├── pics
    ├── tags.png
    └── users.png
├── .gitignore
├── main.py
├── README.md
├── LICENSE
├── model.py
└── utils.py


/pics/tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wang-Yu-Qing/UTPM/HEAD/pics/tags.png


--------------------------------------------------------------------------------
/pics/users.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wang-Yu-Qing/UTPM/HEAD/pics/users.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | data/
  2 | 
  3 | *.pickle
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | import os
 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 6 | 
 7 | 
 8 | from utils import *
 9 | from model import UTPM
10 | 
11 | 
12 | DTYPE = tf.float32
13 | PAD_VALUE = 0
14 | NUM_WORKERS = 4
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     args = parse_args() 
19 |     movie_tag_rel, tag_encoder, tag_decoder = extract_movie_tag_relation("data/ml-20m/genome-scores.csv", args.tags_per_movie, args.min_tag_score, args.min_tag_freq)
20 |     movie_cate_rel, cate_encoder, cate_decoder = extract_movie_cate_relation("data/ml-20m/movies.csv")
21 |     all_tags = list(set(tag_encoder.values()))
22 |     print("Number of tags: ", len(tag_encoder))
23 | 
24 |     if args.prepare_tfrecords:
25 |         print("Start building user samples")
26 |         all_users_samples = build_user_samples_mp(
27 |             "data/ml-20m/ratings.csv", 
28 |             all_tags,
29 |             movie_tag_rel, 
30 |             movie_cate_rel, 
31 |             NUM_WORKERS, 
32 |             args
33 |         )
34 |         print("Samples build done.")
35 |         # randomly split train and test users and their samples
36 |         train_samples, test_samples = split_train_test(all_users_samples)
37 |         print("Start writing tf records.")
38 |         write_tf_records(train_samples, 'data/train_samples.tfrecords')
39 |         write_tf_records(test_samples, 'data/test_samples.tfrecords')
40 | 
41 |     train_dataset, test_dataset = read_tf_records(args.batch_size)
42 | 
43 |     model = UTPM(
44 |         len(tag_decoder),
45 |         len(cate_decoder),
46 |         args.E, 
47 |         args.T, 
48 |         args.D, 
49 |         args.C, 
50 |         args.U, 
51 |         DTYPE, 
52 |         PAD_VALUE, 
53 |         args.lr, 
54 |         args.log_step, 
55 |         args.epochs, 
56 |         args.use_cross
57 |     )
58 |     
59 |     model.train(train_dataset)
60 |     save(model, 'model.pickle')
61 |     model = load('model.pickle')
62 | 
63 |     # tag raw id -> embedding
64 |     tags_embeds = {}
65 |     for encoded_tag_id, tag_embed in enumerate(model.query_tags_embeds()):
66 |         tags_embeds[tag_decoder[encoded_tag_id]] = tag_embed
67 | 
68 |     users_embeds = evaluate(model, test_dataset, tags_embeds, args.U)
69 | 
70 |     tag_names = read_tag_name('data/ml-20m/genome-tags.csv')
71 |     _tag_names, tag_vecs = [], []
72 |     for tag_id, tag_vec in tags_embeds.items():
73 |         _tag_names.append(tag_names[tag_id])
74 |         tag_vecs.append(tag_vec)
75 |     tsne(np.array(tag_vecs), 'pics/tags.png', names=_tag_names)
76 |     tsne(users_embeds, 'pics/users.png')
77 | 
78 |     # Print out tag similarity search result
79 |     # NOTE: these may print out a lot to the terminal
80 |     tag_vecs, idx2name = [], {}
81 |     for idx, (tag_raw_id, tag_vec) in enumerate(tags_embeds.items()):
82 |         idx2name[idx] =  tag_names[tag_raw_id]
83 |         tag_vecs.append(tag_vec)
84 |     tag_vecs = np.array(tag_vecs)
85 |     
86 |     search_index = faiss.IndexFlatIP(args.U)
87 |     search_index.add(tag_vecs)
88 | 
89 |     all_dis, all_neigh = search_index.search(tag_vecs, k=5)
90 |     for idx, (dis, neigh) in enumerate(zip(all_dis, all_neigh)):
91 |         target_tag = idx2name[idx]
92 |         print("{} -->".format(target_tag))
93 |         for _dis, idx in zip(dis, neigh):
94 |             print('\t', idx2name[idx], _dis)
95 | 
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # UTPM
 2 | Code for paper: [Learning to Build User-tag Profile in Recommendation System](https://dl.acm.org/doi/abs/10.1145/3340531.3412719)
 3 | 
 4 | ## Dataset
 5 | Download link: [MovieLens 20M](https://grouplens.org/datasets/movielens/20m/)
 6 | Create data folder, unzip and move the dataset into the folder
 7 | 
 8 | ## Tag similarity query result
 9 | Here are some tag similarity search results based on the trained embedding vectors:
10 | ```
11 | happy ending -->
12 |          happy ending 1.0
13 |          cute! 0.9432591
14 |          fantasy 0.89542687
15 |          cute 0.88582855
16 |          fun 0.8504139
17 | adaptation -->
18 |          adaptation 0.9999999
19 |          books 0.986031
20 |          based on book 0.91810507
21 |          mythology 0.88804483
22 |          magic 0.8615897
23 | book -->
24 |          book 0.99999994
25 |          island 0.8349991
26 |          hospital 0.83032197
27 |          book was better 0.82220745
28 |          midlife crisis 0.80883765
29 | book was better -->
30 |          book was better 0.99999994
31 |          book 0.82220745
32 |          midlife crisis 0.8160943
33 |          homophobia 0.8155084
34 |          stereotypes 0.7798702
35 | literature -->
36 |          literature 0.9999999
37 |          literary adaptation 0.88163626
38 |          passionate 0.8808603
39 |          18th century 0.8807162
40 |          based on a play 0.86564
41 | father son relationship -->
42 |          father son relationship 0.9999999
43 |          vengeance 0.80616903
44 |          police corruption 0.79352015
45 |          oscar (best foreign language film) 0.78094
46 |          tragedy 0.77240366
47 | storytelling -->
48 |          storytelling 0.9999999
49 |          small town 0.7059536
50 |          love story 0.5919437
51 |          romantic 0.53870004
52 |          paris 0.53361225
53 | ```
54 | 
55 | ## Tag & user embedding distribution
56 | Use t-sne dimension reduction method to reduce the trained tag and user embeddings into 2D space, below is the tags' and users' distribution.
57 | ![tags](pics/tags.png)
58 | ![users](pics/users.png)
59 | 
60 | ## Precision@K
61 | Currently **cannot reproduce the result in paper**. I think the paper's data preprocess method (e.g. filter out some long-tail user / movie) is different with mine and this is crutial to the evaluation result. But we cannot find these details in the paper. So the gap still exists.
62 | 
63 | With `user_frac` set to 0.5:
64 | * precision@1: 14.87%
65 | * precision@2: 7.46%
66 | * precision@3: 6.67%
67 | 
68 | ## Run the model
69 | Create `pics` folder for saving t-sne embedding distribution pics. Run with default config `python main.py`
70 | 
71 | If you don't have GPU but want to see the result quickly, it is recommended to use a small (let's say 0.3) `user_frac` to make a sample of full users.
72 | 
73 | If everything is properly set, you should see outputs like the following:
74 | 
75 | ```
76 | epoch: 000 | step: 00000 | batch_loss: 6.6119 | epoch_avg_loss: 6.6119 | step_time: 0.35378
77 | epoch: 000 | step: 00100 | batch_loss: 3.0183 | epoch_avg_loss: 4.4048 | step_time: 0.01175
78 | epoch: 000 | step: 00200 | batch_loss: 5.4969 | epoch_avg_loss: 3.7021 | step_time: 0.01200
79 | epoch: 000 | step: 00300 | batch_loss: 4.1210 | epoch_avg_loss: 3.3223 | step_time: 0.01197
80 | epoch: 000 | step: 00400 | batch_loss: 1.5884 | epoch_avg_loss: 3.0219 | step_time: 0.01407
81 | epoch: 000 | step: 00500 | batch_loss: 0.5027 | epoch_avg_loss: 2.7848 | step_time: 0.01389
82 | epoch: 000 | step: 00600 | batch_loss: 3.1471 | epoch_avg_loss: 2.5947 | step_time: 0.00737
83 | epoch: 000 | step: 00700 | batch_loss: 0.9000 | epoch_avg_loss: 2.4497 | step_time: 0.00798
84 | ......
85 | ```
86 | 
87 | To change the default config, pass arguments when launch main.py, check utils.py for arguments details.
88 | 
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import time as time
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | class UTPM:
  6 |     def __init__(self, n_tags, n_cates, E, T, D, C, U, dtype, pad_value, lr, log_step, epochs, use_cross):
  7 |         self.E = E
  8 |         self.U = U
  9 |         self.pad_value = pad_value
 10 |         self.log_step = log_step
 11 |         self.epochs = epochs
 12 |         self.dtype = dtype
 13 |         self.use_cross = use_cross
 14 |         # init embedding weights
 15 |         self.all_embeds = {
 16 |             "tag": self.init_trainable_weights([n_tags, E], "tag_embeds"),
 17 |             "cate": self.init_trainable_weights([n_cates, E], "cate_embeds"),
 18 |             "tag_label": self.init_trainable_weights([n_tags, U], "tag_label_embeds"),
 19 |         }
 20 |         if use_cross:
 21 |             self.all_embeds["cross"] = self.init_trainable_weights([2 * E, C], "cross_embeds")
 22 | 
 23 |         # embedding op for padding value lookup
 24 |         self.reset_pad_embedding()
 25 | 
 26 |         # heads query weights for all fields' features
 27 |         self.Q = self.init_trainable_weights([2, T, 1], "Q") # (n_head, T, 1)
 28 |         # transform weights & bias for each field
 29 |         self.W_fields = self.init_trainable_weights([2, 2, E, T], "W")  # (n_head, n_list_field, E, T)
 30 |         # in the paper, bias is shared across heads, we just use different bias for each head here.
 31 |         self.B_fields = self.init_trainable_weights([2, 2, T], "B") # (n_head, n_list_field, T)
 32 | 
 33 |         # transform weights & bias for final attention merge (merge fields embedding)
 34 |         self.W_final = self.init_trainable_weights([2, E, T], "FW")
 35 |         self.B_final = self.init_trainable_weights([2, T], "FB")
 36 | 
 37 |         # fc weights
 38 |         if self.use_cross:
 39 |             self.fc1 = self.init_trainable_weights([int(E + 2 * E * (2 * E - 1) / 2), D], "fc1")
 40 |             self.fc2 = self.init_trainable_weights([D, U], "fc2")
 41 |         else:
 42 |             self.fc1 = self.init_trainable_weights([2 * E, D], "fc1")
 43 |             self.fc2 = self.init_trainable_weights([D, U], "fc2")
 44 |         
 45 |         self.trainable_weights = [
 46 |             self.all_embeds["tag"], 
 47 |             self.all_embeds["cate"], 
 48 |             self.all_embeds["tag_label"],
 49 |             self.Q, 
 50 |             self.W_fields, 
 51 |             self.B_fields, 
 52 |             self.W_final, 
 53 |             self.B_final, 
 54 |             self.fc1, 
 55 |             self.fc2
 56 |         ]
 57 |         
 58 |         self.opt = tf.optimizers.Adam(learning_rate=lr)
 59 |          
 60 |     def init_trainable_weights(self, shape, name):
 61 |         return tf.Variable(tf.random.truncated_normal(shape, stddev=1.0 / shape[1], dtype=self.dtype),
 62 |                            dtype=self.dtype,
 63 |                            name=name,
 64 |                            trainable=True)
 65 |     
 66 |     def reset_pad_embedding(self):
 67 |         tf.compat.v1.scatter_update(self.all_embeds["tag"],
 68 |                                     self.pad_value,
 69 |                                     tf.zeros([self.E,], dtype=self.dtype)),
 70 |         tf.compat.v1.scatter_update(self.all_embeds["cate"], 
 71 |                                     self.pad_value,
 72 |                                     tf.zeros([self.E,], dtype=self.dtype)),
 73 |         tf.compat.v1.scatter_update(self.all_embeds["tag_label"], 
 74 |                                     self.pad_value,
 75 |                                     tf.zeros([self.U,], dtype=self.dtype))
 76 | 
 77 |     def head_attention(self, embeds, head_idx, Q, W, B, return_weights=False):
 78 |         """
 79 |             Basic attention merge operation defined in paper's equation (1)
 80 | 
 81 |             @embeds: (batch_size, n, E), batch of all t_i
 82 |             @head_idx: scalar
 83 |             @Q: (n_head, T, 1)
 84 |             @W: W (n_head, E, T)
 85 |             @B: B (n_head, T)
 86 |         """
 87 |         W_head = W[head_idx] # (E, T)
 88 |         B_head = B[head_idx] # (T, )
 89 |         Q_head = Q[head_idx] # (T, 1)
 90 |         
 91 |         # when using matmul, must convert vector to matrix
 92 |         matmul_W = tf.matmul(embeds, W_head) # (batch_size, n, T)
 93 |         add_B = matmul_W + B_head # (batch_size, n, T)
 94 |         relued = tf.expand_dims(tf.nn.relu(add_B), 2) # (batch_size, n, 1, T)
 95 |         matmul_Q = tf.squeeze(tf.matmul(relued, Q_head), axis=[2, 3]) # (batch_size, n)
 96 |         alphas = tf.expand_dims(tf.nn.softmax(matmul_Q), 1) # (batch_size, 1, n)
 97 |         # no squeeze for further list fields merged embedding concat with single fea embedding
 98 |         res = tf.matmul(alphas, embeds) # (batch_size, 1, E)
 99 |         
100 |         if return_weights:
101 |             return res, tf.squeeze(alphas, axis=1)
102 |         else:
103 |             return res
104 |     
105 |     def merge_features(self, i, fea_embeds, attention_weights=None):
106 |         """
107 |             Merge feature embedding from the given list field into field embedding
108 |             @i: list field idx
109 |             @fea_embeds: features embedding of the target list field, (batch_size, n_fea, E)
110 |         """
111 |         # get target list field's W and B
112 |         # (n_head, E, T)
113 |         W = self.W_fields[:, i, :, :]
114 |         # (n_head, T)
115 |         B = self.B_fields[:, i, :]
116 |         
117 |         if attention_weights is not None:
118 |             h0_fea_merged, h0_weights = self.head_attention(fea_embeds, 0, self.Q, W, B, True) # (batch_size, 1, E)
119 |             h1_fea_merged, h1_weights = self.head_attention(fea_embeds, 1, self.Q, W, B, True) # (batch_size, 1, E)
120 |             attention_weights[str(i) + "_h0"] = h0_weights
121 |             attention_weights[str(i) + "_h1"] = h1_weights
122 |         else:
123 |             # merge list feature embeds to produce one embedding for the list feature
124 |             h0_fea_merged = self.head_attention(fea_embeds, 0, self.Q, W, B) # (batch_size, 1, E)
125 |             h1_fea_merged = self.head_attention(fea_embeds, 1, self.Q, W, B) # (batch_size, 1, E)
126 |         
127 |         return tf.squeeze(h0_fea_merged, axis=1), tf.squeeze(h1_fea_merged, axis=1)
128 | 
129 |     def attention_forward(self, pos_tags, pos_cates, return_weights=False):
130 |         list_fields_embeds = {}
131 |         # query embeddings
132 |         list_fields_embeds["pos_tag"] = tf.nn.embedding_lookup(self.all_embeds["tag"], pos_tags)
133 |         list_fields_embeds["pos_cate"] = tf.nn.embedding_lookup(self.all_embeds["cate"], pos_cates)
134 |         
135 |         h0_fields_embeds, h1_fields_embeds = [], []
136 |         attention_weights = {}
137 |         # First merge: merge each list field's feature value embeddings into each list field's embedding.
138 |         for i, (field_name, fea_embeds) in enumerate(list_fields_embeds.items()):
139 |             if return_weights:
140 |                 h0_fea_merged, h1_fea_merged = self.merge_features(i, fea_embeds, attention_weights)
141 |             else:
142 |                 h0_fea_merged, h1_fea_merged = self.merge_features(i, fea_embeds)
143 |             h0_fields_embeds.append(h0_fea_merged)
144 |             h1_fields_embeds.append(h1_fea_merged)
145 | 
146 |         h0_fields_embeds = tf.stack(h0_fields_embeds, axis=1) # (batch_size, n_fea, E)
147 |         h1_fields_embeds = tf.stack(h1_fields_embeds, axis=1) # (batch_size, n_fea, E)
148 | 
149 |         # Second merge: merge all fields embedding into final embedding
150 |         h0_batch_res = self.head_attention(h0_fields_embeds, 0, self.Q, self.W_final, self.B_final) # (batch_size, 1, E)
151 |         h1_batch_res = self.head_attention(h1_fields_embeds, 1, self.Q, self.W_final, self.B_final) # (batch_size, 1, E)
152 |         
153 |         # (batch_size, 2E)
154 |         if return_weights:
155 |             return tf.squeeze(tf.concat([h0_batch_res, h1_batch_res], axis=2), axis=1), attention_weights
156 |         else:
157 |             return tf.squeeze(tf.concat([h0_batch_res, h1_batch_res], axis=2), axis=1)
158 | 
159 |     def brute_force_cross(self, x):
160 |         res = []
161 |         for i in range(x.shape[1] - 1):
162 |             for j in range(i + 1, x.shape[1]):
163 |                 # (2, C)
164 |                 embeds = tf.nn.embedding_lookup(self.all_embeds["cross"], [i, j])
165 |                 # (1, )
166 |                 vi_vj = tf.tensordot(embeds[0], embeds[1], axes=1)
167 |                 # (batch_size, )
168 |                 xi_xj = x[:, i] * x[:, j]
169 |                 # (batch_size, )
170 |                 res.append(xi_xj * vi_vj)
171 |                 
172 |         # (batch_size, 2 * E + 0.5 * 2 * E * (2 * E - 1))
173 |         return tf.concat([x, tf.stack(res, axis=1)], axis=1)
174 | 
175 |     def forward(self, pos_tags, pos_cates):
176 |         x = self.attention_forward(pos_tags, pos_cates)
177 |         if self.use_cross:
178 |             x = self.brute_force_cross(x)
179 |         x = tf.nn.relu(tf.matmul(x, self.fc1))
180 |         batch_user_embeds = tf.nn.relu(tf.matmul(x, self.fc2))
181 |         batch_user_embeds = tf.math.l2_normalize(batch_user_embeds, axis=1)
182 | 
183 |         return batch_user_embeds
184 | 
185 |     def forward_with_attention_details(self, batch_samples):
186 |         x, attention_weights = self.attention_forward(batch_samples, return_weights=True)
187 |         if self.use_cross:
188 |             x = self.brute_force_cross(x)
189 |         x = tf.nn.relu(tf.matmul(x, self.fc1))
190 |         batch_user_embeds = tf.nn.relu(tf.matmul(x, self.fc2))
191 | 
192 |         return batch_user_embeds, attention_weights
193 |     
194 |     def loss(self, batch_user_embeds, batch_target_movie_tags, batch_labels):
195 |         # (batch_size, n_tags, U)
196 |         batch_target_tags_embeds = tf.nn.embedding_lookup(self.all_embeds["tag_label"], batch_target_movie_tags)
197 |         batch_target_tags_embeds = tf.math.l2_normalize(batch_target_tags_embeds, axis=1)
198 |         # (batch_size, )
199 |         y_k = tf.math.sigmoid(tf.reduce_sum(tf.squeeze(tf.matmul(batch_target_tags_embeds, tf.expand_dims(batch_user_embeds, axis=2)), axis=2), axis=1))
200 |         # log(x) needs x > 0 for both x = y_k and x = 1 - y_k
201 |         y_k = tf.math.minimum(y_k, 1 - 1e-06)
202 |         y_k = tf.math.maximum(y_k, 0 + 1e-06)
203 | 
204 |         return (-1 / batch_labels.shape[0]) * tf.reduce_sum(batch_labels * tf.math.log(y_k) + (1 - batch_labels) * tf.math.log(1 - y_k), axis=0)
205 | 
206 |     def train(self, train_dataset):
207 |         last_epoch_avg_loss = float("inf")
208 |         for epoch in range(self.epochs):
209 |             epoch_total_loss = 0
210 |             for step, _batch_samples in enumerate(train_dataset):
211 |                 tic = time.time()
212 |                 user_id = _batch_samples[0]
213 |                 # X
214 |                 pos_tag = _batch_samples[1]
215 |                 pos_cate = _batch_samples[2]
216 |                 # Y
217 |                 target_movie_tag = _batch_samples[3]
218 |                 labels = _batch_samples[4]
219 | 
220 |                 with tf.GradientTape() as tape:
221 |                     user_embeds = self.forward(pos_tag, pos_cate)
222 |                     batch_loss = self.loss(user_embeds, target_movie_tag, labels)
223 |                 
224 |                 epoch_total_loss += batch_loss
225 |                 epoch_avg_loss = epoch_total_loss / (step + 1)
226 | 
227 |                 toc = time.time()
228 |                 if step % self.log_step == 0:
229 |                     print("epoch: {:03d} | step: {:05d} | batch_loss: {:.4f} | epoch_avg_loss: {:.4f} | step_time: {:.5f}".\
230 |                         format(epoch + 1, step, batch_loss, epoch_avg_loss, toc - tic))
231 | 
232 |                 grads = tape.gradient(batch_loss, self.trainable_weights)
233 |                 self.opt.apply_gradients(zip(grads, self.trainable_weights))
234 |                 
235 |                 self.reset_pad_embedding()
236 | 
237 |             print("Epoch {} done, epoch avg loss: {}".format(epoch + 1, epoch_avg_loss))
238 | 
239 |             last_epoch_avg_loss = epoch_avg_loss
240 | 
241 |     def query_tags_embeds(self):
242 |         """
243 |             Use trained tag label embedding vecs as tag embeds during prediction.
244 |             Not using tag embedding in the input layer, 
245 |             because the prediction during model training is based on the dot product
246 |             of the movie's tags label embeddings.
247 | 
248 |             return tag embedding table, idx is encoded id
249 |         """
250 |         return tf.math.l2_normalize(self.all_embeds["tag_label"], axis=1).numpy()
251 |     
252 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import faiss
  2 | import random
  3 | import argparse
  4 | import pickle
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | import multiprocessing
  8 | from itertools import repeat
  9 | from sklearn.manifold import TSNE
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | def parse_args():
 14 |     argparser = argparse.ArgumentParser()
 15 |     argparser.add_argument('--epochs', type=int, default=20)
 16 |     argparser.add_argument('--batch_size', type=int, default=256)
 17 |     argparser.add_argument('--E', type=int, default=16)
 18 |     argparser.add_argument('--T', type=int, default=8)
 19 |     argparser.add_argument('--U', type=int, default=16)
 20 |     argparser.add_argument('--C', type=int, default=4)
 21 |     argparser.add_argument('--D', type=int, default=32)
 22 |     argparser.add_argument('--lr', type=float, default=0.01, help="learning rate")
 23 |     argparser.add_argument('--log_step', type=int, default=500)
 24 |     argparser.add_argument('--pad_value', type=int, default=0)
 25 |     # turn on this will increase forward function's time complexity a lot
 26 |     argparser.add_argument('--use_cross', type=int, default=0, help="whether to use cross layer")
 27 |     argparser.add_argument('--user_frac', default=0.5, type=int, help="fraction of users to be used in training and testing")
 28 |     argparser.add_argument('--max_user_samples', type=int, default=30, help="max labels per user")
 29 |     argparser.add_argument('--min_movies_per_user', type=int, default=50, help="min movies for a valid user")
 30 |     argparser.add_argument('--max_movies_per_user', type=int, default=150, help="max movies for a valid user")
 31 |     argparser.add_argument('--tags_per_movie', type=int, default=10, help="tags per movie")
 32 |     argparser.add_argument('--min_tag_score', type=float, default=0.7, help="min tag score")
 33 |     argparser.add_argument('--min_tag_freq', type=int, default=50, help="min tag freq")
 34 |     argparser.add_argument('--user_his_min_freq', type=int, default=5, help="min valid tag / cate freq in one user's history")
 35 |     argparser.add_argument('--n_values_per_field', type=int, default=100, help="number of values per field")
 36 |     argparser.add_argument('--n_list_fea', type=int, default=2, help="number of list features")
 37 |     argparser.add_argument('--n_neg', type=int, default=5, help="number of negative target per positive")
 38 |     argparser.add_argument('--n_neg_target', type=int, default=20, help="number of tags per negative target")
 39 |     argparser.add_argument('--prepare_tfrecords', default=1, type=int, help="whether to prepare tfrecords, need be set to 1 for first run.")
 40 | 
 41 |     args = argparser.parse_args()
 42 | 
 43 |     return args
 44 | 
 45 | 
 46 | def read_tag_name(filepath):
 47 |     tag_name = {'<pad>': '<pad>'}
 48 |     with open(filepath, "r") as f:
 49 |         f.readline()
 50 |         for line in f.readlines():
 51 |             line = line[:-1]
 52 |             splitted = line.split(",")
 53 |             tag_name[int(splitted[0])] = splitted[1]
 54 |     
 55 |     return tag_name
 56 | 
 57 | 
 58 | def extract_tags(tag_scores, movie_tag_rel, last_movie_id, tags_per_movie, min_tag_score):
 59 |     # as decribed by the paper, restrict max number of tags for each movie
 60 |     tags = sorted(tag_scores, key=lambda x: x[1], reverse=True)[:tags_per_movie]
 61 |     # and only keep tags score higher than thred
 62 |     movie_tag_rel[last_movie_id] = set([x[0] for x in tags if x[1] > min_tag_score])
 63 |     tag_scores.clear()
 64 | 
 65 | 
 66 | def filter_movie_tag(movie_tag_rel, min_tag_freq, tags_per_movie):
 67 |     # filter out tags that cover too little movies
 68 |     tag_freq = {}
 69 |     for movie_id, tags in movie_tag_rel.items():
 70 |         for tag in tags:
 71 |             try:
 72 |                 tag_freq[tag] += 1
 73 |             except KeyError:
 74 |                 tag_freq[tag] = 1
 75 |     valid_tags = set([x[0] for x in tag_freq.items() if x[1] >= min_tag_freq])
 76 | 
 77 |     # filter out invalid tags from movies, and drop movies whose tag number is not enough
 78 |     invalid_movies = set()
 79 |     for movie_id, tags in movie_tag_rel.items():
 80 |         # filter out invalid tags from movie
 81 |         movie_tag_rel[movie_id] = [x for x in tags if x in valid_tags]
 82 |         if len(movie_tag_rel[movie_id]) < tags_per_movie:
 83 |             invalid_movies.add(movie_id)
 84 | 
 85 |     return {x[0]: x[1] for x in movie_tag_rel.items() if x[0] not in invalid_movies}
 86 | 
 87 | 
 88 | def extract_movie_tag_relation(filepath, tags_per_movie, min_tag_score, min_tag_freq):
 89 |     movie_tag_rel = {}
 90 |     with open(filepath, "r") as f:
 91 |         f.readline()
 92 |         last_movie_id, tag_scores = None, []
 93 |         for line in f.readlines():
 94 |             line = line.strip()
 95 |             splitted = line.split(",")
 96 |             movie_id, tag_id, score = int(splitted[0]), int(splitted[1]), float(splitted[2])
 97 |             # use 0 as padding value, make sure original id not starting from 0
 98 |             if last_movie_id is not None and movie_id != last_movie_id:
 99 |                 extract_tags(tag_scores, movie_tag_rel, last_movie_id, tags_per_movie, min_tag_score)
100 |             tag_scores.append((tag_id, score))
101 |             last_movie_id = movie_id
102 |         
103 |         extract_tags(tag_scores, movie_tag_rel, last_movie_id, tags_per_movie, min_tag_score)
104 | 
105 |     # filter
106 |     movie_tag_rel = filter_movie_tag(movie_tag_rel, min_tag_freq, tags_per_movie)    
107 | 
108 |     # encode tags
109 |     tag_encoder, tag_decoder, tag_id = {"<pad>": 0}, ["<pad>"], 1
110 |     for movie_id, raw_tags in movie_tag_rel.items():
111 |         encoded_tags = []
112 |         for raw_tag_id in raw_tags:
113 |             if raw_tag_id not in tag_encoder:
114 |                 tag_encoder[raw_tag_id] = tag_id
115 |                 tag_decoder.append(raw_tag_id)
116 |                 tag_id += 1
117 |             
118 |             encoded_tags.append(tag_encoder[raw_tag_id])
119 |         movie_tag_rel[movie_id] = encoded_tags
120 | 
121 |     return movie_tag_rel, tag_encoder, tag_decoder
122 | 
123 | 
124 | def extract_movie_cate_relation(filepath):
125 |     # use 0 as padding value, make sure original id not starting from 0
126 |     cate_encoder, cate_decoder, cate_id = {"<pad>": 0}, ["<pad>"], 1
127 |     movie_cate_rel = {}
128 |     with open(filepath, "r", encoding="utf-8") as f:
129 |         f.readline()
130 |         for line in f.readlines():
131 |             cates_encoded = []
132 |             line = line.strip()
133 |             splitted = line.split(",")
134 |             cates, movie_id = splitted[-1].split("|"), int(splitted[0])
135 |             for cate in cates:
136 |                 if cate in cate_encoder:
137 |                     cates_encoded.append(cate_encoder[cate])
138 |                 else:
139 |                     cates_encoded.append(cate_id)
140 |                     cate_encoder[cate] = cate_id
141 |                     cate_decoder.append(cate)
142 |                     cate_id += 1
143 |             movie_cate_rel[movie_id] = cates_encoded
144 |     
145 |     return movie_cate_rel, cate_encoder, cate_decoder
146 | 
147 | 
148 | def extract_user_behaviors(ratings_filepath, args):
149 |     user_behaviors = {}
150 |     
151 |     with open(ratings_filepath, "r") as f:
152 |         f.readline()
153 |         for line in f.readlines():
154 |             line = line.strip()
155 |             splitted = line.split(",")
156 |             user_id, movie_id, rating, timestamp = int(splitted[0]), int(splitted[1]), float(splitted[2]), int(splitted[3])
157 |             if user_id not in user_behaviors:
158 |                 user_behaviors[user_id] = []
159 |             if rating <= 1.5:
160 |                 user_behaviors[user_id].append((movie_id, 0, timestamp))
161 |             elif rating >= 3.5:
162 |                 user_behaviors[user_id].append((movie_id, 1, timestamp))
163 | 
164 |     # filter out users that has too little or too much movies
165 |     invalid_users = set([x[0] for x in user_behaviors.items() if len(x[1]) < args.min_movies_per_user or len(x[1]) > args.max_movies_per_user])
166 |     _user_behaviors = {}
167 |     for user_id, behaviors in user_behaviors.items():
168 |         seed = random.uniform(0, 1)
169 |         if seed < args.user_frac and user_id not in invalid_users:
170 |             _user_behaviors[user_id] = behaviors
171 | 
172 |     return _user_behaviors
173 | 
174 | 
175 | def extract_pos_tags_cates(movie_id, label, pos_tags, pos_cates, movie_tag_rel, movie_cate_rel):
176 |     """
177 |         Tags can be repeated. Repeated tags will contribute stronger singnal for user tag interest
178 |     """
179 |     try:
180 |         tags, cates = movie_tag_rel[movie_id], movie_cate_rel[movie_id]
181 |     except KeyError:
182 |         return 
183 |     if label == 1:
184 |         for tag in tags:
185 |             pos_tags.append(tag)
186 |         for cate in cates:
187 |             pos_cates.append(cate)
188 | 
189 | 
190 | def extract_tags_labels(movie_id, label, tags_labels, movie_tag_rel):
191 |     try:
192 |         tags = movie_tag_rel[movie_id]
193 |     except KeyError:
194 |         return 
195 |     tags_labels.append((tags, label))
196 | 
197 | 
198 | def pad_or_cut(values, pad_value, length):
199 |     if len(values) < length:
200 |         return values + [pad_value] * (length - len(values))
201 |     elif len(values) > length:
202 |         return random.choices(values, k=length)
203 |     else:
204 |         return values
205 | 
206 | 
207 | def build_user_samples_mp(ratings_filepath, all_tags, movie_tag_rel, movie_cate_rel, num_workers, args):
208 |     user_behaviors = extract_user_behaviors(ratings_filepath, args)
209 |     with multiprocessing.Pool(num_workers) as pool:
210 |         all_samples = pool.starmap(
211 |             build_user_samples, 
212 |             zip(
213 |                 user_behaviors.keys(), 
214 |                 repeat(all_tags),
215 |                 repeat(movie_tag_rel), 
216 |                 repeat(movie_cate_rel), 
217 |                 user_behaviors.values(), 
218 |                 repeat(args.user_his_min_freq), 
219 |                 repeat(args.n_values_per_field), 
220 |                 repeat(args.max_user_samples), 
221 |                 repeat(args.n_neg), 
222 |                 repeat(args.tags_per_movie)
223 |             )
224 |         )
225 |     
226 |     return all_samples
227 | 
228 | 
229 | def filter_low_freq(all, min_freq):
230 |     value_freq = {}
231 |     for value in all:
232 |         try:
233 |             value_freq[value] += 1
234 |         except KeyError:
235 |             value_freq[value] = 1
236 | 
237 |     invalid_value = set([x[0] for x in value_freq.items() if x[1] <= min_freq])
238 | 
239 |     return [x for x in all if x not in invalid_value]
240 | 
241 | 
242 | def build_user_samples(
243 |         user_id, 
244 |         all_tags, 
245 |         movie_tag_rel, 
246 |         movie_cate_rel, 
247 |         user_behavior, 
248 |         user_his_min_freq, 
249 |         n_values_per_field, 
250 |         max_user_samples, 
251 |         n_neg, 
252 |         tags_per_movie
253 |     ):
254 |     if not user_behavior:
255 |         return []
256 | 
257 |     user_behavior = sorted(user_behavior, key=lambda x: x[2])
258 |     # as decribed in the paper, use top 80% records to build fields
259 |     split_idx = int(len(user_behavior) * 0.8)
260 |     history, future = user_behavior[:split_idx], user_behavior[split_idx:]
261 | 
262 |     # extract history postive tags and cates
263 |     his_pos_tags, his_pos_cates = [], []
264 |     for movie_id, label, timestamp in history:
265 |         extract_pos_tags_cates(movie_id, label, his_pos_tags, his_pos_cates, movie_tag_rel, movie_cate_rel)
266 | 
267 |     # filter out tags and cates with low freq
268 |     his_pos_tags = filter_low_freq(his_pos_tags, user_his_min_freq)
269 |     his_pos_cates = filter_low_freq(his_pos_cates, user_his_min_freq)
270 | 
271 |     his_pos_tags = pad_or_cut(his_pos_tags, 0, n_values_per_field)
272 |     his_pos_cates = pad_or_cut(his_pos_cates, 0, n_values_per_field)
273 | 
274 |     # as described by the paper, restrict max samples for each user
275 |     if len(future) > max_user_samples:
276 |         future = random.choices(future, k=max_user_samples)
277 | 
278 |     # extract label tags from future
279 |     tags_labels = []
280 |     for movie_id, label, timestamp in future:
281 |         # one movie produce (tags, label)
282 |         extract_tags_labels(movie_id, label, tags_labels, movie_tag_rel)
283 | 
284 |     user_samples = []
285 |     for target_tags, target_label in tags_labels:
286 |         user_samples.append([user_id, his_pos_tags, his_pos_cates, target_tags, target_label])
287 |         if target_label == 1:
288 |             # negative sampling
289 |             for _ in range(n_neg):
290 |                 neg_tags = random.choices(all_tags, k=tags_per_movie)
291 |                 user_samples.append([user_id, his_pos_tags, his_pos_cates, neg_tags, 0])
292 | 
293 |     return user_samples
294 | 
295 | 
296 | def split_train_test(all_users_samples):
297 |     train_samples, test_samples = [], []
298 |     for user_samples in all_users_samples:
299 |         seed = random.randint(1, 10)
300 |         if seed > 2:
301 |             train_samples.append(user_samples)
302 |         else:
303 |             test_samples.append(user_samples)
304 |     
305 |     return train_samples, test_samples
306 | 
307 | 
308 | def _bytes_feature(value):
309 |     """Returns a bytes_lis from a string / byte."""
310 |     if isinstance(value, type(tf.constant(0))): # if value ist tensor
311 |         value = value.numpy() # get value of tensor
312 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
313 | 
314 | def _float_feature(value):
315 |   """Returns a floast_list from a float / double."""
316 |   return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
317 | 
318 | def _int64_feature(value):
319 |   """Returns an int64_list from a bool / enum / int / uint."""
320 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
321 | 
322 | def serialize_array(array):
323 |   array = tf.io.serialize_tensor(array)
324 |   return array
325 | 
326 | def parse_single_sample(user_id, pos_tags, pos_cates, target_tags, label):
327 |   data = {
328 |         'user_id':  _int64_feature(user_id),
329 |         'pos_tags':  _bytes_feature(serialize_array(pos_tags)),
330 |         'pos_cates':  _bytes_feature(serialize_array(pos_cates)),
331 |         'target_movie_tags': _bytes_feature(serialize_array(target_tags)),
332 |         'label':  _float_feature(label)
333 |     }
334 | 
335 |   return tf.train.Example(features=tf.train.Features(feature=data))
336 | 
337 | 
338 | def write_tf_records(users_samples, filepath):
339 |     with tf.io.TFRecordWriter(filepath) as writer:
340 |         for user_samples in users_samples:
341 |             for sample in user_samples:
342 |                 user_id, pos_tags, pos_cates, target_tags, label = sample[0], sample[1], sample[2], sample[3], sample[4]
343 |                 example = parse_single_sample(user_id, pos_tags, pos_cates, target_tags, label)
344 |                 writer.write(example.SerializeToString())
345 | 
346 | 
347 | def decode_one_tfrecord(sample):
348 |     data = {
349 |       'user_id': tf.io.FixedLenFeature([], tf.int64),
350 |       'pos_tags': tf.io.FixedLenFeature([], tf.string),
351 |       'pos_cates': tf.io.FixedLenFeature([], tf.string),
352 |       'target_movie_tags': tf.io.FixedLenFeature([], tf.string),
353 |       'label': tf.io.FixedLenFeature([], tf.float32)
354 |     }
355 | 
356 |     sample = tf.io.parse_single_example(sample, data)
357 | 
358 |     user_id = sample["user_id"]
359 |     pos_tags = tf.io.parse_tensor(sample["pos_tags"], out_type=tf.int32)
360 |     pos_cates = tf.io.parse_tensor(sample["pos_cates"], out_type=tf.int32)
361 |     target_movie_tags = tf.io.parse_tensor(sample["target_movie_tags"], out_type=tf.int32)
362 |     label = sample["label"]
363 | 
364 |     return user_id, pos_tags, pos_cates, target_movie_tags, label
365 | 
366 | 
367 | def read_tf_records(batch_size):
368 |     train_dataset = tf.data.TFRecordDataset("data/train_samples.tfrecords").map(decode_one_tfrecord).batch(batch_size).shuffle(1024)
369 |     test_dataset = tf.data.TFRecordDataset("data/test_samples.tfrecords").map(decode_one_tfrecord).batch(batch_size)
370 | 
371 |     return train_dataset, test_dataset
372 | 
373 | 
374 | def save(obj, filepath):
375 |     with open(filepath, "wb") as f:
376 |         f.write(pickle.dumps(obj))
377 | 
378 | 
379 | def load(filepath):
380 |     with open(filepath, "rb") as f:
381 |         return pickle.loads(f.read())
382 | 
383 | 
384 | def evaluate(model, test_dataset, tag_embeds, U):
385 |     # idx -> raw tag id
386 |     idx_2_tag_id, tag_vecs = [], []
387 |     for tag_id, vec in tag_embeds.items():
388 |         idx_2_tag_id.append(tag_id)
389 |         tag_vecs.append(vec)
390 |     tag_vecs = np.array(tag_vecs)
391 | 
392 |     # create tag embedding vecs index for similarity search using brute-force dot-product as similarity
393 |     tag_embeds_index = faiss.IndexFlatIP(U)
394 |     tag_embeds_index.add(tag_vecs)
395 | 
396 |     # query each user's embedding using trained model
397 |     user_embeds, sample, user_true_tags = {}, {}, {}
398 | 
399 |     for _batch_samples in test_dataset:
400 |         user_ids = _batch_samples[0]
401 |         # X
402 |         pos_tag = _batch_samples[1]
403 |         pos_cate = _batch_samples[2]
404 | 
405 |         # Y
406 |         target_movie_tags = _batch_samples[3]
407 |         labels = _batch_samples[4]
408 | 
409 |         _user_embeds = model.forward(pos_tag, pos_cate)
410 | 
411 |         for user_id, _pos_tag, _target_movie_tags, label, user_embed in zip(user_ids, pos_tag, target_movie_tags, labels, _user_embeds):
412 |             # only evaluate on user true interest
413 |             if label.numpy() == 1:
414 |                 user_id = user_id.numpy()
415 |                 user_embeds[user_id] = user_embed
416 | 
417 |                 if user_id not in user_true_tags:
418 |                     user_true_tags[user_id] = set()
419 | 
420 |                 true_tags = set(_target_movie_tags.numpy().tolist() + _pos_tag.numpy().tolist())
421 |                 for tag in true_tags:
422 |                     user_true_tags[user_id].add(tag)
423 | 
424 |     # NOTE: faiss search result is returned with vector index in the array
425 |     idx_2_user_id, user_vecs = [], []
426 |     for user_id, vec in user_embeds.items():
427 |         idx_2_user_id.append(user_id)
428 |         user_vecs.append(vec)
429 |     user_vecs = np.array(user_vecs)
430 | 
431 |     for K in [1, 2, 3, 4, 5]:
432 |         dis, neigh = tag_embeds_index.search(user_vecs, K)
433 |         user_true_tags_pred = {}
434 |         for user_idx, _neigh in enumerate(neigh):
435 |             user_id = idx_2_user_id[user_idx]
436 |             user_true_tags_pred[user_id] = []
437 |             for idx in _neigh:
438 |                 user_true_tags_pred[user_id].append(idx_2_tag_id[idx])
439 |         
440 |         print("precision@{}: {}".format(K, precision_at_K(user_true_tags_pred, user_true_tags, K)))
441 | 
442 |     return np.array(list(user_embeds.values()))
443 | 
444 | 
445 | def precision_at_K(user_true_tags_pred, user_true_tags, K):
446 |     res, n_valid_user = 0, 0
447 |     for user_id, pred_tags in user_true_tags_pred.items():
448 |         hit = 0
449 |         if user_id in user_true_tags:
450 |             true_tags = user_true_tags[user_id]
451 |             for tag in pred_tags:
452 |                 if tag in true_tags:
453 |                     hit += 1
454 |             res += (hit / min(K, len(true_tags)))
455 |             n_valid_user += 1
456 |     
457 |     return res / n_valid_user
458 | 
459 | 
460 | def tsne(embeds, filename, names=None):
461 |     embeds = TSNE(n_components=2).fit_transform(embeds)
462 |     plt.clf()
463 |     plt.scatter([x[0] for x in embeds], [x[1] for x in embeds], alpha=0.7)
464 |     if names:
465 |         for i, name in enumerate(names):
466 |             plt.annotate(name, (embeds[i][0], embeds[i][1]), size=3)
467 |     plt.savefig(filename, dpi=250)
468 | 


--------------------------------------------------------------------------------