├── .gitignore ├── LICENSE ├── README.md ├── logs ├── cold_item.png ├── cold_user.png ├── dropoutnet_citeu_cold.png ├── dropoutnet_citeu_warm.png ├── logo.png ├── logo.svg ├── logo_alt.png ├── logo_fill.png ├── logobox.jpg └── warm.png ├── tf1 ├── data.py ├── main.py ├── main_citeu.py ├── main_cold_citeu.py ├── main_warm_citeu.py ├── model.py └── utils.py ├── tf2 ├── data.py ├── main.py ├── model.py └── utils.py └── torch ├── data.py ├── main.py ├── model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright Notice: © Copyright 2018 The Toronto-Dominion Bank and/or its affiliates 3 | 4 | Permission is hereby granted, subject to the conditions below, free of charge, to 5 | any person obtaining a copy of this software and associated documentation files 6 | (the "Software"), to use, copy, distribute, publish and modify the Software, only 7 | for research and for no other purpose. For clarity, and without limitation, this 8 | licence does not permit use of Software or any part thereof for commercial purposes. 9 | 10 | Patents: This permission does not grant any patent licenses in the Software. 11 | 12 | Conditions: 13 | 1. The above copyright notice and the following disclaimer shall be included in all 14 | copies or substantial portions of the Software. 15 | 2. You must give appropriate credit, provide a link to the license, and indicate if 16 | changes were made. You may do so in any reasonable manner, but not in any way that 17 | suggests that the copyright holder endorses you or your use of the Software. 18 | 19 | Names and Trademarks: No permission to use the names or trademarks of the copyright 20 | holder are granted, except as required for reasonable and customary use in describing 21 | the origin of the Software and reproducing the content of the copyright notice. 22 | 23 | DISCLAIMER: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 25 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 26 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 27 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 28 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | 4 |

5 | 6 | ## NeurIPS'17 DropoutNet: Addressing Cold Start in Recommender Systems 7 | Authors: [Maksims Volkovs](http://www.cs.toronto.edu/~mvolkovs), [Guangwei Yu](http://www.cs.toronto.edu/~guangweiyu), Tomi Poutanen 8 | [[paper](http://www.cs.toronto.edu/~mvolkovs/nips2017_deepcf.pdf)] 9 | 10 | **UPDATE:** We added TensorFlow2 and PyTorch implimentations! 11 | - [`tf1`: original TF1 implementation](tf1) 12 | - [`tf2`: TF2 using compat mode (RecSys dataset only)](tf2) 13 | - [`torch`: PyTorch (RecSys dataset only)](torch) 14 | 15 | 16 | 17 | ## Introduction 18 | This repository contains full implementation of the DropoutNet model and includes both training and evaluation routines. We also provide the [ACM RecSys 2017 Challenge](http://2017.recsyschallenge.com) dataset that we further split into three subsets for warm start, user cold start and item cold start evaluation. The aim is to train a *single* model that can be applied to all three tasks and we report validation accuracy on each task during training. 19 | 20 | Furthermore per request, we also provide scripts and all necessary data to run the Citeulike cold-start experiment. See section on Citeulike below for further details as well as links to the packaged data. 21 | 22 | 23 | 24 | ## Dataset 25 | 26 | To run the model, download the dataset from [here](https://github.com/layer6ai-labs/DropoutNet-Data/blob/master/recsys2017.pub.tar.gz). 27 | With this dataset we have also included pre-trained Weighted 28 | Factorization model (WMF)\[Hu et al., 2008\], that is used as preference input to the DropoutNet. WMF produces competitive performance on warm start but doesn't generalize to cold start. So this code demonstrates how to apply DropoutNet to provide cold start capability to WMF. The format of the data is as follows: 29 | ``` 30 | recsys2017.pub 31 | └─ eval // use path to this folder in --data-dir 32 | ├─ trained // WMF model 33 | │ └─ warm 34 | │ ├─ U.csv.bin // numpy binarized WMF user preference latent vectors (U) 35 | │ └─ V.csv.bin // numpy binarized WMF item preference latent vectors (V) 36 | ├─ warm 37 | │ ├─ test_cold_item.csv // validation interactions for item cold start 38 | │ ├─ test_cold_item_item_ids.csv // targets item ids for item cold start 39 | │ ├─ test_cold_user.csv // validation interactions for user cold start 40 | │ ├─ test_cold_user_item_ids.csv // target user ids for user cold start 41 | │ ├─ test_warm.csv // validation interactions for warm start 42 | │ ├─ test_warm_item_ids.csv // target item ids for warm start 43 | │ └─ train.csv // training interactions 44 | ├─ item_features_0based.txt // item features in libsvm format 45 | └─ user_features_0based.txt // user features in libsvm format 46 | 47 | interactions are stored in csv as: 48 | ,,, 49 | where INTERACTION_TYPE is one of: 50 | 0: impression 51 | 1: click 52 | 2: bookmark 53 | 3: reply 54 | 5: recruiter interest 55 | ``` 56 | 57 | 58 | 59 | ## Running training code 60 | 61 | 1. Download the dataset, extract and keep the directory structure. 62 | 63 | 2. run `main.py` 64 | * for usage, run with `main.py --help` 65 | * default setting trains a two layer neural network with hyperparameters selected for the RecSys data 66 | * gpu is used for training by default and cpu for inference 67 | 3. (Optionally) launch tensorboard to monitor progress by `tensorboard --logdir=` 68 | 69 | During training recall@50,100,...,500 accuracy is shown every 50K updates for warm start, user cold start and item cold start validation sets. 70 | 71 | Notes: 72 | 73 | * Make sure `--data-dir` points to the `eval/` folder, not the root 74 | * On our environment (described above) 50K updates takes approximately 14 minutes with the default GPU/CPU setting. 75 | * By default, training happens on GPU while inference and batch generation is on CPU. 76 | 77 | ## Validation Curves 78 |

79 | 80 | 81 | 82 |

83 | 84 | ## Citeulike 85 | In addition to Recsys, we also provide pipeline to run the publicly available Citeulike data. Note that, as mentioned in the paper, we evaluate cold start the same way as the CTR paper while the warm start evaluation is modified. For convenience, we have proivded our evaluation split for both cold and warm start, item features, as well as the WMF user item preference latent vectors available [here](https://github.com/layer6ai-labs/DropoutNet-Data/blob/master/citeu.tar.gz). 86 | 87 | The citeulike warm and cold models are trained separately as their validation sets differ. Please use the scripts 88 | `main_cold_citeu.py` and `main_warm_citeu.py` to run the experiments on the Citeulike dataset. 89 | 90 | Point `--data-dir` to your extracted `eval` folder after extracting `citeu.tar.gz`. Sample training runs with respective validation performance are shown below per 1000 updates. 91 | 92 |

93 | 94 | 95 |

96 | 97 | -------------------------------------------------------------------------------- /logs/cold_item.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/cold_item.png -------------------------------------------------------------------------------- /logs/cold_user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/cold_user.png -------------------------------------------------------------------------------- /logs/dropoutnet_citeu_cold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/dropoutnet_citeu_cold.png -------------------------------------------------------------------------------- /logs/dropoutnet_citeu_warm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/dropoutnet_citeu_warm.png -------------------------------------------------------------------------------- /logs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logo.png -------------------------------------------------------------------------------- /logs/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Page 1 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /logs/logo_alt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logo_alt.png -------------------------------------------------------------------------------- /logs/logo_fill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logo_fill.png -------------------------------------------------------------------------------- /logs/logobox.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logobox.jpg -------------------------------------------------------------------------------- /logs/warm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/warm.png -------------------------------------------------------------------------------- /tf1/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.sparse 4 | import utils 5 | import pandas as pd 6 | 7 | """ 8 | This module contains class and methods related to data used in DropoutNet 9 | """ 10 | 11 | 12 | def load_eval_data(test_file, test_id_file, name, cold, train_data, citeu=False): 13 | timer = utils.timer() 14 | with open(test_id_file) as f: 15 | test_item_ids = [int(line) for line in f] 16 | test_data = pd.read_csv(test_file, delimiter=",", header=-1, dtype=np.int32).values.ravel() 17 | if citeu: 18 | test_data = test_data.view( 19 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) 20 | else: 21 | test_data = test_data.view( 22 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) 23 | timer.toc('read %s triplets %s' % (name, test_data.shape)).tic() 24 | eval_data = EvalData( 25 | test_data, 26 | test_item_ids, 27 | is_cold=cold, 28 | train=train_data 29 | ) 30 | timer.toc('loaded %s' % name).tic() 31 | print(eval_data.get_stats_string()) 32 | return eval_data 33 | 34 | 35 | class EvalData: 36 | """ 37 | EvalData: 38 | EvalData packages test triplet (user, item, score) into appropriate formats for evaluation 39 | 40 | Compact Indices: 41 | Specifically, this builds compact indices and stores mapping between original and compact indices. 42 | Compact indices only contains: 43 | 1) items in test set 44 | 2) users who interacted with such test items 45 | These compact indices speed up testing significantly by ignoring irrelevant users or items 46 | 47 | Args: 48 | test_triplets(int triplets): user-item-interaction_value triplet to build the test data 49 | train(int triplets): user-item-interaction_value triplet from train data 50 | 51 | Attributes: 52 | is_cold(boolean): whether test data is used for cold start problem 53 | test_item_ids(list of int): maps compressed item ids to original item ids (via position) 54 | test_item_ids_map(dictionary of int->int): maps original item ids to compressed item ids 55 | test_user_ids(list of int): maps compressed user ids to original user ids (via position) 56 | test_user_ids_map(dictionary of int->int): maps original user ids to compressed user ids 57 | R_test_inf(scipy lil matrix): pre-built compressed test matrix 58 | R_train_inf(scipy lil matrix): pre-built compressed train matrix for testing 59 | 60 | other relevant input/output exposed from tensorflow graph 61 | 62 | """ 63 | 64 | def __init__(self, test_triplets, test_item_ids, is_cold, train): 65 | # build map both-ways between compact and original indices 66 | # compact indices only contains: 67 | # 1) items in test set 68 | # 2) users who interacted with such test items 69 | 70 | self.is_cold = is_cold 71 | 72 | self.test_item_ids = test_item_ids 73 | # test_item_ids_map 74 | self.test_item_ids_map = {iid: i for i, iid in enumerate(self.test_item_ids)} 75 | 76 | _test_ij_for_inf = [(t[0], t[1]) for t in test_triplets if t[1] in self.test_item_ids_map] 77 | # test_user_ids 78 | self.test_user_ids = np.unique(test_triplets['uid']) 79 | # test_user_ids_map 80 | self.test_user_ids_map = {user_id: i for i, user_id in enumerate(self.test_user_ids)} 81 | 82 | _test_i_for_inf = [self.test_user_ids_map[_t[0]] for _t in _test_ij_for_inf] 83 | _test_j_for_inf = [self.test_item_ids_map[_t[1]] for _t in _test_ij_for_inf] 84 | self.R_test_inf = scipy.sparse.coo_matrix( 85 | (np.ones(len(_test_i_for_inf)), 86 | (_test_i_for_inf, _test_j_for_inf)), 87 | shape=[len(self.test_user_ids), len(self.test_item_ids)] 88 | ).tolil(copy=False) 89 | 90 | train_ij_for_inf = [(self.test_user_ids_map[_t[0]], self.test_item_ids_map[_t[1]]) for _t 91 | in train 92 | if _t[1] in self.test_item_ids_map and _t[0] in self.test_user_ids_map] 93 | if self.is_cold and len(train_ij_for_inf) is not 0: 94 | raise Exception('using cold dataset, but data is not cold!') 95 | if not self.is_cold and len(train_ij_for_inf) is 0: 96 | raise Exception('using warm datset, but data is not warm!') 97 | 98 | self.R_train_inf = None if self.is_cold else scipy.sparse.coo_matrix(( 99 | np.ones(len(train_ij_for_inf)), 100 | zip(*train_ij_for_inf)), shape=self.R_test_inf.shape).tolil(copy=False) 101 | 102 | # allocate fields 103 | self.U_pref_test = None 104 | self.V_pref_test = None 105 | self.V_content_test = None 106 | self.U_content_test = None 107 | self.tf_eval_train = None 108 | self.tf_eval_test = None 109 | self.eval_batch = None 110 | 111 | def init_tf(self, user_factors, item_factors, user_content, item_content, eval_run_batchsize): 112 | self.U_pref_test = user_factors[self.test_user_ids, :] 113 | self.V_pref_test = item_factors[self.test_item_ids, :] 114 | self.V_content_test = item_content[self.test_item_ids, :] 115 | if scipy.sparse.issparse(self.V_content_test): 116 | self.V_content_test = self.V_content_test.todense() 117 | if user_content!=None: 118 | self.U_content_test = user_content[self.test_user_ids, :] 119 | if scipy.sparse.issparse(self.U_content_test): 120 | self.U_content_test = self.U_content_test.todense() 121 | eval_l = self.R_test_inf.shape[0] 122 | self.eval_batch = [(x, min(x + eval_run_batchsize, eval_l)) for x 123 | in xrange(0, eval_l, eval_run_batchsize)] 124 | 125 | self.tf_eval_train = [] 126 | self.tf_eval_test = [] 127 | 128 | if not self.is_cold: 129 | for (eval_start, eval_finish) in self.eval_batch: 130 | _ui = self.R_train_inf[eval_start:eval_finish, :].tocoo() 131 | _ui = zip(_ui.row, _ui.col) 132 | self.tf_eval_train.append( 133 | tf.SparseTensorValue( 134 | indices=_ui, 135 | values=np.full(len(_ui), -100000, dtype=np.float32), 136 | dense_shape=[eval_finish - eval_start, self.R_train_inf.shape[1]] 137 | ) 138 | ) 139 | 140 | def get_stats_string(self): 141 | return ('\tn_test_users:[%d]\n\tn_test_items:[%d]' % (len(self.test_user_ids), len(self.test_item_ids)) 142 | + '\n\tR_train_inf: %s' % ( 143 | 'no R_train_inf for cold' if self.is_cold else 'shape=%s nnz=[%d]' % ( 144 | str(self.R_train_inf.shape), len(self.R_train_inf.nonzero()[0]) 145 | ) 146 | ) 147 | + '\n\tR_test_inf: shape=%s nnz=[%d]' % ( 148 | str(self.R_test_inf.shape), len(self.R_test_inf.nonzero()[0]) 149 | )) 150 | -------------------------------------------------------------------------------- /tf1/main.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | import datetime 6 | from sklearn import datasets 7 | import data 8 | import model 9 | 10 | import argparse 11 | import os 12 | 13 | n_users = 1497020 + 1 14 | n_items = 1306054 + 1 15 | 16 | 17 | def main(): 18 | data_path = args.data_dir 19 | checkpoint_path = args.checkpoint_path 20 | tb_log_path = args.tb_log_path 21 | model_select = args.model_select 22 | 23 | rank_out = args.rank 24 | user_batch_size = 1000 25 | n_scores_user = 2500 26 | data_batch_size = 100 27 | dropout = args.dropout 28 | recall_at = range(50, 550, 50) 29 | eval_batch_size = 1000 30 | max_data_per_step = 2500000 31 | eval_every = args.eval_every 32 | num_epoch = 10 33 | 34 | _lr = args.lr 35 | _decay_lr_every = 50 36 | _lr_decay = 0.1 37 | 38 | experiment = '%s_%s' % ( 39 | datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 40 | '-'.join(str(x / 100) for x in model_select) if model_select else 'simple' 41 | ) 42 | _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint' 43 | 44 | print('running: ' + experiment) 45 | 46 | dat = load_data(data_path) 47 | u_pref_scaled = dat['u_pref_scaled'] 48 | v_pref_scaled = dat['v_pref_scaled'] 49 | eval_warm = dat['eval_warm'] 50 | eval_cold_user = dat['eval_cold_user'] 51 | eval_cold_item = dat['eval_cold_item'] 52 | user_content = dat['user_content'] 53 | item_content = dat['item_content'] 54 | u_pref = dat['u_pref'] 55 | v_pref = dat['v_pref'] 56 | user_indices = dat['user_indices'] 57 | 58 | timer = utils.timer(name='main').tic() 59 | 60 | # append pref factors for faster dropout 61 | v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) 62 | v_pref_last = v_pref_scaled.shape[0] 63 | u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) 64 | u_pref_last = u_pref_scaled.shape[0] 65 | timer.toc('initialized numpy data for tf') 66 | 67 | # prep eval 68 | eval_batch_size = eval_batch_size 69 | timer.tic() 70 | eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 71 | timer.toc('initialized eval_warm for tf').tic() 72 | eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 73 | timer.toc('initialized eval_cold_user for tf').tic() 74 | eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 75 | timer.toc('initialized eval_cold_item for tf').tic() 76 | 77 | dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1], 78 | user_content_rank=user_content.shape[1], 79 | item_content_rank=item_content.shape[1], 80 | model_select=model_select, 81 | rank_out=rank_out) 82 | 83 | config = tf.ConfigProto(allow_soft_placement=True) 84 | 85 | with tf.device(args.model_device): 86 | dropout_net.build_model() 87 | 88 | with tf.device(args.inf_device): 89 | dropout_net.build_predictor(recall_at, n_scores_user) 90 | 91 | with tf.Session(config=config) as sess: 92 | tf_saver = None if _tf_ckpt_file is None else tf.train.Saver() 93 | train_writer = None if tb_log_path is None else tf.summary.FileWriter( 94 | tb_log_path + experiment, sess.graph) 95 | tf.global_variables_initializer().run() 96 | tf.local_variables_initializer().run() 97 | timer.toc('initialized tf') 98 | 99 | row_index = np.copy(user_indices) 100 | n_step = 0 101 | best_cold_user = 0 102 | best_cold_item = 0 103 | best_warm = 0 104 | n_batch_trained = 0 105 | best_step = 0 106 | for epoch in range(num_epoch): 107 | np.random.shuffle(row_index) 108 | for b in utils.batch(row_index, user_batch_size): 109 | n_step += 1 110 | # prep targets 111 | target_users = np.repeat(b, n_scores_user) 112 | target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) 113 | target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b] 114 | target_items_rand = np.array(target_items_rand).flatten() 115 | target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand])) 116 | [target_scores, target_items, random_scores] = sess.run( 117 | [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random], 118 | feed_dict={ 119 | dropout_net.U_pref_tf: u_pref[b, :], 120 | dropout_net.V_pref_tf: v_pref, 121 | dropout_net.rand_target_ui: target_ui_rand 122 | } 123 | ) 124 | # merge topN and randomN items per user 125 | target_scores = np.append(target_scores, random_scores) 126 | target_items = np.append(target_items, target_items_rand) 127 | target_users = np.append(target_users, target_users) 128 | 129 | tf.local_variables_initializer().run() 130 | n_targets = len(target_scores) 131 | perm = np.random.permutation(n_targets) 132 | n_targets = min(n_targets, max_data_per_step) 133 | data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)] 134 | f_batch = 0 135 | for (start, stop) in data_batch: 136 | batch_perm = perm[start:stop] 137 | batch_users = target_users[batch_perm] 138 | batch_items = target_items[batch_perm] 139 | if dropout != 0: 140 | n_to_drop = int(np.floor(dropout * len(batch_perm))) 141 | perm_user = np.random.permutation(len(batch_perm))[:n_to_drop] 142 | perm_item = np.random.permutation(len(batch_perm))[:n_to_drop] 143 | batch_v_pref = np.copy(batch_items) 144 | batch_u_pref = np.copy(batch_users) 145 | batch_v_pref[perm_user] = v_pref_last 146 | batch_u_pref[perm_item] = u_pref_last 147 | else: 148 | batch_v_pref = batch_items 149 | batch_u_pref = batch_users 150 | 151 | _, _, loss_out = sess.run( 152 | [dropout_net.preds, dropout_net.updates, dropout_net.loss], 153 | feed_dict={ 154 | dropout_net.Uin: u_pref_expanded[batch_u_pref, :], 155 | dropout_net.Vin: v_pref_expanded[batch_v_pref, :], 156 | dropout_net.Ucontent: user_content[batch_users, :].todense(), 157 | dropout_net.Vcontent: item_content[batch_items, :].todense(), 158 | # 159 | dropout_net.target: target_scores[batch_perm], 160 | dropout_net.lr_placeholder: _lr, 161 | dropout_net.phase: 1 162 | } 163 | ) 164 | f_batch += loss_out 165 | if np.isnan(f_batch): 166 | raise Exception('f is nan') 167 | 168 | n_batch_trained += len(data_batch) 169 | if n_step % _decay_lr_every == 0: 170 | _lr = _lr_decay * _lr 171 | print('decayed lr:' + str(_lr)) 172 | if n_step % eval_every == 0: 173 | recall_warm = utils.batch_eval_recall( 174 | sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict, 175 | recall_k=recall_at, eval_data=eval_warm) 176 | recall_cold_user = utils.batch_eval_recall( 177 | sess, dropout_net.eval_preds_cold, 178 | eval_feed_dict=dropout_net.get_eval_dict, 179 | recall_k=recall_at, eval_data=eval_cold_user) 180 | recall_cold_item = utils.batch_eval_recall( 181 | sess, dropout_net.eval_preds_cold, 182 | eval_feed_dict=dropout_net.get_eval_dict, 183 | recall_k=recall_at, eval_data=eval_cold_item) 184 | 185 | # checkpoint 186 | if np.sum(recall_warm + recall_cold_user + recall_cold_item) > np.sum( 187 | best_warm + best_cold_user + best_cold_item): 188 | best_cold_user = recall_cold_user 189 | best_cold_item = recall_cold_item 190 | best_warm = recall_warm 191 | best_step = n_step 192 | if tf_saver is not None: 193 | tf_saver.save(sess, _tf_ckpt_file) 194 | 195 | timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % ( 196 | n_step, len(data_batch), n_batch_trained, f_batch, best_step 197 | )).tic() 198 | print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at])) 199 | print('warm start\t%s\ncold user\t%s\ncold item\t%s' % ( 200 | ' '.join(['%.4f' % i for i in recall_warm]), 201 | ' '.join(['%.4f' % i for i in recall_cold_user]), 202 | ' '.join(['%.4f' % i for i in recall_cold_item]) 203 | )) 204 | summaries = [] 205 | for i, k in enumerate(recall_at): 206 | if k % 100 == 0: 207 | summaries.extend([ 208 | tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]), 209 | tf.Summary.Value(tag="recall@" + str(k) + " cold_user", 210 | simple_value=recall_cold_user[i]), 211 | tf.Summary.Value(tag="recall@" + str(k) + " cold_item", 212 | simple_value=recall_cold_item[i]) 213 | ]) 214 | recall_summary = tf.Summary(value=summaries) 215 | if train_writer is not None: 216 | train_writer.add_summary(recall_summary, n_step) 217 | 218 | 219 | def load_data(data_path): 220 | timer = utils.timer(name='main').tic() 221 | split_folder = os.path.join(data_path, 'warm') 222 | 223 | u_file = os.path.join(data_path, 'trained/warm/U.csv.bin') 224 | v_file = os.path.join(data_path, 'trained/warm/V.csv.bin') 225 | user_content_file = os.path.join(data_path, 'user_features_0based.txt') 226 | item_content_file = os.path.join(data_path, 'item_features_0based.txt') 227 | train_file = os.path.join(split_folder, 'train.csv') 228 | test_warm_file = os.path.join(split_folder, 'test_warm.csv') 229 | test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv') 230 | test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv') 231 | test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv') 232 | test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv') 233 | test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv') 234 | 235 | dat = {} 236 | # load preference data 237 | timer.tic() 238 | u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200) 239 | v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200) 240 | dat['u_pref'] = u_pref 241 | dat['v_pref'] = v_pref 242 | 243 | timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() 244 | 245 | # pre-process 246 | _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) 247 | _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) 248 | timer.toc('standardized U,V').tic() 249 | 250 | # load content data 251 | timer.tic() 252 | user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32) 253 | dat['user_content'] = user_content.tolil(copy=False) 254 | timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() 255 | item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) 256 | dat['item_content'] = item_content.tolil(copy=False) 257 | timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() 258 | 259 | # load split 260 | timer.tic() 261 | train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( 262 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) 263 | dat['user_indices'] = np.unique(train['uid']) 264 | timer.toc('read train triplets %s' % train.shape).tic() 265 | 266 | dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, 267 | train_data=train) 268 | dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user', 269 | cold=True, 270 | train_data=train) 271 | dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item', 272 | cold=True, 273 | train_data=train) 274 | return dat 275 | 276 | 277 | if __name__ == "__main__": 278 | parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data", 279 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 280 | parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder') 281 | 282 | parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training') 283 | parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference') 284 | parser.add_argument('--checkpoint-path', type=str, default=None, 285 | help='path to dump checkpoint data from TensorFlow') 286 | parser.add_argument('--tb-log-path', type=str, default=None, 287 | help='path to dump TensorBoard logs') 288 | parser.add_argument('--model-select', nargs='+', type=int, 289 | default=[800, 400], 290 | help='specify the fully-connected architecture, starting from input,' 291 | ' numbers indicate numbers of hidden units', 292 | ) 293 | parser.add_argument('--rank', type=int, default=200, help='output rank of latent model') 294 | parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout') 295 | parser.add_argument('--eval-every', type=int, default=2, help='evaluate every X user-batch') 296 | parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate') 297 | 298 | args = parser.parse_args() 299 | main() 300 | -------------------------------------------------------------------------------- /tf1/main_citeu.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | import datetime 6 | from sklearn import datasets 7 | import data 8 | import model 9 | 10 | import argparse 11 | import os 12 | 13 | n_users = 5551 + 1 14 | n_items = 16980 + 1 15 | 16 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 17 | 18 | def main(): 19 | data_path = args.data_dir 20 | checkpoint_path = args.checkpoint_path 21 | tb_log_path = args.tb_log_path 22 | model_select = args.model_select 23 | 24 | rank_out = args.rank 25 | user_batch_size = 1000 26 | n_scores_user = 2500 27 | data_batch_size = 100 28 | dropout = args.dropout 29 | recall_at = range(10, 110, 10) 30 | eval_batch_size = 1000 31 | max_data_per_step = 2500000 32 | eval_every = args.eval_every 33 | num_epoch = 10 34 | 35 | _lr = args.lr 36 | _decay_lr_every = 50 37 | _lr_decay = 0.1 38 | 39 | experiment = '%s_%s' % ( 40 | datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 41 | '-'.join(str(x / 100) for x in model_select) if model_select else 'simple' 42 | ) 43 | _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint' 44 | 45 | print('running: ' + experiment) 46 | 47 | dat = load_data(data_path) 48 | u_pref_scaled = dat['u_pref_scaled'] 49 | v_pref_scaled = dat['v_pref_scaled'] 50 | eval_warm = dat['eval_warm'] 51 | item_content = dat['item_content'] 52 | u_pref = dat['u_pref'] 53 | v_pref = dat['v_pref'] 54 | user_indices = dat['user_indices'] 55 | 56 | timer = utils.timer(name='main').tic() 57 | 58 | # append pref factors for faster dropout 59 | v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) 60 | v_pref_last = v_pref_scaled.shape[0] 61 | u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) 62 | u_pref_last = u_pref_scaled.shape[0] 63 | timer.toc('initialized numpy data for tf') 64 | 65 | # prep eval 66 | eval_batch_size = eval_batch_size 67 | timer.tic() 68 | eval_warm.init_tf(u_pref_scaled, v_pref_scaled, None, item_content, eval_batch_size) 69 | timer.toc('initialized eval_warm for tf').tic() 70 | 71 | dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1], 72 | user_content_rank=0, 73 | item_content_rank=item_content.shape[1], 74 | model_select=model_select, 75 | rank_out=rank_out) 76 | 77 | config = tf.ConfigProto(allow_soft_placement=True) 78 | 79 | with tf.device(args.model_device): 80 | dropout_net.build_model() 81 | 82 | with tf.device(args.inf_device): 83 | dropout_net.build_predictor(recall_at, n_scores_user) 84 | 85 | with tf.Session(config=config) as sess: 86 | tf_saver = None if _tf_ckpt_file is None else tf.train.Saver() 87 | train_writer = None if tb_log_path is None else tf.summary.FileWriter( 88 | tb_log_path + experiment, sess.graph) 89 | tf.global_variables_initializer().run() 90 | tf.local_variables_initializer().run() 91 | timer.toc('initialized tf') 92 | 93 | row_index = np.copy(user_indices) 94 | n_step = 0 95 | best_warm = 0 96 | n_batch_trained = 0 97 | best_step = 0 98 | for epoch in range(num_epoch): 99 | np.random.shuffle(row_index) 100 | for b in utils.batch(row_index, user_batch_size): 101 | n_step += 1 102 | # prep targets 103 | target_users = np.repeat(b, n_scores_user) 104 | target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) 105 | target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b] 106 | target_items_rand = np.array(target_items_rand).flatten() 107 | target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand])) 108 | [target_scores, target_items, random_scores] = sess.run( 109 | [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random], 110 | feed_dict={ 111 | dropout_net.U_pref_tf: u_pref[b, :], 112 | dropout_net.V_pref_tf: v_pref, 113 | dropout_net.rand_target_ui: target_ui_rand 114 | } 115 | ) 116 | # merge topN and randomN items per user 117 | target_scores = np.append(target_scores, random_scores) 118 | target_items = np.append(target_items, target_items_rand) 119 | target_users = np.append(target_users, target_users) 120 | 121 | tf.local_variables_initializer().run() 122 | n_targets = len(target_scores) 123 | perm = np.random.permutation(n_targets) 124 | n_targets = min(n_targets, max_data_per_step) 125 | data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)] 126 | f_batch = 0 127 | for (start, stop) in data_batch: 128 | batch_perm = perm[start:stop] 129 | batch_users = target_users[batch_perm] 130 | batch_items = target_items[batch_perm] 131 | if dropout != 0: 132 | n_to_drop = int(np.floor(dropout * len(batch_perm))) 133 | perm_user = np.random.permutation(len(batch_perm))[:n_to_drop] 134 | perm_item = np.random.permutation(len(batch_perm))[:n_to_drop] 135 | batch_v_pref = np.copy(batch_items) 136 | batch_u_pref = np.copy(batch_users) 137 | batch_v_pref[perm_user] = v_pref_last 138 | batch_u_pref[perm_item] = u_pref_last 139 | else: 140 | batch_v_pref = batch_items 141 | batch_u_pref = batch_users 142 | 143 | _, _, loss_out = sess.run( 144 | [dropout_net.preds, dropout_net.updates, dropout_net.loss], 145 | feed_dict={ 146 | dropout_net.Uin: u_pref_expanded[batch_u_pref, :], 147 | dropout_net.Vin: v_pref_expanded[batch_v_pref, :], 148 | dropout_net.Vcontent: item_content[batch_items, :].todense(), 149 | # 150 | dropout_net.target: target_scores[batch_perm], 151 | dropout_net.lr_placeholder: _lr, 152 | dropout_net.phase: 1 153 | } 154 | ) 155 | f_batch += loss_out 156 | if np.isnan(f_batch): 157 | raise Exception('f is nan') 158 | 159 | n_batch_trained += len(data_batch) 160 | if n_step % _decay_lr_every == 0: 161 | _lr = _lr_decay * _lr 162 | print('decayed lr:' + str(_lr)) 163 | if n_step % eval_every == 0: 164 | recall_warm = utils.batch_eval_recall( 165 | sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict, 166 | recall_k=recall_at, eval_data=eval_warm) 167 | 168 | # checkpoint 169 | if np.sum(recall_warm) > np.sum(best_warm): 170 | best_warm = recall_warm 171 | best_step = n_step 172 | if tf_saver is not None: 173 | tf_saver.save(sess, _tf_ckpt_file) 174 | 175 | timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % ( 176 | n_step, len(data_batch), n_batch_trained, f_batch, best_step 177 | )).tic() 178 | print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at])) 179 | print('warm start\t%s' % ( 180 | ' '.join(['%.4f' % i for i in recall_warm]), 181 | )) 182 | summaries = [] 183 | for i, k in enumerate(recall_at): 184 | if k % 100 == 0: 185 | summaries.extend([ 186 | tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]), 187 | ]) 188 | recall_summary = tf.Summary(value=summaries) 189 | if train_writer is not None: 190 | train_writer.add_summary(recall_summary, n_step) 191 | 192 | 193 | def load_data(data_path): 194 | timer = utils.timer(name='main').tic() 195 | split_folder = os.path.join(data_path, 'warm') 196 | 197 | u_file = os.path.join(data_path, 'trained/warm/U.bin') 198 | v_file = os.path.join(data_path, 'trained/warm/V.bin') 199 | item_content_file = os.path.join(data_path, 'item_features_0based.txt') 200 | train_file = os.path.join(split_folder, 'train.csv') 201 | test_warm_file = os.path.join(split_folder, 'test.csv') 202 | test_warm_iid_file = os.path.join(split_folder, 'test_item_ids.csv') 203 | 204 | dat = {} 205 | # load preference data 206 | timer.tic() 207 | u_pref = np.fromfile(u_file, dtype='>f4').reshape(n_users, 200) 208 | v_pref = np.fromfile(v_file, dtype='>f4').reshape(n_items, 200) 209 | dat['u_pref'] = u_pref 210 | dat['v_pref'] = v_pref 211 | 212 | timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() 213 | 214 | # pre-process 215 | _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) 216 | _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) 217 | timer.toc('standardized U,V').tic() 218 | 219 | # load content data 220 | timer.tic() 221 | item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) 222 | dat['item_content'] = item_content.tolil(copy=False) 223 | timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() 224 | 225 | # load split 226 | timer.tic() 227 | train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( 228 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) 229 | dat['user_indices'] = np.unique(train['uid']) 230 | timer.toc('read train triplets %s' % train.shape).tic() 231 | 232 | dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, 233 | train_data=train,citeu=True) 234 | return dat 235 | 236 | 237 | if __name__ == "__main__": 238 | parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data", 239 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 240 | parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder') 241 | 242 | parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training') 243 | parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference') 244 | parser.add_argument('--checkpoint-path', type=str, default=None, 245 | help='path to dump checkpoint data from TensorFlow') 246 | parser.add_argument('--tb-log-path', type=str, default=None, 247 | help='path to dump TensorBoard logs') 248 | parser.add_argument('--model-select', nargs='+', type=int, 249 | default=[200], 250 | help='specify the fully-connected architecture, starting from input,' 251 | ' numbers indicate numbers of hidden units', 252 | ) 253 | parser.add_argument('--rank', type=int, default=200, help='output rank of latent model') 254 | parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout') 255 | parser.add_argument('--eval-every', type=int, default=1, help='evaluate every X user-batch') 256 | parser.add_argument('--lr', type=float, default=0.05, help='starting learning rate') 257 | 258 | args = parser.parse_args() 259 | main() 260 | -------------------------------------------------------------------------------- /tf1/main_cold_citeu.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | import datetime 6 | from sklearn import datasets 7 | import data 8 | import model 9 | import scipy.sparse as sp 10 | 11 | import argparse 12 | import os 13 | 14 | n_users = 5551 + 1 15 | n_items = 16980 + 1 16 | 17 | def main(): 18 | data_path = args.data_dir 19 | checkpoint_path = args.checkpoint_path 20 | tb_log_path = args.tb_log_path 21 | model_select = args.model_select 22 | 23 | rank_out = args.rank 24 | user_batch_size = 1000 25 | n_scores_user = 2500 26 | data_batch_size = 100 27 | dropout = args.dropout 28 | recall_at = range(10, 110, 10) 29 | eval_batch_size = 1000 30 | max_data_per_step = 2500000 31 | eval_every = args.eval_every 32 | num_epoch = 500 33 | 34 | _lr = args.lr 35 | _decay_lr_every = 100 36 | _lr_decay = 0.1 37 | 38 | experiment = '%s_%s' % ( 39 | datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 40 | '-'.join(str(x / 100) for x in model_select) if model_select else 'simple' 41 | ) 42 | _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint' 43 | 44 | print('running: ' + experiment) 45 | 46 | dat = load_data(data_path) 47 | u_pref_scaled = dat['u_pref_scaled'] 48 | v_pref_scaled = dat['v_pref_scaled'] 49 | eval_cold = dat['eval_cold'] 50 | item_content = dat['item_content'] 51 | u_pref = dat['u_pref'] 52 | v_pref = dat['v_pref'] 53 | user_indices = dat['user_indices'] 54 | 55 | timer = utils.timer(name='main').tic() 56 | 57 | # append pref factors for faster dropout 58 | v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) 59 | v_pref_last = v_pref_scaled.shape[0] 60 | u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) 61 | u_pref_last = u_pref_scaled.shape[0] 62 | timer.toc('initialized numpy data for tf') 63 | 64 | # prep eval 65 | eval_batch_size = eval_batch_size 66 | timer.tic() 67 | eval_cold.init_tf(u_pref_scaled, v_pref_scaled, None, item_content, eval_batch_size) 68 | timer.toc('initialized eval for tf').tic() 69 | 70 | dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1], 71 | user_content_rank=0, 72 | item_content_rank=item_content.shape[1], 73 | model_select=model_select, 74 | rank_out=rank_out) 75 | 76 | config = tf.ConfigProto(allow_soft_placement=True) 77 | 78 | with tf.device(args.model_device): 79 | dropout_net.build_model() 80 | 81 | with tf.device(args.inf_device): 82 | dropout_net.build_predictor(recall_at, n_scores_user) 83 | 84 | if args.progress: 85 | from tqdm import tqdm 86 | with tf.Session(config=config) as sess: 87 | tf_saver = None if _tf_ckpt_file is None else tf.train.Saver() 88 | train_writer = None if tb_log_path is None else tf.summary.FileWriter( 89 | tb_log_path + experiment, sess.graph) 90 | tf.global_variables_initializer().run() 91 | tf.local_variables_initializer().run() 92 | timer.toc('initialized tf') 93 | 94 | row_index = np.copy(user_indices) 95 | n_step = 0 96 | best_cold = 0 97 | n_batch_trained = 0 98 | best_step = 0 99 | for epoch in range(num_epoch): 100 | np.random.shuffle(row_index) 101 | for b in utils.batch(row_index, user_batch_size): 102 | n_step += 1 103 | # prep targets 104 | target_users = np.repeat(b, n_scores_user) 105 | target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) 106 | target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b] 107 | target_items_rand = np.array(target_items_rand).flatten() 108 | target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand])) 109 | [target_scores, target_items, random_scores] = sess.run( 110 | [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random], 111 | feed_dict={ 112 | dropout_net.U_pref_tf: u_pref[b, :], 113 | dropout_net.V_pref_tf: v_pref, 114 | dropout_net.rand_target_ui: target_ui_rand 115 | } 116 | ) 117 | # merge topN and randomN items per user 118 | target_scores = np.append(target_scores, random_scores) 119 | target_items = np.append(target_items, target_items_rand) 120 | target_users = np.append(target_users, target_users) 121 | 122 | tf.local_variables_initializer().run() 123 | n_targets = len(target_scores) 124 | perm = np.random.permutation(n_targets) 125 | n_targets = min(n_targets, max_data_per_step) 126 | data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)] 127 | f_batch = 0 128 | gen = data_batch 129 | if args.progress: 130 | gen = tqdm(gen) 131 | for (start, stop) in gen: 132 | batch_perm = perm[start:stop] 133 | batch_users = target_users[batch_perm] 134 | batch_items = target_items[batch_perm] 135 | if dropout != 0: 136 | n_to_drop = int(np.floor(dropout * len(batch_perm))) 137 | perm_user = np.random.permutation(len(batch_perm))[:n_to_drop] 138 | perm_item = np.random.permutation(len(batch_perm))[:n_to_drop] 139 | batch_v_pref = np.copy(batch_items) 140 | batch_u_pref = np.copy(batch_users) 141 | batch_v_pref[perm_user] = v_pref_last 142 | batch_u_pref[perm_item] = u_pref_last 143 | else: 144 | batch_v_pref = batch_items 145 | batch_u_pref = batch_users 146 | item_content_batch = item_content[batch_items, :] 147 | if sp.issparse(item_content): 148 | item_content_batch = item_content_batch.todense() 149 | 150 | _, _, loss_out = sess.run( 151 | [dropout_net.preds, dropout_net.updates, dropout_net.loss], 152 | feed_dict={ 153 | dropout_net.Uin: u_pref_expanded[batch_u_pref, :], 154 | dropout_net.Vin: v_pref_expanded[batch_v_pref, :], 155 | dropout_net.Vcontent: item_content_batch, 156 | # 157 | dropout_net.target: target_scores[batch_perm], 158 | dropout_net.lr_placeholder: _lr, 159 | dropout_net.phase: 1 160 | } 161 | ) 162 | f_batch += loss_out 163 | if np.isnan(f_batch): 164 | raise Exception('f is nan') 165 | 166 | n_batch_trained += len(data_batch) 167 | if n_step % _decay_lr_every == 0: 168 | _lr = _lr_decay * _lr 169 | print('decayed lr:' + str(_lr)) 170 | if n_step % eval_every == 0: 171 | recall_cold = utils.batch_eval_recall( 172 | sess, dropout_net.eval_preds_cold, eval_feed_dict=dropout_net.get_eval_dict, 173 | recall_k=recall_at, eval_data=eval_cold) 174 | 175 | # checkpoint 176 | if np.sum(recall_cold) > np.sum(best_cold): 177 | best_cold = recall_cold 178 | best_step = n_step 179 | if tf_saver is not None: 180 | tf_saver.save(sess, _tf_ckpt_file) 181 | 182 | timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % ( 183 | n_step, len(data_batch), n_batch_trained, f_batch, best_step 184 | )).tic() 185 | print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at])) 186 | print('cold start\t%s' % ( 187 | ' '.join(['%.4f' % i for i in recall_cold]), 188 | )) 189 | print('best epoch[%d]\t%s' % ( 190 | best_step, 191 | ' '.join(['%.4f' % i for i in best_cold] ), 192 | )) 193 | summaries = [] 194 | for i, k in enumerate(recall_at): 195 | if k % 100 == 0: 196 | summaries.extend([ 197 | tf.Summary.Value(tag="recall@" + str(k) + " cold", simple_value=recall_cold[i]), 198 | ]) 199 | recall_summary = tf.Summary(value=summaries) 200 | if train_writer is not None: 201 | train_writer.add_summary(recall_summary, n_step) 202 | 203 | def tfidf(R): 204 | row = R.shape[0] 205 | col = R.shape[1] 206 | Rbin = R.copy() 207 | Rbin[Rbin!=0]=1.0 208 | R = R + Rbin 209 | tf = R.copy() 210 | tf.data = np.log(tf.data) 211 | idf = np.sum(Rbin,0) 212 | idf = np.log(row/(1+idf)) 213 | idf = sp.spdiags(idf,0,col,col) 214 | return tf * idf 215 | 216 | def load_data(data_path): 217 | timer = utils.timer(name='main').tic() 218 | split_folder = os.path.join(data_path, 'cold') 219 | 220 | u_file = os.path.join(data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt') 221 | v_file = os.path.join(data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt') 222 | item_content_file = os.path.join(data_path, 'item_features_0based.txt') 223 | train_file = os.path.join(split_folder, 'train.csv') 224 | test_cold_file = os.path.join(split_folder, 'test.csv') 225 | test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv') 226 | 227 | dat = {} 228 | # load preference data 229 | timer.tic() 230 | # u_pref = np.fromfile(u_file, dtype='>f4').reshape(n_users, 200) 231 | # v_pref = np.fromfile(v_file, dtype='>f4').reshape(n_items, 200) 232 | 233 | u_pref = np.loadtxt(u_file).reshape(n_users,200) 234 | v_pref = np.loadtxt(v_file).reshape(n_items,200) 235 | 236 | dat['u_pref'] = u_pref 237 | dat['v_pref'] = v_pref 238 | 239 | timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() 240 | 241 | # pre-process 242 | _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) 243 | _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) 244 | 245 | timer.toc('standardized U,V').tic() 246 | 247 | # load content data 248 | timer.tic() 249 | item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) 250 | 251 | item_content = tfidf(item_content) 252 | 253 | from sklearn.utils.extmath import randomized_svd 254 | u,s,_ = randomized_svd(item_content, n_components=300, n_iter=5) 255 | item_content = u * s 256 | _, item_content = utils.prep_standardize(item_content) 257 | 258 | if sp.issparse(item_content): 259 | dat['item_content'] = item_content.tolil(copy=False) 260 | else: 261 | dat['item_content'] = item_content 262 | timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() 263 | 264 | # load split 265 | timer.tic() 266 | train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( 267 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) 268 | dat['user_indices'] = np.unique(train['uid']) 269 | timer.toc('read train triplets %s' % train.shape).tic() 270 | 271 | dat['eval_cold'] = data.load_eval_data(test_cold_file, test_cold_iid_file, name='eval_cold', cold=True, 272 | train_data=train,citeu=True) 273 | return dat 274 | 275 | 276 | if __name__ == "__main__": 277 | parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data", 278 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 279 | parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder') 280 | 281 | parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training') 282 | parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference') 283 | parser.add_argument('--checkpoint-path', type=str, default=None, 284 | help='path to dump checkpoint data from TensorFlow') 285 | parser.add_argument('--tb-log-path', type=str, default=None, 286 | help='path to dump TensorBoard logs') 287 | parser.add_argument('--model-select', nargs='+', type=int, 288 | default=[200], 289 | help='specify the fully-connected architecture, starting from input,' 290 | ' numbers indicate numbers of hidden units', 291 | ) 292 | parser.add_argument('--rank', type=int, default=200, help='output rank of latent model') 293 | parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout') 294 | parser.add_argument('--eval-every', type=int, default=1, help='evaluate every X user-batch') 295 | parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate') 296 | parser.add_argument('--progress', action='store_true', help='show tqdm progress (requires tqdm) during training') 297 | 298 | args = parser.parse_args() 299 | args, _ = parser.parse_known_args() 300 | for key in vars(args): 301 | print(key + ":" + str(vars(args)[key])) 302 | main() 303 | -------------------------------------------------------------------------------- /tf1/main_warm_citeu.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | import datetime 6 | from sklearn import datasets 7 | import data 8 | import model 9 | import scipy.sparse as sp 10 | 11 | import argparse 12 | import os 13 | 14 | 15 | n_users = 5551 + 1 16 | n_items = 16980 + 1 17 | 18 | def main(): 19 | data_path = args.data_dir 20 | checkpoint_path = args.checkpoint_path 21 | tb_log_path = args.tb_log_path 22 | model_select = args.model_select 23 | 24 | rank_out = args.rank 25 | user_batch_size = 1000 26 | n_scores_user = 2500 27 | data_batch_size = 100 28 | dropout = args.dropout 29 | recall_at = range(10, 110, 10) 30 | eval_batch_size = 1000 31 | max_data_per_step = 2500000 32 | eval_every = args.eval_every 33 | num_epoch = 200 34 | 35 | _lr = args.lr 36 | _decay_lr_every = 100 37 | _lr_decay = 0.1 38 | 39 | experiment = '%s_%s' % ( 40 | datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 41 | '-'.join(str(x / 100) for x in model_select) if model_select else 'simple' 42 | ) 43 | _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint' 44 | 45 | print('running: ' + experiment) 46 | 47 | dat = load_data(data_path) 48 | u_pref_scaled = dat['u_pref_scaled'] 49 | v_pref_scaled = dat['v_pref_scaled'] 50 | eval_warm = dat['eval_warm'] 51 | item_content = dat['item_content'] 52 | u_pref = dat['u_pref'] 53 | v_pref = dat['v_pref'] 54 | user_indices = dat['user_indices'] 55 | 56 | timer = utils.timer(name='main').tic() 57 | 58 | # append pref factors for faster dropout 59 | v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) 60 | v_pref_last = v_pref_scaled.shape[0] 61 | u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) 62 | u_pref_last = u_pref_scaled.shape[0] 63 | timer.toc('initialized numpy data for tf') 64 | 65 | # prep eval 66 | eval_batch_size = eval_batch_size 67 | timer.tic() 68 | eval_warm.init_tf(u_pref_scaled, v_pref_scaled, None, item_content, eval_batch_size) 69 | timer.toc('initialized eval for tf').tic() 70 | 71 | dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1], 72 | user_content_rank=0, 73 | item_content_rank=item_content.shape[1], 74 | model_select=model_select, 75 | rank_out=rank_out) 76 | 77 | config = tf.ConfigProto(allow_soft_placement=True) 78 | 79 | with tf.device(args.model_device): 80 | dropout_net.build_model() 81 | 82 | with tf.device(args.inf_device): 83 | dropout_net.build_predictor(recall_at, n_scores_user) 84 | 85 | if args.progress: 86 | from tqdm import tqdm 87 | with tf.Session(config=config) as sess: 88 | tf_saver = None if _tf_ckpt_file is None else tf.train.Saver() 89 | train_writer = None if tb_log_path is None else tf.summary.FileWriter( 90 | tb_log_path + experiment, sess.graph) 91 | tf.global_variables_initializer().run() 92 | tf.local_variables_initializer().run() 93 | timer.toc('initialized tf') 94 | 95 | row_index = np.copy(user_indices) 96 | n_step = 0 97 | best_warm = 0 98 | n_batch_trained = 0 99 | best_step = 0 100 | for epoch in range(num_epoch): 101 | np.random.shuffle(row_index) 102 | for b in utils.batch(row_index, user_batch_size): 103 | n_step += 1 104 | # prep targets 105 | target_users = np.repeat(b, n_scores_user) 106 | target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) 107 | target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b] 108 | target_items_rand = np.array(target_items_rand).flatten() 109 | target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand])) 110 | [target_scores, target_items, random_scores] = sess.run( 111 | [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random], 112 | feed_dict={ 113 | dropout_net.U_pref_tf: u_pref[b, :], 114 | dropout_net.V_pref_tf: v_pref, 115 | dropout_net.rand_target_ui: target_ui_rand 116 | } 117 | ) 118 | # merge topN and randomN items per user 119 | target_scores = np.append(target_scores, random_scores) 120 | target_items = np.append(target_items, target_items_rand) 121 | target_users = np.append(target_users, target_users) 122 | 123 | tf.local_variables_initializer().run() 124 | n_targets = len(target_scores) 125 | perm = np.random.permutation(n_targets) 126 | n_targets = min(n_targets, max_data_per_step) 127 | data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)] 128 | f_batch = 0 129 | gen = data_batch 130 | if args.progress: 131 | gen = tqdm(gen) 132 | for (start, stop) in gen: 133 | batch_perm = perm[start:stop] 134 | batch_users = target_users[batch_perm] 135 | batch_items = target_items[batch_perm] 136 | if dropout != 0: 137 | n_to_drop = int(np.floor(dropout * len(batch_perm))) 138 | perm_user = np.random.permutation(len(batch_perm))[:n_to_drop] 139 | perm_item = np.random.permutation(len(batch_perm))[:n_to_drop] 140 | batch_v_pref = np.copy(batch_items) 141 | batch_u_pref = np.copy(batch_users) 142 | batch_v_pref[perm_user] = v_pref_last 143 | batch_u_pref[perm_item] = u_pref_last 144 | else: 145 | batch_v_pref = batch_items 146 | batch_u_pref = batch_users 147 | item_content_batch = item_content[batch_items, :] 148 | if sp.issparse(item_content): 149 | item_content_batch = item_content_batch.todense() 150 | 151 | _, _, loss_out = sess.run( 152 | [dropout_net.preds, dropout_net.updates, dropout_net.loss], 153 | feed_dict={ 154 | dropout_net.Uin: u_pref_expanded[batch_u_pref, :], 155 | dropout_net.Vin: v_pref_expanded[batch_v_pref, :], 156 | dropout_net.Vcontent: item_content_batch, 157 | # 158 | dropout_net.target: target_scores[batch_perm], 159 | dropout_net.lr_placeholder: _lr, 160 | dropout_net.phase: 1 161 | } 162 | ) 163 | f_batch += loss_out 164 | if np.isnan(f_batch): 165 | raise Exception('f is nan') 166 | 167 | n_batch_trained += len(data_batch) 168 | if n_step % _decay_lr_every == 0: 169 | _lr = _lr_decay * _lr 170 | print('decayed lr:' + str(_lr)) 171 | if n_step % eval_every == 0: 172 | recall_warm = utils.batch_eval_recall( 173 | sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict, 174 | recall_k=recall_at, eval_data=eval_warm) 175 | 176 | # checkpoint 177 | if np.sum(recall_warm) > np.sum(best_warm): 178 | best_warm = recall_warm 179 | best_step = n_step 180 | if tf_saver is not None: 181 | tf_saver.save(sess, _tf_ckpt_file) 182 | 183 | timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % ( 184 | n_step, len(data_batch), n_batch_trained, f_batch, best_step 185 | )).tic() 186 | print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at])) 187 | print('warm start\t%s' % ( 188 | ' '.join(['%.4f' % i for i in recall_warm]), 189 | )) 190 | print('best epoch[%d]\t%s' % ( 191 | best_step, 192 | ' '.join(['%.4f' % i for i in best_warm] ), 193 | )) 194 | summaries = [] 195 | for i, k in enumerate(recall_at): 196 | if k % 100 == 0: 197 | summaries.extend([ 198 | tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]), 199 | ]) 200 | recall_summary = tf.Summary(value=summaries) 201 | if train_writer is not None: 202 | train_writer.add_summary(recall_summary, n_step) 203 | 204 | def tfidf(R): 205 | row = R.shape[0] 206 | col = R.shape[1] 207 | Rbin = R.copy() 208 | Rbin[Rbin!=0]=1.0 209 | R = R + Rbin 210 | tf = R.copy() 211 | tf.data = np.log(tf.data) 212 | idf = np.sum(Rbin,0) 213 | idf = np.log(row/(1+idf)) 214 | idf = sp.spdiags(idf,0,col,col) 215 | return tf * idf 216 | 217 | def load_data(data_path): 218 | timer = utils.timer(name='main').tic() 219 | split_folder = os.path.join(data_path, 'warm') 220 | 221 | u_file = os.path.join(data_path, 'trained/warm/WRMF_warm_rank200_reg1_alpha10_iter10.U.txt') 222 | v_file = os.path.join(data_path, 'trained/warm/WRMF_warm_rank200_reg1_alpha10_iter10.V.txt') 223 | item_content_file = os.path.join(data_path, 'item_features_0based.txt') 224 | train_file = os.path.join(split_folder, 'train.csv') 225 | test_warm_file = os.path.join(split_folder, 'test.csv') 226 | test_warm_iid_file = os.path.join(split_folder, 'test_item_ids.csv') 227 | 228 | dat = {} 229 | # load preference data 230 | timer.tic() 231 | 232 | u_pref = np.loadtxt(u_file).reshape(n_users,200) 233 | v_pref = np.loadtxt(v_file).reshape(n_items,200) 234 | 235 | 236 | dat['u_pref'] = u_pref 237 | dat['v_pref'] = v_pref 238 | 239 | timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() 240 | 241 | # pre-process 242 | _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) 243 | _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) 244 | 245 | timer.toc('standardized U,V').tic() 246 | 247 | # load content data 248 | timer.tic() 249 | item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) 250 | 251 | item_content = tfidf(item_content) 252 | 253 | from sklearn.utils.extmath import randomized_svd 254 | u,s,_ = randomized_svd(item_content, n_components=300, n_iter=5) 255 | item_content = u * s 256 | _, item_content = utils.prep_standardize(item_content) 257 | 258 | if sp.issparse(item_content): 259 | dat['item_content'] = item_content.tolil(copy=False) 260 | else: 261 | dat['item_content'] = item_content 262 | timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() 263 | 264 | # load split 265 | timer.tic() 266 | train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( 267 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) 268 | dat['user_indices'] = np.unique(train['uid']) 269 | timer.toc('read train triplets %s' % train.shape).tic() 270 | 271 | dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, 272 | train_data=train,citeu=True) 273 | return dat 274 | 275 | 276 | if __name__ == "__main__": 277 | parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data", 278 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 279 | parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder') 280 | 281 | parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training') 282 | parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference') 283 | parser.add_argument('--checkpoint-path', type=str, default=None, 284 | help='path to dump checkpoint data from TensorFlow') 285 | parser.add_argument('--tb-log-path', type=str, default=None, 286 | help='path to dump TensorBoard logs') 287 | parser.add_argument('--model-select', nargs='+', type=int, 288 | default=[500], 289 | help='specify the fully-connected architecture, starting from input,' 290 | ' numbers indicate numbers of hidden units', 291 | ) 292 | parser.add_argument('--rank', type=int, default=200, help='output rank of latent model') 293 | parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout') 294 | parser.add_argument('--eval-every', type=int, default=1, help='evaluate every X user-batch') 295 | parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate') 296 | parser.add_argument('--progress', action='store_true', help='show tqdm progress (requires tqdm) during training') 297 | 298 | args = parser.parse_args() 299 | args, _ = parser.parse_known_args() 300 | for key in vars(args): 301 | print(key + ":" + str(vars(args)[key])) 302 | main() 303 | -------------------------------------------------------------------------------- /tf1/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def dense_batch_fc_tanh(x, units, phase, scope, do_norm=False): 5 | """ 6 | convenience function to build tanh blocks in DeepCF 7 | tanh is found to work better for DeepCF nets 8 | constitutes of: FC -> batch norm -> tanh activation 9 | 10 | x: input 11 | units: # of hidden units in FC 12 | phase: boolean flag whether we are training, required by batch norm 13 | scope: name of block 14 | do_norm: boolean flag to do batch norm after FC 15 | """ 16 | 17 | with tf.variable_scope(scope): 18 | init = tf.truncated_normal_initializer(stddev=0.01) 19 | h1_w = tf.get_variable(scope + '_w', 20 | shape=[x.get_shape().as_list()[1], units], 21 | initializer=init) 22 | h1_b = tf.get_variable(scope + '_b', 23 | shape=[1, units], 24 | initializer=tf.zeros_initializer()) 25 | h1 = tf.matmul(x, h1_w) + h1_b 26 | if do_norm: 27 | h2 = tf.contrib.layers.batch_norm( 28 | h1, 29 | decay=0.9, 30 | center=True, 31 | scale=True, 32 | is_training=phase, 33 | scope=scope + '_bn') 34 | return tf.nn.tanh(h2, scope + '_tanh') 35 | else: 36 | return tf.nn.tanh(h1, scope + '_tanh') 37 | 38 | 39 | class DeepCF: 40 | """ 41 | main model class implementing DeepCF 42 | also stores states for fast candidate generation 43 | 44 | latent_rank_in: rank of preference model input 45 | user_content_rank: rank of user content input 46 | item_content_rank: rank of item content input 47 | model_select: array of number of hidden unit, 48 | i.e. [200,100] indicate two hidden layer with 200 units followed by 100 units 49 | rank_out: rank of latent model output 50 | 51 | """ 52 | 53 | def __init__(self, latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out): 54 | 55 | self.rank_in = latent_rank_in 56 | self.phi_u_dim = user_content_rank 57 | self.phi_v_dim = item_content_rank 58 | self.model_select = model_select 59 | self.rank_out = rank_out 60 | 61 | # inputs 62 | self.Uin = None 63 | self.Vin = None 64 | self.Ucontent = None 65 | self.Vcontent = None 66 | self.phase = None 67 | self.target = None 68 | self.eval_trainR = None 69 | self.U_pref_tf = None 70 | self.V_pref_tf = None 71 | self.rand_target_ui = None 72 | 73 | # outputs in the model 74 | 75 | self.preds = None 76 | self.updates = None 77 | self.loss = None 78 | 79 | self.U_embedding = None 80 | self.V_embedding = None 81 | 82 | self.lr_placeholder = None 83 | 84 | # predictor 85 | self.tf_topk_vals = None 86 | self.tf_topk_inds = None 87 | self.preds_random = None 88 | self.tf_latent_topk_cold = None 89 | self.tf_latent_topk_warm = None 90 | self.eval_preds_warm = None 91 | self.eval_preds_cold = None 92 | 93 | def build_model(self): 94 | """ 95 | set up tf components for main DeepCF net 96 | call after setting up desired tf state (cpu/gpu etc...) 97 | 98 | Note: should use GPU 99 | """ 100 | self.lr_placeholder = tf.placeholder(tf.float32, shape=[], name='learn_rate') 101 | self.phase = tf.placeholder(tf.bool, name='phase') 102 | self.target = tf.placeholder(tf.float32, shape=[None], name='target') 103 | 104 | self.Uin = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='U_in_raw') 105 | self.Vin = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='V_in_raw') 106 | if self.phi_u_dim>0: 107 | self.Ucontent = tf.placeholder(tf.float32, shape=[None, self.phi_u_dim], name='U_content') 108 | u_concat = tf.concat([self.Uin, self.Ucontent], 1) 109 | else: 110 | u_concat = self.Uin 111 | 112 | if self.phi_v_dim>0: 113 | self.Vcontent = tf.placeholder(tf.float32, shape=[None, self.phi_v_dim], name='V_content') 114 | v_concat = tf.concat([self.Vin, self.Vcontent], 1) 115 | else: 116 | v_concat = self.Vin 117 | 118 | print ('\tu_concat.shape=%s' % str(u_concat.get_shape())) 119 | print ('\tv_concat.shape=%s' % str(v_concat.get_shape())) 120 | 121 | u_last = u_concat 122 | v_last = v_concat 123 | for ihid, hid in enumerate(self.model_select): 124 | u_last = dense_batch_fc_tanh(u_last, hid, self.phase, 'user_layer_%d' % (ihid + 1), do_norm=True) 125 | v_last = dense_batch_fc_tanh(v_last, hid, self.phase, 'item_layer_%d' % (ihid + 1), do_norm=True) 126 | 127 | with tf.variable_scope("self.U_embedding"): 128 | u_emb_w = tf.Variable(tf.truncated_normal([u_last.get_shape().as_list()[1], self.rank_out], stddev=0.01), 129 | name='u_emb_w') 130 | u_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='u_emb_b') 131 | self.U_embedding = tf.matmul(u_last, u_emb_w) + u_emb_b 132 | 133 | with tf.variable_scope("V_embedding"): 134 | v_emb_w = tf.Variable(tf.truncated_normal([v_last.get_shape().as_list()[1], self.rank_out], stddev=0.01), 135 | name='v_emb_w') 136 | v_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='v_emb_b') 137 | self.V_embedding = tf.matmul(v_last, v_emb_w) + v_emb_b 138 | 139 | with tf.variable_scope("loss"): 140 | preds = tf.multiply(self.U_embedding, self.V_embedding) 141 | self.preds = tf.reduce_sum(preds, 1) 142 | self.loss = tf.reduce_mean(tf.squared_difference(self.preds, self.target)) 143 | 144 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 145 | with tf.control_dependencies(update_ops): 146 | # Ensures that we execute the update_ops before performing the train_step 147 | self.updates = tf.train.MomentumOptimizer(self.lr_placeholder, 0.9).minimize(self.loss) 148 | 149 | def build_predictor(self, recall_at, num_candidates): 150 | """ 151 | set up tf components for prediction and target selection 152 | call after setting up desired tf state (cpu/gpu etc...) 153 | 154 | Note: should use CPU, as large inputs are expected 155 | 156 | :param recall_at: truncation to compute recall 157 | :param num_candidates: number of candidates 158 | :return: 159 | """ 160 | self.eval_trainR = tf.sparse_placeholder( 161 | dtype=tf.float32, shape=[None, None], name='trainR_sparse_CPU') 162 | 163 | with tf.variable_scope("eval"): 164 | embedding_prod_cold = tf.matmul(self.U_embedding, self.V_embedding, transpose_b=True, name='pred_all_items') 165 | embedding_prod_warm = tf.sparse_add(embedding_prod_cold, self.eval_trainR) 166 | _, self.eval_preds_cold = tf.nn.top_k(embedding_prod_cold, k=recall_at[-1], sorted=True, 167 | name='topK_net_cold') 168 | _, self.eval_preds_warm = tf.nn.top_k(embedding_prod_warm, k=recall_at[-1], sorted=True, 169 | name='topK_net_warm') 170 | with tf.variable_scope("select_targets"): 171 | self.U_pref_tf = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='u_pref') 172 | self.V_pref_tf = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='v_pref') 173 | self.rand_target_ui = tf.placeholder(tf.int32, shape=[None, None], name='rand_target_ui') 174 | preds_pref = tf.matmul(self.U_pref_tf, self.V_pref_tf, transpose_b=True) 175 | tf_topk_vals, tf_topk_inds = tf.nn.top_k(preds_pref, k=num_candidates, sorted=True, name='top_targets') 176 | self.tf_topk_vals = tf.reshape(tf_topk_vals, [-1], name='select_y_vals') 177 | self.tf_topk_inds = tf.reshape(tf_topk_inds, [-1], name='select_y_inds') 178 | preds_random = tf.gather_nd(preds_pref, self.rand_target_ui) 179 | self.preds_random = tf.reshape(preds_random, [-1], name='random_y_inds') 180 | 181 | # tf matmul-topk to get eval on latent 182 | with tf.variable_scope("latent_eval"): 183 | preds_pref_latent_warm = tf.sparse_add(preds_pref, self.eval_trainR) 184 | _, self.tf_latent_topk_cold = tf.nn.top_k(preds_pref, k=recall_at[-1], sorted=True, name='topK_latent_cold') 185 | _, self.tf_latent_topk_warm = tf.nn.top_k(preds_pref_latent_warm, k=recall_at[-1], sorted=True, 186 | name='topK_latent_warm') 187 | 188 | def get_eval_dict(self, _i, _eval_start, _eval_finish, eval_data): 189 | """ 190 | packaging method to iterate evaluation data, select from start:finish 191 | should be passed directly to batch method 192 | 193 | :param _i: slice id 194 | :param _eval_start: integer beginning of slice 195 | :param _eval_finish: integer end of slice 196 | :param eval_data: package EvalData obj 197 | :return: 198 | """ 199 | _eval_dict = { 200 | self.Uin: eval_data.U_pref_test[_eval_start:_eval_finish, :], 201 | self.Vin: eval_data.V_pref_test, 202 | self.Vcontent: eval_data.V_content_test, 203 | self.phase: 0 204 | } 205 | if self.Ucontent!=None: 206 | _eval_dict[self.Ucontent]= eval_data.U_content_test[_eval_start:_eval_finish, :] 207 | if not eval_data.is_cold: 208 | _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i] 209 | return _eval_dict 210 | 211 | def get_eval_dict_latent(self, _i, _eval_start, _eval_finish, eval_data, u_pref, v_pref): 212 | """ 213 | packaging method to iterate evaluation data, select from start:finish 214 | uses preference input 215 | should be passed directly to batch method 216 | 217 | :param _i: slice id 218 | :param _eval_start: integer beginning of slice 219 | :param _eval_finish: integer end of slice 220 | :param eval_data: package EvalData obj 221 | :param u_pref: user latent input to slice 222 | :param v_pref: item latent input to slice 223 | :return: 224 | """ 225 | _eval_dict = { 226 | self.U_pref_tf: u_pref[eval_data.test_user_ids[_eval_start:_eval_finish], :], 227 | self.V_pref_tf: v_pref[eval_data.test_item_ids, :] 228 | } 229 | if not eval_data.is_cold: 230 | _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i] 231 | return _eval_dict 232 | -------------------------------------------------------------------------------- /tf1/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | import numpy as np 4 | import scipy 5 | import tensorflow as tf 6 | from sklearn import preprocessing as prep 7 | 8 | 9 | class timer(object): 10 | def __init__(self, name='default'): 11 | """ 12 | timer object to record running time of functions, not for micro-benchmarking 13 | usage is: 14 | $ timer = utils.timer('name').tic() 15 | $ timer.toc('process A').tic() 16 | 17 | 18 | :param name: label for the timer 19 | """ 20 | self._start_time = None 21 | self._name = name 22 | self.tic() 23 | 24 | def tic(self): 25 | self._start_time = time.time() 26 | return self 27 | 28 | def toc(self, message): 29 | elapsed = time.time() - self._start_time 30 | message = '' if message is None else message 31 | print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed))) 32 | return self 33 | 34 | def reset(self): 35 | self._start_time = None 36 | return self 37 | 38 | @staticmethod 39 | def _format(s): 40 | delta = datetime.timedelta(seconds=s) 41 | d = datetime.datetime(1, 1, 1) + delta 42 | s = '' 43 | if (d.day - 1) > 0: 44 | s = s + '{:d} days'.format(d.day - 1) 45 | if d.hour > 0: 46 | s = s + '{:d} hr'.format(d.hour) 47 | if d.minute > 0: 48 | s = s + '{:d} min'.format(d.minute) 49 | s = s + '{:d} s'.format(d.second) 50 | return s 51 | 52 | 53 | def batch(iterable, _n=1, drop=True): 54 | """ 55 | returns batched version of some iterable 56 | :param iterable: iterable object as input 57 | :param _n: batch size 58 | :param drop: if true, drop extra if batch size does not divide evenly, 59 | otherwise keep them (last batch might be shorter) 60 | :return: batched version of iterable 61 | """ 62 | it_len = len(iterable) 63 | for ndx in range(0, it_len, _n): 64 | if ndx + _n < it_len: 65 | yield iterable[ndx:ndx + _n] 66 | elif drop is False: 67 | yield iterable[ndx:it_len] 68 | 69 | 70 | def tfidf(x): 71 | """ 72 | compute tfidf of numpy array x 73 | :param x: input array, document by terms 74 | :return: 75 | """ 76 | x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel()) 77 | x_idf = np.asarray(x_idf) 78 | x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf))) 79 | x_idf_diag.setdiag(x_idf) 80 | x_tf = x.tocsr() 81 | x_tf.data = np.log(x_tf.data + 1) 82 | x_tfidf = x_tf * x_idf_diag 83 | return x_tfidf 84 | 85 | 86 | def prep_standardize(x): 87 | """ 88 | takes sparse input and compute standardized version 89 | 90 | Note: 91 | cap at 5 std 92 | 93 | :param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing 94 | :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x 95 | """ 96 | x_nzrow = x.any(axis=1) 97 | scaler = prep.StandardScaler().fit(x[x_nzrow, :]) 98 | x_scaled = np.copy(x) 99 | x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :]) 100 | x_scaled[x_scaled > 5] = 5 101 | x_scaled[x_scaled < -5] = -5 102 | x_scaled[np.absolute(x_scaled) < 1e-5] = 0 103 | return scaler, x_scaled 104 | 105 | 106 | def prep_standardize_dense(x): 107 | """ 108 | takes dense input and compute standardized version 109 | 110 | Note: 111 | cap at 5 std 112 | 113 | :param x: 2D numpy data array to standardize (column-wise) 114 | :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x 115 | """ 116 | scaler = prep.StandardScaler().fit(x) 117 | x_scaled = scaler.transform(x) 118 | x_scaled[x_scaled > 5] = 5 119 | x_scaled[x_scaled < -5] = -5 120 | x_scaled[np.absolute(x_scaled) < 1e-5] = 0 121 | return scaler, x_scaled 122 | 123 | 124 | def batch_eval_recall(_sess, tf_eval, eval_feed_dict, recall_k, eval_data): 125 | """ 126 | given EvalData and DropoutNet compute graph in TensorFlow, runs batch evaluation 127 | 128 | :param _sess: tf session 129 | :param tf_eval: the evaluate output symbol in tf 130 | :param eval_feed_dict: method to parse tf, pick from EvalData method 131 | :param recall_k: list of thresholds to compute recall at (information retrieval recall) 132 | :param eval_data: EvalData instance 133 | :return: recall array at thresholds matching recall_k 134 | """ 135 | tf_eval_preds_batch = [] 136 | for (batch, (eval_start, eval_stop)) in enumerate(eval_data.eval_batch): 137 | tf_eval_preds = _sess.run(tf_eval, 138 | feed_dict=eval_feed_dict( 139 | batch, eval_start, eval_stop, eval_data)) 140 | tf_eval_preds_batch.append(tf_eval_preds) 141 | tf_eval_preds = np.concatenate(tf_eval_preds_batch) 142 | tf.local_variables_initializer().run() 143 | 144 | # filter non-zero targets 145 | y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows] 146 | y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz] 147 | 148 | preds_all = tf_eval_preds[y_nz, :] 149 | 150 | recall = [] 151 | for at_k in recall_k: 152 | preds_k = preds_all[:, :at_k] 153 | y = eval_data.R_test_inf[y_nz, :] 154 | 155 | x = scipy.sparse.lil_matrix(y.shape) 156 | x.rows = preds_k 157 | x.data = np.ones_like(preds_k) 158 | 159 | z = y.multiply(x) 160 | recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1)))) 161 | return recall 162 | -------------------------------------------------------------------------------- /tf2/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.sparse 4 | import utils 5 | import pandas as pd 6 | 7 | """ 8 | This module contains class and methods related to data used in DropoutNet 9 | """ 10 | 11 | 12 | def load_eval_data(test_file, test_id_file, name, cold, train_data, citeu=False): 13 | timer = utils.timer() 14 | with open(test_id_file) as f: 15 | test_item_ids = [int(line) for line in f] 16 | test_data = pd.read_csv(test_file, delimiter=",", header=None, dtype=np.int32).values.ravel() 17 | if citeu: 18 | test_data = test_data.view( 19 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) 20 | else: 21 | test_data = test_data.view( 22 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) 23 | timer.toc('read %s triplets %s' % (name, test_data.shape)).tic() 24 | eval_data = EvalData( 25 | test_data, 26 | test_item_ids, 27 | is_cold=cold, 28 | train=train_data 29 | ) 30 | timer.toc('loaded %s' % name).tic() 31 | print(eval_data.get_stats_string()) 32 | return eval_data 33 | 34 | 35 | class EvalData: 36 | """ 37 | EvalData: 38 | EvalData packages test triplet (user, item, score) into appropriate formats for evaluation 39 | 40 | Compact Indices: 41 | Specifically, this builds compact indices and stores mapping between original and compact indices. 42 | Compact indices only contains: 43 | 1) items in test set 44 | 2) users who interacted with such test items 45 | These compact indices speed up testing significantly by ignoring irrelevant users or items 46 | 47 | Args: 48 | test_triplets(int triplets): user-item-interaction_value triplet to build the test data 49 | train(int triplets): user-item-interaction_value triplet from train data 50 | 51 | Attributes: 52 | is_cold(boolean): whether test data is used for cold start problem 53 | test_item_ids(list of int): maps compressed item ids to original item ids (via position) 54 | test_item_ids_map(dictionary of int->int): maps original item ids to compressed item ids 55 | test_user_ids(list of int): maps compressed user ids to original user ids (via position) 56 | test_user_ids_map(dictionary of int->int): maps original user ids to compressed user ids 57 | R_test_inf(scipy lil matrix): pre-built compressed test matrix 58 | R_train_inf(scipy lil matrix): pre-built compressed train matrix for testing 59 | 60 | other relevant input/output exposed from tensorflow graph 61 | 62 | """ 63 | 64 | def __init__(self, test_triplets, test_item_ids, is_cold, train): 65 | # build map both-ways between compact and original indices 66 | # compact indices only contains: 67 | # 1) items in test set 68 | # 2) users who interacted with such test items 69 | 70 | self.is_cold = is_cold 71 | 72 | self.test_item_ids = test_item_ids 73 | # test_item_ids_map 74 | self.test_item_ids_map = {iid: i for i, iid in enumerate(self.test_item_ids)} 75 | 76 | _test_ij_for_inf = [(t[0], t[1]) for t in test_triplets if t[1] in self.test_item_ids_map] 77 | # test_user_ids 78 | self.test_user_ids = np.unique(test_triplets['uid']) 79 | # test_user_ids_map 80 | self.test_user_ids_map = {user_id: i for i, user_id in enumerate(self.test_user_ids)} 81 | 82 | _test_i_for_inf = [self.test_user_ids_map[_t[0]] for _t in _test_ij_for_inf] 83 | _test_j_for_inf = [self.test_item_ids_map[_t[1]] for _t in _test_ij_for_inf] 84 | self.R_test_inf = scipy.sparse.coo_matrix( 85 | (np.ones(len(_test_i_for_inf)), 86 | (_test_i_for_inf, _test_j_for_inf)), 87 | shape=[len(self.test_user_ids), len(self.test_item_ids)] 88 | ).tolil(copy=False) 89 | 90 | train_ij_for_inf = [(self.test_user_ids_map[_t[0]], self.test_item_ids_map[_t[1]]) for _t 91 | in train 92 | if _t[1] in self.test_item_ids_map and _t[0] in self.test_user_ids_map] 93 | if self.is_cold and len(train_ij_for_inf) is not 0: 94 | raise Exception('using cold dataset, but data is not cold!') 95 | if not self.is_cold and len(train_ij_for_inf) is 0: 96 | raise Exception('using warm datset, but data is not warm!') 97 | 98 | self.R_train_inf = None if self.is_cold else scipy.sparse.coo_matrix(( 99 | np.ones(len(train_ij_for_inf)), 100 | zip(*train_ij_for_inf)), shape=self.R_test_inf.shape).tolil(copy=False) 101 | 102 | # allocate fields 103 | self.U_pref_test = None 104 | self.V_pref_test = None 105 | self.V_content_test = None 106 | self.U_content_test = None 107 | self.tf_eval_train = None 108 | self.tf_eval_test = None 109 | self.eval_batch = None 110 | 111 | def init_tf(self, user_factors, item_factors, user_content, item_content, eval_run_batchsize): 112 | self.U_pref_test = user_factors[self.test_user_ids, :] 113 | self.V_pref_test = item_factors[self.test_item_ids, :] 114 | self.V_content_test = item_content[self.test_item_ids, :] 115 | if scipy.sparse.issparse(self.V_content_test): 116 | self.V_content_test = self.V_content_test.todense() 117 | if user_content!=None: 118 | self.U_content_test = user_content[self.test_user_ids, :] 119 | if scipy.sparse.issparse(self.U_content_test): 120 | self.U_content_test = self.U_content_test.todense() 121 | eval_l = self.R_test_inf.shape[0] 122 | self.eval_batch = [(x, min(x + eval_run_batchsize, eval_l)) for x 123 | in range(0, eval_l, eval_run_batchsize)] 124 | 125 | self.tf_eval_train = [] 126 | self.tf_eval_test = [] 127 | 128 | if not self.is_cold: 129 | for (eval_start, eval_finish) in self.eval_batch: 130 | _ui = self.R_train_inf[eval_start:eval_finish, :].tocoo() 131 | _ui = list(zip(_ui.row, _ui.col)) 132 | self.tf_eval_train.append( 133 | tf.compat.v1.SparseTensorValue( 134 | indices=_ui, 135 | values=np.full(len(_ui), -100000, dtype=np.float32), 136 | dense_shape=[eval_finish - eval_start, self.R_train_inf.shape[1]] 137 | ) 138 | ) 139 | 140 | def get_stats_string(self): 141 | return ('\tn_test_users:[%d]\n\tn_test_items:[%d]' % (len(self.test_user_ids), len(self.test_item_ids)) 142 | + '\n\tR_train_inf: %s' % ( 143 | 'no R_train_inf for cold' if self.is_cold else 'shape=%s nnz=[%d]' % ( 144 | str(self.R_train_inf.shape), len(self.R_train_inf.nonzero()[0]) 145 | ) 146 | ) 147 | + '\n\tR_test_inf: shape=%s nnz=[%d]' % ( 148 | str(self.R_test_inf.shape), len(self.R_test_inf.nonzero()[0]) 149 | )) 150 | -------------------------------------------------------------------------------- /tf2/main.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | import datetime 6 | from sklearn import datasets 7 | import data 8 | import model 9 | 10 | import argparse 11 | import os 12 | 13 | n_users = 1497020 + 1 14 | n_items = 1306054 + 1 15 | tf.compat.v1.disable_eager_execution() 16 | 17 | 18 | def main(): 19 | data_path = args.data_dir 20 | checkpoint_path = args.checkpoint_path 21 | tb_log_path = args.tb_log_path 22 | model_select = args.model_select 23 | 24 | rank_out = args.rank 25 | user_batch_size = 1000 26 | n_scores_user = 2500 27 | data_batch_size = 100 28 | dropout = args.dropout 29 | recall_at = range(50, 550, 50) 30 | eval_batch_size = 1000 31 | max_data_per_step = 2500000 32 | eval_every = args.eval_every 33 | num_epoch = 10 34 | 35 | _lr = args.lr 36 | _decay_lr_every = 50 37 | _lr_decay = 0.1 38 | 39 | experiment = '%s_%s' % ( 40 | datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 41 | '-'.join(str(x / 100) for x in model_select) if model_select else 'simple' 42 | ) 43 | _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint' 44 | 45 | print('running: ' + experiment) 46 | 47 | dat = load_data(data_path) 48 | u_pref_scaled = dat['u_pref_scaled'] 49 | v_pref_scaled = dat['v_pref_scaled'] 50 | eval_warm = dat['eval_warm'] 51 | eval_cold_user = dat['eval_cold_user'] 52 | eval_cold_item = dat['eval_cold_item'] 53 | user_content = dat['user_content'] 54 | item_content = dat['item_content'] 55 | u_pref = dat['u_pref'] 56 | v_pref = dat['v_pref'] 57 | user_indices = dat['user_indices'] 58 | 59 | timer = utils.timer(name='main').tic() 60 | 61 | # append pref factors for faster dropout 62 | v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) 63 | v_pref_last = v_pref_scaled.shape[0] 64 | u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) 65 | u_pref_last = u_pref_scaled.shape[0] 66 | timer.toc('initialized numpy data for tf') 67 | 68 | # prep eval 69 | eval_batch_size = eval_batch_size 70 | timer.tic() 71 | eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 72 | timer.toc('initialized eval_warm for tf').tic() 73 | eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 74 | timer.toc('initialized eval_cold_user for tf').tic() 75 | eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 76 | timer.toc('initialized eval_cold_item for tf').tic() 77 | 78 | dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1], 79 | user_content_rank=user_content.shape[1], 80 | item_content_rank=item_content.shape[1], 81 | model_select=model_select, 82 | rank_out=rank_out) 83 | 84 | config = tf.compat.v1.ConfigProto(allow_soft_placement=True) 85 | 86 | with tf.device(args.model_device): 87 | dropout_net.build_model() 88 | 89 | with tf.device(args.inf_device): 90 | dropout_net.build_predictor(recall_at, n_scores_user) 91 | 92 | with tf.compat.v1.Session(config=config) as sess: 93 | tf_saver = None if _tf_ckpt_file is None else tf.compat.v1.train.Saver() 94 | train_writer = None if tb_log_path is None else tf.compat.v1.summary.FileWriter( 95 | tb_log_path + experiment, sess.graph) 96 | tf.compat.v1.global_variables_initializer().run() 97 | tf.compat.v1.local_variables_initializer().run() 98 | timer.toc('initialized tf') 99 | 100 | row_index = np.copy(user_indices) 101 | n_step = 0 102 | best_cold_user = 0 103 | best_cold_item = 0 104 | best_warm = 0 105 | n_batch_trained = 0 106 | best_step = 0 107 | for epoch in range(num_epoch): 108 | np.random.shuffle(row_index) 109 | for b in utils.batch(row_index, user_batch_size): 110 | n_step += 1 111 | # prep targets 112 | target_users = np.repeat(b, n_scores_user) 113 | target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) 114 | target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b] 115 | target_items_rand = np.array(target_items_rand).flatten() 116 | target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand])) 117 | [target_scores, target_items, random_scores] = sess.run( 118 | [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random], 119 | feed_dict={ 120 | dropout_net.U_pref_tf: u_pref[b, :], 121 | dropout_net.V_pref_tf: v_pref, 122 | dropout_net.rand_target_ui: target_ui_rand 123 | } 124 | ) 125 | # merge topN and randomN items per user 126 | target_scores = np.append(target_scores, random_scores) 127 | target_items = np.append(target_items, target_items_rand) 128 | target_users = np.append(target_users, target_users) 129 | 130 | tf.compat.v1.local_variables_initializer().run() 131 | n_targets = len(target_scores) 132 | perm = np.random.permutation(n_targets) 133 | n_targets = min(n_targets, max_data_per_step) 134 | data_batch = [(n, min(n + data_batch_size, n_targets)) for n in range(0, n_targets, data_batch_size)] 135 | f_batch = 0 136 | for (start, stop) in data_batch: 137 | batch_perm = perm[start:stop] 138 | batch_users = target_users[batch_perm] 139 | batch_items = target_items[batch_perm] 140 | if dropout != 0: 141 | n_to_drop = int(np.floor(dropout * len(batch_perm))) 142 | perm_user = np.random.permutation(len(batch_perm))[:n_to_drop] 143 | perm_item = np.random.permutation(len(batch_perm))[:n_to_drop] 144 | batch_v_pref = np.copy(batch_items) 145 | batch_u_pref = np.copy(batch_users) 146 | batch_v_pref[perm_user] = v_pref_last 147 | batch_u_pref[perm_item] = u_pref_last 148 | else: 149 | batch_v_pref = batch_items 150 | batch_u_pref = batch_users 151 | 152 | _, _, loss_out = sess.run( 153 | [dropout_net.preds, dropout_net.updates, dropout_net.loss], 154 | feed_dict={ 155 | dropout_net.Uin: u_pref_expanded[batch_u_pref, :], 156 | dropout_net.Vin: v_pref_expanded[batch_v_pref, :], 157 | dropout_net.Ucontent: user_content[batch_users, :].todense(), 158 | dropout_net.Vcontent: item_content[batch_items, :].todense(), 159 | # 160 | dropout_net.target: target_scores[batch_perm], 161 | dropout_net.lr_placeholder: _lr, 162 | dropout_net.phase: 1 163 | } 164 | ) 165 | f_batch += loss_out 166 | if np.isnan(f_batch): 167 | raise Exception('f is nan') 168 | 169 | n_batch_trained += len(data_batch) 170 | if n_step % _decay_lr_every == 0: 171 | _lr = _lr_decay * _lr 172 | print('decayed lr:' + str(_lr)) 173 | if n_step % eval_every == 0: 174 | recall_warm = utils.batch_eval_recall( 175 | sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict, 176 | recall_k=recall_at, eval_data=eval_warm) 177 | recall_cold_user = utils.batch_eval_recall( 178 | sess, dropout_net.eval_preds_cold, 179 | eval_feed_dict=dropout_net.get_eval_dict, 180 | recall_k=recall_at, eval_data=eval_cold_user) 181 | recall_cold_item = utils.batch_eval_recall( 182 | sess, dropout_net.eval_preds_cold, 183 | eval_feed_dict=dropout_net.get_eval_dict, 184 | recall_k=recall_at, eval_data=eval_cold_item) 185 | 186 | # checkpoint 187 | if np.sum(recall_warm + recall_cold_user + recall_cold_item) > np.sum( 188 | best_warm + best_cold_user + best_cold_item): 189 | best_cold_user = recall_cold_user 190 | best_cold_item = recall_cold_item 191 | best_warm = recall_warm 192 | best_step = n_step 193 | if tf_saver is not None: 194 | tf_saver.save(sess, _tf_ckpt_file) 195 | 196 | timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % ( 197 | n_step, len(data_batch), n_batch_trained, f_batch, best_step 198 | )).tic() 199 | print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at])) 200 | print('warm start\t%s\ncold user\t%s\ncold item\t%s' % ( 201 | ' '.join(['%.4f' % i for i in recall_warm]), 202 | ' '.join(['%.4f' % i for i in recall_cold_user]), 203 | ' '.join(['%.4f' % i for i in recall_cold_item]) 204 | )) 205 | summaries = [] 206 | for i, k in enumerate(recall_at): 207 | if k % 100 == 0: 208 | summaries.extend([ 209 | tf.compat.v1.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]), 210 | tf.compat.v1.Summary.Value(tag="recall@" + str(k) + " cold_user", 211 | simple_value=recall_cold_user[i]), 212 | tf.compat.v1.Summary.Value(tag="recall@" + str(k) + " cold_item", 213 | simple_value=recall_cold_item[i]) 214 | ]) 215 | recall_summary = tf.compat.v1.Summary(value=summaries) 216 | if train_writer is not None: 217 | train_writer.add_summary(recall_summary, n_step) 218 | 219 | 220 | def load_data(data_path): 221 | timer = utils.timer(name='main').tic() 222 | split_folder = os.path.join(data_path, 'warm') 223 | 224 | u_file = os.path.join(data_path, 'trained/warm/U.csv.bin') 225 | v_file = os.path.join(data_path, 'trained/warm/V.csv.bin') 226 | user_content_file = os.path.join(data_path, 'user_features_0based.txt') 227 | item_content_file = os.path.join(data_path, 'item_features_0based.txt') 228 | train_file = os.path.join(split_folder, 'train.csv') 229 | test_warm_file = os.path.join(split_folder, 'test_warm.csv') 230 | test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv') 231 | test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv') 232 | test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv') 233 | test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv') 234 | test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv') 235 | 236 | dat = {} 237 | # load preference data 238 | timer.tic() 239 | u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200) 240 | v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200) 241 | dat['u_pref'] = u_pref 242 | dat['v_pref'] = v_pref 243 | 244 | timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() 245 | 246 | # pre-process 247 | _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) 248 | _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) 249 | timer.toc('standardized U,V').tic() 250 | 251 | # load content data 252 | timer.tic() 253 | user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32) 254 | dat['user_content'] = user_content.tolil(copy=False) 255 | timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() 256 | item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) 257 | dat['item_content'] = item_content.tolil(copy=False) 258 | timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() 259 | 260 | # load split 261 | timer.tic() 262 | train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view( 263 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) 264 | dat['user_indices'] = np.unique(train['uid']) 265 | timer.toc('read train triplets %s' % train.shape).tic() 266 | 267 | dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, 268 | train_data=train) 269 | dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user', 270 | cold=True, 271 | train_data=train) 272 | dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item', 273 | cold=True, 274 | train_data=train) 275 | return dat 276 | 277 | 278 | if __name__ == "__main__": 279 | parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data", 280 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 281 | parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder') 282 | 283 | parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training') 284 | parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference') 285 | parser.add_argument('--checkpoint-path', type=str, default=None, 286 | help='path to dump checkpoint data from TensorFlow') 287 | parser.add_argument('--tb-log-path', type=str, default=None, 288 | help='path to dump TensorBoard logs') 289 | parser.add_argument('--model-select', nargs='+', type=int, 290 | default=[800, 400], 291 | help='specify the fully-connected architecture, starting from input,' 292 | ' numbers indicate numbers of hidden units', 293 | ) 294 | parser.add_argument('--rank', type=int, default=200, help='output rank of latent model') 295 | parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout') 296 | parser.add_argument('--eval-every', type=int, default=2, help='evaluate every X user-batch') 297 | parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate') 298 | 299 | args = parser.parse_args() 300 | main() 301 | -------------------------------------------------------------------------------- /tf2/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def dense_batch_fc_tanh(x, units, phase, scope, do_norm=False): 5 | """ 6 | convenience function to build tanh blocks in DeepCF 7 | tanh is found to work better for DeepCF nets 8 | constitutes of: FC -> batch norm -> tanh activation 9 | 10 | x: input 11 | units: # of hidden units in FC 12 | phase: boolean flag whether we are training, required by batch norm 13 | scope: name of block 14 | do_norm: boolean flag to do batch norm after FC 15 | """ 16 | 17 | with tf.compat.v1.variable_scope(scope): 18 | init = tf.compat.v1.truncated_normal_initializer(stddev=0.01) 19 | h1_w = tf.compat.v1.get_variable(scope + '_w', 20 | shape=[x.get_shape().as_list()[1], units], 21 | initializer=init) 22 | h1_b = tf.compat.v1.get_variable(scope + '_b', 23 | shape=[1, units], 24 | initializer=tf.zeros_initializer()) 25 | h1 = tf.matmul(x, h1_w) + h1_b 26 | if do_norm: 27 | h2 = tf.keras.layers.BatchNormalization( 28 | center=True, 29 | scale=True, 30 | trainable=True)(h1) 31 | return tf.nn.tanh(h2, scope + '_tanh') 32 | else: 33 | return tf.nn.tanh(h1, scope + '_tanh') 34 | 35 | 36 | class DeepCF: 37 | """ 38 | main model class implementing DeepCF 39 | also stores states for fast candidate generation 40 | 41 | latent_rank_in: rank of preference model input 42 | user_content_rank: rank of user content input 43 | item_content_rank: rank of item content input 44 | model_select: array of number of hidden unit, 45 | i.e. [200,100] indicate two hidden layer with 200 units followed by 100 units 46 | rank_out: rank of latent model output 47 | 48 | """ 49 | 50 | def __init__(self, latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out): 51 | 52 | self.rank_in = latent_rank_in 53 | self.phi_u_dim = user_content_rank 54 | self.phi_v_dim = item_content_rank 55 | self.model_select = model_select 56 | self.rank_out = rank_out 57 | 58 | # inputs 59 | self.Uin = None 60 | self.Vin = None 61 | self.Ucontent = None 62 | self.Vcontent = None 63 | self.phase = None 64 | self.target = None 65 | self.eval_trainR = None 66 | self.U_pref_tf = None 67 | self.V_pref_tf = None 68 | self.rand_target_ui = None 69 | 70 | # outputs in the model 71 | 72 | self.preds = None 73 | self.updates = None 74 | self.loss = None 75 | 76 | self.U_embedding = None 77 | self.V_embedding = None 78 | 79 | self.lr_placeholder = None 80 | 81 | # predictor 82 | self.tf_topk_vals = None 83 | self.tf_topk_inds = None 84 | self.preds_random = None 85 | self.tf_latent_topk_cold = None 86 | self.tf_latent_topk_warm = None 87 | self.eval_preds_warm = None 88 | self.eval_preds_cold = None 89 | 90 | def build_model(self): 91 | """ 92 | set up tf components for main DeepCF net 93 | call after setting up desired tf state (cpu/gpu etc...) 94 | 95 | Note: should use GPU 96 | """ 97 | self.lr_placeholder = tf.compat.v1.placeholder(tf.float32, shape=[], name='learn_rate') 98 | self.phase = tf.compat.v1.placeholder(tf.bool, name='phase') 99 | self.target = tf.compat.v1.placeholder(tf.float32, shape=[None], name='target') 100 | 101 | self.Uin = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='U_in_raw') 102 | self.Vin = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='V_in_raw') 103 | if self.phi_u_dim>0: 104 | self.Ucontent = tf.compat.v1.placeholder(tf.float32, shape=[None, self.phi_u_dim], name='U_content') 105 | u_concat = tf.concat([self.Uin, self.Ucontent], 1) 106 | else: 107 | u_concat = self.Uin 108 | 109 | if self.phi_v_dim>0: 110 | self.Vcontent = tf.compat.v1.placeholder(tf.float32, shape=[None, self.phi_v_dim], name='V_content') 111 | v_concat = tf.concat([self.Vin, self.Vcontent], 1) 112 | else: 113 | v_concat = self.Vin 114 | 115 | print ('\tu_concat.shape=%s' % str(u_concat.get_shape())) 116 | print ('\tv_concat.shape=%s' % str(v_concat.get_shape())) 117 | 118 | u_last = u_concat 119 | v_last = v_concat 120 | for ihid, hid in enumerate(self.model_select): 121 | u_last = dense_batch_fc_tanh(u_last, hid, self.phase, 'user_layer_%d' % (ihid + 1), do_norm=True) 122 | v_last = dense_batch_fc_tanh(v_last, hid, self.phase, 'item_layer_%d' % (ihid + 1), do_norm=True) 123 | 124 | with tf.compat.v1.variable_scope("self.U_embedding"): 125 | u_emb_w = tf.Variable(tf.random.truncated_normal([u_last.get_shape().as_list()[1], self.rank_out], stddev=0.01), 126 | name='u_emb_w') 127 | u_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='u_emb_b') 128 | self.U_embedding = tf.matmul(u_last, u_emb_w) + u_emb_b 129 | 130 | with tf.compat.v1.variable_scope("V_embedding"): 131 | v_emb_w = tf.Variable(tf.random.truncated_normal([v_last.get_shape().as_list()[1], self.rank_out], stddev=0.01), 132 | name='v_emb_w') 133 | v_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='v_emb_b') 134 | self.V_embedding = tf.matmul(v_last, v_emb_w) + v_emb_b 135 | 136 | with tf.compat.v1.variable_scope("loss"): 137 | preds = tf.multiply(self.U_embedding, self.V_embedding) 138 | self.preds = tf.reduce_sum(preds, 1) 139 | self.loss = tf.reduce_mean(tf.math.squared_difference(self.preds, self.target)) 140 | 141 | update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) 142 | with tf.control_dependencies(update_ops): 143 | # Ensures that we execute the update_ops before performing the train_step 144 | self.updates = tf.compat.v1.train.MomentumOptimizer(self.lr_placeholder, 0.9).minimize(self.loss) 145 | 146 | def build_predictor(self, recall_at, num_candidates): 147 | """ 148 | set up tf components for prediction and target selection 149 | call after setting up desired tf state (cpu/gpu etc...) 150 | 151 | Note: should use CPU, as large inputs are expected 152 | 153 | :param recall_at: truncation to compute recall 154 | :param num_candidates: number of candidates 155 | :return: 156 | """ 157 | self.eval_trainR = tf.compat.v1.sparse_placeholder( 158 | dtype=tf.float32, shape=[None, None], name='trainR_sparse_CPU') 159 | 160 | with tf.compat.v1.variable_scope("eval"): 161 | embedding_prod_cold = tf.matmul(self.U_embedding, self.V_embedding, transpose_b=True, name='pred_all_items') 162 | embedding_prod_warm = tf.compat.v1.sparse_add(embedding_prod_cold, self.eval_trainR) 163 | _, self.eval_preds_cold = tf.nn.top_k(embedding_prod_cold, k=recall_at[-1], sorted=True, 164 | name='topK_net_cold') 165 | _, self.eval_preds_warm = tf.nn.top_k(embedding_prod_warm, k=recall_at[-1], sorted=True, 166 | name='topK_net_warm') 167 | with tf.compat.v1.variable_scope("select_targets"): 168 | self.U_pref_tf = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='u_pref') 169 | self.V_pref_tf = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='v_pref') 170 | self.rand_target_ui = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name='rand_target_ui') 171 | preds_pref = tf.matmul(self.U_pref_tf, self.V_pref_tf, transpose_b=True) 172 | tf_topk_vals, tf_topk_inds = tf.nn.top_k(preds_pref, k=num_candidates, sorted=True, name='top_targets') 173 | self.tf_topk_vals = tf.reshape(tf_topk_vals, [-1], name='select_y_vals') 174 | self.tf_topk_inds = tf.reshape(tf_topk_inds, [-1], name='select_y_inds') 175 | preds_random = tf.gather_nd(preds_pref, self.rand_target_ui) 176 | self.preds_random = tf.reshape(preds_random, [-1], name='random_y_inds') 177 | 178 | # tf matmul-topk to get eval on latent 179 | with tf.compat.v1.variable_scope("latent_eval"): 180 | preds_pref_latent_warm = tf.compat.v1.sparse_add(preds_pref, self.eval_trainR) 181 | _, self.tf_latent_topk_cold = tf.nn.top_k(preds_pref, k=recall_at[-1], sorted=True, name='topK_latent_cold') 182 | _, self.tf_latent_topk_warm = tf.nn.top_k(preds_pref_latent_warm, k=recall_at[-1], sorted=True, 183 | name='topK_latent_warm') 184 | 185 | def get_eval_dict(self, _i, _eval_start, _eval_finish, eval_data): 186 | """ 187 | packaging method to iterate evaluation data, select from start:finish 188 | should be passed directly to batch method 189 | 190 | :param _i: slice id 191 | :param _eval_start: integer beginning of slice 192 | :param _eval_finish: integer end of slice 193 | :param eval_data: package EvalData obj 194 | :return: 195 | """ 196 | _eval_dict = { 197 | self.Uin: eval_data.U_pref_test[_eval_start:_eval_finish, :], 198 | self.Vin: eval_data.V_pref_test, 199 | self.Vcontent: eval_data.V_content_test, 200 | self.phase: 0 201 | } 202 | if self.Ucontent!=None: 203 | _eval_dict[self.Ucontent]= eval_data.U_content_test[_eval_start:_eval_finish, :] 204 | if not eval_data.is_cold: 205 | _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i] 206 | return _eval_dict 207 | 208 | def get_eval_dict_latent(self, _i, _eval_start, _eval_finish, eval_data, u_pref, v_pref): 209 | """ 210 | packaging method to iterate evaluation data, select from start:finish 211 | uses preference input 212 | should be passed directly to batch method 213 | 214 | :param _i: slice id 215 | :param _eval_start: integer beginning of slice 216 | :param _eval_finish: integer end of slice 217 | :param eval_data: package EvalData obj 218 | :param u_pref: user latent input to slice 219 | :param v_pref: item latent input to slice 220 | :return: 221 | """ 222 | _eval_dict = { 223 | self.U_pref_tf: u_pref[eval_data.test_user_ids[_eval_start:_eval_finish], :], 224 | self.V_pref_tf: v_pref[eval_data.test_item_ids, :] 225 | } 226 | if not eval_data.is_cold: 227 | _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i] 228 | return _eval_dict 229 | -------------------------------------------------------------------------------- /tf2/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | import numpy as np 4 | import scipy 5 | import tensorflow as tf 6 | from sklearn import preprocessing as prep 7 | 8 | 9 | class timer(object): 10 | def __init__(self, name='default'): 11 | """ 12 | timer object to record running time of functions, not for micro-benchmarking 13 | usage is: 14 | $ timer = utils.timer('name').tic() 15 | $ timer.toc('process A').tic() 16 | 17 | 18 | :param name: label for the timer 19 | """ 20 | self._start_time = None 21 | self._name = name 22 | self.tic() 23 | 24 | def tic(self): 25 | self._start_time = time.time() 26 | return self 27 | 28 | def toc(self, message): 29 | elapsed = time.time() - self._start_time 30 | message = '' if message is None else message 31 | print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed))) 32 | return self 33 | 34 | def reset(self): 35 | self._start_time = None 36 | return self 37 | 38 | @staticmethod 39 | def _format(s): 40 | delta = datetime.timedelta(seconds=s) 41 | d = datetime.datetime(1, 1, 1) + delta 42 | s = '' 43 | if (d.day - 1) > 0: 44 | s = s + '{:d} days'.format(d.day - 1) 45 | if d.hour > 0: 46 | s = s + '{:d} hr'.format(d.hour) 47 | if d.minute > 0: 48 | s = s + '{:d} min'.format(d.minute) 49 | s = s + '{:d} s'.format(d.second) 50 | return s 51 | 52 | 53 | def batch(iterable, _n=1, drop=True): 54 | """ 55 | returns batched version of some iterable 56 | :param iterable: iterable object as input 57 | :param _n: batch size 58 | :param drop: if true, drop extra if batch size does not divide evenly, 59 | otherwise keep them (last batch might be shorter) 60 | :return: batched version of iterable 61 | """ 62 | it_len = len(iterable) 63 | for ndx in range(0, it_len, _n): 64 | if ndx + _n < it_len: 65 | yield iterable[ndx:ndx + _n] 66 | elif drop is False: 67 | yield iterable[ndx:it_len] 68 | 69 | 70 | def tfidf(x): 71 | """ 72 | compute tfidf of numpy array x 73 | :param x: input array, document by terms 74 | :return: 75 | """ 76 | x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel()) 77 | x_idf = np.asarray(x_idf) 78 | x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf))) 79 | x_idf_diag.setdiag(x_idf) 80 | x_tf = x.tocsr() 81 | x_tf.data = np.log(x_tf.data + 1) 82 | x_tfidf = x_tf * x_idf_diag 83 | return x_tfidf 84 | 85 | 86 | def prep_standardize(x): 87 | """ 88 | takes sparse input and compute standardized version 89 | 90 | Note: 91 | cap at 5 std 92 | 93 | :param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing 94 | :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x 95 | """ 96 | x_nzrow = x.any(axis=1) 97 | scaler = prep.StandardScaler().fit(x[x_nzrow, :]) 98 | x_scaled = np.copy(x) 99 | x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :]) 100 | x_scaled[x_scaled > 5] = 5 101 | x_scaled[x_scaled < -5] = -5 102 | x_scaled[np.absolute(x_scaled) < 1e-5] = 0 103 | return scaler, x_scaled 104 | 105 | 106 | def prep_standardize_dense(x): 107 | """ 108 | takes dense input and compute standardized version 109 | 110 | Note: 111 | cap at 5 std 112 | 113 | :param x: 2D numpy data array to standardize (column-wise) 114 | :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x 115 | """ 116 | scaler = prep.StandardScaler().fit(x) 117 | x_scaled = scaler.transform(x) 118 | x_scaled[x_scaled > 5] = 5 119 | x_scaled[x_scaled < -5] = -5 120 | x_scaled[np.absolute(x_scaled) < 1e-5] = 0 121 | return scaler, x_scaled 122 | 123 | 124 | def batch_eval_recall(_sess, tf_eval, eval_feed_dict, recall_k, eval_data): 125 | """ 126 | given EvalData and DropoutNet compute graph in TensorFlow, runs batch evaluation 127 | 128 | :param _sess: tf session 129 | :param tf_eval: the evaluate output symbol in tf 130 | :param eval_feed_dict: method to parse tf, pick from EvalData method 131 | :param recall_k: list of thresholds to compute recall at (information retrieval recall) 132 | :param eval_data: EvalData instance 133 | :return: recall array at thresholds matching recall_k 134 | """ 135 | tf_eval_preds_batch = [] 136 | for (batch, (eval_start, eval_stop)) in enumerate(eval_data.eval_batch): 137 | tf_eval_preds = _sess.run(tf_eval, 138 | feed_dict=eval_feed_dict( 139 | batch, eval_start, eval_stop, eval_data)) 140 | tf_eval_preds_batch.append(tf_eval_preds) 141 | tf_eval_preds = np.concatenate(tf_eval_preds_batch) 142 | tf.compat.v1.local_variables_initializer().run() 143 | 144 | # filter non-zero targets 145 | y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows] 146 | y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz] 147 | 148 | preds_all = tf_eval_preds[y_nz, :] 149 | 150 | recall = [] 151 | for at_k in recall_k: 152 | preds_k = preds_all[:, :at_k] 153 | y = eval_data.R_test_inf[y_nz, :] 154 | 155 | x = scipy.sparse.lil_matrix(y.shape) 156 | # x.rows = preds_k 157 | # x.data = np.ones_like(preds_k) 158 | x.data = np.array([z.tolist() for z in np.ones_like(preds_k)]+[[]])[:-1] 159 | x.rows = np.array([z.tolist() for z in preds_k]+[[]])[:-1] 160 | #import pdb; pdb.set_trace() 161 | z = y.multiply(x) 162 | recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1)))) 163 | return recall 164 | -------------------------------------------------------------------------------- /torch/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import scipy.sparse 4 | import utils 5 | import pandas as pd 6 | 7 | """ 8 | This module contains class and methods related to data used in DropoutNet 9 | """ 10 | 11 | 12 | def load_eval_data(test_file, test_id_file, name, cold, train_data, citeu=False): 13 | timer = utils.timer(name='utils') 14 | with open(test_id_file) as f: 15 | test_item_ids = [int(line) for line in f] 16 | test_data = pd.read_csv(test_file, delimiter=",", header=None, dtype=np.int32).values.ravel() 17 | if citeu: 18 | test_data = test_data.view( 19 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) 20 | else: 21 | test_data = test_data.view( 22 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) 23 | timer.toc('read %s triplets %s' % (name, test_data.shape)).tic() 24 | eval_data = EvalData( 25 | test_data, 26 | test_item_ids, 27 | is_cold=cold, 28 | train=train_data 29 | ) 30 | timer.toc('loaded %s' % name).tic() 31 | print(eval_data.get_stats_string()) 32 | return eval_data 33 | 34 | 35 | class EvalData: 36 | """ 37 | EvalData: 38 | EvalData packages test triplet (user, item, score) into appropriate formats for evaluation 39 | 40 | Compact Indices: 41 | Specifically, this builds compact indices and stores mapping between original and compact indices. 42 | Compact indices only contains: 43 | 1) items in test set 44 | 2) users who interacted with such test items 45 | These compact indices speed up testing significantly by ignoring irrelevant users or items 46 | 47 | Args: 48 | test_triplets(int triplets): user-item-interaction_value triplet to build the test data 49 | train(int triplets): user-item-interaction_value triplet from train data 50 | 51 | Attributes: 52 | is_cold(boolean): whether test data is used for cold start problem 53 | test_item_ids(list of int): maps compressed item ids to original item ids (via position) 54 | test_item_ids_map(dictionary of int->int): maps original item ids to compressed item ids 55 | test_user_ids(list of int): maps compressed user ids to original user ids (via position) 56 | test_user_ids_map(dictionary of int->int): maps original user ids to compressed user ids 57 | R_test_inf(scipy lil matrix): pre-built compressed test matrix 58 | R_train_inf(scipy lil matrix): pre-built compressed train matrix for testing 59 | 60 | other relevant input/output exposed from tensorflow graph 61 | 62 | """ 63 | 64 | def __init__(self, test_triplets, test_item_ids, is_cold, train): 65 | # build map both-ways between compact and original indices 66 | # compact indices only contains: 67 | # 1) items in test set 68 | # 2) users who interacted with such test items 69 | 70 | self.is_cold = is_cold 71 | 72 | self.test_item_ids = test_item_ids 73 | # test_item_ids_map 74 | self.test_item_ids_map = {iid: i for i, iid in enumerate(self.test_item_ids)} 75 | 76 | _test_ij_for_inf = [(t[0], t[1]) for t in test_triplets if t[1] in self.test_item_ids_map] 77 | # test_user_ids 78 | self.test_user_ids = np.unique(test_triplets['uid']) 79 | # test_user_ids_map 80 | self.test_user_ids_map = {user_id: i for i, user_id in enumerate(self.test_user_ids)} 81 | 82 | _test_i_for_inf = [self.test_user_ids_map[_t[0]] for _t in _test_ij_for_inf] 83 | _test_j_for_inf = [self.test_item_ids_map[_t[1]] for _t in _test_ij_for_inf] 84 | self.R_test_inf = scipy.sparse.coo_matrix( 85 | (np.ones(len(_test_i_for_inf)), 86 | (_test_i_for_inf, _test_j_for_inf)), 87 | shape=[len(self.test_user_ids), len(self.test_item_ids)] 88 | ).tolil(copy=False) 89 | 90 | train_ij_for_inf = [(self.test_user_ids_map[_t[0]], self.test_item_ids_map[_t[1]]) for _t 91 | in train 92 | if _t[1] in self.test_item_ids_map and _t[0] in self.test_user_ids_map] 93 | if self.is_cold and len(train_ij_for_inf) != 0: 94 | raise Exception('using cold dataset, but data is not cold!') 95 | if not self.is_cold and len(train_ij_for_inf) == 0: 96 | raise Exception('using warm datset, but data is not warm!') 97 | 98 | self.R_train_inf = None if self.is_cold else scipy.sparse.coo_matrix(( 99 | np.ones(len(train_ij_for_inf)), 100 | zip(*train_ij_for_inf)), shape=self.R_test_inf.shape).tolil(copy=False) 101 | 102 | # allocate fields 103 | self.U_pref_test = None 104 | self.V_pref_test = None 105 | self.V_content_test = None 106 | self.U_content_test = None 107 | self.tf_eval_train = None 108 | self.tf_eval_test = None 109 | self.eval_batch = None 110 | 111 | def init_tf(self, user_factors, item_factors, user_content, item_content, eval_run_batchsize): 112 | self.U_pref_test = user_factors[self.test_user_ids, :] 113 | self.V_pref_test = item_factors[self.test_item_ids, :] 114 | self.V_content_test = item_content[self.test_item_ids, :] 115 | if scipy.sparse.issparse(self.V_content_test): 116 | self.V_content_test = self.V_content_test.todense() 117 | if user_content!=None: 118 | self.U_content_test = user_content[self.test_user_ids, :] 119 | if scipy.sparse.issparse(self.U_content_test): 120 | self.U_content_test = self.U_content_test.todense() 121 | eval_l = self.R_test_inf.shape[0] 122 | self.eval_batch = [(x, min(x + eval_run_batchsize, eval_l)) for x 123 | in range(0, eval_l, eval_run_batchsize)] 124 | 125 | self.tf_eval_train = [] 126 | self.tf_eval_test = [] 127 | 128 | if not self.is_cold: 129 | for (eval_start, eval_finish) in self.eval_batch: 130 | _ui = self.R_train_inf[eval_start:eval_finish, :].tocoo() 131 | _ui = list(zip(_ui.row, _ui.col)) 132 | self.tf_eval_train.append( 133 | torch.sparse_coo_tensor( 134 | indices=np.array(_ui).T, 135 | values=np.full(len(_ui), -100000, dtype=np.float32), 136 | size=[eval_finish - eval_start, self.R_train_inf.shape[1]] 137 | ) 138 | ) 139 | 140 | def get_stats_string(self): 141 | return ('\tn_test_users:[%d]\n\tn_test_items:[%d]' % (len(self.test_user_ids), len(self.test_item_ids)) 142 | + '\n\tR_train_inf: %s' % ( 143 | 'no R_train_inf for cold' if self.is_cold else 'shape=%s nnz=[%d]' % ( 144 | str(self.R_train_inf.shape), len(self.R_train_inf.nonzero()[0]) 145 | ) 146 | ) 147 | + '\n\tR_test_inf: shape=%s nnz=[%d]' % ( 148 | str(self.R_test_inf.shape), len(self.R_test_inf.nonzero()[0]) 149 | )) 150 | -------------------------------------------------------------------------------- /torch/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy 4 | import torch 5 | import datetime 6 | from sklearn import datasets 7 | from tqdm import tqdm 8 | import argparse 9 | import os 10 | 11 | import utils 12 | import data 13 | import model 14 | 15 | n_users = 1497020 + 1 16 | n_items = 1306054 + 1 17 | 18 | def main(): 19 | data_path = args.data_dir 20 | checkpoint_path = args.checkpoint_path 21 | tb_log_path = args.tb_log_path 22 | model_select = args.model_select 23 | 24 | rank_out = args.rank 25 | user_batch_size = 1000 26 | n_scores_user = 2500 27 | data_batch_size = 100 28 | dropout = args.dropout 29 | recall_at = range(50, 550, 50) 30 | eval_batch_size = 1000 31 | max_data_per_step = 2500000 32 | eval_every = args.eval_every 33 | num_epoch = 10 34 | 35 | _lr = args.lr 36 | _decay_lr_every = 50 37 | _lr_decay = 0.1 38 | 39 | experiment = '%s_%s' % ( 40 | datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 41 | '-'.join(str(x / 100) for x in model_select) if model_select else 'simple' 42 | ) 43 | print('running: ' + experiment) 44 | 45 | dat = load_data(data_path) 46 | u_pref_scaled = dat['u_pref_scaled'] 47 | v_pref_scaled = dat['v_pref_scaled'] 48 | eval_warm = dat['eval_warm'] 49 | eval_cold_user = dat['eval_cold_user'] 50 | eval_cold_item = dat['eval_cold_item'] 51 | user_content = dat['user_content'] 52 | item_content = dat['item_content'] 53 | u_pref = dat['u_pref'] 54 | v_pref = dat['v_pref'] 55 | user_indices = dat['user_indices'] 56 | 57 | timer = utils.timer(name='main').tic() 58 | 59 | # append pref factors for faster dropout 60 | v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) 61 | v_pref_last = v_pref_scaled.shape[0] 62 | u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) 63 | u_pref_last = u_pref_scaled.shape[0] 64 | timer.toc('initialized numpy data') 65 | 66 | # prep eval 67 | eval_batch_size = eval_batch_size 68 | timer.tic() 69 | eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 70 | timer.toc('initialized eval_warm').tic() 71 | eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 72 | timer.toc('initialized eval_cold_user').tic() 73 | eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) 74 | timer.toc('initialized eval_cold_item').tic() 75 | 76 | dropout_net = model.get_model(latent_rank_in=u_pref.shape[1], 77 | user_content_rank=user_content.shape[1], 78 | item_content_rank=item_content.shape[1], 79 | model_select=model_select, 80 | rank_out=rank_out) 81 | 82 | row_index = np.copy(user_indices) 83 | n_step = 0 84 | best_cold_user = 0 85 | best_cold_item = 0 86 | best_warm = 0 87 | n_batch_trained = 0 88 | best_step = 0 89 | optimizer = torch.optim.SGD(dropout_net.parameters(), args.lr, momentum=0.9) 90 | crit = torch.nn.MSELoss() 91 | d_train = torch.device(args.model_device) 92 | d_eval = torch.device(args.inf_device) 93 | 94 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=_decay_lr_every, gamma=_lr_decay) 95 | dropout_net.to(d_train) 96 | dropout_net.train() 97 | 98 | for epoch in range(num_epoch): 99 | np.random.shuffle(row_index) 100 | for b in utils.batch(row_index, user_batch_size): 101 | n_step += 1 102 | # prep targets 103 | target_users = np.repeat(b, n_scores_user) 104 | target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) 105 | target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b] 106 | target_items_rand = np.array(target_items_rand).flatten() 107 | target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand])) 108 | 109 | preds_pref = np.matmul(u_pref[b, :], v_pref.T) 110 | preds_pref = torch.tensor(preds_pref) 111 | target_scores, target_items = torch.topk(preds_pref, k=n_scores_user, sorted=True) 112 | random_scores = preds_pref.detach().cpu().numpy()[target_ui_rand[:,0],target_ui_rand[:,1]] 113 | 114 | 115 | # merge topN and randomN items per user 116 | target_scores = np.append(target_scores, random_scores) 117 | target_items = np.append(target_items, target_items_rand) 118 | target_users = np.append(target_users, target_users) 119 | 120 | n_targets = len(target_scores) 121 | perm = np.random.permutation(n_targets) 122 | n_targets = min(n_targets, max_data_per_step) 123 | data_batch = [(n, min(n + data_batch_size, n_targets)) for n in range(0, n_targets, data_batch_size)] 124 | f_batch = 0 125 | pbar = tqdm(data_batch, desc='ubatch') 126 | 127 | for (start, stop) in pbar: 128 | batch_perm = perm[start:stop] 129 | batch_users = target_users[batch_perm] 130 | batch_items = target_items[batch_perm] 131 | if dropout != 0: 132 | n_to_drop = int(np.floor(dropout * len(batch_perm))) 133 | perm_user = np.random.permutation(len(batch_perm))[:n_to_drop] 134 | perm_item = np.random.permutation(len(batch_perm))[:n_to_drop] 135 | batch_v_pref = np.copy(batch_items) 136 | batch_u_pref = np.copy(batch_users) 137 | batch_v_pref[perm_user] = v_pref_last 138 | batch_u_pref[perm_item] = u_pref_last 139 | else: 140 | batch_v_pref = batch_items 141 | batch_u_pref = batch_users 142 | 143 | Uin = u_pref_expanded[batch_u_pref, :] 144 | Vin = v_pref_expanded[batch_v_pref, :] 145 | Ucontent = user_content[batch_users, :].todense() 146 | Vcontent = item_content[batch_items, :].todense() 147 | targets = target_scores[batch_perm] 148 | 149 | Uin = torch.tensor(Uin).to(d_train) 150 | Vin = torch.tensor(Vin).to(d_train) 151 | Ucontent = torch.tensor(Ucontent).to(d_train) 152 | Vcontent = torch.tensor(Vcontent).to(d_train) 153 | targets = torch.tensor(targets).to(d_train) 154 | 155 | preds, U_embedding, V_embedding = dropout_net.forward(Uin, Vin, Ucontent, Vcontent) 156 | loss = crit(preds, targets) 157 | loss_out = loss.item() 158 | 159 | optimizer.zero_grad() 160 | loss.backward() 161 | optimizer.step() 162 | f_batch += loss_out 163 | if np.isnan(f_batch): 164 | raise Exception('f is nan') 165 | n_batch_trained += 1 166 | pbar.set_description(f'updates={n_batch_trained/1000:.0f}k f={loss_out:.4f} f_tot={f_batch:.2f}') 167 | # step after every ubatch, decay is based on # of ubatch 168 | scheduler.step() 169 | 170 | if n_step % eval_every == 0: 171 | dropout_net.to(d_eval) 172 | dropout_net.eval() 173 | 174 | recall_warm = dropout_net.evaluate(recall_k=recall_at, eval_data=eval_warm, device=d_eval) 175 | recall_cold_user = dropout_net.evaluate(recall_k=recall_at, eval_data=eval_cold_user, device=d_eval) 176 | recall_cold_item = dropout_net.evaluate(recall_k=recall_at, eval_data=eval_cold_item, device=d_eval) 177 | 178 | dropout_net.to(d_train) 179 | dropout_net.train() 180 | 181 | # checkpoint 182 | agg_cur = np.sum(recall_warm + recall_cold_user + recall_cold_item) 183 | agg_best = np.sum(best_warm + best_cold_user + best_cold_item) 184 | if agg_cur > agg_best: 185 | best_cold_user = recall_cold_user 186 | best_cold_item = recall_cold_item 187 | best_warm = recall_warm 188 | best_step = n_step 189 | 190 | timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % ( 191 | n_step, len(data_batch), n_batch_trained, f_batch, best_step 192 | )).tic() 193 | print ('\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at])) 194 | print('warm start\t%s\ncold user\t%s\ncold item\t%s' % ( 195 | ' '.join(['%.4f' % i for i in recall_warm]), 196 | ' '.join(['%.4f' % i for i in recall_cold_user]), 197 | ' '.join(['%.4f' % i for i in recall_cold_item]) 198 | )) 199 | 200 | 201 | def load_data(data_path): 202 | timer = utils.timer(name='main').tic() 203 | split_folder = os.path.join(data_path, 'warm') 204 | 205 | u_file = os.path.join(data_path, 'trained/warm/U.csv.bin') 206 | v_file = os.path.join(data_path, 'trained/warm/V.csv.bin') 207 | user_content_file = os.path.join(data_path, 'user_features_0based.txt') 208 | item_content_file = os.path.join(data_path, 'item_features_0based.txt') 209 | train_file = os.path.join(split_folder, 'train.csv') 210 | test_warm_file = os.path.join(split_folder, 'test_warm.csv') 211 | test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv') 212 | test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv') 213 | test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv') 214 | test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv') 215 | test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv') 216 | 217 | dat = {} 218 | # load preference data 219 | timer.tic() 220 | u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200) 221 | v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200) 222 | dat['u_pref'] = u_pref 223 | dat['v_pref'] = v_pref 224 | 225 | timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() 226 | 227 | # pre-process 228 | _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) 229 | _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) 230 | timer.toc('standardized U,V').tic() 231 | 232 | # load content data 233 | timer.tic() 234 | user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32) 235 | dat['user_content'] = user_content.tolil(copy=False) 236 | timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() 237 | item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) 238 | dat['item_content'] = item_content.tolil(copy=False) 239 | timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() 240 | 241 | # load split 242 | timer.tic() 243 | train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view( 244 | dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) 245 | dat['user_indices'] = np.unique(train['uid']) 246 | timer.toc('read train triplets %s' % train.shape).tic() 247 | 248 | dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, 249 | train_data=train) 250 | dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user', 251 | cold=True, 252 | train_data=train) 253 | dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item', 254 | cold=True, 255 | train_data=train) 256 | return dat 257 | 258 | 259 | if __name__ == "__main__": 260 | parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data", 261 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 262 | parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder') 263 | 264 | parser.add_argument('--model-device', type=str, default='cuda:0', help='device to use for training') 265 | parser.add_argument('--inf-device', type=str, default='cpu', help='device to use for inference') 266 | parser.add_argument('--checkpoint-path', type=str, default=None, 267 | help='path to dump checkpoint data from TensorFlow') 268 | parser.add_argument('--tb-log-path', type=str, default=None, 269 | help='path to dump TensorBoard logs') 270 | parser.add_argument('--model-select', nargs='+', type=int, 271 | default=[800, 400], 272 | help='specify the fully-connected architecture, starting from input,' 273 | ' numbers indicate numbers of hidden units', 274 | ) 275 | parser.add_argument('--rank', type=int, default=200, help='output rank of latent model') 276 | parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout') 277 | parser.add_argument('--eval-every', type=int, default=2, help='evaluate every X user-batch') 278 | parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate') 279 | 280 | args = parser.parse_args() 281 | main() 282 | -------------------------------------------------------------------------------- /torch/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import scipy 4 | import numpy as np 5 | 6 | from tqdm import tqdm 7 | 8 | 9 | def truncated_normal_(tensor, mean=0, std=1): 10 | size = tensor.shape 11 | tmp = tensor.new_empty(size + (4,)).normal_() 12 | valid = (tmp < 2) & (tmp > -2) 13 | ind = valid.max(-1, keepdim=True)[1] 14 | tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1)) 15 | tensor.data.mul_(std).add_(mean) 16 | 17 | @torch.no_grad() 18 | def init_weights(net): 19 | if type(net) == nn.Linear: 20 | #torch.nn.init.normal_(net.weight, mean=0, std=0.01) 21 | truncated_normal_(net.weight, std=0.01) 22 | if net.bias is not None: 23 | torch.nn.init.constant_(net.bias, 0) 24 | 25 | 26 | def get_model(latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out): 27 | model = DeepCF(latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out) 28 | model.apply(init_weights) 29 | return model 30 | 31 | 32 | 33 | class TanHBlock(nn.Module): 34 | def __init__(self, dim_in, dim_out): 35 | super(TanHBlock, self).__init__() 36 | self.layer = nn.Linear(dim_in, dim_out) 37 | self.bn = nn.BatchNorm1d( 38 | num_features=dim_out, 39 | momentum=0.01, 40 | eps=0.001 41 | ) 42 | 43 | 44 | def forward(self, x): 45 | out = self.layer(x) 46 | out = self.bn(out) 47 | out = torch.tanh(out) 48 | return out 49 | 50 | class DeepCF(nn.Module): 51 | """ 52 | main model class implementing DeepCF 53 | also stores states for fast candidate generation 54 | latent_rank_in: rank of preference model input 55 | user_content_rank: rank of user content input 56 | item_content_rank: rank of item content input 57 | model_select: array of number of hidden unit, 58 | i.e. [200,100] indicate two hidden layer with 200 units followed by 100 units 59 | rank_out: rank of latent model output 60 | """ 61 | 62 | def __init__(self, latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out): 63 | super(DeepCF, self).__init__() 64 | self.rank_in = latent_rank_in 65 | self.phi_u_dim = user_content_rank 66 | self.phi_v_dim = item_content_rank 67 | self.model_select = model_select 68 | self.rank_out = rank_out 69 | 70 | # inputs 71 | self.phase = None 72 | self.target = None 73 | self.eval_trainR = None 74 | self.U_pref_tf = None 75 | self.V_pref_tf = None 76 | self.rand_target_ui = None 77 | 78 | # outputs in the model 79 | self.updates = None 80 | 81 | # predictor 82 | self.tf_topk_vals = None 83 | self.tf_topk_inds = None 84 | self.preds_random = None 85 | self.tf_latent_topk_cold = None 86 | self.tf_latent_topk_warm = None 87 | self.eval_preds_warm = None 88 | self.eval_preds_cold = None 89 | 90 | u_dim = self.rank_in + self.phi_u_dim if self.phi_u_dim > 0 else self.rank_in 91 | v_dim = self.rank_in + self.phi_v_dim if self.phi_v_dim > 0 else self.rank_in 92 | 93 | print ('\tu_concat rank=%s' % str(u_dim)) 94 | print ('\tv_concat rank=%s' % str(v_dim)) 95 | 96 | u_dims = [u_dim] + self.model_select 97 | v_dims = [v_dim] + self.model_select 98 | self.u_layers = nn.ModuleList(TanHBlock(u_dims[i], u_dims[i + 1]) for i in range(len(u_dims) - 1)) 99 | self.v_layers = nn.ModuleList(TanHBlock(v_dims[i], v_dims[i + 1]) for i in range(len(v_dims) - 1)) 100 | 101 | self.u_emb = nn.Linear(u_dims[-1], self.rank_out) 102 | self.v_emb = nn.Linear(v_dims[-1], self.rank_out) 103 | 104 | def encode(self, Uin, Vin, Ucontent, Vcontent): 105 | 106 | if self.phi_u_dim>0: 107 | u_concat = torch.cat((Uin, Ucontent), 1) 108 | else: 109 | u_concat = Uin 110 | 111 | if self.phi_v_dim>0: 112 | v_concat = torch.cat((Vin, Vcontent), 1) 113 | else: 114 | v_concat = Vin 115 | 116 | u_out = u_concat 117 | for layer in self.u_layers: 118 | u_out = layer(u_out) 119 | U_embedding = self.u_emb(u_out) 120 | 121 | v_out = v_concat 122 | for layer in self.v_layers: 123 | v_out = layer(v_out) 124 | V_embedding = self.v_emb(v_out) 125 | return U_embedding, V_embedding 126 | 127 | def forward(self, Uin, Vin, Ucontent, Vcontent): 128 | 129 | U_embedding, V_embedding = self.encode(Uin, Vin, Ucontent, Vcontent) 130 | 131 | preds = U_embedding * V_embedding 132 | preds = torch.sum(preds, 1) 133 | return preds, U_embedding, V_embedding 134 | 135 | @torch.no_grad() 136 | def evaluate(self, recall_k, eval_data, device=None): 137 | """ 138 | given EvalData runs batch evaluation 139 | :param recall_k: list of thresholds to compute recall at (information retrieval recall) 140 | :param eval_data: EvalData instance 141 | :return: recall array at thresholds matching recall_k 142 | """ 143 | d = device 144 | 145 | tf_eval_preds_batch = [] 146 | for (batch, (eval_start, eval_stop)) in enumerate(tqdm(eval_data.eval_batch, desc='eval', leave=False)): 147 | 148 | Uin = eval_data.U_pref_test[eval_start:eval_stop, :] 149 | Vin = eval_data.V_pref_test 150 | Vcontent = eval_data.V_content_test 151 | 152 | if self.phi_u_dim > 0: 153 | Ucontent= eval_data.U_content_test[eval_start:eval_stop, :] 154 | else: 155 | Ucontent = None 156 | 157 | Uin = torch.tensor(Uin) 158 | Vin = torch.tensor(Vin) 159 | if Ucontent is not None: 160 | Ucontent = torch.tensor(Ucontent) 161 | if Vcontent is not None: 162 | Vcontent = torch.tensor(Vcontent) 163 | if d is not None: 164 | Uin = Uin.to(d) 165 | Vin = Vin.to(d) 166 | Ucontent = Ucontent.to(d) 167 | Vcontent = Vcontent.to(d) 168 | U_embedding, V_embedding = self.encode(Uin, Vin, Ucontent, Vcontent) 169 | embedding_prod = torch.matmul(U_embedding, V_embedding.t()) 170 | 171 | 172 | if not eval_data.is_cold: 173 | eval_trainR = eval_data.tf_eval_train[batch] 174 | embedding_prod = embedding_prod + eval_trainR 175 | 176 | _, eval_preds = torch.topk(embedding_prod, k=recall_k[-1], sorted=True) 177 | tf_eval_preds_batch.append(eval_preds.detach().cpu().numpy()) 178 | 179 | 180 | tf_eval_preds = np.concatenate(tf_eval_preds_batch) 181 | 182 | # filter non-zero targets 183 | y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows] 184 | y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz] 185 | 186 | preds_all = tf_eval_preds[y_nz, :] 187 | 188 | recall = [] 189 | for at_k in tqdm(recall_k, desc='recall', leave=False): 190 | preds_k = preds_all[:, :at_k] 191 | y = eval_data.R_test_inf[y_nz, :] 192 | 193 | x = scipy.sparse.lil_matrix(y.shape) 194 | x.data = np.array([z.tolist() for z in np.ones_like(preds_k)]+[[]],dtype=object)[:-1] 195 | x.rows = np.array([z.tolist() for z in preds_k]+[[]],dtype=object)[:-1] 196 | z = y.multiply(x) 197 | recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1)))) 198 | return recall 199 | -------------------------------------------------------------------------------- /torch/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | import numpy as np 4 | import scipy 5 | from sklearn import preprocessing as prep 6 | 7 | 8 | class timer(object): 9 | def __init__(self, name='default'): 10 | """ 11 | timer object to record running time of functions, not for micro-benchmarking 12 | usage is: 13 | $ timer = utils.timer('name').tic() 14 | $ timer.toc('process A').tic() 15 | 16 | 17 | :param name: label for the timer 18 | """ 19 | self._start_time = None 20 | self._name = name 21 | self.tic() 22 | 23 | def tic(self): 24 | self._start_time = time.time() 25 | return self 26 | 27 | def toc(self, message): 28 | elapsed = time.time() - self._start_time 29 | message = '' if message is None else message 30 | print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed))) 31 | return self 32 | 33 | def reset(self): 34 | self._start_time = None 35 | return self 36 | 37 | @staticmethod 38 | def _format(s): 39 | delta = datetime.timedelta(seconds=s) 40 | d = datetime.datetime(1, 1, 1) + delta 41 | s = '' 42 | if (d.day - 1) > 0: 43 | s = s + '{:d} days'.format(d.day - 1) 44 | if d.hour > 0: 45 | s = s + '{:d} hr'.format(d.hour) 46 | if d.minute > 0: 47 | s = s + '{:d} min'.format(d.minute) 48 | s = s + '{:d} s'.format(d.second) 49 | return s 50 | 51 | 52 | def batch(iterable, _n=1, drop=True): 53 | """ 54 | returns batched version of some iterable 55 | :param iterable: iterable object as input 56 | :param _n: batch size 57 | :param drop: if true, drop extra if batch size does not divide evenly, 58 | otherwise keep them (last batch might be shorter) 59 | :return: batched version of iterable 60 | """ 61 | it_len = len(iterable) 62 | for ndx in range(0, it_len, _n): 63 | if ndx + _n < it_len: 64 | yield iterable[ndx:ndx + _n] 65 | elif drop is False: 66 | yield iterable[ndx:it_len] 67 | 68 | 69 | def tfidf(x): 70 | """ 71 | compute tfidf of numpy array x 72 | :param x: input array, document by terms 73 | :return: 74 | """ 75 | x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel()) 76 | x_idf = np.asarray(x_idf) 77 | x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf))) 78 | x_idf_diag.setdiag(x_idf) 79 | x_tf = x.tocsr() 80 | x_tf.data = np.log(x_tf.data + 1) 81 | x_tfidf = x_tf * x_idf_diag 82 | return x_tfidf 83 | 84 | 85 | def prep_standardize(x): 86 | """ 87 | takes sparse input and compute standardized version 88 | 89 | Note: 90 | cap at 5 std 91 | 92 | :param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing 93 | :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x 94 | """ 95 | x_nzrow = x.any(axis=1) 96 | scaler = prep.StandardScaler().fit(x[x_nzrow, :]) 97 | x_scaled = np.copy(x) 98 | x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :]) 99 | x_scaled[x_scaled > 5] = 5 100 | x_scaled[x_scaled < -5] = -5 101 | x_scaled[np.absolute(x_scaled) < 1e-5] = 0 102 | return scaler, x_scaled 103 | 104 | 105 | def prep_standardize_dense(x): 106 | """ 107 | takes dense input and compute standardized version 108 | 109 | Note: 110 | cap at 5 std 111 | 112 | :param x: 2D numpy data array to standardize (column-wise) 113 | :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x 114 | """ 115 | scaler = prep.StandardScaler().fit(x) 116 | x_scaled = scaler.transform(x) 117 | x_scaled[x_scaled > 5] = 5 118 | x_scaled[x_scaled < -5] = -5 119 | x_scaled[np.absolute(x_scaled) < 1e-5] = 0 120 | return scaler, x_scaled 121 | --------------------------------------------------------------------------------