├── .gitignore
├── LICENSE
├── README.md
├── logs
    ├── cold_item.png
    ├── cold_user.png
    ├── dropoutnet_citeu_cold.png
    ├── dropoutnet_citeu_warm.png
    ├── logo.png
    ├── logo.svg
    ├── logo_alt.png
    ├── logo_fill.png
    ├── logobox.jpg
    └── warm.png
├── tf1
    ├── data.py
    ├── main.py
    ├── main_citeu.py
    ├── main_cold_citeu.py
    ├── main_warm_citeu.py
    ├── model.py
    └── utils.py
├── tf2
    ├── data.py
    ├── main.py
    ├── model.py
    └── utils.py
└── torch
    ├── data.py
    ├── main.py
    ├── model.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright Notice: © Copyright 2018 The Toronto-Dominion Bank and/or its affiliates
 3 | 
 4 | Permission is hereby granted, subject to the conditions below, free of charge, to 
 5 | any person obtaining a copy of this software and associated documentation files 
 6 | (the "Software"), to use, copy, distribute, publish and modify the Software, only 
 7 | for research and for no other purpose. For clarity, and without limitation, this 
 8 | licence does not permit use of Software or any part thereof for commercial purposes.
 9 | 
10 | Patents: This permission does not grant any patent licenses in the Software.
11 | 
12 | Conditions: 
13 | 1. The above copyright notice and the following disclaimer shall be included in all 
14 | copies or substantial portions of the Software.
15 | 2. You must give appropriate credit, provide a link to the license, and indicate if 
16 | changes were made. You may do so in any reasonable manner, but not in any way that 
17 | suggests that the copyright holder endorses you or your use of the Software.
18 | 
19 | Names and Trademarks: No permission to use the names or trademarks of the copyright 
20 | holder are granted, except as required for reasonable and customary use in describing 
21 | the origin of the Software  and reproducing the content of the copyright notice.
22 | 
23 | DISCLAIMER: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
25 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
26 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
27 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
28 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <p align="center">
 3 | <a href="https://layer6.ai/"><img src="https://github.com/layer6ai-labs/DropoutNet/blob/master/logs/logobox.jpg" width="180"></a>
 4 | </p>
 5 | 
 6 | ## NeurIPS'17 DropoutNet: Addressing Cold Start in Recommender Systems
 7 | Authors: [Maksims Volkovs](http://www.cs.toronto.edu/~mvolkovs), [Guangwei Yu](http://www.cs.toronto.edu/~guangweiyu), Tomi Poutanen  
 8 | [[paper](http://www.cs.toronto.edu/~mvolkovs/nips2017_deepcf.pdf)]
 9 | 
10 | **UPDATE:** We added TensorFlow2 and PyTorch implimentations!
11 | - [`tf1`: original TF1 implementation](tf1)
12 | - [`tf2`: TF2 using compat mode (RecSys dataset only)](tf2)
13 | - [`torch`: PyTorch (RecSys dataset only)](torch)
14 | 
15 | <a name="intro"/>
16 | 
17 | ## Introduction
18 | This repository contains full implementation of the DropoutNet model and includes both training and evaluation routines. We also provide the [ACM RecSys 2017 Challenge](http://2017.recsyschallenge.com) dataset that we further split into three subsets for warm start, user cold start and item cold start evaluation. The aim is to train a *single* model that can be applied to all three tasks and we report validation accuracy on each task during training.
19 | 
20 | Furthermore per request, we also provide scripts and all necessary data to run the Citeulike cold-start experiment. See section on Citeulike below for further details as well as links to the packaged data.
21 | 
22 | <a name="dataset"/>
23 | 
24 | ## Dataset
25 | 
26 | To run the model, download the dataset from [here](https://github.com/layer6ai-labs/DropoutNet-Data/blob/master/recsys2017.pub.tar.gz).
27 | With this dataset we have also included pre-trained Weighted 
28 | Factorization model (WMF)\[Hu et al., 2008\], that is used as preference input to the DropoutNet. WMF produces competitive performance on warm start but doesn't generalize to cold start. So this code demonstrates how to apply DropoutNet to provide cold start capability to WMF. The format of the data is as follows:
29 | ```
30 | recsys2017.pub				
31 | └─ eval					// use path to this folder in --data-dir
32 |    ├─ trained				// WMF model
33 |    │  └─ warm				
34 |    │     ├─ U.csv.bin			// numpy binarized WMF user preference latent vectors (U)
35 |    │     └─ V.csv.bin			// numpy binarized WMF item preference latent vectors (V)
36 |    ├─ warm				
37 |    │  ├─ test_cold_item.csv		// validation interactions for item cold start 
38 |    │  ├─ test_cold_item_item_ids.csv	// targets item ids for item cold start
39 |    │  ├─ test_cold_user.csv    		// validation interactions for user cold start
40 |    │  ├─ test_cold_user_item_ids.csv	// target user ids for user cold start
41 |    │  ├─ test_warm.csv			// validation interactions for warm start
42 |    │  ├─ test_warm_item_ids.csv		// target item ids for warm start
43 |    │  └─ train.csv			// training interactions
44 |    ├─ item_features_0based.txt		// item features in libsvm format
45 |    └─ user_features_0based.txt		// user features in libsvm format
46 |       
47 | interactions are stored in csv as:
48 |   <USER_ID>,<ITEM_ID>,<INTERACTION_TYPE>,<TIMESTAMP>
49 | where INTERACTION_TYPE is one of:
50 |   0: impression
51 |   1: click
52 |   2: bookmark
53 |   3: reply
54 |   5: recruiter interest
55 | ```
56 | 
57 | <a name="demo"/>
58 | 
59 | ## Running training code
60 | 
61 | 1. Download the dataset, extract and keep the directory structure.
62 | 
63 | 2. run `main.py`
64 |     * for usage, run with `main.py --help`
65 |     * default setting trains a two layer neural network with hyperparameters selected for the RecSys data
66 |     * gpu is used for training by default and cpu for inference
67 | 3. (Optionally) launch tensorboard to monitor progress by `tensorboard --logdir=<log_path>`
68 | 
69 | During training recall@50,100,...,500 accuracy is shown every 50K updates for warm start, user cold start and item cold start validation sets.
70 | 
71 | Notes:
72 | 
73 | * Make sure `--data-dir` points to the `eval/` folder, not the root
74 | * On our environment (described above) 50K updates takes approximately 14 minutes with the default GPU/CPU setting.
75 | * By default, training happens on GPU while inference and batch generation is on CPU.
76 | 
77 | ## Validation Curves
78 | <p align="center">
79 | <img src="https://github.com/layer6ai-labs/DropoutNet/blob/master/logs/warm.png" width="500">
80 | <img src="https://github.com/layer6ai-labs/DropoutNet/blob/master/logs/cold_user.png" width="500">
81 | <img src="https://github.com/layer6ai-labs/DropoutNet/blob/master/logs/cold_item.png" width="500">
82 | </p>
83 | 
84 | ## Citeulike
85 | In addition to Recsys, we also provide pipeline to run the publicly available Citeulike data. Note that, as mentioned in the paper, we evaluate cold start the same way as the CTR paper while the warm start evaluation is modified. For convenience, we have proivded our evaluation split for both cold and warm start, item features, as well as the WMF user item preference latent vectors available [here](https://github.com/layer6ai-labs/DropoutNet-Data/blob/master/citeu.tar.gz).
86 | 
87 | The citeulike warm and cold models are trained separately as their validation sets differ. Please use the scripts
88 | `main_cold_citeu.py` and `main_warm_citeu.py` to run the experiments on the Citeulike dataset.
89 | 
90 | Point `--data-dir` to your extracted `eval` folder after extracting `citeu.tar.gz`. Sample training runs with respective validation performance are shown below per 1000 updates.
91 | 
92 | <p align="center">
93 | <img src="https://github.com/layer6ai-labs/DropoutNet/blob/master/logs/dropoutnet_citeu_cold.png" width="500">
94 | <img src="https://github.com/layer6ai-labs/DropoutNet/blob/master/logs/dropoutnet_citeu_warm.png" width="500">
95 | </p>
96 | 
97 | 


--------------------------------------------------------------------------------
/logs/cold_item.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/cold_item.png


--------------------------------------------------------------------------------
/logs/cold_user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/cold_user.png


--------------------------------------------------------------------------------
/logs/dropoutnet_citeu_cold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/dropoutnet_citeu_cold.png


--------------------------------------------------------------------------------
/logs/dropoutnet_citeu_warm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/dropoutnet_citeu_warm.png


--------------------------------------------------------------------------------
/logs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logo.png


--------------------------------------------------------------------------------
/logs/logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg width="111px" height="35px" viewBox="0 0 111 35" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Generator: Sketch 49.3 (51167) - http://www.bohemiancoding.com/sketch -->
 4 |     <title>Page 1</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <defs>
 7 |         <polygon id="path-1" points="0.8251 0.0004 20.391 0.0004 20.391 27.4234 0.8251 27.4234"></polygon>
 8 |         <polygon id="path-3" points="0 0.3351 10.6479 0.3351 10.6479 27.0561 0 27.0561"></polygon>
 9 |         <polygon id="path-5" points="0.3921 0.6801 21.1431 0.6801 21.1431 28 0.3921 28"></polygon>
10 |     </defs>
11 |     <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
12 |         <g>
13 |             <g id="Group-3" transform="translate(90.000000, 0.000000)">
14 |                 <mask id="mask-2" fill="white">
15 |                     <use xlink:href="#path-1"></use>
16 |                 </mask>
17 |                 <g id="Clip-2"></g>
18 |                 <path d="M14.3291,21.9994 C13.4601,22.9364 12.3111,23.4094 10.9151,23.4094 C9.4941,23.4094 8.3501,22.9194 7.5171,21.9524 C6.6861,20.9884 6.2661,19.7804 6.2661,18.3624 C6.2661,16.9214 6.6861,15.6984 7.5171,14.7284 C8.3501,13.7554 9.5061,13.2614 10.9511,13.2614 C12.3351,13.2614 13.4751,13.7294 14.3381,14.6534 C15.2001,15.5764 15.6371,16.8104 15.6371,18.3174 C15.6371,19.8254 15.1971,21.0644 14.3291,21.9994 M11.6911,9.3564 C8.9681,9.3564 6.8871,10.2374 5.6331,12.1454 C5.6331,12.1454 5.6041,11.6954 5.6331,10.9574 C5.7091,9.0124 6.0321,7.4584 6.9151,6.1114 C7.8951,4.6154 9.1181,4.0134 10.9871,4.0134 C12.1021,4.0134 13.0471,4.2784 13.7951,4.8014 C14.5301,5.3144 14.9931,6.0564 15.1731,7.0024 L19.7171,6.0364 C19.3281,4.3914 18.4811,3.0164 17.1801,1.9454 C15.6091,0.6544 13.5871,0.0004 11.1681,0.0004 C7.7731,0.0004 5.1741,1.2584 3.4421,3.7394 C1.7061,6.2284 0.8251,9.7504 0.8251,14.2084 C0.8251,18.3904 1.6901,21.6744 3.3951,23.9714 C5.0981,26.2624 7.6151,27.4234 10.8791,27.4234 C13.6791,27.4234 15.9891,26.5684 17.7451,24.8844 C19.5011,23.2004 20.3911,20.9764 20.3911,18.2724 C20.3911,15.6414 19.5641,13.4744 17.9331,11.8314 C16.3041,10.1884 14.2031,9.3564 11.6911,9.3564" id="Fill-1" fill="#000000" mask="url(#mask-2)"></path>
19 |             </g>
20 |             <g id="Group-6">
21 |                 <mask id="mask-4" fill="white">
22 |                     <use xlink:href="#path-3"></use>
23 |                 </mask>
24 |                 <g id="Clip-5"></g>
25 |                 <path d="M10.6479,27.0561 L10.6479,23.1171 L7.4739,23.1171 L7.4739,0.3351 L-0.0001,0.3351 L-0.0001,4.2561 L3.1169,4.2561 L3.1169,24.8361 C3.1349,26.1771 4.2209,27.0151 5.4459,27.0561 L10.6479,27.0561 Z" id="Fill-4" fill="#000000" mask="url(#mask-4)"></path>
26 |             </g>
27 |             <path d="M20.1538,23.9613 C18.0488,23.9613 16.7908,22.9753 16.7908,21.3233 C16.7908,19.5493 18.3788,18.5713 21.2598,18.5713 L25.1568,18.5713 L25.1568,19.6073 C25.1568,22.4353 22.5798,23.9613 20.1538,23.9613 Z M32.6178,23.1223 L29.4198,23.1223 L29.4198,14.4943 C29.4198,9.9143 26.5778,7.2883 21.4888,7.2883 C16.8408,7.2883 13.7538,9.4833 13.2198,13.0753 L17.5108,13.9883 C17.6568,12.0163 19.1208,10.9023 21.4128,10.9023 C23.8278,10.9023 25.1568,12.1643 25.1568,14.4553 L25.1568,15.4153 L21.2598,15.4153 C15.7208,15.4153 12.4138,17.7523 12.4138,21.6673 C12.4138,25.1633 15.0318,27.4233 19.0858,27.4233 C22.4798,27.4233 24.2518,25.9933 25.1628,24.6953 L25.1628,24.8353 C25.1808,26.1763 26.2678,27.0153 27.4928,27.0563 L32.6178,27.0563 L32.6178,23.1223 Z" id="Fill-7" fill="#000000"></path>
28 |             <g id="Group-11" transform="translate(30.000000, 7.000000)">
29 |                 <mask id="mask-6" fill="white">
30 |                     <use xlink:href="#path-5"></use>
31 |                 </mask>
32 |                 <g id="Clip-10"></g>
33 |                 <path d="M16.4341,0.6801 L11.3421,16.2591 L5.9461,0.6801 L1.1221,0.6801 C1.1221,0.6801 8.3801,19.8241 8.5101,20.1551 C8.7191,20.7121 8.8241,21.1641 8.8241,21.6171 C8.8241,23.1851 7.9531,23.9861 6.4551,23.9861 L0.3921,23.9861 L0.3921,28.0001 L6.1791,28.0001 C9.0951,28.0001 11.5121,27.3481 12.8931,23.5111 L21.1431,0.6801 L16.4341,0.6801 Z" id="Fill-9" fill="#000000" mask="url(#mask-6)"></path>
34 |             </g>
35 |             <path d="M74.9033,7.6655 C73.6773,7.7065 72.7093,8.5495 72.6913,9.8905 L72.6913,27.0565 L77.0373,27.0565 L77.0373,11.6905 L85.5823,11.6905 L85.5823,7.6655 L74.9033,7.6655 Z" id="Fill-12" fill="#000000"></path>
36 |             <path d="M60.7178,10.7699 C63.4798,10.7699 65.6208,12.7099 65.6998,15.2249 L55.6138,15.2249 C56.1018,12.5029 57.9598,10.7699 60.7178,10.7699 Z M65.4768,20.6519 C64.7378,23.0129 62.5748,23.7499 60.6978,23.7499 C57.7698,23.7499 55.5808,21.9989 55.4668,18.4569 L70.0988,18.4569 L70.0988,17.5849 C70.0988,11.4719 66.5798,7.2889 60.6788,7.2889 C55.0668,7.2889 50.9938,11.5389 50.9938,17.3939 C50.9938,23.3929 54.7478,27.4229 60.7178,27.4229 C65.1968,27.4229 68.4338,25.2869 69.7308,21.5269 L65.4768,20.6519 Z" id="Fill-14" fill="#000000"></path>
37 |         </g>
38 |     </g>
39 | </svg>


--------------------------------------------------------------------------------
/logs/logo_alt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logo_alt.png


--------------------------------------------------------------------------------
/logs/logo_fill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logo_fill.png


--------------------------------------------------------------------------------
/logs/logobox.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/logobox.jpg


--------------------------------------------------------------------------------
/logs/warm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/layer6ai-labs/DropoutNet/60c9aa73568deec30374ad1bbc29a65c9d47f115/logs/warm.png


--------------------------------------------------------------------------------
/tf1/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.sparse
  4 | import utils
  5 | import pandas as pd
  6 | 
  7 | """
  8 | This module contains class and methods related to data used in DropoutNet  
  9 | """
 10 | 
 11 | 
 12 | def load_eval_data(test_file, test_id_file, name, cold, train_data, citeu=False):
 13 |     timer = utils.timer()
 14 |     with open(test_id_file) as f:
 15 |         test_item_ids = [int(line) for line in f]
 16 |         test_data = pd.read_csv(test_file, delimiter=",", header=-1, dtype=np.int32).values.ravel()
 17 |         if citeu:
 18 |             test_data = test_data.view(
 19 |             dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
 20 |         else:
 21 |             test_data = test_data.view(
 22 |             dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)])
 23 |         timer.toc('read %s triplets %s' % (name, test_data.shape)).tic()
 24 |         eval_data = EvalData(
 25 |             test_data,
 26 |             test_item_ids,
 27 |             is_cold=cold,
 28 |             train=train_data
 29 |         )
 30 |         timer.toc('loaded %s' % name).tic()
 31 |         print(eval_data.get_stats_string())
 32 |         return eval_data
 33 | 
 34 | 
 35 | class EvalData:
 36 |     """
 37 |     EvalData:
 38 |         EvalData packages test triplet (user, item, score) into appropriate formats for evaluation
 39 | 
 40 |         Compact Indices:
 41 |             Specifically, this builds compact indices and stores mapping between original and compact indices.
 42 |             Compact indices only contains:
 43 |                 1) items in test set
 44 |                 2) users who interacted with such test items
 45 |             These compact indices speed up testing significantly by ignoring irrelevant users or items
 46 | 
 47 |         Args:
 48 |             test_triplets(int triplets): user-item-interaction_value triplet to build the test data
 49 |             train(int triplets): user-item-interaction_value triplet from train data
 50 | 
 51 |         Attributes:
 52 |             is_cold(boolean): whether test data is used for cold start problem
 53 |             test_item_ids(list of int): maps compressed item ids to original item ids (via position)
 54 |             test_item_ids_map(dictionary of int->int): maps original item ids to compressed item ids
 55 |             test_user_ids(list of int): maps compressed user ids to original user ids (via position)
 56 |             test_user_ids_map(dictionary of int->int): maps original user ids to compressed user ids
 57 |             R_test_inf(scipy lil matrix): pre-built compressed test matrix
 58 |             R_train_inf(scipy lil matrix): pre-built compressed train matrix for testing
 59 | 
 60 |             other relevant input/output exposed from tensorflow graph
 61 | 
 62 |     """
 63 | 
 64 |     def __init__(self, test_triplets, test_item_ids, is_cold, train):
 65 |         # build map both-ways between compact and original indices
 66 |         # compact indices only contains:
 67 |         #  1) items in test set
 68 |         #  2) users who interacted with such test items
 69 | 
 70 |         self.is_cold = is_cold
 71 | 
 72 |         self.test_item_ids = test_item_ids
 73 |         # test_item_ids_map
 74 |         self.test_item_ids_map = {iid: i for i, iid in enumerate(self.test_item_ids)}
 75 | 
 76 |         _test_ij_for_inf = [(t[0], t[1]) for t in test_triplets if t[1] in self.test_item_ids_map]
 77 |         # test_user_ids
 78 |         self.test_user_ids = np.unique(test_triplets['uid'])
 79 |         # test_user_ids_map
 80 |         self.test_user_ids_map = {user_id: i for i, user_id in enumerate(self.test_user_ids)}
 81 | 
 82 |         _test_i_for_inf = [self.test_user_ids_map[_t[0]] for _t in _test_ij_for_inf]
 83 |         _test_j_for_inf = [self.test_item_ids_map[_t[1]] for _t in _test_ij_for_inf]
 84 |         self.R_test_inf = scipy.sparse.coo_matrix(
 85 |             (np.ones(len(_test_i_for_inf)),
 86 |              (_test_i_for_inf, _test_j_for_inf)),
 87 |             shape=[len(self.test_user_ids), len(self.test_item_ids)]
 88 |         ).tolil(copy=False)
 89 | 
 90 |         train_ij_for_inf = [(self.test_user_ids_map[_t[0]], self.test_item_ids_map[_t[1]]) for _t
 91 |                             in train
 92 |                             if _t[1] in self.test_item_ids_map and _t[0] in self.test_user_ids_map]
 93 |         if self.is_cold and len(train_ij_for_inf) is not 0:
 94 |             raise Exception('using cold dataset, but data is not cold!')
 95 |         if not self.is_cold and len(train_ij_for_inf) is 0:
 96 |             raise Exception('using warm datset, but data is not warm!')
 97 | 
 98 |         self.R_train_inf = None if self.is_cold else scipy.sparse.coo_matrix((
 99 |             np.ones(len(train_ij_for_inf)),
100 |             zip(*train_ij_for_inf)), shape=self.R_test_inf.shape).tolil(copy=False)
101 | 
102 |         # allocate fields
103 |         self.U_pref_test = None
104 |         self.V_pref_test = None
105 |         self.V_content_test = None
106 |         self.U_content_test = None
107 |         self.tf_eval_train = None
108 |         self.tf_eval_test = None
109 |         self.eval_batch = None
110 | 
111 |     def init_tf(self, user_factors, item_factors, user_content, item_content, eval_run_batchsize):
112 |         self.U_pref_test = user_factors[self.test_user_ids, :]
113 |         self.V_pref_test = item_factors[self.test_item_ids, :]
114 |         self.V_content_test = item_content[self.test_item_ids, :]
115 |         if scipy.sparse.issparse(self.V_content_test):
116 |             self.V_content_test = self.V_content_test.todense()
117 |         if user_content!=None:
118 |             self.U_content_test = user_content[self.test_user_ids, :]
119 |             if scipy.sparse.issparse(self.U_content_test):
120 |                 self.U_content_test = self.U_content_test.todense()
121 |         eval_l = self.R_test_inf.shape[0]
122 |         self.eval_batch = [(x, min(x + eval_run_batchsize, eval_l)) for x
123 |                            in xrange(0, eval_l, eval_run_batchsize)]
124 | 
125 |         self.tf_eval_train = []
126 |         self.tf_eval_test = []
127 | 
128 |         if not self.is_cold:
129 |             for (eval_start, eval_finish) in self.eval_batch:
130 |                 _ui = self.R_train_inf[eval_start:eval_finish, :].tocoo()
131 |                 _ui = zip(_ui.row, _ui.col)
132 |                 self.tf_eval_train.append(
133 |                     tf.SparseTensorValue(
134 |                         indices=_ui,
135 |                         values=np.full(len(_ui), -100000, dtype=np.float32),
136 |                         dense_shape=[eval_finish - eval_start, self.R_train_inf.shape[1]]
137 |                     )
138 |                 )
139 | 
140 |     def get_stats_string(self):
141 |         return ('\tn_test_users:[%d]\n\tn_test_items:[%d]' % (len(self.test_user_ids), len(self.test_item_ids))
142 |                 + '\n\tR_train_inf: %s' % (
143 |                     'no R_train_inf for cold' if self.is_cold else 'shape=%s nnz=[%d]' % (
144 |                         str(self.R_train_inf.shape), len(self.R_train_inf.nonzero()[0])
145 |                     )
146 |                 )
147 |                 + '\n\tR_test_inf: shape=%s nnz=[%d]' % (
148 |                     str(self.R_test_inf.shape), len(self.R_test_inf.nonzero()[0])
149 |                 ))
150 | 


--------------------------------------------------------------------------------
/tf1/main.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | import datetime
  6 | from sklearn import datasets
  7 | import data
  8 | import model
  9 | 
 10 | import argparse
 11 | import os
 12 | 
 13 | n_users = 1497020 + 1
 14 | n_items = 1306054 + 1
 15 | 
 16 | 
 17 | def main():
 18 |     data_path = args.data_dir
 19 |     checkpoint_path = args.checkpoint_path
 20 |     tb_log_path = args.tb_log_path
 21 |     model_select = args.model_select
 22 | 
 23 |     rank_out = args.rank
 24 |     user_batch_size = 1000
 25 |     n_scores_user = 2500
 26 |     data_batch_size = 100
 27 |     dropout = args.dropout
 28 |     recall_at = range(50, 550, 50)
 29 |     eval_batch_size = 1000
 30 |     max_data_per_step = 2500000
 31 |     eval_every = args.eval_every
 32 |     num_epoch = 10
 33 | 
 34 |     _lr = args.lr
 35 |     _decay_lr_every = 50
 36 |     _lr_decay = 0.1
 37 | 
 38 |     experiment = '%s_%s' % (
 39 |         datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'),
 40 |         '-'.join(str(x / 100) for x in model_select) if model_select else 'simple'
 41 |     )
 42 |     _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint'
 43 | 
 44 |     print('running: ' + experiment)
 45 | 
 46 |     dat = load_data(data_path)
 47 |     u_pref_scaled = dat['u_pref_scaled']
 48 |     v_pref_scaled = dat['v_pref_scaled']
 49 |     eval_warm = dat['eval_warm']
 50 |     eval_cold_user = dat['eval_cold_user']
 51 |     eval_cold_item = dat['eval_cold_item']
 52 |     user_content = dat['user_content']
 53 |     item_content = dat['item_content']
 54 |     u_pref = dat['u_pref']
 55 |     v_pref = dat['v_pref']
 56 |     user_indices = dat['user_indices']
 57 | 
 58 |     timer = utils.timer(name='main').tic()
 59 | 
 60 |     # append pref factors for faster dropout
 61 |     v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
 62 |     v_pref_last = v_pref_scaled.shape[0]
 63 |     u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
 64 |     u_pref_last = u_pref_scaled.shape[0]
 65 |     timer.toc('initialized numpy data for tf')
 66 | 
 67 |     # prep eval
 68 |     eval_batch_size = eval_batch_size
 69 |     timer.tic()
 70 |     eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 71 |     timer.toc('initialized eval_warm for tf').tic()
 72 |     eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 73 |     timer.toc('initialized eval_cold_user for tf').tic()
 74 |     eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 75 |     timer.toc('initialized eval_cold_item for tf').tic()
 76 | 
 77 |     dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1],
 78 |                                user_content_rank=user_content.shape[1],
 79 |                                item_content_rank=item_content.shape[1],
 80 |                                model_select=model_select,
 81 |                                rank_out=rank_out)
 82 | 
 83 |     config = tf.ConfigProto(allow_soft_placement=True)
 84 | 
 85 |     with tf.device(args.model_device):
 86 |         dropout_net.build_model()
 87 | 
 88 |     with tf.device(args.inf_device):
 89 |         dropout_net.build_predictor(recall_at, n_scores_user)
 90 | 
 91 |     with tf.Session(config=config) as sess:
 92 |         tf_saver = None if _tf_ckpt_file is None else tf.train.Saver()
 93 |         train_writer = None if tb_log_path is None else tf.summary.FileWriter(
 94 |             tb_log_path + experiment, sess.graph)
 95 |         tf.global_variables_initializer().run()
 96 |         tf.local_variables_initializer().run()
 97 |         timer.toc('initialized tf')
 98 | 
 99 |         row_index = np.copy(user_indices)
100 |         n_step = 0
101 |         best_cold_user = 0
102 |         best_cold_item = 0
103 |         best_warm = 0
104 |         n_batch_trained = 0
105 |         best_step = 0
106 |         for epoch in range(num_epoch):
107 |             np.random.shuffle(row_index)
108 |             for b in utils.batch(row_index, user_batch_size):
109 |                 n_step += 1
110 |                 # prep targets
111 |                 target_users = np.repeat(b, n_scores_user)
112 |                 target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
113 |                 target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b]
114 |                 target_items_rand = np.array(target_items_rand).flatten()
115 |                 target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand]))
116 |                 [target_scores, target_items, random_scores] = sess.run(
117 |                     [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random],
118 |                     feed_dict={
119 |                         dropout_net.U_pref_tf: u_pref[b, :],
120 |                         dropout_net.V_pref_tf: v_pref,
121 |                         dropout_net.rand_target_ui: target_ui_rand
122 |                     }
123 |                 )
124 |                 # merge topN and randomN items per user
125 |                 target_scores = np.append(target_scores, random_scores)
126 |                 target_items = np.append(target_items, target_items_rand)
127 |                 target_users = np.append(target_users, target_users)
128 | 
129 |                 tf.local_variables_initializer().run()
130 |                 n_targets = len(target_scores)
131 |                 perm = np.random.permutation(n_targets)
132 |                 n_targets = min(n_targets, max_data_per_step)
133 |                 data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)]
134 |                 f_batch = 0
135 |                 for (start, stop) in data_batch:
136 |                     batch_perm = perm[start:stop]
137 |                     batch_users = target_users[batch_perm]
138 |                     batch_items = target_items[batch_perm]
139 |                     if dropout != 0:
140 |                         n_to_drop = int(np.floor(dropout * len(batch_perm)))
141 |                         perm_user = np.random.permutation(len(batch_perm))[:n_to_drop]
142 |                         perm_item = np.random.permutation(len(batch_perm))[:n_to_drop]
143 |                         batch_v_pref = np.copy(batch_items)
144 |                         batch_u_pref = np.copy(batch_users)
145 |                         batch_v_pref[perm_user] = v_pref_last
146 |                         batch_u_pref[perm_item] = u_pref_last
147 |                     else:
148 |                         batch_v_pref = batch_items
149 |                         batch_u_pref = batch_users
150 | 
151 |                     _, _, loss_out = sess.run(
152 |                         [dropout_net.preds, dropout_net.updates, dropout_net.loss],
153 |                         feed_dict={
154 |                             dropout_net.Uin: u_pref_expanded[batch_u_pref, :],
155 |                             dropout_net.Vin: v_pref_expanded[batch_v_pref, :],
156 |                             dropout_net.Ucontent: user_content[batch_users, :].todense(),
157 |                             dropout_net.Vcontent: item_content[batch_items, :].todense(),
158 |                             #
159 |                             dropout_net.target: target_scores[batch_perm],
160 |                             dropout_net.lr_placeholder: _lr,
161 |                             dropout_net.phase: 1
162 |                         }
163 |                     )
164 |                     f_batch += loss_out
165 |                     if np.isnan(f_batch):
166 |                         raise Exception('f is nan')
167 | 
168 |                 n_batch_trained += len(data_batch)
169 |                 if n_step % _decay_lr_every == 0:
170 |                     _lr = _lr_decay * _lr
171 |                     print('decayed lr:' + str(_lr))
172 |                 if n_step % eval_every == 0:
173 |                     recall_warm = utils.batch_eval_recall(
174 |                         sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict,
175 |                         recall_k=recall_at, eval_data=eval_warm)
176 |                     recall_cold_user = utils.batch_eval_recall(
177 |                         sess, dropout_net.eval_preds_cold,
178 |                         eval_feed_dict=dropout_net.get_eval_dict,
179 |                         recall_k=recall_at, eval_data=eval_cold_user)
180 |                     recall_cold_item = utils.batch_eval_recall(
181 |                         sess, dropout_net.eval_preds_cold,
182 |                         eval_feed_dict=dropout_net.get_eval_dict,
183 |                         recall_k=recall_at, eval_data=eval_cold_item)
184 | 
185 |                     # checkpoint
186 |                     if np.sum(recall_warm + recall_cold_user + recall_cold_item) > np.sum(
187 |                                             best_warm + best_cold_user + best_cold_item):
188 |                         best_cold_user = recall_cold_user
189 |                         best_cold_item = recall_cold_item
190 |                         best_warm = recall_warm
191 |                         best_step = n_step
192 |                         if tf_saver is not None:
193 |                             tf_saver.save(sess, _tf_ckpt_file)
194 | 
195 |                     timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (
196 |                         n_step, len(data_batch), n_batch_trained, f_batch, best_step
197 |                     )).tic()
198 |                     print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at]))
199 |                     print('warm start\t%s\ncold user\t%s\ncold item\t%s' % (
200 |                         ' '.join(['%.4f' % i for i in recall_warm]),
201 |                         ' '.join(['%.4f' % i for i in recall_cold_user]),
202 |                         ' '.join(['%.4f' % i for i in recall_cold_item])
203 |                     ))
204 |                     summaries = []
205 |                     for i, k in enumerate(recall_at):
206 |                         if k % 100 == 0:
207 |                             summaries.extend([
208 |                                 tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]),
209 |                                 tf.Summary.Value(tag="recall@" + str(k) + " cold_user",
210 |                                                  simple_value=recall_cold_user[i]),
211 |                                 tf.Summary.Value(tag="recall@" + str(k) + " cold_item",
212 |                                                  simple_value=recall_cold_item[i])
213 |                             ])
214 |                     recall_summary = tf.Summary(value=summaries)
215 |                     if train_writer is not None:
216 |                         train_writer.add_summary(recall_summary, n_step)
217 | 
218 | 
219 | def load_data(data_path):
220 |     timer = utils.timer(name='main').tic()
221 |     split_folder = os.path.join(data_path, 'warm')
222 | 
223 |     u_file = os.path.join(data_path, 'trained/warm/U.csv.bin')
224 |     v_file = os.path.join(data_path, 'trained/warm/V.csv.bin')
225 |     user_content_file = os.path.join(data_path, 'user_features_0based.txt')
226 |     item_content_file = os.path.join(data_path, 'item_features_0based.txt')
227 |     train_file = os.path.join(split_folder, 'train.csv')
228 |     test_warm_file = os.path.join(split_folder, 'test_warm.csv')
229 |     test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv')
230 |     test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv')
231 |     test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv')
232 |     test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv')
233 |     test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv')
234 | 
235 |     dat = {}
236 |     # load preference data
237 |     timer.tic()
238 |     u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200)
239 |     v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200)
240 |     dat['u_pref'] = u_pref
241 |     dat['v_pref'] = v_pref
242 | 
243 |     timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic()
244 | 
245 |     # pre-process
246 |     _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
247 |     _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
248 |     timer.toc('standardized U,V').tic()
249 | 
250 |     # load content data
251 |     timer.tic()
252 |     user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32)
253 |     dat['user_content'] = user_content.tolil(copy=False)
254 |     timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic()
255 |     item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)
256 |     dat['item_content'] = item_content.tolil(copy=False)
257 |     timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
258 | 
259 |     # load split
260 |     timer.tic()
261 |     train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view(
262 |         dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)])
263 |     dat['user_indices'] = np.unique(train['uid'])
264 |     timer.toc('read train triplets %s' % train.shape).tic()
265 | 
266 |     dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False,
267 |                                            train_data=train)
268 |     dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user',
269 |                                                 cold=True,
270 |                                                 train_data=train)
271 |     dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item',
272 |                                                 cold=True,
273 |                                                 train_data=train)
274 |     return dat
275 | 
276 | 
277 | if __name__ == "__main__":
278 |     parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data",
279 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
280 |     parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder')
281 | 
282 |     parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training')
283 |     parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference')
284 |     parser.add_argument('--checkpoint-path', type=str, default=None,
285 |                         help='path to dump checkpoint data from TensorFlow')
286 |     parser.add_argument('--tb-log-path', type=str, default=None,
287 |                         help='path to dump TensorBoard logs')
288 |     parser.add_argument('--model-select', nargs='+', type=int,
289 |                         default=[800, 400],
290 |                         help='specify the fully-connected architecture, starting from input,'
291 |                              ' numbers indicate numbers of hidden units',
292 |                         )
293 |     parser.add_argument('--rank', type=int, default=200, help='output rank of latent model')
294 |     parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout')
295 |     parser.add_argument('--eval-every', type=int, default=2, help='evaluate every X user-batch')
296 |     parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate')
297 | 
298 |     args = parser.parse_args()
299 |     main()
300 | 


--------------------------------------------------------------------------------
/tf1/main_citeu.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | import datetime
  6 | from sklearn import datasets
  7 | import data
  8 | import model
  9 | 
 10 | import argparse
 11 | import os
 12 | 
 13 | n_users = 5551 + 1
 14 | n_items = 16980 + 1
 15 | 
 16 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
 17 | 
 18 | def main():
 19 |     data_path = args.data_dir
 20 |     checkpoint_path = args.checkpoint_path
 21 |     tb_log_path = args.tb_log_path
 22 |     model_select = args.model_select
 23 | 
 24 |     rank_out = args.rank
 25 |     user_batch_size = 1000
 26 |     n_scores_user = 2500
 27 |     data_batch_size = 100
 28 |     dropout = args.dropout
 29 |     recall_at = range(10, 110, 10)
 30 |     eval_batch_size = 1000
 31 |     max_data_per_step = 2500000
 32 |     eval_every = args.eval_every
 33 |     num_epoch = 10
 34 | 
 35 |     _lr = args.lr
 36 |     _decay_lr_every = 50
 37 |     _lr_decay = 0.1
 38 | 
 39 |     experiment = '%s_%s' % (
 40 |         datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'),
 41 |         '-'.join(str(x / 100) for x in model_select) if model_select else 'simple'
 42 |     )
 43 |     _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint'
 44 | 
 45 |     print('running: ' + experiment)
 46 | 
 47 |     dat = load_data(data_path)
 48 |     u_pref_scaled = dat['u_pref_scaled']
 49 |     v_pref_scaled = dat['v_pref_scaled']
 50 |     eval_warm = dat['eval_warm']
 51 |     item_content = dat['item_content']
 52 |     u_pref = dat['u_pref']
 53 |     v_pref = dat['v_pref']
 54 |     user_indices = dat['user_indices']
 55 | 
 56 |     timer = utils.timer(name='main').tic()
 57 | 
 58 |     # append pref factors for faster dropout
 59 |     v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
 60 |     v_pref_last = v_pref_scaled.shape[0]
 61 |     u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
 62 |     u_pref_last = u_pref_scaled.shape[0]
 63 |     timer.toc('initialized numpy data for tf')
 64 | 
 65 |     # prep eval
 66 |     eval_batch_size = eval_batch_size
 67 |     timer.tic()
 68 |     eval_warm.init_tf(u_pref_scaled, v_pref_scaled, None, item_content, eval_batch_size)
 69 |     timer.toc('initialized eval_warm for tf').tic()
 70 | 
 71 |     dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1],
 72 |                                user_content_rank=0,
 73 |                                item_content_rank=item_content.shape[1],
 74 |                                model_select=model_select,
 75 |                                rank_out=rank_out)
 76 | 
 77 |     config = tf.ConfigProto(allow_soft_placement=True)
 78 | 
 79 |     with tf.device(args.model_device):
 80 |         dropout_net.build_model()
 81 | 
 82 |     with tf.device(args.inf_device):
 83 |         dropout_net.build_predictor(recall_at, n_scores_user)
 84 | 
 85 |     with tf.Session(config=config) as sess:
 86 |         tf_saver = None if _tf_ckpt_file is None else tf.train.Saver()
 87 |         train_writer = None if tb_log_path is None else tf.summary.FileWriter(
 88 |             tb_log_path + experiment, sess.graph)
 89 |         tf.global_variables_initializer().run()
 90 |         tf.local_variables_initializer().run()
 91 |         timer.toc('initialized tf')
 92 | 
 93 |         row_index = np.copy(user_indices)
 94 |         n_step = 0
 95 |         best_warm = 0
 96 |         n_batch_trained = 0
 97 |         best_step = 0
 98 |         for epoch in range(num_epoch):
 99 |             np.random.shuffle(row_index)
100 |             for b in utils.batch(row_index, user_batch_size):
101 |                 n_step += 1
102 |                 # prep targets
103 |                 target_users = np.repeat(b, n_scores_user)
104 |                 target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
105 |                 target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b]
106 |                 target_items_rand = np.array(target_items_rand).flatten()
107 |                 target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand]))
108 |                 [target_scores, target_items, random_scores] = sess.run(
109 |                     [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random],
110 |                     feed_dict={
111 |                         dropout_net.U_pref_tf: u_pref[b, :],
112 |                         dropout_net.V_pref_tf: v_pref,
113 |                         dropout_net.rand_target_ui: target_ui_rand
114 |                     }
115 |                 )
116 |                 # merge topN and randomN items per user
117 |                 target_scores = np.append(target_scores, random_scores)
118 |                 target_items = np.append(target_items, target_items_rand)
119 |                 target_users = np.append(target_users, target_users)
120 | 
121 |                 tf.local_variables_initializer().run()
122 |                 n_targets = len(target_scores)
123 |                 perm = np.random.permutation(n_targets)
124 |                 n_targets = min(n_targets, max_data_per_step)
125 |                 data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)]
126 |                 f_batch = 0
127 |                 for (start, stop) in data_batch:
128 |                     batch_perm = perm[start:stop]
129 |                     batch_users = target_users[batch_perm]
130 |                     batch_items = target_items[batch_perm]
131 |                     if dropout != 0:
132 |                         n_to_drop = int(np.floor(dropout * len(batch_perm)))
133 |                         perm_user = np.random.permutation(len(batch_perm))[:n_to_drop]
134 |                         perm_item = np.random.permutation(len(batch_perm))[:n_to_drop]
135 |                         batch_v_pref = np.copy(batch_items)
136 |                         batch_u_pref = np.copy(batch_users)
137 |                         batch_v_pref[perm_user] = v_pref_last
138 |                         batch_u_pref[perm_item] = u_pref_last
139 |                     else:
140 |                         batch_v_pref = batch_items
141 |                         batch_u_pref = batch_users
142 | 
143 |                     _, _, loss_out = sess.run(
144 |                         [dropout_net.preds, dropout_net.updates, dropout_net.loss],
145 |                         feed_dict={
146 |                             dropout_net.Uin: u_pref_expanded[batch_u_pref, :],
147 |                             dropout_net.Vin: v_pref_expanded[batch_v_pref, :],
148 |                             dropout_net.Vcontent: item_content[batch_items, :].todense(),
149 |                             #
150 |                             dropout_net.target: target_scores[batch_perm],
151 |                             dropout_net.lr_placeholder: _lr,
152 |                             dropout_net.phase: 1
153 |                         }
154 |                     )
155 |                     f_batch += loss_out
156 |                     if np.isnan(f_batch):
157 |                         raise Exception('f is nan')
158 | 
159 |                 n_batch_trained += len(data_batch)
160 |                 if n_step % _decay_lr_every == 0:
161 |                     _lr = _lr_decay * _lr
162 |                     print('decayed lr:' + str(_lr))
163 |                 if n_step % eval_every == 0:
164 |                     recall_warm = utils.batch_eval_recall(
165 |                         sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict,
166 |                         recall_k=recall_at, eval_data=eval_warm)
167 | 
168 |                     # checkpoint
169 |                     if np.sum(recall_warm) > np.sum(best_warm):
170 |                         best_warm = recall_warm
171 |                         best_step = n_step
172 |                         if tf_saver is not None:
173 |                             tf_saver.save(sess, _tf_ckpt_file)
174 | 
175 |                     timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (
176 |                         n_step, len(data_batch), n_batch_trained, f_batch, best_step
177 |                     )).tic()
178 |                     print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at]))
179 |                     print('warm start\t%s' % (
180 |                         ' '.join(['%.4f' % i for i in recall_warm]),
181 |                     ))
182 |                     summaries = []
183 |                     for i, k in enumerate(recall_at):
184 |                         if k % 100 == 0:
185 |                             summaries.extend([
186 |                                 tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]),
187 |                             ])
188 |                     recall_summary = tf.Summary(value=summaries)
189 |                     if train_writer is not None:
190 |                         train_writer.add_summary(recall_summary, n_step)
191 | 
192 | 
193 | def load_data(data_path):
194 |     timer = utils.timer(name='main').tic()
195 |     split_folder = os.path.join(data_path, 'warm')
196 | 
197 |     u_file = os.path.join(data_path, 'trained/warm/U.bin')
198 |     v_file = os.path.join(data_path, 'trained/warm/V.bin')
199 |     item_content_file = os.path.join(data_path, 'item_features_0based.txt')
200 |     train_file = os.path.join(split_folder, 'train.csv')
201 |     test_warm_file = os.path.join(split_folder, 'test.csv')
202 |     test_warm_iid_file = os.path.join(split_folder, 'test_item_ids.csv')
203 | 
204 |     dat = {}
205 |     # load preference data
206 |     timer.tic()
207 |     u_pref = np.fromfile(u_file, dtype='>f4').reshape(n_users, 200)
208 |     v_pref = np.fromfile(v_file, dtype='>f4').reshape(n_items, 200)
209 |     dat['u_pref'] = u_pref
210 |     dat['v_pref'] = v_pref
211 | 
212 |     timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic()
213 | 
214 |     # pre-process
215 |     _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
216 |     _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
217 |     timer.toc('standardized U,V').tic()
218 | 
219 |     # load content data
220 |     timer.tic()
221 |     item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)
222 |     dat['item_content'] = item_content.tolil(copy=False)
223 |     timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
224 | 
225 |     # load split
226 |     timer.tic()
227 |     train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view(
228 |         dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
229 |     dat['user_indices'] = np.unique(train['uid'])
230 |     timer.toc('read train triplets %s' % train.shape).tic()
231 | 
232 |     dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False,
233 |                                            train_data=train,citeu=True)
234 |     return dat
235 | 
236 | 
237 | if __name__ == "__main__":
238 |     parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data",
239 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
240 |     parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder')
241 | 
242 |     parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training')
243 |     parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference')
244 |     parser.add_argument('--checkpoint-path', type=str, default=None,
245 |                         help='path to dump checkpoint data from TensorFlow')
246 |     parser.add_argument('--tb-log-path', type=str, default=None,
247 |                         help='path to dump TensorBoard logs')
248 |     parser.add_argument('--model-select', nargs='+', type=int,
249 |                         default=[200],
250 |                         help='specify the fully-connected architecture, starting from input,'
251 |                              ' numbers indicate numbers of hidden units',
252 |                         )
253 |     parser.add_argument('--rank', type=int, default=200, help='output rank of latent model')
254 |     parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout')
255 |     parser.add_argument('--eval-every', type=int, default=1, help='evaluate every X user-batch')
256 |     parser.add_argument('--lr', type=float, default=0.05, help='starting learning rate')
257 | 
258 |     args = parser.parse_args()
259 |     main()
260 | 


--------------------------------------------------------------------------------
/tf1/main_cold_citeu.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | import datetime
  6 | from sklearn import datasets
  7 | import data
  8 | import model
  9 | import scipy.sparse as sp
 10 | 
 11 | import argparse
 12 | import os
 13 | 
 14 | n_users = 5551 + 1
 15 | n_items = 16980 + 1
 16 | 
 17 | def main():
 18 |     data_path = args.data_dir
 19 |     checkpoint_path = args.checkpoint_path
 20 |     tb_log_path = args.tb_log_path
 21 |     model_select = args.model_select
 22 | 
 23 |     rank_out = args.rank
 24 |     user_batch_size = 1000
 25 |     n_scores_user = 2500
 26 |     data_batch_size = 100
 27 |     dropout = args.dropout
 28 |     recall_at = range(10, 110, 10)
 29 |     eval_batch_size = 1000
 30 |     max_data_per_step = 2500000
 31 |     eval_every = args.eval_every
 32 |     num_epoch = 500
 33 | 
 34 |     _lr = args.lr
 35 |     _decay_lr_every = 100
 36 |     _lr_decay = 0.1
 37 | 
 38 |     experiment = '%s_%s' % (
 39 |         datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'),
 40 |         '-'.join(str(x / 100) for x in model_select) if model_select else 'simple'
 41 |     )
 42 |     _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint'
 43 | 
 44 |     print('running: ' + experiment)
 45 | 
 46 |     dat = load_data(data_path)
 47 |     u_pref_scaled = dat['u_pref_scaled']
 48 |     v_pref_scaled = dat['v_pref_scaled']
 49 |     eval_cold = dat['eval_cold']
 50 |     item_content = dat['item_content']
 51 |     u_pref = dat['u_pref']
 52 |     v_pref = dat['v_pref']
 53 |     user_indices = dat['user_indices']
 54 | 
 55 |     timer = utils.timer(name='main').tic()
 56 | 
 57 |     # append pref factors for faster dropout
 58 |     v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
 59 |     v_pref_last = v_pref_scaled.shape[0]
 60 |     u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
 61 |     u_pref_last = u_pref_scaled.shape[0]
 62 |     timer.toc('initialized numpy data for tf')
 63 | 
 64 |     # prep eval
 65 |     eval_batch_size = eval_batch_size
 66 |     timer.tic()
 67 |     eval_cold.init_tf(u_pref_scaled, v_pref_scaled, None, item_content, eval_batch_size)
 68 |     timer.toc('initialized eval for tf').tic()
 69 | 
 70 |     dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1],
 71 |                                user_content_rank=0,
 72 |                                item_content_rank=item_content.shape[1],
 73 |                                model_select=model_select,
 74 |                                rank_out=rank_out)
 75 | 
 76 |     config = tf.ConfigProto(allow_soft_placement=True)
 77 | 
 78 |     with tf.device(args.model_device):
 79 |         dropout_net.build_model()
 80 | 
 81 |     with tf.device(args.inf_device):
 82 |         dropout_net.build_predictor(recall_at, n_scores_user)
 83 | 
 84 |     if args.progress:
 85 |         from tqdm import tqdm
 86 |     with tf.Session(config=config) as sess:
 87 |         tf_saver = None if _tf_ckpt_file is None else tf.train.Saver()
 88 |         train_writer = None if tb_log_path is None else tf.summary.FileWriter(
 89 |             tb_log_path + experiment, sess.graph)
 90 |         tf.global_variables_initializer().run()
 91 |         tf.local_variables_initializer().run()
 92 |         timer.toc('initialized tf')
 93 | 
 94 |         row_index = np.copy(user_indices)
 95 |         n_step = 0
 96 |         best_cold = 0
 97 |         n_batch_trained = 0
 98 |         best_step = 0
 99 |         for epoch in range(num_epoch):
100 |             np.random.shuffle(row_index)
101 |             for b in utils.batch(row_index, user_batch_size):
102 |                 n_step += 1
103 |                 # prep targets
104 |                 target_users = np.repeat(b, n_scores_user)
105 |                 target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
106 |                 target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b]
107 |                 target_items_rand = np.array(target_items_rand).flatten()
108 |                 target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand]))
109 |                 [target_scores, target_items, random_scores] = sess.run(
110 |                     [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random],
111 |                     feed_dict={
112 |                         dropout_net.U_pref_tf: u_pref[b, :],
113 |                         dropout_net.V_pref_tf: v_pref,
114 |                         dropout_net.rand_target_ui: target_ui_rand
115 |                     }
116 |                 )
117 |                 # merge topN and randomN items per user
118 |                 target_scores = np.append(target_scores, random_scores)
119 |                 target_items = np.append(target_items, target_items_rand)
120 |                 target_users = np.append(target_users, target_users)
121 | 
122 |                 tf.local_variables_initializer().run()
123 |                 n_targets = len(target_scores)
124 |                 perm = np.random.permutation(n_targets)
125 |                 n_targets = min(n_targets, max_data_per_step)
126 |                 data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)]
127 |                 f_batch = 0
128 |                 gen = data_batch
129 |                 if args.progress:
130 |                     gen = tqdm(gen)
131 |                 for (start, stop) in gen:
132 |                     batch_perm = perm[start:stop]
133 |                     batch_users = target_users[batch_perm]
134 |                     batch_items = target_items[batch_perm]
135 |                     if dropout != 0:
136 |                         n_to_drop = int(np.floor(dropout * len(batch_perm)))
137 |                         perm_user = np.random.permutation(len(batch_perm))[:n_to_drop]
138 |                         perm_item = np.random.permutation(len(batch_perm))[:n_to_drop]
139 |                         batch_v_pref = np.copy(batch_items)
140 |                         batch_u_pref = np.copy(batch_users)
141 |                         batch_v_pref[perm_user] = v_pref_last
142 |                         batch_u_pref[perm_item] = u_pref_last
143 |                     else:
144 |                         batch_v_pref = batch_items
145 |                         batch_u_pref = batch_users
146 |                     item_content_batch = item_content[batch_items, :]
147 |                     if sp.issparse(item_content):
148 |                         item_content_batch = item_content_batch.todense()
149 | 
150 |                     _, _, loss_out = sess.run(
151 |                         [dropout_net.preds, dropout_net.updates, dropout_net.loss],
152 |                         feed_dict={
153 |                             dropout_net.Uin: u_pref_expanded[batch_u_pref, :],
154 |                             dropout_net.Vin: v_pref_expanded[batch_v_pref, :],
155 |                             dropout_net.Vcontent: item_content_batch,
156 |                             #
157 |                             dropout_net.target: target_scores[batch_perm],
158 |                             dropout_net.lr_placeholder: _lr,
159 |                             dropout_net.phase: 1
160 |                         }
161 |                     )
162 |                     f_batch += loss_out
163 |                     if np.isnan(f_batch):
164 |                         raise Exception('f is nan')
165 | 
166 |                 n_batch_trained += len(data_batch)
167 |                 if n_step % _decay_lr_every == 0:
168 |                     _lr = _lr_decay * _lr
169 |                     print('decayed lr:' + str(_lr))
170 |                 if n_step % eval_every == 0:
171 |                     recall_cold = utils.batch_eval_recall(
172 |                         sess, dropout_net.eval_preds_cold, eval_feed_dict=dropout_net.get_eval_dict,
173 |                         recall_k=recall_at, eval_data=eval_cold)
174 | 
175 |                     # checkpoint
176 |                     if np.sum(recall_cold) > np.sum(best_cold):
177 |                         best_cold = recall_cold
178 |                         best_step = n_step
179 |                         if tf_saver is not None:
180 |                             tf_saver.save(sess, _tf_ckpt_file)
181 | 
182 |                     timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (
183 |                         n_step, len(data_batch), n_batch_trained, f_batch, best_step
184 |                     )).tic()
185 |                     print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at]))
186 |                     print('cold start\t%s' % (
187 |                         ' '.join(['%.4f' % i for i in recall_cold]),
188 |                     ))
189 |                     print('best epoch[%d]\t%s' % (
190 |                         best_step,
191 |                         ' '.join(['%.4f' % i for i in best_cold] ),
192 |                     ))
193 |                     summaries = []
194 |                     for i, k in enumerate(recall_at):
195 |                         if k % 100 == 0:
196 |                             summaries.extend([
197 |                                 tf.Summary.Value(tag="recall@" + str(k) + " cold", simple_value=recall_cold[i]),
198 |                             ])
199 |                     recall_summary = tf.Summary(value=summaries)
200 |                     if train_writer is not None:
201 |                         train_writer.add_summary(recall_summary, n_step)
202 | 
203 | def tfidf(R):
204 |     row = R.shape[0]
205 |     col = R.shape[1]
206 |     Rbin = R.copy()
207 |     Rbin[Rbin!=0]=1.0
208 |     R = R + Rbin
209 |     tf = R.copy()
210 |     tf.data = np.log(tf.data)
211 |     idf = np.sum(Rbin,0)
212 |     idf = np.log(row/(1+idf))
213 |     idf = sp.spdiags(idf,0,col,col)
214 |     return tf * idf
215 | 
216 | def load_data(data_path):
217 |     timer = utils.timer(name='main').tic()
218 |     split_folder = os.path.join(data_path, 'cold')
219 | 
220 |     u_file = os.path.join(data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt')
221 |     v_file = os.path.join(data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt')
222 |     item_content_file = os.path.join(data_path, 'item_features_0based.txt')
223 |     train_file = os.path.join(split_folder, 'train.csv')
224 |     test_cold_file = os.path.join(split_folder, 'test.csv')
225 |     test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv')
226 | 
227 |     dat = {}
228 |     # load preference data
229 |     timer.tic()
230 | #    u_pref = np.fromfile(u_file, dtype='>f4').reshape(n_users, 200)
231 | #    v_pref = np.fromfile(v_file, dtype='>f4').reshape(n_items, 200)
232 | 
233 |     u_pref = np.loadtxt(u_file).reshape(n_users,200)
234 |     v_pref = np.loadtxt(v_file).reshape(n_items,200)
235 | 
236 |     dat['u_pref'] = u_pref
237 |     dat['v_pref'] = v_pref
238 | 
239 |     timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic()
240 | 
241 |     # pre-process
242 |     _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
243 |     _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
244 | 
245 |     timer.toc('standardized U,V').tic()
246 | 
247 |     # load content data
248 |     timer.tic()
249 |     item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)
250 | 
251 |     item_content = tfidf(item_content)
252 | 
253 |     from sklearn.utils.extmath import randomized_svd
254 |     u,s,_ = randomized_svd(item_content, n_components=300, n_iter=5)
255 |     item_content = u * s
256 |     _, item_content = utils.prep_standardize(item_content)
257 | 
258 |     if sp.issparse(item_content):
259 |         dat['item_content'] = item_content.tolil(copy=False)
260 |     else:
261 |         dat['item_content'] = item_content
262 |     timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
263 | 
264 |     # load split
265 |     timer.tic()
266 |     train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view(
267 |         dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
268 |     dat['user_indices'] = np.unique(train['uid'])
269 |     timer.toc('read train triplets %s' % train.shape).tic()
270 | 
271 |     dat['eval_cold'] = data.load_eval_data(test_cold_file, test_cold_iid_file, name='eval_cold', cold=True,
272 |                                            train_data=train,citeu=True)
273 |     return dat
274 | 
275 | 
276 | if __name__ == "__main__":
277 |     parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data",
278 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
279 |     parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder')
280 | 
281 |     parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training')
282 |     parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference')
283 |     parser.add_argument('--checkpoint-path', type=str, default=None,
284 |                         help='path to dump checkpoint data from TensorFlow')
285 |     parser.add_argument('--tb-log-path', type=str, default=None,
286 |                         help='path to dump TensorBoard logs')
287 |     parser.add_argument('--model-select', nargs='+', type=int,
288 |                         default=[200],
289 |                         help='specify the fully-connected architecture, starting from input,'
290 |                              ' numbers indicate numbers of hidden units',
291 |                         )
292 |     parser.add_argument('--rank', type=int, default=200, help='output rank of latent model')
293 |     parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout')
294 |     parser.add_argument('--eval-every', type=int, default=1, help='evaluate every X user-batch')
295 |     parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate')
296 |     parser.add_argument('--progress', action='store_true', help='show tqdm progress (requires tqdm) during training')
297 | 
298 |     args = parser.parse_args()
299 |     args, _ = parser.parse_known_args()
300 |     for key in vars(args):
301 |         print(key + ":" + str(vars(args)[key]))
302 |     main()
303 | 


--------------------------------------------------------------------------------
/tf1/main_warm_citeu.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | import datetime
  6 | from sklearn import datasets
  7 | import data
  8 | import model
  9 | import scipy.sparse as sp
 10 | 
 11 | import argparse
 12 | import os
 13 | 
 14 | 
 15 | n_users = 5551 + 1
 16 | n_items = 16980 + 1
 17 | 
 18 | def main():
 19 |     data_path = args.data_dir
 20 |     checkpoint_path = args.checkpoint_path
 21 |     tb_log_path = args.tb_log_path
 22 |     model_select = args.model_select
 23 | 
 24 |     rank_out = args.rank
 25 |     user_batch_size = 1000
 26 |     n_scores_user = 2500
 27 |     data_batch_size = 100
 28 |     dropout = args.dropout
 29 |     recall_at = range(10, 110, 10)
 30 |     eval_batch_size = 1000
 31 |     max_data_per_step = 2500000
 32 |     eval_every = args.eval_every
 33 |     num_epoch = 200
 34 | 
 35 |     _lr = args.lr
 36 |     _decay_lr_every = 100
 37 |     _lr_decay = 0.1
 38 | 
 39 |     experiment = '%s_%s' % (
 40 |         datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'),
 41 |         '-'.join(str(x / 100) for x in model_select) if model_select else 'simple'
 42 |     )
 43 |     _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint'
 44 | 
 45 |     print('running: ' + experiment)
 46 | 
 47 |     dat = load_data(data_path)
 48 |     u_pref_scaled = dat['u_pref_scaled']
 49 |     v_pref_scaled = dat['v_pref_scaled']
 50 |     eval_warm = dat['eval_warm']
 51 |     item_content = dat['item_content']
 52 |     u_pref = dat['u_pref']
 53 |     v_pref = dat['v_pref']
 54 |     user_indices = dat['user_indices']
 55 | 
 56 |     timer = utils.timer(name='main').tic()
 57 | 
 58 |     # append pref factors for faster dropout
 59 |     v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
 60 |     v_pref_last = v_pref_scaled.shape[0]
 61 |     u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
 62 |     u_pref_last = u_pref_scaled.shape[0]
 63 |     timer.toc('initialized numpy data for tf')
 64 | 
 65 |     # prep eval
 66 |     eval_batch_size = eval_batch_size
 67 |     timer.tic()
 68 |     eval_warm.init_tf(u_pref_scaled, v_pref_scaled, None, item_content, eval_batch_size)
 69 |     timer.toc('initialized eval for tf').tic()
 70 | 
 71 |     dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1],
 72 |                                user_content_rank=0,
 73 |                                item_content_rank=item_content.shape[1],
 74 |                                model_select=model_select,
 75 |                                rank_out=rank_out)
 76 | 
 77 |     config = tf.ConfigProto(allow_soft_placement=True)
 78 | 
 79 |     with tf.device(args.model_device):
 80 |         dropout_net.build_model()
 81 | 
 82 |     with tf.device(args.inf_device):
 83 |         dropout_net.build_predictor(recall_at, n_scores_user)
 84 | 
 85 |     if args.progress:
 86 |         from tqdm import tqdm
 87 |     with tf.Session(config=config) as sess:
 88 |         tf_saver = None if _tf_ckpt_file is None else tf.train.Saver()
 89 |         train_writer = None if tb_log_path is None else tf.summary.FileWriter(
 90 |             tb_log_path + experiment, sess.graph)
 91 |         tf.global_variables_initializer().run()
 92 |         tf.local_variables_initializer().run()
 93 |         timer.toc('initialized tf')
 94 | 
 95 |         row_index = np.copy(user_indices)
 96 |         n_step = 0
 97 |         best_warm = 0
 98 |         n_batch_trained = 0
 99 |         best_step = 0
100 |         for epoch in range(num_epoch):
101 |             np.random.shuffle(row_index)
102 |             for b in utils.batch(row_index, user_batch_size):
103 |                 n_step += 1
104 |                 # prep targets
105 |                 target_users = np.repeat(b, n_scores_user)
106 |                 target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
107 |                 target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b]
108 |                 target_items_rand = np.array(target_items_rand).flatten()
109 |                 target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand]))
110 |                 [target_scores, target_items, random_scores] = sess.run(
111 |                     [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random],
112 |                     feed_dict={
113 |                         dropout_net.U_pref_tf: u_pref[b, :],
114 |                         dropout_net.V_pref_tf: v_pref,
115 |                         dropout_net.rand_target_ui: target_ui_rand
116 |                     }
117 |                 )
118 |                 # merge topN and randomN items per user
119 |                 target_scores = np.append(target_scores, random_scores)
120 |                 target_items = np.append(target_items, target_items_rand)
121 |                 target_users = np.append(target_users, target_users)
122 | 
123 |                 tf.local_variables_initializer().run()
124 |                 n_targets = len(target_scores)
125 |                 perm = np.random.permutation(n_targets)
126 |                 n_targets = min(n_targets, max_data_per_step)
127 |                 data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)]
128 |                 f_batch = 0
129 |                 gen = data_batch
130 |                 if args.progress:
131 |                     gen = tqdm(gen)
132 |                 for (start, stop) in gen:
133 |                     batch_perm = perm[start:stop]
134 |                     batch_users = target_users[batch_perm]
135 |                     batch_items = target_items[batch_perm]
136 |                     if dropout != 0:
137 |                         n_to_drop = int(np.floor(dropout * len(batch_perm)))
138 |                         perm_user = np.random.permutation(len(batch_perm))[:n_to_drop]
139 |                         perm_item = np.random.permutation(len(batch_perm))[:n_to_drop]
140 |                         batch_v_pref = np.copy(batch_items)
141 |                         batch_u_pref = np.copy(batch_users)
142 |                         batch_v_pref[perm_user] = v_pref_last
143 |                         batch_u_pref[perm_item] = u_pref_last
144 |                     else:
145 |                         batch_v_pref = batch_items
146 |                         batch_u_pref = batch_users
147 |                     item_content_batch = item_content[batch_items, :]
148 |                     if sp.issparse(item_content):
149 |                         item_content_batch = item_content_batch.todense()
150 | 
151 |                     _, _, loss_out = sess.run(
152 |                         [dropout_net.preds, dropout_net.updates, dropout_net.loss],
153 |                         feed_dict={
154 |                             dropout_net.Uin: u_pref_expanded[batch_u_pref, :],
155 |                             dropout_net.Vin: v_pref_expanded[batch_v_pref, :],
156 |                             dropout_net.Vcontent: item_content_batch,
157 |                             #
158 |                             dropout_net.target: target_scores[batch_perm],
159 |                             dropout_net.lr_placeholder: _lr,
160 |                             dropout_net.phase: 1
161 |                         }
162 |                     )
163 |                     f_batch += loss_out
164 |                     if np.isnan(f_batch):
165 |                         raise Exception('f is nan')
166 | 
167 |                 n_batch_trained += len(data_batch)
168 |                 if n_step % _decay_lr_every == 0:
169 |                     _lr = _lr_decay * _lr
170 |                     print('decayed lr:' + str(_lr))
171 |                 if n_step % eval_every == 0:
172 |                     recall_warm = utils.batch_eval_recall(
173 |                         sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict,
174 |                         recall_k=recall_at, eval_data=eval_warm)
175 | 
176 |                     # checkpoint
177 |                     if np.sum(recall_warm) > np.sum(best_warm):
178 |                         best_warm = recall_warm
179 |                         best_step = n_step
180 |                         if tf_saver is not None:
181 |                             tf_saver.save(sess, _tf_ckpt_file)
182 | 
183 |                     timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (
184 |                         n_step, len(data_batch), n_batch_trained, f_batch, best_step
185 |                     )).tic()
186 |                     print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at]))
187 |                     print('warm start\t%s' % (
188 |                         ' '.join(['%.4f' % i for i in recall_warm]),
189 |                     ))
190 |                     print('best epoch[%d]\t%s' % (
191 |                         best_step,
192 |                         ' '.join(['%.4f' % i for i in best_warm] ),
193 |                     ))
194 |                     summaries = []
195 |                     for i, k in enumerate(recall_at):
196 |                         if k % 100 == 0:
197 |                             summaries.extend([
198 |                                 tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]),
199 |                             ])
200 |                     recall_summary = tf.Summary(value=summaries)
201 |                     if train_writer is not None:
202 |                         train_writer.add_summary(recall_summary, n_step)
203 | 
204 | def tfidf(R):
205 |     row = R.shape[0]
206 |     col = R.shape[1]
207 |     Rbin = R.copy()
208 |     Rbin[Rbin!=0]=1.0
209 |     R = R + Rbin
210 |     tf = R.copy()
211 |     tf.data = np.log(tf.data)
212 |     idf = np.sum(Rbin,0)
213 |     idf = np.log(row/(1+idf))
214 |     idf = sp.spdiags(idf,0,col,col)
215 |     return tf * idf
216 | 
217 | def load_data(data_path):
218 |     timer = utils.timer(name='main').tic()
219 |     split_folder = os.path.join(data_path, 'warm')
220 | 
221 |     u_file = os.path.join(data_path, 'trained/warm/WRMF_warm_rank200_reg1_alpha10_iter10.U.txt')
222 |     v_file = os.path.join(data_path, 'trained/warm/WRMF_warm_rank200_reg1_alpha10_iter10.V.txt')
223 |     item_content_file = os.path.join(data_path, 'item_features_0based.txt')
224 |     train_file = os.path.join(split_folder, 'train.csv')
225 |     test_warm_file = os.path.join(split_folder, 'test.csv')
226 |     test_warm_iid_file = os.path.join(split_folder, 'test_item_ids.csv')
227 | 
228 |     dat = {}
229 |     # load preference data
230 |     timer.tic()
231 | 
232 |     u_pref = np.loadtxt(u_file).reshape(n_users,200)
233 |     v_pref = np.loadtxt(v_file).reshape(n_items,200)
234 | 
235 | 
236 |     dat['u_pref'] = u_pref
237 |     dat['v_pref'] = v_pref
238 | 
239 |     timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic()
240 | 
241 |     # pre-process
242 |     _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
243 |     _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
244 | 
245 |     timer.toc('standardized U,V').tic()
246 | 
247 |     # load content data
248 |     timer.tic()
249 |     item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)
250 | 
251 |     item_content = tfidf(item_content)
252 | 
253 |     from sklearn.utils.extmath import randomized_svd
254 |     u,s,_ = randomized_svd(item_content, n_components=300, n_iter=5)
255 |     item_content = u * s
256 |     _, item_content = utils.prep_standardize(item_content)
257 | 
258 |     if sp.issparse(item_content):
259 |         dat['item_content'] = item_content.tolil(copy=False)
260 |     else:
261 |         dat['item_content'] = item_content
262 |     timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
263 | 
264 |     # load split
265 |     timer.tic()
266 |     train = pd.read_csv(train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view(
267 |         dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
268 |     dat['user_indices'] = np.unique(train['uid'])
269 |     timer.toc('read train triplets %s' % train.shape).tic()
270 | 
271 |     dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False,
272 |                                            train_data=train,citeu=True)
273 |     return dat
274 | 
275 | 
276 | if __name__ == "__main__":
277 |     parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data",
278 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
279 |     parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder')
280 | 
281 |     parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training')
282 |     parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference')
283 |     parser.add_argument('--checkpoint-path', type=str, default=None,
284 |                         help='path to dump checkpoint data from TensorFlow')
285 |     parser.add_argument('--tb-log-path', type=str, default=None,
286 |                         help='path to dump TensorBoard logs')
287 |     parser.add_argument('--model-select', nargs='+', type=int,
288 |                         default=[500],
289 |                         help='specify the fully-connected architecture, starting from input,'
290 |                              ' numbers indicate numbers of hidden units',
291 |                         )
292 |     parser.add_argument('--rank', type=int, default=200, help='output rank of latent model')
293 |     parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout')
294 |     parser.add_argument('--eval-every', type=int, default=1, help='evaluate every X user-batch')
295 |     parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate')
296 |     parser.add_argument('--progress', action='store_true', help='show tqdm progress (requires tqdm) during training')
297 | 
298 |     args = parser.parse_args()
299 |     args, _ = parser.parse_known_args()
300 |     for key in vars(args):
301 |         print(key + ":" + str(vars(args)[key]))
302 |     main()
303 | 


--------------------------------------------------------------------------------
/tf1/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | def dense_batch_fc_tanh(x, units, phase, scope, do_norm=False):
  5 |     """
  6 |     convenience function to build tanh blocks in DeepCF
  7 |     tanh is found to work better for DeepCF nets
  8 |     constitutes of: FC -> batch norm -> tanh activation
  9 | 
 10 |     x: input
 11 |     units: # of hidden units in FC
 12 |     phase: boolean flag whether we are training, required by batch norm
 13 |     scope: name of block
 14 |     do_norm: boolean flag to do batch norm after FC
 15 |     """
 16 | 
 17 |     with tf.variable_scope(scope):
 18 |         init = tf.truncated_normal_initializer(stddev=0.01)
 19 |         h1_w = tf.get_variable(scope + '_w',
 20 |                                shape=[x.get_shape().as_list()[1], units],
 21 |                                initializer=init)
 22 |         h1_b = tf.get_variable(scope + '_b',
 23 |                                shape=[1, units],
 24 |                                initializer=tf.zeros_initializer())
 25 |         h1 = tf.matmul(x, h1_w) + h1_b
 26 |         if do_norm:
 27 |             h2 = tf.contrib.layers.batch_norm(
 28 |                 h1,
 29 |                 decay=0.9,
 30 |                 center=True,
 31 |                 scale=True,
 32 |                 is_training=phase,
 33 |                 scope=scope + '_bn')
 34 |             return tf.nn.tanh(h2, scope + '_tanh')
 35 |         else:
 36 |             return tf.nn.tanh(h1, scope + '_tanh')
 37 | 
 38 | 
 39 | class DeepCF:
 40 |     """
 41 |     main model class implementing DeepCF
 42 |     also stores states for fast candidate generation
 43 | 
 44 |     latent_rank_in: rank of preference model input
 45 |     user_content_rank: rank of user content input
 46 |     item_content_rank: rank of item content input
 47 |     model_select: array of number of hidden unit,
 48 |         i.e. [200,100] indicate two hidden layer with 200 units followed by 100 units
 49 |     rank_out: rank of latent model output
 50 | 
 51 |     """
 52 | 
 53 |     def __init__(self, latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out):
 54 | 
 55 |         self.rank_in = latent_rank_in
 56 |         self.phi_u_dim = user_content_rank
 57 |         self.phi_v_dim = item_content_rank
 58 |         self.model_select = model_select
 59 |         self.rank_out = rank_out
 60 | 
 61 |         # inputs
 62 |         self.Uin = None
 63 |         self.Vin = None
 64 |         self.Ucontent = None
 65 |         self.Vcontent = None
 66 |         self.phase = None
 67 |         self.target = None
 68 |         self.eval_trainR = None
 69 |         self.U_pref_tf = None
 70 |         self.V_pref_tf = None
 71 |         self.rand_target_ui = None
 72 | 
 73 |         # outputs in the model
 74 | 
 75 |         self.preds = None
 76 |         self.updates = None
 77 |         self.loss = None
 78 | 
 79 |         self.U_embedding = None
 80 |         self.V_embedding = None
 81 | 
 82 |         self.lr_placeholder = None
 83 | 
 84 |         # predictor
 85 |         self.tf_topk_vals = None
 86 |         self.tf_topk_inds = None
 87 |         self.preds_random = None
 88 |         self.tf_latent_topk_cold = None
 89 |         self.tf_latent_topk_warm = None
 90 |         self.eval_preds_warm = None
 91 |         self.eval_preds_cold = None
 92 | 
 93 |     def build_model(self):
 94 |         """
 95 |         set up tf components for main DeepCF net
 96 |         call after setting up desired tf state (cpu/gpu etc...)
 97 | 
 98 |         Note: should use GPU
 99 |         """
100 |         self.lr_placeholder = tf.placeholder(tf.float32, shape=[], name='learn_rate')
101 |         self.phase = tf.placeholder(tf.bool, name='phase')
102 |         self.target = tf.placeholder(tf.float32, shape=[None], name='target')
103 | 
104 |         self.Uin = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='U_in_raw')
105 |         self.Vin = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='V_in_raw')
106 |         if self.phi_u_dim>0:
107 |             self.Ucontent = tf.placeholder(tf.float32, shape=[None, self.phi_u_dim], name='U_content')
108 |             u_concat = tf.concat([self.Uin, self.Ucontent], 1)
109 |         else:
110 |             u_concat = self.Uin
111 | 
112 |         if self.phi_v_dim>0:
113 |             self.Vcontent = tf.placeholder(tf.float32, shape=[None, self.phi_v_dim], name='V_content')
114 |             v_concat = tf.concat([self.Vin, self.Vcontent], 1)
115 |         else:
116 |             v_concat = self.Vin
117 | 
118 |         print ('\tu_concat.shape=%s' % str(u_concat.get_shape()))
119 |         print ('\tv_concat.shape=%s' % str(v_concat.get_shape()))
120 | 
121 |         u_last = u_concat
122 |         v_last = v_concat
123 |         for ihid, hid in enumerate(self.model_select):
124 |             u_last = dense_batch_fc_tanh(u_last, hid, self.phase, 'user_layer_%d' % (ihid + 1), do_norm=True)
125 |             v_last = dense_batch_fc_tanh(v_last, hid, self.phase, 'item_layer_%d' % (ihid + 1), do_norm=True)
126 | 
127 |         with tf.variable_scope("self.U_embedding"):
128 |             u_emb_w = tf.Variable(tf.truncated_normal([u_last.get_shape().as_list()[1], self.rank_out], stddev=0.01),
129 |                                   name='u_emb_w')
130 |             u_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='u_emb_b')
131 |             self.U_embedding = tf.matmul(u_last, u_emb_w) + u_emb_b
132 | 
133 |         with tf.variable_scope("V_embedding"):
134 |             v_emb_w = tf.Variable(tf.truncated_normal([v_last.get_shape().as_list()[1], self.rank_out], stddev=0.01),
135 |                                   name='v_emb_w')
136 |             v_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='v_emb_b')
137 |             self.V_embedding = tf.matmul(v_last, v_emb_w) + v_emb_b
138 | 
139 |         with tf.variable_scope("loss"):
140 |             preds = tf.multiply(self.U_embedding, self.V_embedding)
141 |             self.preds = tf.reduce_sum(preds, 1)
142 |             self.loss = tf.reduce_mean(tf.squared_difference(self.preds, self.target))
143 | 
144 |         update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
145 |         with tf.control_dependencies(update_ops):
146 |             # Ensures that we execute the update_ops before performing the train_step
147 |             self.updates = tf.train.MomentumOptimizer(self.lr_placeholder, 0.9).minimize(self.loss)
148 | 
149 |     def build_predictor(self, recall_at, num_candidates):
150 |         """
151 |         set up tf components for prediction and target selection
152 |         call after setting up desired tf state (cpu/gpu etc...)
153 | 
154 |         Note: should use CPU, as large inputs are expected
155 | 
156 |         :param recall_at: truncation to compute recall
157 |         :param num_candidates: number of candidates
158 |         :return:
159 |         """
160 |         self.eval_trainR = tf.sparse_placeholder(
161 |             dtype=tf.float32, shape=[None, None], name='trainR_sparse_CPU')
162 | 
163 |         with tf.variable_scope("eval"):
164 |             embedding_prod_cold = tf.matmul(self.U_embedding, self.V_embedding, transpose_b=True, name='pred_all_items')
165 |             embedding_prod_warm = tf.sparse_add(embedding_prod_cold, self.eval_trainR)
166 |             _, self.eval_preds_cold = tf.nn.top_k(embedding_prod_cold, k=recall_at[-1], sorted=True,
167 |                                                   name='topK_net_cold')
168 |             _, self.eval_preds_warm = tf.nn.top_k(embedding_prod_warm, k=recall_at[-1], sorted=True,
169 |                                                   name='topK_net_warm')
170 |         with tf.variable_scope("select_targets"):
171 |             self.U_pref_tf = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='u_pref')
172 |             self.V_pref_tf = tf.placeholder(tf.float32, shape=[None, self.rank_in], name='v_pref')
173 |             self.rand_target_ui = tf.placeholder(tf.int32, shape=[None, None], name='rand_target_ui')
174 |             preds_pref = tf.matmul(self.U_pref_tf, self.V_pref_tf, transpose_b=True)
175 |             tf_topk_vals, tf_topk_inds = tf.nn.top_k(preds_pref, k=num_candidates, sorted=True, name='top_targets')
176 |             self.tf_topk_vals = tf.reshape(tf_topk_vals, [-1], name='select_y_vals')
177 |             self.tf_topk_inds = tf.reshape(tf_topk_inds, [-1], name='select_y_inds')
178 |             preds_random = tf.gather_nd(preds_pref, self.rand_target_ui)
179 |             self.preds_random = tf.reshape(preds_random, [-1], name='random_y_inds')
180 | 
181 |         # tf matmul-topk to get eval on latent
182 |         with tf.variable_scope("latent_eval"):
183 |             preds_pref_latent_warm = tf.sparse_add(preds_pref, self.eval_trainR)
184 |             _, self.tf_latent_topk_cold = tf.nn.top_k(preds_pref, k=recall_at[-1], sorted=True, name='topK_latent_cold')
185 |             _, self.tf_latent_topk_warm = tf.nn.top_k(preds_pref_latent_warm, k=recall_at[-1], sorted=True,
186 |                                                       name='topK_latent_warm')
187 | 
188 |     def get_eval_dict(self, _i, _eval_start, _eval_finish, eval_data):
189 |         """
190 |         packaging method to iterate evaluation data, select from start:finish
191 |         should be passed directly to batch method
192 | 
193 |         :param _i: slice id
194 |         :param _eval_start: integer beginning of slice
195 |         :param _eval_finish: integer end of slice
196 |         :param eval_data: package EvalData obj
197 |         :return:
198 |         """
199 |         _eval_dict = {
200 |             self.Uin: eval_data.U_pref_test[_eval_start:_eval_finish, :],
201 |             self.Vin: eval_data.V_pref_test,
202 |             self.Vcontent: eval_data.V_content_test,
203 |             self.phase: 0
204 |         }
205 |         if self.Ucontent!=None: 
206 |             _eval_dict[self.Ucontent]= eval_data.U_content_test[_eval_start:_eval_finish, :]
207 |         if not eval_data.is_cold:
208 |             _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i]
209 |         return _eval_dict
210 | 
211 |     def get_eval_dict_latent(self, _i, _eval_start, _eval_finish, eval_data, u_pref, v_pref):
212 |         """
213 |         packaging method to iterate evaluation data, select from start:finish
214 |         uses preference input
215 |         should be passed directly to batch method
216 | 
217 |         :param _i: slice id
218 |         :param _eval_start: integer beginning of slice
219 |         :param _eval_finish: integer end of slice
220 |         :param eval_data: package EvalData obj
221 |         :param u_pref: user latent input to slice
222 |         :param v_pref: item latent input to slice
223 |         :return:
224 |         """
225 |         _eval_dict = {
226 |             self.U_pref_tf: u_pref[eval_data.test_user_ids[_eval_start:_eval_finish], :],
227 |             self.V_pref_tf: v_pref[eval_data.test_item_ids, :]
228 |         }
229 |         if not eval_data.is_cold:
230 |             _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i]
231 |         return _eval_dict
232 | 


--------------------------------------------------------------------------------
/tf1/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import datetime
  3 | import numpy as np
  4 | import scipy
  5 | import tensorflow as tf
  6 | from sklearn import preprocessing as prep
  7 | 
  8 | 
  9 | class timer(object):
 10 |     def __init__(self, name='default'):
 11 |         """
 12 |         timer object to record running time of functions, not for micro-benchmarking
 13 |         usage is:
 14 |             $ timer = utils.timer('name').tic()
 15 |             $ timer.toc('process A').tic()
 16 | 
 17 | 
 18 |         :param name: label for the timer
 19 |         """
 20 |         self._start_time = None
 21 |         self._name = name
 22 |         self.tic()
 23 | 
 24 |     def tic(self):
 25 |         self._start_time = time.time()
 26 |         return self
 27 | 
 28 |     def toc(self, message):
 29 |         elapsed = time.time() - self._start_time
 30 |         message = '' if message is None else message
 31 |         print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed)))
 32 |         return self
 33 | 
 34 |     def reset(self):
 35 |         self._start_time = None
 36 |         return self
 37 | 
 38 |     @staticmethod
 39 |     def _format(s):
 40 |         delta = datetime.timedelta(seconds=s)
 41 |         d = datetime.datetime(1, 1, 1) + delta
 42 |         s = ''
 43 |         if (d.day - 1) > 0:
 44 |             s = s + '{:d} days'.format(d.day - 1)
 45 |         if d.hour > 0:
 46 |             s = s + '{:d} hr'.format(d.hour)
 47 |         if d.minute > 0:
 48 |             s = s + '{:d} min'.format(d.minute)
 49 |         s = s + '{:d} s'.format(d.second)
 50 |         return s
 51 | 
 52 | 
 53 | def batch(iterable, _n=1, drop=True):
 54 |     """
 55 |     returns batched version of some iterable
 56 |     :param iterable: iterable object as input
 57 |     :param _n: batch size
 58 |     :param drop: if true, drop extra if batch size does not divide evenly,
 59 |         otherwise keep them (last batch might be shorter)
 60 |     :return: batched version of iterable
 61 |     """
 62 |     it_len = len(iterable)
 63 |     for ndx in range(0, it_len, _n):
 64 |         if ndx + _n < it_len:
 65 |             yield iterable[ndx:ndx + _n]
 66 |         elif drop is False:
 67 |             yield iterable[ndx:it_len]
 68 | 
 69 | 
 70 | def tfidf(x):
 71 |     """
 72 |     compute tfidf of numpy array x
 73 |     :param x: input array, document by terms
 74 |     :return:
 75 |     """
 76 |     x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel())
 77 |     x_idf = np.asarray(x_idf)
 78 |     x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf)))
 79 |     x_idf_diag.setdiag(x_idf)
 80 |     x_tf = x.tocsr()
 81 |     x_tf.data = np.log(x_tf.data + 1)
 82 |     x_tfidf = x_tf * x_idf_diag
 83 |     return x_tfidf
 84 | 
 85 | 
 86 | def prep_standardize(x):
 87 |     """
 88 |     takes sparse input and compute standardized version
 89 | 
 90 |     Note:
 91 |         cap at 5 std
 92 | 
 93 |     :param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing
 94 |     :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
 95 |     """
 96 |     x_nzrow = x.any(axis=1)
 97 |     scaler = prep.StandardScaler().fit(x[x_nzrow, :])
 98 |     x_scaled = np.copy(x)
 99 |     x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :])
100 |     x_scaled[x_scaled > 5] = 5
101 |     x_scaled[x_scaled < -5] = -5
102 |     x_scaled[np.absolute(x_scaled) < 1e-5] = 0
103 |     return scaler, x_scaled
104 | 
105 | 
106 | def prep_standardize_dense(x):
107 |     """
108 |     takes dense input and compute standardized version
109 | 
110 |     Note:
111 |         cap at 5 std
112 | 
113 |     :param x: 2D numpy data array to standardize (column-wise)
114 |     :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
115 |     """
116 |     scaler = prep.StandardScaler().fit(x)
117 |     x_scaled = scaler.transform(x)
118 |     x_scaled[x_scaled > 5] = 5
119 |     x_scaled[x_scaled < -5] = -5
120 |     x_scaled[np.absolute(x_scaled) < 1e-5] = 0
121 |     return scaler, x_scaled
122 | 
123 | 
124 | def batch_eval_recall(_sess, tf_eval, eval_feed_dict, recall_k, eval_data):
125 |     """
126 |     given EvalData and DropoutNet compute graph in TensorFlow, runs batch evaluation
127 | 
128 |     :param _sess: tf session
129 |     :param tf_eval: the evaluate output symbol in tf
130 |     :param eval_feed_dict: method to parse tf, pick from EvalData method
131 |     :param recall_k: list of thresholds to compute recall at (information retrieval recall)
132 |     :param eval_data: EvalData instance
133 |     :return: recall array at thresholds matching recall_k
134 |     """
135 |     tf_eval_preds_batch = []
136 |     for (batch, (eval_start, eval_stop)) in enumerate(eval_data.eval_batch):
137 |         tf_eval_preds = _sess.run(tf_eval,
138 |                                   feed_dict=eval_feed_dict(
139 |                                       batch, eval_start, eval_stop, eval_data))
140 |         tf_eval_preds_batch.append(tf_eval_preds)
141 |     tf_eval_preds = np.concatenate(tf_eval_preds_batch)
142 |     tf.local_variables_initializer().run()
143 | 
144 |     # filter non-zero targets
145 |     y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows]
146 |     y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz]
147 | 
148 |     preds_all = tf_eval_preds[y_nz, :]
149 | 
150 |     recall = []
151 |     for at_k in recall_k:
152 |         preds_k = preds_all[:, :at_k]
153 |         y = eval_data.R_test_inf[y_nz, :]
154 | 
155 |         x = scipy.sparse.lil_matrix(y.shape)
156 |         x.rows = preds_k
157 |         x.data = np.ones_like(preds_k)
158 | 
159 |         z = y.multiply(x)
160 |         recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1))))
161 |     return recall
162 | 


--------------------------------------------------------------------------------
/tf2/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.sparse
  4 | import utils
  5 | import pandas as pd
  6 | 
  7 | """
  8 | This module contains class and methods related to data used in DropoutNet  
  9 | """
 10 | 
 11 | 
 12 | def load_eval_data(test_file, test_id_file, name, cold, train_data, citeu=False):
 13 |     timer = utils.timer()
 14 |     with open(test_id_file) as f:
 15 |         test_item_ids = [int(line) for line in f]
 16 |         test_data = pd.read_csv(test_file, delimiter=",", header=None, dtype=np.int32).values.ravel()
 17 |         if citeu:
 18 |             test_data = test_data.view(
 19 |             dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
 20 |         else:
 21 |             test_data = test_data.view(
 22 |             dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)])
 23 |         timer.toc('read %s triplets %s' % (name, test_data.shape)).tic()
 24 |         eval_data = EvalData(
 25 |             test_data,
 26 |             test_item_ids,
 27 |             is_cold=cold,
 28 |             train=train_data
 29 |         )
 30 |         timer.toc('loaded %s' % name).tic()
 31 |         print(eval_data.get_stats_string())
 32 |         return eval_data
 33 | 
 34 | 
 35 | class EvalData:
 36 |     """
 37 |     EvalData:
 38 |         EvalData packages test triplet (user, item, score) into appropriate formats for evaluation
 39 | 
 40 |         Compact Indices:
 41 |             Specifically, this builds compact indices and stores mapping between original and compact indices.
 42 |             Compact indices only contains:
 43 |                 1) items in test set
 44 |                 2) users who interacted with such test items
 45 |             These compact indices speed up testing significantly by ignoring irrelevant users or items
 46 | 
 47 |         Args:
 48 |             test_triplets(int triplets): user-item-interaction_value triplet to build the test data
 49 |             train(int triplets): user-item-interaction_value triplet from train data
 50 | 
 51 |         Attributes:
 52 |             is_cold(boolean): whether test data is used for cold start problem
 53 |             test_item_ids(list of int): maps compressed item ids to original item ids (via position)
 54 |             test_item_ids_map(dictionary of int->int): maps original item ids to compressed item ids
 55 |             test_user_ids(list of int): maps compressed user ids to original user ids (via position)
 56 |             test_user_ids_map(dictionary of int->int): maps original user ids to compressed user ids
 57 |             R_test_inf(scipy lil matrix): pre-built compressed test matrix
 58 |             R_train_inf(scipy lil matrix): pre-built compressed train matrix for testing
 59 | 
 60 |             other relevant input/output exposed from tensorflow graph
 61 | 
 62 |     """
 63 | 
 64 |     def __init__(self, test_triplets, test_item_ids, is_cold, train):
 65 |         # build map both-ways between compact and original indices
 66 |         # compact indices only contains:
 67 |         #  1) items in test set
 68 |         #  2) users who interacted with such test items
 69 | 
 70 |         self.is_cold = is_cold
 71 | 
 72 |         self.test_item_ids = test_item_ids
 73 |         # test_item_ids_map
 74 |         self.test_item_ids_map = {iid: i for i, iid in enumerate(self.test_item_ids)}
 75 | 
 76 |         _test_ij_for_inf = [(t[0], t[1]) for t in test_triplets if t[1] in self.test_item_ids_map]
 77 |         # test_user_ids
 78 |         self.test_user_ids = np.unique(test_triplets['uid'])
 79 |         # test_user_ids_map
 80 |         self.test_user_ids_map = {user_id: i for i, user_id in enumerate(self.test_user_ids)}
 81 | 
 82 |         _test_i_for_inf = [self.test_user_ids_map[_t[0]] for _t in _test_ij_for_inf]
 83 |         _test_j_for_inf = [self.test_item_ids_map[_t[1]] for _t in _test_ij_for_inf]
 84 |         self.R_test_inf = scipy.sparse.coo_matrix(
 85 |             (np.ones(len(_test_i_for_inf)),
 86 |              (_test_i_for_inf, _test_j_for_inf)),
 87 |             shape=[len(self.test_user_ids), len(self.test_item_ids)]
 88 |         ).tolil(copy=False)
 89 | 
 90 |         train_ij_for_inf = [(self.test_user_ids_map[_t[0]], self.test_item_ids_map[_t[1]]) for _t
 91 |                             in train
 92 |                             if _t[1] in self.test_item_ids_map and _t[0] in self.test_user_ids_map]
 93 |         if self.is_cold and len(train_ij_for_inf) is not 0:
 94 |             raise Exception('using cold dataset, but data is not cold!')
 95 |         if not self.is_cold and len(train_ij_for_inf) is 0:
 96 |             raise Exception('using warm datset, but data is not warm!')
 97 | 
 98 |         self.R_train_inf = None if self.is_cold else scipy.sparse.coo_matrix((
 99 |             np.ones(len(train_ij_for_inf)),
100 |             zip(*train_ij_for_inf)), shape=self.R_test_inf.shape).tolil(copy=False)
101 | 
102 |         # allocate fields
103 |         self.U_pref_test = None
104 |         self.V_pref_test = None
105 |         self.V_content_test = None
106 |         self.U_content_test = None
107 |         self.tf_eval_train = None
108 |         self.tf_eval_test = None
109 |         self.eval_batch = None
110 | 
111 |     def init_tf(self, user_factors, item_factors, user_content, item_content, eval_run_batchsize):
112 |         self.U_pref_test = user_factors[self.test_user_ids, :]
113 |         self.V_pref_test = item_factors[self.test_item_ids, :]
114 |         self.V_content_test = item_content[self.test_item_ids, :]
115 |         if scipy.sparse.issparse(self.V_content_test):
116 |             self.V_content_test = self.V_content_test.todense()
117 |         if user_content!=None:
118 |             self.U_content_test = user_content[self.test_user_ids, :]
119 |             if scipy.sparse.issparse(self.U_content_test):
120 |                 self.U_content_test = self.U_content_test.todense()
121 |         eval_l = self.R_test_inf.shape[0]
122 |         self.eval_batch = [(x, min(x + eval_run_batchsize, eval_l)) for x
123 |                            in range(0, eval_l, eval_run_batchsize)]
124 | 
125 |         self.tf_eval_train = []
126 |         self.tf_eval_test = []
127 | 
128 |         if not self.is_cold:
129 |             for (eval_start, eval_finish) in self.eval_batch:
130 |                 _ui = self.R_train_inf[eval_start:eval_finish, :].tocoo()
131 |                 _ui = list(zip(_ui.row, _ui.col))
132 |                 self.tf_eval_train.append(
133 |                     tf.compat.v1.SparseTensorValue(
134 |                         indices=_ui,
135 |                         values=np.full(len(_ui), -100000, dtype=np.float32),
136 |                         dense_shape=[eval_finish - eval_start, self.R_train_inf.shape[1]]
137 |                     )
138 |                 )
139 | 
140 |     def get_stats_string(self):
141 |         return ('\tn_test_users:[%d]\n\tn_test_items:[%d]' % (len(self.test_user_ids), len(self.test_item_ids))
142 |                 + '\n\tR_train_inf: %s' % (
143 |                     'no R_train_inf for cold' if self.is_cold else 'shape=%s nnz=[%d]' % (
144 |                         str(self.R_train_inf.shape), len(self.R_train_inf.nonzero()[0])
145 |                     )
146 |                 )
147 |                 + '\n\tR_test_inf: shape=%s nnz=[%d]' % (
148 |                     str(self.R_test_inf.shape), len(self.R_test_inf.nonzero()[0])
149 |                 ))
150 | 


--------------------------------------------------------------------------------
/tf2/main.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | import datetime
  6 | from sklearn import datasets
  7 | import data
  8 | import model
  9 | 
 10 | import argparse
 11 | import os
 12 | 
 13 | n_users = 1497020 + 1
 14 | n_items = 1306054 + 1
 15 | tf.compat.v1.disable_eager_execution()
 16 | 
 17 | 
 18 | def main():
 19 |     data_path = args.data_dir
 20 |     checkpoint_path = args.checkpoint_path
 21 |     tb_log_path = args.tb_log_path
 22 |     model_select = args.model_select
 23 | 
 24 |     rank_out = args.rank
 25 |     user_batch_size = 1000
 26 |     n_scores_user = 2500
 27 |     data_batch_size = 100
 28 |     dropout = args.dropout
 29 |     recall_at = range(50, 550, 50)
 30 |     eval_batch_size = 1000
 31 |     max_data_per_step = 2500000
 32 |     eval_every = args.eval_every
 33 |     num_epoch = 10
 34 | 
 35 |     _lr = args.lr
 36 |     _decay_lr_every = 50
 37 |     _lr_decay = 0.1
 38 | 
 39 |     experiment = '%s_%s' % (
 40 |         datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'),
 41 |         '-'.join(str(x / 100) for x in model_select) if model_select else 'simple'
 42 |     )
 43 |     _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint'
 44 | 
 45 |     print('running: ' + experiment)
 46 | 
 47 |     dat = load_data(data_path)
 48 |     u_pref_scaled = dat['u_pref_scaled']
 49 |     v_pref_scaled = dat['v_pref_scaled']
 50 |     eval_warm = dat['eval_warm']
 51 |     eval_cold_user = dat['eval_cold_user']
 52 |     eval_cold_item = dat['eval_cold_item']
 53 |     user_content = dat['user_content']
 54 |     item_content = dat['item_content']
 55 |     u_pref = dat['u_pref']
 56 |     v_pref = dat['v_pref']
 57 |     user_indices = dat['user_indices']
 58 | 
 59 |     timer = utils.timer(name='main').tic()
 60 | 
 61 |     # append pref factors for faster dropout
 62 |     v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
 63 |     v_pref_last = v_pref_scaled.shape[0]
 64 |     u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
 65 |     u_pref_last = u_pref_scaled.shape[0]
 66 |     timer.toc('initialized numpy data for tf')
 67 | 
 68 |     # prep eval
 69 |     eval_batch_size = eval_batch_size
 70 |     timer.tic()
 71 |     eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 72 |     timer.toc('initialized eval_warm for tf').tic()
 73 |     eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 74 |     timer.toc('initialized eval_cold_user for tf').tic()
 75 |     eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 76 |     timer.toc('initialized eval_cold_item for tf').tic()
 77 | 
 78 |     dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1],
 79 |                                user_content_rank=user_content.shape[1],
 80 |                                item_content_rank=item_content.shape[1],
 81 |                                model_select=model_select,
 82 |                                rank_out=rank_out)
 83 | 
 84 |     config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
 85 | 
 86 |     with tf.device(args.model_device):
 87 |         dropout_net.build_model()
 88 | 
 89 |     with tf.device(args.inf_device):
 90 |         dropout_net.build_predictor(recall_at, n_scores_user)
 91 | 
 92 |     with tf.compat.v1.Session(config=config) as sess:
 93 |         tf_saver = None if _tf_ckpt_file is None else tf.compat.v1.train.Saver()
 94 |         train_writer = None if tb_log_path is None else tf.compat.v1.summary.FileWriter(
 95 |             tb_log_path + experiment, sess.graph)
 96 |         tf.compat.v1.global_variables_initializer().run()
 97 |         tf.compat.v1.local_variables_initializer().run()
 98 |         timer.toc('initialized tf')
 99 | 
100 |         row_index = np.copy(user_indices)
101 |         n_step = 0
102 |         best_cold_user = 0
103 |         best_cold_item = 0
104 |         best_warm = 0
105 |         n_batch_trained = 0
106 |         best_step = 0
107 |         for epoch in range(num_epoch):
108 |             np.random.shuffle(row_index)
109 |             for b in utils.batch(row_index, user_batch_size):
110 |                 n_step += 1
111 |                 # prep targets
112 |                 target_users = np.repeat(b, n_scores_user)
113 |                 target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
114 |                 target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b]
115 |                 target_items_rand = np.array(target_items_rand).flatten()
116 |                 target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand]))
117 |                 [target_scores, target_items, random_scores] = sess.run(
118 |                     [dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random],
119 |                     feed_dict={
120 |                         dropout_net.U_pref_tf: u_pref[b, :],
121 |                         dropout_net.V_pref_tf: v_pref,
122 |                         dropout_net.rand_target_ui: target_ui_rand
123 |                     }
124 |                 )
125 |                 # merge topN and randomN items per user
126 |                 target_scores = np.append(target_scores, random_scores)
127 |                 target_items = np.append(target_items, target_items_rand)
128 |                 target_users = np.append(target_users, target_users)
129 | 
130 |                 tf.compat.v1.local_variables_initializer().run()
131 |                 n_targets = len(target_scores)
132 |                 perm = np.random.permutation(n_targets)
133 |                 n_targets = min(n_targets, max_data_per_step)
134 |                 data_batch = [(n, min(n + data_batch_size, n_targets)) for n in range(0, n_targets, data_batch_size)]
135 |                 f_batch = 0
136 |                 for (start, stop) in data_batch:
137 |                     batch_perm = perm[start:stop]
138 |                     batch_users = target_users[batch_perm]
139 |                     batch_items = target_items[batch_perm]
140 |                     if dropout != 0:
141 |                         n_to_drop = int(np.floor(dropout * len(batch_perm)))
142 |                         perm_user = np.random.permutation(len(batch_perm))[:n_to_drop]
143 |                         perm_item = np.random.permutation(len(batch_perm))[:n_to_drop]
144 |                         batch_v_pref = np.copy(batch_items)
145 |                         batch_u_pref = np.copy(batch_users)
146 |                         batch_v_pref[perm_user] = v_pref_last
147 |                         batch_u_pref[perm_item] = u_pref_last
148 |                     else:
149 |                         batch_v_pref = batch_items
150 |                         batch_u_pref = batch_users
151 | 
152 |                     _, _, loss_out = sess.run(
153 |                         [dropout_net.preds, dropout_net.updates, dropout_net.loss],
154 |                         feed_dict={
155 |                             dropout_net.Uin: u_pref_expanded[batch_u_pref, :],
156 |                             dropout_net.Vin: v_pref_expanded[batch_v_pref, :],
157 |                             dropout_net.Ucontent: user_content[batch_users, :].todense(),
158 |                             dropout_net.Vcontent: item_content[batch_items, :].todense(),
159 |                             #
160 |                             dropout_net.target: target_scores[batch_perm],
161 |                             dropout_net.lr_placeholder: _lr,
162 |                             dropout_net.phase: 1
163 |                         }
164 |                     )
165 |                     f_batch += loss_out
166 |                     if np.isnan(f_batch):
167 |                         raise Exception('f is nan')
168 | 
169 |                 n_batch_trained += len(data_batch)
170 |                 if n_step % _decay_lr_every == 0:
171 |                     _lr = _lr_decay * _lr
172 |                     print('decayed lr:' + str(_lr))
173 |                 if n_step % eval_every == 0:
174 |                     recall_warm = utils.batch_eval_recall(
175 |                         sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict,
176 |                         recall_k=recall_at, eval_data=eval_warm)
177 |                     recall_cold_user = utils.batch_eval_recall(
178 |                         sess, dropout_net.eval_preds_cold,
179 |                         eval_feed_dict=dropout_net.get_eval_dict,
180 |                         recall_k=recall_at, eval_data=eval_cold_user)
181 |                     recall_cold_item = utils.batch_eval_recall(
182 |                         sess, dropout_net.eval_preds_cold,
183 |                         eval_feed_dict=dropout_net.get_eval_dict,
184 |                         recall_k=recall_at, eval_data=eval_cold_item)
185 | 
186 |                     # checkpoint
187 |                     if np.sum(recall_warm + recall_cold_user + recall_cold_item) > np.sum(
188 |                                             best_warm + best_cold_user + best_cold_item):
189 |                         best_cold_user = recall_cold_user
190 |                         best_cold_item = recall_cold_item
191 |                         best_warm = recall_warm
192 |                         best_step = n_step
193 |                         if tf_saver is not None:
194 |                             tf_saver.save(sess, _tf_ckpt_file)
195 | 
196 |                     timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (
197 |                         n_step, len(data_batch), n_batch_trained, f_batch, best_step
198 |                     )).tic()
199 |                     print ('\t\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at]))
200 |                     print('warm start\t%s\ncold user\t%s\ncold item\t%s' % (
201 |                         ' '.join(['%.4f' % i for i in recall_warm]),
202 |                         ' '.join(['%.4f' % i for i in recall_cold_user]),
203 |                         ' '.join(['%.4f' % i for i in recall_cold_item])
204 |                     ))
205 |                     summaries = []
206 |                     for i, k in enumerate(recall_at):
207 |                         if k % 100 == 0:
208 |                             summaries.extend([
209 |                                 tf.compat.v1.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]),
210 |                                 tf.compat.v1.Summary.Value(tag="recall@" + str(k) + " cold_user",
211 |                                                  simple_value=recall_cold_user[i]),
212 |                                 tf.compat.v1.Summary.Value(tag="recall@" + str(k) + " cold_item",
213 |                                                  simple_value=recall_cold_item[i])
214 |                             ])
215 |                     recall_summary = tf.compat.v1.Summary(value=summaries)
216 |                     if train_writer is not None:
217 |                         train_writer.add_summary(recall_summary, n_step)
218 | 
219 | 
220 | def load_data(data_path):
221 |     timer = utils.timer(name='main').tic()
222 |     split_folder = os.path.join(data_path, 'warm')
223 | 
224 |     u_file = os.path.join(data_path, 'trained/warm/U.csv.bin')
225 |     v_file = os.path.join(data_path, 'trained/warm/V.csv.bin')
226 |     user_content_file = os.path.join(data_path, 'user_features_0based.txt')
227 |     item_content_file = os.path.join(data_path, 'item_features_0based.txt')
228 |     train_file = os.path.join(split_folder, 'train.csv')
229 |     test_warm_file = os.path.join(split_folder, 'test_warm.csv')
230 |     test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv')
231 |     test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv')
232 |     test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv')
233 |     test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv')
234 |     test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv')
235 | 
236 |     dat = {}
237 |     # load preference data
238 |     timer.tic()
239 |     u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200)
240 |     v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200)
241 |     dat['u_pref'] = u_pref
242 |     dat['v_pref'] = v_pref
243 | 
244 |     timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic()
245 | 
246 |     # pre-process
247 |     _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
248 |     _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
249 |     timer.toc('standardized U,V').tic()
250 | 
251 |     # load content data
252 |     timer.tic()
253 |     user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32)
254 |     dat['user_content'] = user_content.tolil(copy=False)
255 |     timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic()
256 |     item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)
257 |     dat['item_content'] = item_content.tolil(copy=False)
258 |     timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
259 | 
260 |     # load split
261 |     timer.tic()
262 |     train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view(
263 |         dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)])
264 |     dat['user_indices'] = np.unique(train['uid'])
265 |     timer.toc('read train triplets %s' % train.shape).tic()
266 | 
267 |     dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False,
268 |                                            train_data=train)
269 |     dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user',
270 |                                                 cold=True,
271 |                                                 train_data=train)
272 |     dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item',
273 |                                                 cold=True,
274 |                                                 train_data=train)
275 |     return dat
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data",
280 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
281 |     parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder')
282 | 
283 |     parser.add_argument('--model-device', type=str, default='/gpu:0', help='device to use for training')
284 |     parser.add_argument('--inf-device', type=str, default='/cpu:0', help='device to use for inference')
285 |     parser.add_argument('--checkpoint-path', type=str, default=None,
286 |                         help='path to dump checkpoint data from TensorFlow')
287 |     parser.add_argument('--tb-log-path', type=str, default=None,
288 |                         help='path to dump TensorBoard logs')
289 |     parser.add_argument('--model-select', nargs='+', type=int,
290 |                         default=[800, 400],
291 |                         help='specify the fully-connected architecture, starting from input,'
292 |                              ' numbers indicate numbers of hidden units',
293 |                         )
294 |     parser.add_argument('--rank', type=int, default=200, help='output rank of latent model')
295 |     parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout')
296 |     parser.add_argument('--eval-every', type=int, default=2, help='evaluate every X user-batch')
297 |     parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate')
298 | 
299 |     args = parser.parse_args()
300 |     main()
301 | 


--------------------------------------------------------------------------------
/tf2/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | def dense_batch_fc_tanh(x, units, phase, scope, do_norm=False):
  5 |     """
  6 |     convenience function to build tanh blocks in DeepCF
  7 |     tanh is found to work better for DeepCF nets
  8 |     constitutes of: FC -> batch norm -> tanh activation
  9 | 
 10 |     x: input
 11 |     units: # of hidden units in FC
 12 |     phase: boolean flag whether we are training, required by batch norm
 13 |     scope: name of block
 14 |     do_norm: boolean flag to do batch norm after FC
 15 |     """
 16 | 
 17 |     with tf.compat.v1.variable_scope(scope):
 18 |         init = tf.compat.v1.truncated_normal_initializer(stddev=0.01)
 19 |         h1_w = tf.compat.v1.get_variable(scope + '_w',
 20 |                                shape=[x.get_shape().as_list()[1], units],
 21 |                                initializer=init)
 22 |         h1_b = tf.compat.v1.get_variable(scope + '_b',
 23 |                                shape=[1, units],
 24 |                                initializer=tf.zeros_initializer())
 25 |         h1 = tf.matmul(x, h1_w) + h1_b
 26 |         if do_norm:
 27 |             h2 = tf.keras.layers.BatchNormalization(
 28 |                 center=True,
 29 |                 scale=True,
 30 |                 trainable=True)(h1)
 31 |             return tf.nn.tanh(h2, scope + '_tanh')
 32 |         else:
 33 |             return tf.nn.tanh(h1, scope + '_tanh')
 34 | 
 35 | 
 36 | class DeepCF:
 37 |     """
 38 |     main model class implementing DeepCF
 39 |     also stores states for fast candidate generation
 40 | 
 41 |     latent_rank_in: rank of preference model input
 42 |     user_content_rank: rank of user content input
 43 |     item_content_rank: rank of item content input
 44 |     model_select: array of number of hidden unit,
 45 |         i.e. [200,100] indicate two hidden layer with 200 units followed by 100 units
 46 |     rank_out: rank of latent model output
 47 | 
 48 |     """
 49 | 
 50 |     def __init__(self, latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out):
 51 | 
 52 |         self.rank_in = latent_rank_in
 53 |         self.phi_u_dim = user_content_rank
 54 |         self.phi_v_dim = item_content_rank
 55 |         self.model_select = model_select
 56 |         self.rank_out = rank_out
 57 | 
 58 |         # inputs
 59 |         self.Uin = None
 60 |         self.Vin = None
 61 |         self.Ucontent = None
 62 |         self.Vcontent = None
 63 |         self.phase = None
 64 |         self.target = None
 65 |         self.eval_trainR = None
 66 |         self.U_pref_tf = None
 67 |         self.V_pref_tf = None
 68 |         self.rand_target_ui = None
 69 | 
 70 |         # outputs in the model
 71 | 
 72 |         self.preds = None
 73 |         self.updates = None
 74 |         self.loss = None
 75 | 
 76 |         self.U_embedding = None
 77 |         self.V_embedding = None
 78 | 
 79 |         self.lr_placeholder = None
 80 | 
 81 |         # predictor
 82 |         self.tf_topk_vals = None
 83 |         self.tf_topk_inds = None
 84 |         self.preds_random = None
 85 |         self.tf_latent_topk_cold = None
 86 |         self.tf_latent_topk_warm = None
 87 |         self.eval_preds_warm = None
 88 |         self.eval_preds_cold = None
 89 | 
 90 |     def build_model(self):
 91 |         """
 92 |         set up tf components for main DeepCF net
 93 |         call after setting up desired tf state (cpu/gpu etc...)
 94 | 
 95 |         Note: should use GPU
 96 |         """
 97 |         self.lr_placeholder = tf.compat.v1.placeholder(tf.float32, shape=[], name='learn_rate')
 98 |         self.phase = tf.compat.v1.placeholder(tf.bool, name='phase')
 99 |         self.target = tf.compat.v1.placeholder(tf.float32, shape=[None], name='target')
100 | 
101 |         self.Uin = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='U_in_raw')
102 |         self.Vin = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='V_in_raw')
103 |         if self.phi_u_dim>0:
104 |             self.Ucontent = tf.compat.v1.placeholder(tf.float32, shape=[None, self.phi_u_dim], name='U_content')
105 |             u_concat = tf.concat([self.Uin, self.Ucontent], 1)
106 |         else:
107 |             u_concat = self.Uin
108 | 
109 |         if self.phi_v_dim>0:
110 |             self.Vcontent = tf.compat.v1.placeholder(tf.float32, shape=[None, self.phi_v_dim], name='V_content')
111 |             v_concat = tf.concat([self.Vin, self.Vcontent], 1)
112 |         else:
113 |             v_concat = self.Vin
114 | 
115 |         print ('\tu_concat.shape=%s' % str(u_concat.get_shape()))
116 |         print ('\tv_concat.shape=%s' % str(v_concat.get_shape()))
117 | 
118 |         u_last = u_concat
119 |         v_last = v_concat
120 |         for ihid, hid in enumerate(self.model_select):
121 |             u_last = dense_batch_fc_tanh(u_last, hid, self.phase, 'user_layer_%d' % (ihid + 1), do_norm=True)
122 |             v_last = dense_batch_fc_tanh(v_last, hid, self.phase, 'item_layer_%d' % (ihid + 1), do_norm=True)
123 | 
124 |         with tf.compat.v1.variable_scope("self.U_embedding"):
125 |             u_emb_w = tf.Variable(tf.random.truncated_normal([u_last.get_shape().as_list()[1], self.rank_out], stddev=0.01),
126 |                                   name='u_emb_w')
127 |             u_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='u_emb_b')
128 |             self.U_embedding = tf.matmul(u_last, u_emb_w) + u_emb_b
129 | 
130 |         with tf.compat.v1.variable_scope("V_embedding"):
131 |             v_emb_w = tf.Variable(tf.random.truncated_normal([v_last.get_shape().as_list()[1], self.rank_out], stddev=0.01),
132 |                                   name='v_emb_w')
133 |             v_emb_b = tf.Variable(tf.zeros([1, self.rank_out]), name='v_emb_b')
134 |             self.V_embedding = tf.matmul(v_last, v_emb_w) + v_emb_b
135 | 
136 |         with tf.compat.v1.variable_scope("loss"):
137 |             preds = tf.multiply(self.U_embedding, self.V_embedding)
138 |             self.preds = tf.reduce_sum(preds, 1)
139 |             self.loss = tf.reduce_mean(tf.math.squared_difference(self.preds, self.target))
140 | 
141 |         update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
142 |         with tf.control_dependencies(update_ops):
143 |             # Ensures that we execute the update_ops before performing the train_step
144 |             self.updates = tf.compat.v1.train.MomentumOptimizer(self.lr_placeholder, 0.9).minimize(self.loss)
145 | 
146 |     def build_predictor(self, recall_at, num_candidates):
147 |         """
148 |         set up tf components for prediction and target selection
149 |         call after setting up desired tf state (cpu/gpu etc...)
150 | 
151 |         Note: should use CPU, as large inputs are expected
152 | 
153 |         :param recall_at: truncation to compute recall
154 |         :param num_candidates: number of candidates
155 |         :return:
156 |         """
157 |         self.eval_trainR = tf.compat.v1.sparse_placeholder(
158 |             dtype=tf.float32, shape=[None, None], name='trainR_sparse_CPU')
159 | 
160 |         with tf.compat.v1.variable_scope("eval"):
161 |             embedding_prod_cold = tf.matmul(self.U_embedding, self.V_embedding, transpose_b=True, name='pred_all_items')
162 |             embedding_prod_warm = tf.compat.v1.sparse_add(embedding_prod_cold, self.eval_trainR)
163 |             _, self.eval_preds_cold = tf.nn.top_k(embedding_prod_cold, k=recall_at[-1], sorted=True,
164 |                                                   name='topK_net_cold')
165 |             _, self.eval_preds_warm = tf.nn.top_k(embedding_prod_warm, k=recall_at[-1], sorted=True,
166 |                                                   name='topK_net_warm')
167 |         with tf.compat.v1.variable_scope("select_targets"):
168 |             self.U_pref_tf = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='u_pref')
169 |             self.V_pref_tf = tf.compat.v1.placeholder(tf.float32, shape=[None, self.rank_in], name='v_pref')
170 |             self.rand_target_ui = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name='rand_target_ui')
171 |             preds_pref = tf.matmul(self.U_pref_tf, self.V_pref_tf, transpose_b=True)
172 |             tf_topk_vals, tf_topk_inds = tf.nn.top_k(preds_pref, k=num_candidates, sorted=True, name='top_targets')
173 |             self.tf_topk_vals = tf.reshape(tf_topk_vals, [-1], name='select_y_vals')
174 |             self.tf_topk_inds = tf.reshape(tf_topk_inds, [-1], name='select_y_inds')
175 |             preds_random = tf.gather_nd(preds_pref, self.rand_target_ui)
176 |             self.preds_random = tf.reshape(preds_random, [-1], name='random_y_inds')
177 | 
178 |         # tf matmul-topk to get eval on latent
179 |         with tf.compat.v1.variable_scope("latent_eval"):
180 |             preds_pref_latent_warm = tf.compat.v1.sparse_add(preds_pref, self.eval_trainR)
181 |             _, self.tf_latent_topk_cold = tf.nn.top_k(preds_pref, k=recall_at[-1], sorted=True, name='topK_latent_cold')
182 |             _, self.tf_latent_topk_warm = tf.nn.top_k(preds_pref_latent_warm, k=recall_at[-1], sorted=True,
183 |                                                       name='topK_latent_warm')
184 | 
185 |     def get_eval_dict(self, _i, _eval_start, _eval_finish, eval_data):
186 |         """
187 |         packaging method to iterate evaluation data, select from start:finish
188 |         should be passed directly to batch method
189 | 
190 |         :param _i: slice id
191 |         :param _eval_start: integer beginning of slice
192 |         :param _eval_finish: integer end of slice
193 |         :param eval_data: package EvalData obj
194 |         :return:
195 |         """
196 |         _eval_dict = {
197 |             self.Uin: eval_data.U_pref_test[_eval_start:_eval_finish, :],
198 |             self.Vin: eval_data.V_pref_test,
199 |             self.Vcontent: eval_data.V_content_test,
200 |             self.phase: 0
201 |         }
202 |         if self.Ucontent!=None: 
203 |             _eval_dict[self.Ucontent]= eval_data.U_content_test[_eval_start:_eval_finish, :]
204 |         if not eval_data.is_cold:
205 |             _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i]
206 |         return _eval_dict
207 | 
208 |     def get_eval_dict_latent(self, _i, _eval_start, _eval_finish, eval_data, u_pref, v_pref):
209 |         """
210 |         packaging method to iterate evaluation data, select from start:finish
211 |         uses preference input
212 |         should be passed directly to batch method
213 | 
214 |         :param _i: slice id
215 |         :param _eval_start: integer beginning of slice
216 |         :param _eval_finish: integer end of slice
217 |         :param eval_data: package EvalData obj
218 |         :param u_pref: user latent input to slice
219 |         :param v_pref: item latent input to slice
220 |         :return:
221 |         """
222 |         _eval_dict = {
223 |             self.U_pref_tf: u_pref[eval_data.test_user_ids[_eval_start:_eval_finish], :],
224 |             self.V_pref_tf: v_pref[eval_data.test_item_ids, :]
225 |         }
226 |         if not eval_data.is_cold:
227 |             _eval_dict[self.eval_trainR] = eval_data.tf_eval_train[_i]
228 |         return _eval_dict
229 | 


--------------------------------------------------------------------------------
/tf2/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import datetime
  3 | import numpy as np
  4 | import scipy
  5 | import tensorflow as tf
  6 | from sklearn import preprocessing as prep
  7 | 
  8 | 
  9 | class timer(object):
 10 |     def __init__(self, name='default'):
 11 |         """
 12 |         timer object to record running time of functions, not for micro-benchmarking
 13 |         usage is:
 14 |             $ timer = utils.timer('name').tic()
 15 |             $ timer.toc('process A').tic()
 16 | 
 17 | 
 18 |         :param name: label for the timer
 19 |         """
 20 |         self._start_time = None
 21 |         self._name = name
 22 |         self.tic()
 23 | 
 24 |     def tic(self):
 25 |         self._start_time = time.time()
 26 |         return self
 27 | 
 28 |     def toc(self, message):
 29 |         elapsed = time.time() - self._start_time
 30 |         message = '' if message is None else message
 31 |         print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed)))
 32 |         return self
 33 | 
 34 |     def reset(self):
 35 |         self._start_time = None
 36 |         return self
 37 | 
 38 |     @staticmethod
 39 |     def _format(s):
 40 |         delta = datetime.timedelta(seconds=s)
 41 |         d = datetime.datetime(1, 1, 1) + delta
 42 |         s = ''
 43 |         if (d.day - 1) > 0:
 44 |             s = s + '{:d} days'.format(d.day - 1)
 45 |         if d.hour > 0:
 46 |             s = s + '{:d} hr'.format(d.hour)
 47 |         if d.minute > 0:
 48 |             s = s + '{:d} min'.format(d.minute)
 49 |         s = s + '{:d} s'.format(d.second)
 50 |         return s
 51 | 
 52 | 
 53 | def batch(iterable, _n=1, drop=True):
 54 |     """
 55 |     returns batched version of some iterable
 56 |     :param iterable: iterable object as input
 57 |     :param _n: batch size
 58 |     :param drop: if true, drop extra if batch size does not divide evenly,
 59 |         otherwise keep them (last batch might be shorter)
 60 |     :return: batched version of iterable
 61 |     """
 62 |     it_len = len(iterable)
 63 |     for ndx in range(0, it_len, _n):
 64 |         if ndx + _n < it_len:
 65 |             yield iterable[ndx:ndx + _n]
 66 |         elif drop is False:
 67 |             yield iterable[ndx:it_len]
 68 | 
 69 | 
 70 | def tfidf(x):
 71 |     """
 72 |     compute tfidf of numpy array x
 73 |     :param x: input array, document by terms
 74 |     :return:
 75 |     """
 76 |     x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel())
 77 |     x_idf = np.asarray(x_idf)
 78 |     x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf)))
 79 |     x_idf_diag.setdiag(x_idf)
 80 |     x_tf = x.tocsr()
 81 |     x_tf.data = np.log(x_tf.data + 1)
 82 |     x_tfidf = x_tf * x_idf_diag
 83 |     return x_tfidf
 84 | 
 85 | 
 86 | def prep_standardize(x):
 87 |     """
 88 |     takes sparse input and compute standardized version
 89 | 
 90 |     Note:
 91 |         cap at 5 std
 92 | 
 93 |     :param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing
 94 |     :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
 95 |     """
 96 |     x_nzrow = x.any(axis=1)
 97 |     scaler = prep.StandardScaler().fit(x[x_nzrow, :])
 98 |     x_scaled = np.copy(x)
 99 |     x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :])
100 |     x_scaled[x_scaled > 5] = 5
101 |     x_scaled[x_scaled < -5] = -5
102 |     x_scaled[np.absolute(x_scaled) < 1e-5] = 0
103 |     return scaler, x_scaled
104 | 
105 | 
106 | def prep_standardize_dense(x):
107 |     """
108 |     takes dense input and compute standardized version
109 | 
110 |     Note:
111 |         cap at 5 std
112 | 
113 |     :param x: 2D numpy data array to standardize (column-wise)
114 |     :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
115 |     """
116 |     scaler = prep.StandardScaler().fit(x)
117 |     x_scaled = scaler.transform(x)
118 |     x_scaled[x_scaled > 5] = 5
119 |     x_scaled[x_scaled < -5] = -5
120 |     x_scaled[np.absolute(x_scaled) < 1e-5] = 0
121 |     return scaler, x_scaled
122 | 
123 | 
124 | def batch_eval_recall(_sess, tf_eval, eval_feed_dict, recall_k, eval_data):
125 |     """
126 |     given EvalData and DropoutNet compute graph in TensorFlow, runs batch evaluation
127 | 
128 |     :param _sess: tf session
129 |     :param tf_eval: the evaluate output symbol in tf
130 |     :param eval_feed_dict: method to parse tf, pick from EvalData method
131 |     :param recall_k: list of thresholds to compute recall at (information retrieval recall)
132 |     :param eval_data: EvalData instance
133 |     :return: recall array at thresholds matching recall_k
134 |     """
135 |     tf_eval_preds_batch = []
136 |     for (batch, (eval_start, eval_stop)) in enumerate(eval_data.eval_batch):
137 |         tf_eval_preds = _sess.run(tf_eval,
138 |                                   feed_dict=eval_feed_dict(
139 |                                       batch, eval_start, eval_stop, eval_data))
140 |         tf_eval_preds_batch.append(tf_eval_preds)
141 |     tf_eval_preds = np.concatenate(tf_eval_preds_batch)
142 |     tf.compat.v1.local_variables_initializer().run()
143 | 
144 |     # filter non-zero targets
145 |     y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows]
146 |     y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz]
147 | 
148 |     preds_all = tf_eval_preds[y_nz, :]
149 | 
150 |     recall = []
151 |     for at_k in recall_k:
152 |         preds_k = preds_all[:, :at_k]
153 |         y = eval_data.R_test_inf[y_nz, :]
154 | 
155 |         x = scipy.sparse.lil_matrix(y.shape)
156 | #        x.rows = preds_k
157 | #        x.data = np.ones_like(preds_k)
158 |         x.data = np.array([z.tolist() for z in np.ones_like(preds_k)]+[[]])[:-1]
159 |         x.rows = np.array([z.tolist() for z in preds_k]+[[]])[:-1]
160 |         #import pdb; pdb.set_trace()
161 |         z = y.multiply(x)
162 |         recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1))))
163 |     return recall
164 | 


--------------------------------------------------------------------------------
/torch/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import scipy.sparse
  4 | import utils
  5 | import pandas as pd
  6 | 
  7 | """
  8 | This module contains class and methods related to data used in DropoutNet  
  9 | """
 10 | 
 11 | 
 12 | def load_eval_data(test_file, test_id_file, name, cold, train_data, citeu=False):
 13 |     timer = utils.timer(name='utils')
 14 |     with open(test_id_file) as f:
 15 |         test_item_ids = [int(line) for line in f]
 16 |         test_data = pd.read_csv(test_file, delimiter=",", header=None, dtype=np.int32).values.ravel()
 17 |         if citeu:
 18 |             test_data = test_data.view(
 19 |             dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
 20 |         else:
 21 |             test_data = test_data.view(
 22 |             dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)])
 23 |         timer.toc('read %s triplets %s' % (name, test_data.shape)).tic()
 24 |         eval_data = EvalData(
 25 |             test_data,
 26 |             test_item_ids,
 27 |             is_cold=cold,
 28 |             train=train_data
 29 |         )
 30 |         timer.toc('loaded %s' % name).tic()
 31 |         print(eval_data.get_stats_string())
 32 |         return eval_data
 33 | 
 34 | 
 35 | class EvalData:
 36 |     """
 37 |     EvalData:
 38 |         EvalData packages test triplet (user, item, score) into appropriate formats for evaluation
 39 | 
 40 |         Compact Indices:
 41 |             Specifically, this builds compact indices and stores mapping between original and compact indices.
 42 |             Compact indices only contains:
 43 |                 1) items in test set
 44 |                 2) users who interacted with such test items
 45 |             These compact indices speed up testing significantly by ignoring irrelevant users or items
 46 | 
 47 |         Args:
 48 |             test_triplets(int triplets): user-item-interaction_value triplet to build the test data
 49 |             train(int triplets): user-item-interaction_value triplet from train data
 50 | 
 51 |         Attributes:
 52 |             is_cold(boolean): whether test data is used for cold start problem
 53 |             test_item_ids(list of int): maps compressed item ids to original item ids (via position)
 54 |             test_item_ids_map(dictionary of int->int): maps original item ids to compressed item ids
 55 |             test_user_ids(list of int): maps compressed user ids to original user ids (via position)
 56 |             test_user_ids_map(dictionary of int->int): maps original user ids to compressed user ids
 57 |             R_test_inf(scipy lil matrix): pre-built compressed test matrix
 58 |             R_train_inf(scipy lil matrix): pre-built compressed train matrix for testing
 59 | 
 60 |             other relevant input/output exposed from tensorflow graph
 61 | 
 62 |     """
 63 | 
 64 |     def __init__(self, test_triplets, test_item_ids, is_cold, train):
 65 |         # build map both-ways between compact and original indices
 66 |         # compact indices only contains:
 67 |         #  1) items in test set
 68 |         #  2) users who interacted with such test items
 69 | 
 70 |         self.is_cold = is_cold
 71 | 
 72 |         self.test_item_ids = test_item_ids
 73 |         # test_item_ids_map
 74 |         self.test_item_ids_map = {iid: i for i, iid in enumerate(self.test_item_ids)}
 75 | 
 76 |         _test_ij_for_inf = [(t[0], t[1]) for t in test_triplets if t[1] in self.test_item_ids_map]
 77 |         # test_user_ids
 78 |         self.test_user_ids = np.unique(test_triplets['uid'])
 79 |         # test_user_ids_map
 80 |         self.test_user_ids_map = {user_id: i for i, user_id in enumerate(self.test_user_ids)}
 81 | 
 82 |         _test_i_for_inf = [self.test_user_ids_map[_t[0]] for _t in _test_ij_for_inf]
 83 |         _test_j_for_inf = [self.test_item_ids_map[_t[1]] for _t in _test_ij_for_inf]
 84 |         self.R_test_inf = scipy.sparse.coo_matrix(
 85 |             (np.ones(len(_test_i_for_inf)),
 86 |              (_test_i_for_inf, _test_j_for_inf)),
 87 |             shape=[len(self.test_user_ids), len(self.test_item_ids)]
 88 |         ).tolil(copy=False)
 89 | 
 90 |         train_ij_for_inf = [(self.test_user_ids_map[_t[0]], self.test_item_ids_map[_t[1]]) for _t
 91 |                             in train
 92 |                             if _t[1] in self.test_item_ids_map and _t[0] in self.test_user_ids_map]
 93 |         if self.is_cold and len(train_ij_for_inf) != 0:
 94 |             raise Exception('using cold dataset, but data is not cold!')
 95 |         if not self.is_cold and len(train_ij_for_inf) == 0:
 96 |             raise Exception('using warm datset, but data is not warm!')
 97 | 
 98 |         self.R_train_inf = None if self.is_cold else scipy.sparse.coo_matrix((
 99 |             np.ones(len(train_ij_for_inf)),
100 |             zip(*train_ij_for_inf)), shape=self.R_test_inf.shape).tolil(copy=False)
101 | 
102 |         # allocate fields
103 |         self.U_pref_test = None
104 |         self.V_pref_test = None
105 |         self.V_content_test = None
106 |         self.U_content_test = None
107 |         self.tf_eval_train = None
108 |         self.tf_eval_test = None
109 |         self.eval_batch = None
110 | 
111 |     def init_tf(self, user_factors, item_factors, user_content, item_content, eval_run_batchsize):
112 |         self.U_pref_test = user_factors[self.test_user_ids, :]
113 |         self.V_pref_test = item_factors[self.test_item_ids, :]
114 |         self.V_content_test = item_content[self.test_item_ids, :]
115 |         if scipy.sparse.issparse(self.V_content_test):
116 |             self.V_content_test = self.V_content_test.todense()
117 |         if user_content!=None:
118 |             self.U_content_test = user_content[self.test_user_ids, :]
119 |             if scipy.sparse.issparse(self.U_content_test):
120 |                 self.U_content_test = self.U_content_test.todense()
121 |         eval_l = self.R_test_inf.shape[0]
122 |         self.eval_batch = [(x, min(x + eval_run_batchsize, eval_l)) for x
123 |                            in range(0, eval_l, eval_run_batchsize)]
124 | 
125 |         self.tf_eval_train = []
126 |         self.tf_eval_test = []
127 | 
128 |         if not self.is_cold:
129 |             for (eval_start, eval_finish) in self.eval_batch:
130 |                 _ui = self.R_train_inf[eval_start:eval_finish, :].tocoo()
131 |                 _ui = list(zip(_ui.row, _ui.col))
132 |                 self.tf_eval_train.append(
133 |                     torch.sparse_coo_tensor(
134 |                         indices=np.array(_ui).T,
135 |                         values=np.full(len(_ui), -100000, dtype=np.float32),
136 |                         size=[eval_finish - eval_start, self.R_train_inf.shape[1]]
137 |                     )
138 |                 )
139 | 
140 |     def get_stats_string(self):
141 |         return ('\tn_test_users:[%d]\n\tn_test_items:[%d]' % (len(self.test_user_ids), len(self.test_item_ids))
142 |                 + '\n\tR_train_inf: %s' % (
143 |                     'no R_train_inf for cold' if self.is_cold else 'shape=%s nnz=[%d]' % (
144 |                         str(self.R_train_inf.shape), len(self.R_train_inf.nonzero()[0])
145 |                     )
146 |                 )
147 |                 + '\n\tR_test_inf: shape=%s nnz=[%d]' % (
148 |                     str(self.R_test_inf.shape), len(self.R_test_inf.nonzero()[0])
149 |                 ))
150 | 


--------------------------------------------------------------------------------
/torch/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy
  4 | import torch
  5 | import datetime
  6 | from sklearn import datasets
  7 | from tqdm import tqdm
  8 | import argparse
  9 | import os
 10 | 
 11 | import utils
 12 | import data
 13 | import model
 14 | 
 15 | n_users = 1497020 + 1
 16 | n_items = 1306054 + 1
 17 | 
 18 | def main():
 19 |     data_path           = args.data_dir
 20 |     checkpoint_path     = args.checkpoint_path
 21 |     tb_log_path         = args.tb_log_path
 22 |     model_select        = args.model_select
 23 | 
 24 |     rank_out            = args.rank
 25 |     user_batch_size     = 1000
 26 |     n_scores_user       = 2500
 27 |     data_batch_size     = 100
 28 |     dropout             = args.dropout
 29 |     recall_at           = range(50, 550, 50)
 30 |     eval_batch_size     = 1000
 31 |     max_data_per_step   = 2500000
 32 |     eval_every          = args.eval_every
 33 |     num_epoch           = 10
 34 | 
 35 |     _lr = args.lr
 36 |     _decay_lr_every = 50
 37 |     _lr_decay = 0.1
 38 | 
 39 |     experiment = '%s_%s' % (
 40 |         datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'),
 41 |         '-'.join(str(x / 100) for x in model_select) if model_select else 'simple'
 42 |     )
 43 |     print('running: ' + experiment)
 44 | 
 45 |     dat = load_data(data_path)
 46 |     u_pref_scaled = dat['u_pref_scaled']
 47 |     v_pref_scaled = dat['v_pref_scaled']
 48 |     eval_warm = dat['eval_warm']
 49 |     eval_cold_user = dat['eval_cold_user']
 50 |     eval_cold_item = dat['eval_cold_item']
 51 |     user_content = dat['user_content']
 52 |     item_content = dat['item_content']
 53 |     u_pref = dat['u_pref']
 54 |     v_pref = dat['v_pref']
 55 |     user_indices = dat['user_indices']
 56 | 
 57 |     timer = utils.timer(name='main').tic()
 58 | 
 59 |     # append pref factors for faster dropout
 60 |     v_pref_expanded = np.vstack([v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
 61 |     v_pref_last = v_pref_scaled.shape[0]
 62 |     u_pref_expanded = np.vstack([u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
 63 |     u_pref_last = u_pref_scaled.shape[0]
 64 |     timer.toc('initialized numpy data')
 65 | 
 66 |     # prep eval
 67 |     eval_batch_size = eval_batch_size
 68 |     timer.tic()
 69 |     eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 70 |     timer.toc('initialized eval_warm').tic()
 71 |     eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 72 |     timer.toc('initialized eval_cold_user').tic()
 73 |     eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size)
 74 |     timer.toc('initialized eval_cold_item').tic()
 75 | 
 76 |     dropout_net = model.get_model(latent_rank_in=u_pref.shape[1],
 77 |                                user_content_rank=user_content.shape[1],
 78 |                                item_content_rank=item_content.shape[1],
 79 |                                model_select=model_select,
 80 |                                rank_out=rank_out)
 81 | 
 82 |     row_index = np.copy(user_indices)
 83 |     n_step = 0
 84 |     best_cold_user = 0
 85 |     best_cold_item = 0
 86 |     best_warm = 0
 87 |     n_batch_trained = 0
 88 |     best_step = 0
 89 |     optimizer = torch.optim.SGD(dropout_net.parameters(), args.lr, momentum=0.9)
 90 |     crit = torch.nn.MSELoss()
 91 |     d_train = torch.device(args.model_device)
 92 |     d_eval = torch.device(args.inf_device)
 93 | 
 94 |     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=_decay_lr_every, gamma=_lr_decay)
 95 |     dropout_net.to(d_train)
 96 |     dropout_net.train()
 97 | 
 98 |     for epoch in range(num_epoch):
 99 |         np.random.shuffle(row_index)
100 |         for b in utils.batch(row_index, user_batch_size):
101 |             n_step += 1
102 |             # prep targets
103 |             target_users = np.repeat(b, n_scores_user)
104 |             target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
105 |             target_items_rand = [np.random.choice(v_pref.shape[0], n_scores_user) for _ in b]
106 |             target_items_rand = np.array(target_items_rand).flatten()
107 |             target_ui_rand = np.transpose(np.vstack([target_users_rand, target_items_rand]))
108 |             
109 |             preds_pref = np.matmul(u_pref[b, :], v_pref.T)
110 |             preds_pref = torch.tensor(preds_pref)
111 |             target_scores, target_items = torch.topk(preds_pref, k=n_scores_user, sorted=True)
112 |             random_scores = preds_pref.detach().cpu().numpy()[target_ui_rand[:,0],target_ui_rand[:,1]]
113 | 
114 | 
115 |             # merge topN and randomN items per user
116 |             target_scores = np.append(target_scores, random_scores)
117 |             target_items = np.append(target_items, target_items_rand)
118 |             target_users = np.append(target_users, target_users)
119 | 
120 |             n_targets = len(target_scores)
121 |             perm = np.random.permutation(n_targets)
122 |             n_targets = min(n_targets, max_data_per_step)
123 |             data_batch = [(n, min(n + data_batch_size, n_targets)) for n in range(0, n_targets, data_batch_size)]
124 |             f_batch = 0
125 |             pbar = tqdm(data_batch, desc='ubatch')
126 |             
127 |             for (start, stop) in pbar:
128 |                 batch_perm = perm[start:stop]
129 |                 batch_users = target_users[batch_perm]
130 |                 batch_items = target_items[batch_perm]
131 |                 if dropout != 0:
132 |                     n_to_drop = int(np.floor(dropout * len(batch_perm)))
133 |                     perm_user = np.random.permutation(len(batch_perm))[:n_to_drop]
134 |                     perm_item = np.random.permutation(len(batch_perm))[:n_to_drop]
135 |                     batch_v_pref = np.copy(batch_items)
136 |                     batch_u_pref = np.copy(batch_users)
137 |                     batch_v_pref[perm_user] = v_pref_last
138 |                     batch_u_pref[perm_item] = u_pref_last
139 |                 else:
140 |                     batch_v_pref = batch_items
141 |                     batch_u_pref = batch_users
142 | 
143 |                 Uin = u_pref_expanded[batch_u_pref, :]
144 |                 Vin = v_pref_expanded[batch_v_pref, :]
145 |                 Ucontent = user_content[batch_users, :].todense()
146 |                 Vcontent = item_content[batch_items, :].todense()
147 |                 targets = target_scores[batch_perm]
148 |                 
149 |                 Uin = torch.tensor(Uin).to(d_train)
150 |                 Vin = torch.tensor(Vin).to(d_train)
151 |                 Ucontent = torch.tensor(Ucontent).to(d_train)
152 |                 Vcontent = torch.tensor(Vcontent).to(d_train)
153 |                 targets = torch.tensor(targets).to(d_train)
154 |                 
155 |                 preds, U_embedding, V_embedding = dropout_net.forward(Uin, Vin, Ucontent, Vcontent)
156 |                 loss = crit(preds, targets)
157 |                 loss_out = loss.item()
158 |                 
159 |                 optimizer.zero_grad()
160 |                 loss.backward()
161 |                 optimizer.step()
162 |                 f_batch += loss_out
163 |                 if np.isnan(f_batch):
164 |                     raise Exception('f is nan')
165 |                 n_batch_trained += 1
166 |                 pbar.set_description(f'updates={n_batch_trained/1000:.0f}k f={loss_out:.4f} f_tot={f_batch:.2f}')
167 |             # step after every ubatch, decay is based on # of ubatch
168 |             scheduler.step()
169 | 
170 |             if n_step % eval_every == 0:
171 |                 dropout_net.to(d_eval)
172 |                 dropout_net.eval()
173 | 
174 |                 recall_warm      = dropout_net.evaluate(recall_k=recall_at, eval_data=eval_warm,      device=d_eval)
175 |                 recall_cold_user = dropout_net.evaluate(recall_k=recall_at, eval_data=eval_cold_user, device=d_eval)
176 |                 recall_cold_item = dropout_net.evaluate(recall_k=recall_at, eval_data=eval_cold_item, device=d_eval)
177 | 
178 |                 dropout_net.to(d_train)
179 |                 dropout_net.train()
180 | 
181 |                 # checkpoint
182 |                 agg_cur = np.sum(recall_warm + recall_cold_user + recall_cold_item) 
183 |                 agg_best = np.sum(best_warm + best_cold_user + best_cold_item)
184 |                 if agg_cur > agg_best:
185 |                     best_cold_user = recall_cold_user
186 |                     best_cold_item = recall_cold_item
187 |                     best_warm      = recall_warm
188 |                     best_step      = n_step
189 | 
190 |                 timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (
191 |                     n_step, len(data_batch), n_batch_trained, f_batch, best_step
192 |                 )).tic()
193 |                 print ('\t\t'+' '.join([('@'+str(i)).ljust(6) for i in recall_at]))
194 |                 print('warm start\t%s\ncold user\t%s\ncold item\t%s' % (
195 |                     ' '.join(['%.4f' % i for i in recall_warm]),
196 |                     ' '.join(['%.4f' % i for i in recall_cold_user]),
197 |                     ' '.join(['%.4f' % i for i in recall_cold_item])
198 |                 ))
199 | 
200 | 
201 | def load_data(data_path):
202 |     timer = utils.timer(name='main').tic()
203 |     split_folder = os.path.join(data_path, 'warm')
204 | 
205 |     u_file                  = os.path.join(data_path, 'trained/warm/U.csv.bin')
206 |     v_file                  = os.path.join(data_path, 'trained/warm/V.csv.bin')
207 |     user_content_file       = os.path.join(data_path, 'user_features_0based.txt')
208 |     item_content_file       = os.path.join(data_path, 'item_features_0based.txt')
209 |     train_file              = os.path.join(split_folder, 'train.csv')
210 |     test_warm_file          = os.path.join(split_folder, 'test_warm.csv')
211 |     test_warm_iid_file      = os.path.join(split_folder, 'test_warm_item_ids.csv')
212 |     test_cold_user_file     = os.path.join(split_folder, 'test_cold_user.csv')
213 |     test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv')
214 |     test_cold_item_file     = os.path.join(split_folder, 'test_cold_item.csv')
215 |     test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv')
216 | 
217 |     dat = {}
218 |     # load preference data
219 |     timer.tic()
220 |     u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200)
221 |     v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200)
222 |     dat['u_pref'] = u_pref
223 |     dat['v_pref'] = v_pref
224 | 
225 |     timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic()
226 | 
227 |     # pre-process
228 |     _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
229 |     _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
230 |     timer.toc('standardized U,V').tic()
231 | 
232 |     # load content data
233 |     timer.tic()
234 |     user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32)
235 |     dat['user_content'] = user_content.tolil(copy=False)
236 |     timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic()
237 |     item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)
238 |     dat['item_content'] = item_content.tolil(copy=False)
239 |     timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
240 | 
241 |     # load split
242 |     timer.tic()
243 |     train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view(
244 |         dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)])
245 |     dat['user_indices'] = np.unique(train['uid'])
246 |     timer.toc('read train triplets %s' % train.shape).tic()
247 | 
248 |     dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False,
249 |                                            train_data=train)
250 |     dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user',
251 |                                                 cold=True,
252 |                                                 train_data=train)
253 |     dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item',
254 |                                                 cold=True,
255 |                                                 train_data=train)
256 |     return dat
257 | 
258 | 
259 | if __name__ == "__main__":
260 |     parser = argparse.ArgumentParser(description="Demo script to run DropoutNet on RecSys data",
261 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
262 |     parser.add_argument('--data-dir', type=str, required=True, help='path to eval in the downloaded folder')
263 | 
264 |     parser.add_argument('--model-device', type=str, default='cuda:0', help='device to use for training')
265 |     parser.add_argument('--inf-device', type=str, default='cpu', help='device to use for inference')
266 |     parser.add_argument('--checkpoint-path', type=str, default=None,
267 |                         help='path to dump checkpoint data from TensorFlow')
268 |     parser.add_argument('--tb-log-path', type=str, default=None,
269 |                         help='path to dump TensorBoard logs')
270 |     parser.add_argument('--model-select', nargs='+', type=int,
271 |                         default=[800, 400],
272 |                         help='specify the fully-connected architecture, starting from input,'
273 |                              ' numbers indicate numbers of hidden units',
274 |                         )
275 |     parser.add_argument('--rank', type=int, default=200, help='output rank of latent model')
276 |     parser.add_argument('--dropout', type=float, default=0.5, help='DropoutNet dropout')
277 |     parser.add_argument('--eval-every', type=int, default=2, help='evaluate every X user-batch')
278 |     parser.add_argument('--lr', type=float, default=0.005, help='starting learning rate')
279 | 
280 |     args = parser.parse_args()
281 |     main()
282 | 


--------------------------------------------------------------------------------
/torch/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import scipy
  4 | import numpy as np
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | 
  9 | def truncated_normal_(tensor, mean=0, std=1):
 10 |     size = tensor.shape
 11 |     tmp = tensor.new_empty(size + (4,)).normal_()
 12 |     valid = (tmp < 2) & (tmp > -2)
 13 |     ind = valid.max(-1, keepdim=True)[1]
 14 |     tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
 15 |     tensor.data.mul_(std).add_(mean)
 16 | 
 17 | @torch.no_grad()
 18 | def init_weights(net):
 19 |     if type(net) == nn.Linear:
 20 |         #torch.nn.init.normal_(net.weight, mean=0, std=0.01)
 21 |         truncated_normal_(net.weight, std=0.01)
 22 |         if net.bias is not None:
 23 |             torch.nn.init.constant_(net.bias, 0)
 24 | 
 25 | 
 26 | def get_model(latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out):
 27 |     model = DeepCF(latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out)
 28 |     model.apply(init_weights)
 29 |     return model
 30 |         
 31 | 
 32 | 
 33 | class TanHBlock(nn.Module):
 34 |     def __init__(self, dim_in, dim_out):
 35 |         super(TanHBlock, self).__init__()
 36 |         self.layer = nn.Linear(dim_in, dim_out)
 37 |         self.bn = nn.BatchNorm1d(
 38 |                 num_features=dim_out,
 39 |                 momentum=0.01,
 40 |                 eps=0.001
 41 |                 )
 42 | 
 43 |     
 44 |     def forward(self, x):
 45 |         out = self.layer(x)
 46 |         out = self.bn(out)
 47 |         out = torch.tanh(out)
 48 |         return out
 49 | 
 50 | class DeepCF(nn.Module):
 51 |     """
 52 |     main model class implementing DeepCF
 53 |     also stores states for fast candidate generation
 54 |     latent_rank_in: rank of preference model input
 55 |     user_content_rank: rank of user content input
 56 |     item_content_rank: rank of item content input
 57 |     model_select: array of number of hidden unit,
 58 |         i.e. [200,100] indicate two hidden layer with 200 units followed by 100 units
 59 |     rank_out: rank of latent model output
 60 |     """
 61 | 
 62 |     def __init__(self, latent_rank_in, user_content_rank, item_content_rank, model_select, rank_out):
 63 |         super(DeepCF, self).__init__()
 64 |         self.rank_in = latent_rank_in
 65 |         self.phi_u_dim = user_content_rank
 66 |         self.phi_v_dim = item_content_rank
 67 |         self.model_select = model_select
 68 |         self.rank_out = rank_out
 69 | 
 70 |         # inputs
 71 |         self.phase = None
 72 |         self.target = None
 73 |         self.eval_trainR = None
 74 |         self.U_pref_tf = None
 75 |         self.V_pref_tf = None
 76 |         self.rand_target_ui = None
 77 | 
 78 |         # outputs in the model
 79 |         self.updates = None
 80 | 
 81 |         # predictor
 82 |         self.tf_topk_vals = None
 83 |         self.tf_topk_inds = None
 84 |         self.preds_random = None
 85 |         self.tf_latent_topk_cold = None
 86 |         self.tf_latent_topk_warm = None
 87 |         self.eval_preds_warm = None
 88 |         self.eval_preds_cold = None
 89 |         
 90 |         u_dim = self.rank_in + self.phi_u_dim if self.phi_u_dim > 0 else self.rank_in
 91 |         v_dim = self.rank_in + self.phi_v_dim if self.phi_v_dim > 0 else self.rank_in
 92 | 
 93 |         print ('\tu_concat rank=%s' % str(u_dim))
 94 |         print ('\tv_concat rank=%s' % str(v_dim))
 95 |         
 96 |         u_dims = [u_dim] + self.model_select
 97 |         v_dims = [v_dim] + self.model_select
 98 |         self.u_layers = nn.ModuleList(TanHBlock(u_dims[i], u_dims[i + 1]) for i in range(len(u_dims) - 1))
 99 |         self.v_layers = nn.ModuleList(TanHBlock(v_dims[i], v_dims[i + 1]) for i in range(len(v_dims) - 1))
100 |         
101 |         self.u_emb = nn.Linear(u_dims[-1], self.rank_out)
102 |         self.v_emb = nn.Linear(v_dims[-1], self.rank_out)
103 | 
104 |     def encode(self, Uin, Vin, Ucontent, Vcontent):
105 |         
106 |         if self.phi_u_dim>0:
107 |             u_concat = torch.cat((Uin, Ucontent), 1)
108 |         else:
109 |             u_concat = Uin
110 | 
111 |         if self.phi_v_dim>0:
112 |             v_concat = torch.cat((Vin, Vcontent), 1)
113 |         else:
114 |             v_concat = Vin
115 |             
116 |         u_out = u_concat
117 |         for layer in self.u_layers:
118 |             u_out = layer(u_out)
119 |         U_embedding = self.u_emb(u_out)
120 |         
121 |         v_out = v_concat
122 |         for layer in self.v_layers:
123 |             v_out = layer(v_out)
124 |         V_embedding = self.v_emb(v_out)
125 |         return U_embedding, V_embedding
126 |         
127 |     def forward(self, Uin, Vin, Ucontent, Vcontent):
128 |         
129 |         U_embedding, V_embedding = self.encode(Uin, Vin, Ucontent, Vcontent)
130 |         
131 |         preds = U_embedding * V_embedding
132 |         preds = torch.sum(preds, 1)
133 |         return preds, U_embedding, V_embedding
134 | 
135 |     @torch.no_grad()
136 |     def evaluate(self, recall_k, eval_data, device=None):
137 |         """
138 |         given EvalData runs batch evaluation
139 |         :param recall_k: list of thresholds to compute recall at (information retrieval recall)
140 |         :param eval_data: EvalData instance
141 |         :return: recall array at thresholds matching recall_k
142 |         """
143 |         d = device
144 | 
145 |         tf_eval_preds_batch = []
146 |         for (batch, (eval_start, eval_stop)) in enumerate(tqdm(eval_data.eval_batch, desc='eval', leave=False)):
147 | 
148 |             Uin = eval_data.U_pref_test[eval_start:eval_stop, :]
149 |             Vin = eval_data.V_pref_test
150 |             Vcontent = eval_data.V_content_test
151 | 
152 |             if self.phi_u_dim > 0: 
153 |                 Ucontent= eval_data.U_content_test[eval_start:eval_stop, :]
154 |             else:
155 |                 Ucontent = None
156 | 
157 |             Uin = torch.tensor(Uin)
158 |             Vin = torch.tensor(Vin)
159 |             if Ucontent is not None:
160 |                 Ucontent = torch.tensor(Ucontent)
161 |             if Vcontent is not None:
162 |                 Vcontent = torch.tensor(Vcontent)
163 |             if d is not None:
164 |                 Uin = Uin.to(d)
165 |                 Vin = Vin.to(d)
166 |                 Ucontent = Ucontent.to(d)
167 |                 Vcontent = Vcontent.to(d)
168 |             U_embedding, V_embedding = self.encode(Uin, Vin, Ucontent, Vcontent)
169 |             embedding_prod = torch.matmul(U_embedding, V_embedding.t())
170 | 
171 | 
172 |             if not eval_data.is_cold:
173 |                 eval_trainR = eval_data.tf_eval_train[batch]
174 |                 embedding_prod = embedding_prod + eval_trainR
175 | 
176 |             _, eval_preds = torch.topk(embedding_prod, k=recall_k[-1], sorted=True)
177 |             tf_eval_preds_batch.append(eval_preds.detach().cpu().numpy())
178 | 
179 | 
180 |         tf_eval_preds = np.concatenate(tf_eval_preds_batch)
181 | 
182 |         # filter non-zero targets
183 |         y_nz = [len(x) > 0 for x in eval_data.R_test_inf.rows]
184 |         y_nz = np.arange(len(eval_data.R_test_inf.rows))[y_nz]
185 | 
186 |         preds_all = tf_eval_preds[y_nz, :]
187 | 
188 |         recall = []
189 |         for at_k in tqdm(recall_k, desc='recall', leave=False):
190 |             preds_k = preds_all[:, :at_k]
191 |             y = eval_data.R_test_inf[y_nz, :]
192 | 
193 |             x = scipy.sparse.lil_matrix(y.shape)
194 |             x.data = np.array([z.tolist() for z in np.ones_like(preds_k)]+[[]],dtype=object)[:-1]
195 |             x.rows = np.array([z.tolist() for z in preds_k]+[[]],dtype=object)[:-1]
196 |             z = y.multiply(x)
197 |             recall.append(np.mean(np.divide((np.sum(z, 1)), np.sum(y, 1))))
198 |         return recall
199 | 


--------------------------------------------------------------------------------
/torch/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import datetime
  3 | import numpy as np
  4 | import scipy
  5 | from sklearn import preprocessing as prep
  6 | 
  7 | 
  8 | class timer(object):
  9 |     def __init__(self, name='default'):
 10 |         """
 11 |         timer object to record running time of functions, not for micro-benchmarking
 12 |         usage is:
 13 |             $ timer = utils.timer('name').tic()
 14 |             $ timer.toc('process A').tic()
 15 | 
 16 | 
 17 |         :param name: label for the timer
 18 |         """
 19 |         self._start_time = None
 20 |         self._name = name
 21 |         self.tic()
 22 | 
 23 |     def tic(self):
 24 |         self._start_time = time.time()
 25 |         return self
 26 | 
 27 |     def toc(self, message):
 28 |         elapsed = time.time() - self._start_time
 29 |         message = '' if message is None else message
 30 |         print('[{0:s}] {1:s} elapsed [{2:s}]'.format(self._name, message, timer._format(elapsed)))
 31 |         return self
 32 | 
 33 |     def reset(self):
 34 |         self._start_time = None
 35 |         return self
 36 | 
 37 |     @staticmethod
 38 |     def _format(s):
 39 |         delta = datetime.timedelta(seconds=s)
 40 |         d = datetime.datetime(1, 1, 1) + delta
 41 |         s = ''
 42 |         if (d.day - 1) > 0:
 43 |             s = s + '{:d} days'.format(d.day - 1)
 44 |         if d.hour > 0:
 45 |             s = s + '{:d} hr'.format(d.hour)
 46 |         if d.minute > 0:
 47 |             s = s + '{:d} min'.format(d.minute)
 48 |         s = s + '{:d} s'.format(d.second)
 49 |         return s
 50 | 
 51 | 
 52 | def batch(iterable, _n=1, drop=True):
 53 |     """
 54 |     returns batched version of some iterable
 55 |     :param iterable: iterable object as input
 56 |     :param _n: batch size
 57 |     :param drop: if true, drop extra if batch size does not divide evenly,
 58 |         otherwise keep them (last batch might be shorter)
 59 |     :return: batched version of iterable
 60 |     """
 61 |     it_len = len(iterable)
 62 |     for ndx in range(0, it_len, _n):
 63 |         if ndx + _n < it_len:
 64 |             yield iterable[ndx:ndx + _n]
 65 |         elif drop is False:
 66 |             yield iterable[ndx:it_len]
 67 | 
 68 | 
 69 | def tfidf(x):
 70 |     """
 71 |     compute tfidf of numpy array x
 72 |     :param x: input array, document by terms
 73 |     :return:
 74 |     """
 75 |     x_idf = np.log(x.shape[0] - 1) - np.log(1 + np.asarray(np.sum(x > 0, axis=0)).ravel())
 76 |     x_idf = np.asarray(x_idf)
 77 |     x_idf_diag = scipy.sparse.lil_matrix((len(x_idf), len(x_idf)))
 78 |     x_idf_diag.setdiag(x_idf)
 79 |     x_tf = x.tocsr()
 80 |     x_tf.data = np.log(x_tf.data + 1)
 81 |     x_tfidf = x_tf * x_idf_diag
 82 |     return x_tfidf
 83 | 
 84 | 
 85 | def prep_standardize(x):
 86 |     """
 87 |     takes sparse input and compute standardized version
 88 | 
 89 |     Note:
 90 |         cap at 5 std
 91 | 
 92 |     :param x: 2D scipy sparse data array to standardize (column-wise), must support row indexing
 93 |     :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
 94 |     """
 95 |     x_nzrow = x.any(axis=1)
 96 |     scaler = prep.StandardScaler().fit(x[x_nzrow, :])
 97 |     x_scaled = np.copy(x)
 98 |     x_scaled[x_nzrow, :] = scaler.transform(x_scaled[x_nzrow, :])
 99 |     x_scaled[x_scaled > 5] = 5
100 |     x_scaled[x_scaled < -5] = -5
101 |     x_scaled[np.absolute(x_scaled) < 1e-5] = 0
102 |     return scaler, x_scaled
103 | 
104 | 
105 | def prep_standardize_dense(x):
106 |     """
107 |     takes dense input and compute standardized version
108 | 
109 |     Note:
110 |         cap at 5 std
111 | 
112 |     :param x: 2D numpy data array to standardize (column-wise)
113 |     :return: the object to perform scale (stores mean/std) for inference, as well as the scaled x
114 |     """
115 |     scaler = prep.StandardScaler().fit(x)
116 |     x_scaled = scaler.transform(x)
117 |     x_scaled[x_scaled > 5] = 5
118 |     x_scaled[x_scaled < -5] = -5
119 |     x_scaled[np.absolute(x_scaled) < 1e-5] = 0
120 |     return scaler, x_scaled
121 | 


--------------------------------------------------------------------------------