├── conf ├── reddit │ └── VAEmultilayer.config └── yelp_SIGIR │ └── VAEmultilayer.config ├── models ├── BaseModel.py ├── __init__.py └── VAEmultilayer.py ├── LICENSE ├── utils ├── Params.py ├── Logger.py ├── KAVgenerator.py ├── Table.py ├── HPShelper.py ├── io.py ├── Dataset.py ├── Tools.py ├── Trainer.py └── Evaluator.py ├── README.md ├── .gitignore └── main.py /conf/reddit/VAEmultilayer.config: -------------------------------------------------------------------------------- 1 | { 2 | "hidden_dim": 50, 3 | "act": "tanh", 4 | "sparse_normalization": false, 5 | "anneal_cap": 0.2, 6 | "total_anneal_steps": 0, 7 | "learning_rate": 0.001, 8 | "weight_decay": 1e-4, 9 | "batch_size": 800, 10 | "test_batch_size": 800, 11 | "num_epochs": 20, 12 | "early_stop": false, 13 | "patience": 100, 14 | "dropout_ratio": 0.2, 15 | "weighted_recon": 0.3 16 | } -------------------------------------------------------------------------------- /models/BaseModel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class BaseModel(nn.Module): 4 | def __init__(self): 5 | super(BaseModel, self).__init__() 6 | 7 | def forward(self, *input): 8 | pass 9 | 10 | def train_one_epoch(self, *input): 11 | pass 12 | 13 | def predict(self, eval_users, eval_pos, test_batch_size): 14 | pass 15 | 16 | def before_evaluate(self): 17 | pass -------------------------------------------------------------------------------- /conf/yelp_SIGIR/VAEmultilayer.config: -------------------------------------------------------------------------------- 1 | { 2 | "hidden_dim": 50, 3 | "act": "tanh", 4 | "sparse_normalization": false, 5 | "anneal_cap": 0.2, 6 | "total_anneal_steps": 0, 7 | "learning_rate": 0.001, 8 | "weight_decay": 1e-4, 9 | "batch_size": 800, 10 | "test_batch_size": 800, 11 | "num_epochs": 20, 12 | "early_stop": false, 13 | "patience": 100, 14 | "dropout_ratio": 0.2, 15 | "weighted_recon": 0.3 16 | } -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # Non-eural 2 | # from models.ItemKNN import ItemKNN 3 | # from models.PureSVD import PureSVD 4 | # from models.SLIMElastic import SLIM 5 | # from models.P3a import P3a 6 | # from models.RP3b import RP3b 7 | # from models.EASE import EASE 8 | 9 | # VAEs 10 | from models.VAEmultilayer import VAEmultilayer 11 | 12 | # from models.DAE import DAE 13 | # from models.CDAE import CDAE 14 | # from models.BPRMF import BPRMF 15 | # from models.MultVAE import MultVAE 16 | # from models.NGCF import NGCF 17 | # from models.LightGCN import LightGCN 18 | 19 | # __all__ = ['ItemKNN', 'PureSVD', 'P3a', 'RP3b', 'SLIM', 'EASE', 'DAE', 'CDAE', 'BPRMF', 'MultVAE', 'AE'] 20 | __all__ = ['VAEmultilayer'] 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Zhaolin Gao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/Params.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | Codes from: 5 | https://github.com/cs230-stanford/cs230-code-examples/blob/master/pytorch/nlp/utils.py 6 | """ 7 | 8 | 9 | class Params(): 10 | """Class that loads hyperparameters from a json file. 11 | Example: 12 | ``` 13 | params = Params(json_path) 14 | print(params.learning_rate) 15 | params.learning_rate = 0.5 # change the value of learning_rate in params 16 | ``` 17 | """ 18 | 19 | def __init__(self): 20 | return 21 | 22 | def update_dict(self, dic): 23 | self.__dict__.update(dic) 24 | 25 | def save(self, json_path): 26 | with open(json_path, 'w') as f: 27 | json.dump(self.__dict__, f, indent=4) 28 | 29 | def update(self, json_path): 30 | """Loads parameters from json file""" 31 | with open(json_path) as f: 32 | params = json.load(f) 33 | self.__dict__.update(params) 34 | 35 | def __str__(self): 36 | # return string representation of 'Parameters' class 37 | # print(Parameters) or str(Parameters) 38 | ret = '======== [Config] ========\n' 39 | for k in self.__dict__: 40 | ret += '%s: %s\n' % (str(k), str(self.__dict__[k])) 41 | ret += '\n' 42 | return ret 43 | 44 | @property 45 | def dict(self): 46 | """ 47 | Gives dict-like access to params instance by 48 | `params.dict['learning_rate'] 49 | """ 50 | return self.__dict__ 51 | 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TD-VAE-CF 2 | 3 | This repo covers the implementation for our paper: 4 | 5 | Zhaolin Gao, Tianshu Shen, Zheda Mai, Mohamed Reda Bouadjenek, Isaac Waller, Ashton Anderson, Ron Bodkin, and Scott Sanner. "Mitigating the Filter Bubble while Maintaining Relevance: Targeted Diversification with VAE-based Recommender Systems" In Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 22). 6 | 7 | ## Instructions 8 | 9 | 1. Download dataset from `https://drive.google.com/drive/folders/1o1izS1Mjptmq8SG5lodc2F6guiifxna7?usp=sharing`. 10 | 11 | 2. Modify the `api_key` in line 96 of `main.py` to your api key on comet_ml. 12 | 13 | 3. Train and evaluate: 14 | ``` 15 | python main.py --data_name yelp_SIGIR --target veg_bbq --lamb LAMB_VALUE --std STD_VALUE 16 | python main.py --data_name yelp_SIGIR --target fried_salad --lamb LAMB_VALUE --std STD_VALUE 17 | python main.py --data_name reddit --target men_women --lamb LAMB_VALUE --std STD_VALUE 18 | python main.py --data_name reddit --target rep_dem --lamb LAMB_VALUE --std STD_VALUE 19 | ``` 20 | 21 | ## Citation 22 | 23 | If you find this code useful in your research, please cite the following paper: 24 | 25 | @inproceedings{gao2022sigir, 26 | title={Mitigating the Filter Bubble while Maintaining Relevance: Targeted Diversification with VAE-based Recommender Systems}, 27 | author={Zhaolin Gao, Tianshu Shen, Zheda Mai, Mohamed Reda Bouadjenek, Isaac Waller, Ashton Anderson, Ron Bodkin, Scott Sanner}, 28 | booktitle={Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval}, 29 | year={2022} 30 | } 31 | 32 | ## Credit 33 | 34 | Reddit dataset is obtained using [[PushShift](https://github.com/pushshift/api)] 35 | -------------------------------------------------------------------------------- /utils/Logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import logging 5 | 6 | class Logger: 7 | def __init__(self, log_dir): 8 | self.logger = logging.getLogger('RecSys') 9 | self.logger.setLevel(logging.INFO) 10 | 11 | # File handler 12 | self.log_dir = self.get_log_dir(log_dir) 13 | fh = logging.FileHandler(os.path.join(self.log_dir, 'log.txt')) 14 | fh.setLevel(logging.DEBUG) 15 | fh_format = logging.Formatter('%(asctime)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') 16 | fh.setFormatter(fh_format) 17 | self.logger.addHandler(fh) 18 | 19 | # Console handler 20 | ch = logging.StreamHandler(sys.stdout) 21 | ch.setLevel(logging.INFO) 22 | ch_format = logging.Formatter('%(message)s') 23 | ch.setFormatter(ch_format) 24 | self.logger.addHandler(ch) 25 | 26 | def info(self, msg): 27 | self.logger.info(msg) 28 | 29 | def get_log_dir(self, log_dir): 30 | if not os.path.exists(log_dir): 31 | os.makedirs(log_dir) 32 | 33 | log_dirs = os.listdir(log_dir) 34 | if len(log_dirs) == 0: 35 | idx = 0 36 | else: 37 | idx_lis = [] 38 | for d in log_dirs: 39 | try: 40 | current_idx = int(d.split('_')[0]) 41 | idx_lis.append(current_idx) 42 | except: 43 | continue 44 | 45 | idx_list = sorted(idx_lis) 46 | idx = idx_list[-1] + 1 47 | 48 | cur_log_dir = '%d_%s' % (idx, time.strftime('%Y%m%d-%H%M')) 49 | full_log_dir = os.path.join(log_dir, cur_log_dir) 50 | if not os.path.exists(full_log_dir): 51 | os.mkdir(full_log_dir) 52 | 53 | return full_log_dir -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /utils/KAVgenerator.py: -------------------------------------------------------------------------------- 1 | from sklearn import linear_model 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | 6 | class KAVgenerator: 7 | def __init__(self, positive_embeddings, negative_embeddings): 8 | '''Takes in the embeddings for the postivie political spectrum dimension, 9 | as well as the negative political spectrum dimension embeddings''' 10 | self.positive_embeddings = positive_embeddings 11 | self.negative_embeddings = negative_embeddings 12 | self.num_positive = self.positive_embeddings.shape[0] 13 | self.num_negative = self.negative_embeddings.shape[0] 14 | self.dim_activate = self.positive_embeddings.shape[1] 15 | 16 | def _get_cav(self, num_neg=1,num_vec=100): 17 | vectors = [] 18 | if self.num_positive == 0 or self.num_negative == 0: 19 | #activation vectors are all zero vector 20 | return np.zeros(shape=(num_vec, self.dim_activate)) 21 | 22 | for _ in range(num_vec): 23 | # Get positive and negative sample id's, and their vector embeddings 24 | positive_samples = np.random.choice(list(range(self.positive_embeddings.shape[0])), num_neg) 25 | negative_samples = np.random.choice(list(range(self.negative_embeddings.shape[0])), num_neg) 26 | v_positive_samples = self.positive_embeddings[positive_samples] 27 | v_negative_samples = self.negative_embeddings[negative_samples] 28 | 29 | X = np.vstack((v_positive_samples,v_negative_samples)) 30 | Y = [1]*len(positive_samples) + [0]*len(negative_samples) 31 | lm = linear_model.LogisticRegression() 32 | lm.fit(X, Y) 33 | vectors.append(lm.coef_[0]) 34 | return self.normalize_rows(np.vstack(vectors)) 35 | 36 | def get_all_cav(self, num_negatives,num_cav): 37 | ret = [] 38 | print("Generate Activation Vector") 39 | # get the directional vector for this component 40 | kavs = self._get_cav(num_negatives, num_cav) 41 | ret.append(kavs) 42 | #kp by sample by dim 43 | return np.stack(ret,axis=0) 44 | # return kavs 45 | 46 | def get_all_mean_cav(self, num_negatives, num_cav): 47 | all_cav = self.get_all_cav(num_negatives, num_cav) 48 | # print(all_cav.shape) 49 | return np.mean(all_cav, axis=1) 50 | 51 | def normalize_rows(self, x): 52 | #return x 53 | """ 54 | function that normalizes each row of the matrix x to have unit length. 55 | Args: 56 | ``x``: A numpy matrix of shape (n, m) 57 | Returns: 58 | ``x``: The normalized (by row) numpy matrix. 59 | """ 60 | return x/np.linalg.norm(x, ord=2, axis=1, keepdims=True) 61 | -------------------------------------------------------------------------------- /utils/Table.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | 4 | class Table: 5 | """ 6 | 7 | Class to save and show result neatly. 8 | First column is always 'NAME' column. 9 | 10 | """ 11 | def __init__(self, table_name='table', header=None, splitter='||', int_formatter='%3d', float_formatter='%.4f'): 12 | """ 13 | Initialize table setting. 14 | 15 | :param list header: list of string, table headers. 16 | :param str splitter: 17 | :param str int_formatter: 18 | :param str float_formatter: 19 | """ 20 | self.table_name = table_name 21 | self.header = header 22 | if self.header is not None: 23 | self.set_headers(self.header) 24 | self.num_rows = 0 25 | self.splitter = splitter 26 | self.int_formatter = int_formatter 27 | self.float_formatter = float_formatter 28 | 29 | def set_headers(self, header): 30 | """ 31 | Set table headers as given and clear all data. 32 | 33 | :param list header: list of header strings 34 | :return: None 35 | """ 36 | self.header = ['NAME'] + header 37 | self.data = OrderedDict([(h, []) for h in self.header]) 38 | self.max_len = OrderedDict([(h, len(h)) for h in self.header]) 39 | # {h: len(h) for h in self.header} 40 | 41 | def add_row(self, row_name, row_dict): 42 | """ 43 | Add new row into the table. 44 | 45 | :param str row_name: name of the row, which will be the first column 46 | :param dict row_dict: dictionary containing column name as a key and column value as value. 47 | :return: None 48 | """ 49 | 50 | # If header is not defined, fetch from input dict 51 | if self.header is None: 52 | self.set_headers(list(row_dict.keys())) 53 | 54 | # If input dict has new column, make one 55 | for key in row_dict: 56 | if key not in self.data: 57 | self.data[key] = ['-'] * self.num_rows 58 | self.header.append(key) 59 | 60 | for h in self.header: 61 | if h == 'NAME': 62 | self.data['NAME'].append(row_name) 63 | self.max_len[h] = max(self.max_len['NAME'], len(row_name)) 64 | else: 65 | # If input dict doesn't have values for table header, make empty value. 66 | if h not in row_dict: 67 | row_dict[h] = '-' 68 | 69 | # convert input dict to string 70 | d = row_dict[h] 71 | 72 | if isinstance(d, (int, np.integer)): 73 | d_str = self.int_formatter % d 74 | elif isinstance(d, (float, np.float)): 75 | d_str = self.float_formatter % d 76 | elif isinstance(d, str): 77 | d_str = d 78 | else: 79 | print('Table add row WARNING: Type %s converted to string' % type(d)) 80 | d_str = str(d) 81 | # raise NotImplementedError('Type %s not implemented.' % type(d)) 82 | 83 | self.data[h].append(d_str) 84 | self.max_len[h] = max(self.max_len[h], len(d_str)) 85 | self.num_rows += 1 86 | 87 | def row_to_line(self, row_values): 88 | """ 89 | Convert a row into string form 90 | 91 | :param list row_values: list of row values as string 92 | :return: string form of a row 93 | """ 94 | value_str = [] 95 | for i, header in enumerate(self.header): 96 | max_length = self.max_len[header] 97 | length = len(row_values[i]) 98 | diff = max_length - length 99 | 100 | # Left align 101 | s = row_values[i] + ' ' * diff 102 | value_str.append(s) 103 | 104 | return self.splitter + ' ' + (' %s ' % self.splitter).join(value_str) + ' ' + self.splitter 105 | 106 | def to_string(self): 107 | """ 108 | Convert a table into string form 109 | 110 | :return: string form of the table 111 | """ 112 | size_per_col = {h: self.max_len[h] + 2 + len(self.splitter) for h in self.header} 113 | line_len = sum([size_per_col[c] for c in size_per_col]) + len(self.splitter) 114 | table_str = '\n' 115 | 116 | # TABLE NAME 117 | table_str += self.table_name + '\n' 118 | 119 | # HEADER 120 | line = self.row_to_line(self.header) 121 | table_str += '=' * line_len + '\n' 122 | table_str += line + '\n' 123 | table_str += self.splitter + '-' * (line_len - len(self.splitter) * 2) + self.splitter + '\n' 124 | 125 | # DATA 126 | for row_values in zip(*self.data.values()): 127 | line = self.row_to_line(row_values) 128 | table_str += line + '\n' 129 | table_str += '=' * line_len + '\n' 130 | return table_str 131 | 132 | def show(self): 133 | print(self.to_string()) 134 | 135 | @property 136 | def shape(self): 137 | return (self.num_rows, self.num_cols) 138 | 139 | @property 140 | def num_cols(self): 141 | return len(self.header) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from comet_ml import Experiment 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | import models 8 | from utils.Dataset import Dataset, Reddit_Dataset 9 | from utils.Evaluator import Evaluator 10 | from utils.HPShelper import conf_dict_generator 11 | from utils.Logger import Logger 12 | from utils.Params import Params 13 | from utils.Trainer import Trainer 14 | from utils.io import load_dataframe_csv, save_dataframe_csv 15 | 16 | def fit(experiment_, model_name, data_name_, target_, lamb_, std_, dataset_, log_directory, device_, skip_eval, plot_graph, run_samples): 17 | # dictionary generate from experiment 18 | d = conf_dict_generator[model_name](experiment_) 19 | d['skip_eval'] = skip_eval 20 | conf_dict = Params() 21 | conf_dict.update_dict(d) 22 | 23 | model_base = getattr(models, model_name) 24 | if 'contrast' in model_name: 25 | model_ = model_base(conf_dict, dataset_.num_users, dataset_.num_items, dataset_.num_keyphrases, device_) 26 | else: 27 | model_ = model_base(conf_dict, dataset_.num_users, dataset_.num_items, device_) 28 | 29 | evaluator = Evaluator(rec_atK=[5, 10, 15, 20, 50], explain_atK=[5, 10, 15, 20, 50], lamb=lamb_, std=std_) 30 | logger = Logger(log_directory) 31 | logger.info(conf_dict) 32 | logger.info(dataset_) 33 | 34 | trainer = Trainer( 35 | dataname=data_name_, 36 | target=target_, 37 | dataset=dataset_, 38 | model=model_, 39 | evaluator=evaluator, 40 | logger=logger, 41 | conf=conf_dict, 42 | experiment=experiment_, 43 | plot_graph=plot_graph, # plot the stats for embeddings 44 | run_samples=run_samples # run a 2D use case 45 | ) 46 | 47 | trainer.train() 48 | return (trainer.best_rec_score, trainer.best_uk_score, 49 | trainer.best_epoch, model_) 50 | 51 | 52 | if __name__ == "__main__": 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--model_name', type=str, default='VAEmultilayer') 56 | parser.add_argument('--data_name', type=str, default='yelp_SIGIR') 57 | parser.add_argument('--target', type=str, default='veg_bbq', help='[veg_bbq, fried_salad, men_women, rep_dem]') 58 | parser.add_argument('--fold_name', type=str, default='fold0') 59 | parser.add_argument('--top_items', type=int, default=10, help='used to indicate top labels for each item') 60 | parser.add_argument('--top_users', type=int, help='if cuting the matrix with top user numbers') 61 | parser.add_argument('--rating_threshold', type=float, default=1, 62 | help='used to indicate user liked items for generating uk matrices') 63 | parser.add_argument('--lamb', type=float, default=0.6) 64 | parser.add_argument('--std', type=float, default=10) 65 | 66 | parser.add_argument('--plot_graph', action='store_true', help='Whether plotting the statistical graphs') 67 | parser.add_argument('--skip_eval', action='store_true') 68 | parser.add_argument('--run_samples', action='store_true') 69 | parser.add_argument('--conf', type=str, default='VAEmultilayer.config') 70 | parser.add_argument('--seed', type=int, default=201231) 71 | p = parser.parse_args() 72 | 73 | np.random.seed(p.seed) 74 | torch.random.manual_seed(p.seed) 75 | 76 | # where the training data files are stored 77 | log_dir = "{}/{}/{}/".format("./saves", p.data_name, p.model_name) 78 | config_dir = "{}/{}/{}".format("./conf", p.data_name, p.conf) 79 | table_dir = "{}/{}/{}/".format("./tables", p.data_name, p.model_name) 80 | print('config_dir:', config_dir, 'table_dir:', table_dir) 81 | 82 | with open(config_dir) as f: 83 | conf = json.load(f) 84 | 85 | if p.data_name in ['yelp_SIGIR']: 86 | data_dir = "{}/{}/{}/".format("./data", p.data_name, p.fold_name) 87 | dataset = Dataset(data_dir=data_dir, top_keyphrases=p.top_items, rating_threshold=p.rating_threshold, 88 | top_users=p.top_users) 89 | elif p.data_name in ['reddit']: 90 | data_dir = "{}/{}/".format("./data", p.data_name) 91 | dataset = Reddit_Dataset(data_dir=data_dir, top_keyphrases=p.top_items, target=p.target) 92 | 93 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 94 | 95 | project_name = p.data_name + '-' + 'main' 96 | experiment = Experiment(api_key='', project_name=project_name) 97 | experiment.log_parameters(conf) 98 | 99 | # training 100 | try: 101 | rec_score, uk_score, epoch, model = fit(experiment, p.model_name, p.data_name, p.target, p.lamb, p.std, 102 | dataset, log_dir, device, skip_eval=p.skip_eval, 103 | plot_graph=p.plot_graph, run_samples=p.run_samples) 104 | 105 | experiment.log_metric("best_epoch", epoch) 106 | experiment.log_metrics({k: v[0] for k, v in rec_score.items()}) 107 | if uk_score is not None: 108 | experiment.log_metrics({k: v[0] for k, v in uk_score.items()}) 109 | 110 | experiment.log_others({ 111 | "model_desc": p.model_name 112 | }) 113 | 114 | # save results table 115 | result_dict = conf_dict_generator[p.model_name](experiment) 116 | result_dict['best_epoch'] = epoch 117 | try: 118 | df = load_dataframe_csv(table_dir, p.conf.split('.')[0]+'.csv') 119 | except: 120 | df = pd.DataFrame(columns=result_dict.keys()) 121 | 122 | for name in rec_score.keys(): 123 | result_dict[name] = [round(rec_score[name][0], 4), round(rec_score[name][1], 4)] 124 | if uk_score is not None: 125 | for name in uk_score.keys(): 126 | result_dict[name] = [round(uk_score[name][0], 4), round(uk_score[name][1], 4)] 127 | 128 | df = df.append(result_dict, ignore_index=True) 129 | 130 | save_dataframe_csv(df, table_dir, p.conf.split('.')[0]) 131 | 132 | finally: 133 | experiment.end() 134 | -------------------------------------------------------------------------------- /models/VAEmultilayer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dawen Liang et al., Variational Autoencoders for Collaborative Filtering. WWW 2018. 3 | https://arxiv.org/pdf/1802.05814 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import numpy as np 9 | from models.BaseModel import BaseModel 10 | from utils.Tools import activation_map, gaussian_nll 11 | 12 | class VAEmultilayer(BaseModel): 13 | def __init__(self, model_conf, num_users, num_items, device, 14 | observation_std=0.01): 15 | super(VAEmultilayer, self).__init__() 16 | 17 | self.hidden_dim = model_conf.hidden_dim 18 | self.num_users = num_users 19 | self.num_items = num_items 20 | self.act = model_conf.act 21 | self.weighted_recon = model_conf.weighted_recon 22 | self.weight_decay = model_conf.weight_decay 23 | self.sparse_normalization = model_conf.sparse_normalization 24 | self.dropout_ratio = model_conf.dropout_ratio 25 | self.observation_std = observation_std 26 | self.encoder = nn.ModuleList() 27 | self.encoder.append(nn.Linear(self.num_items, self.hidden_dim*4)) 28 | self.encoder.append(activation_map(self.act)) 29 | self.encoder.append(nn.Linear(self.hidden_dim*4, self.hidden_dim*2)) 30 | for layer in self.encoder: 31 | if 'weight' in dir(layer): 32 | torch.nn.init.xavier_uniform_(layer.weight) 33 | torch.nn.init.zeros_(layer.bias) 34 | 35 | self.decoder = nn.Linear(self.hidden_dim, self.num_items, bias=False) 36 | torch.nn.init.xavier_uniform_(self.decoder.weight) 37 | # torch.nn.init.zeros_(self.decoder.bias) 38 | 39 | self.total_anneal_steps = model_conf.total_anneal_steps 40 | self.anneal_cap = model_conf.anneal_cap 41 | 42 | self.anneal = 0. 43 | self.update_count = 0 44 | self.device = device 45 | self.to(self.device) 46 | 47 | def forward(self, rating_matrix): 48 | # encoder 49 | mu_q, logvar_q = self.get_mu_logvar(rating_matrix) 50 | std_q = self.logvar2std(logvar_q) 51 | eps = torch.randn_like(std_q) # reparametrization trick 52 | sampled_z = mu_q + self.training * eps * std_q # apply reparameterization if in training mode? 53 | 54 | output = self.decoder(sampled_z) # pass through the decoder 55 | 56 | if self.training: 57 | # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) 58 | # not averaged yet 59 | # kl_loss = -0.5 * torch.sum(1 + logvar_q - mu_q.pow(2) - logvar_q.exp()) 60 | kl_loss = -0.5 * torch.mean(1 + logvar_q - mu_q.pow(2) - logvar_q.exp()) 61 | return output, kl_loss 62 | else: # evaluation mode 63 | return output 64 | 65 | def get_mu_logvar(self, rating_matrix): 66 | 67 | if self.training and self.dropout_ratio >0 : 68 | rating_matrix = F.dropout(rating_matrix, p=self.dropout_ratio) * (1 - self.dropout_ratio) 69 | 70 | if self.sparse_normalization: 71 | deno = torch.sum(rating_matrix>0, axis=1, keepdim=True) + 1e-5 72 | rating_matrix = rating_matrix / deno 73 | 74 | # un-embedded 75 | h = rating_matrix 76 | for layer in self.encoder: # pass through encoder layer 77 | h = layer(h) 78 | mu_q = h[:, :self.hidden_dim] 79 | logvar_q = h[:, self.hidden_dim:] # log sigmod^2 80 | return mu_q, logvar_q 81 | 82 | def logvar2std(self, logvar): 83 | return torch.exp(0.5 * logvar) # sigmod 84 | 85 | def train_one_epoch(self, train_matrix, optimizer, batch_size, verbose, **kwargs): 86 | self.train() 87 | 88 | num_training = train_matrix.shape[0] 89 | num_batches = int(np.ceil(num_training / batch_size)) 90 | 91 | perm = np.random.permutation(num_training) 92 | 93 | loss = 0.0 94 | for b in range(num_batches): 95 | optimizer.zero_grad() 96 | 97 | if (b + 1) * batch_size >= num_training: 98 | batch_idx = perm[b * batch_size:] 99 | else: 100 | batch_idx = perm[b * batch_size: (b + 1) * batch_size] 101 | batch_matrix = torch.FloatTensor(train_matrix[batch_idx].toarray()).to(self.device) 102 | 103 | # used for assignment of beta value 104 | if self.total_anneal_steps > 0: 105 | self.anneal = min(self.anneal_cap, 1. * self.update_count / self.total_anneal_steps) 106 | else: 107 | self.anneal = self.anneal_cap 108 | 109 | pred_matrix, kl_loss = self.forward(batch_matrix) 110 | 111 | '''Gaussian log-likelihood loss''' 112 | mask = batch_matrix != 0 113 | sigma = self.observation_std * torch.ones([], device=pred_matrix.device) 114 | # recon_loss = torch.sum(gaussian_nll(pred_matrix, sigma, batch_matrix) * mask) / torch.sum(mask) 115 | recon_loss = gaussian_nll(pred_matrix * mask, sigma, batch_matrix * mask) 116 | 117 | # for the unobserved entries 118 | mask0 = batch_matrix == 0 119 | sigma0 = self.observation_std * torch.ones([], device=pred_matrix.device) 120 | # recon_loss0 = torch.sum(gaussian_nll(pred_matrix, sigma0, batch_matrix) * mask0) / torch.sum(mask0) 121 | recon_loss0 = gaussian_nll(pred_matrix * mask0, sigma0, batch_matrix * mask0) 122 | 123 | # recon_loss = torch.sum(gaussian_nll(pred_matrix, sigma, batch_matrix) * mask) 124 | 125 | # l2 norm regularization, also regularizing the keyphrases' stdev embeddings 126 | l2_reg = torch.tensor(0., requires_grad=True) 127 | for layer in self.encoder: 128 | if 'weight' in dir(layer): 129 | l2_reg = l2_reg + torch.norm(layer.weight) 130 | 131 | l2_reg = l2_reg + torch.norm(self.decoder.weight) 132 | 133 | # vae loss with annealing 134 | batch_loss = recon_loss + self.weighted_recon * recon_loss0\ 135 | + kl_loss * self.anneal\ 136 | + self.weight_decay * l2_reg 137 | 138 | batch_loss.backward() 139 | optimizer.step() 140 | 141 | self.update_count += 1 142 | 143 | loss += batch_loss 144 | if verbose and b % 50 == 0: 145 | print('(%3d / %3d) loss = %.4f' % (b, num_batches, batch_loss)) 146 | return loss.detach().cpu() 147 | 148 | # make predictions for recommendation 149 | def predict(self, input_matrix): 150 | ''' 151 | Args: 152 | input_matrix: a input UI matrix 153 | Returns: 154 | pred_matrix: a predicted UI matrix 155 | ''' 156 | with torch.no_grad(): 157 | input_batch_matrix = torch.FloatTensor(input_matrix.toarray()).to(self.device) 158 | pred_batch_matrix = self.forward(input_batch_matrix).cpu().numpy() 159 | 160 | return pred_batch_matrix 161 | 162 | # def predict(self, input_matrix, test_matrix, test_batch_size): 163 | # total_preds = [] 164 | # total_ys = [] 165 | # with torch.no_grad(): 166 | # num_data = input_matrix.shape[0] 167 | # num_batches = int(np.ceil(num_data / test_batch_size)) 168 | # perm = list(range(num_data)) 169 | # for b in range(num_batches): 170 | # if (b + 1) * test_batch_size >= num_data: 171 | # batch_idx = perm[b * test_batch_size:] 172 | # else: 173 | # batch_idx = perm[b * test_batch_size: (b + 1) * test_batch_size] 174 | # 175 | # input_batch_matrix = torch.FloatTensor(input_matrix[batch_idx].toarray()).to(self.device) 176 | # test_batch_matrix = torch.FloatTensor(test_matrix[batch_idx].toarray()) 177 | # 178 | # pred_batch_matrix = self.forward(input_batch_matrix).cpu().numpy() 179 | # 180 | # preds = pred_batch_matrix[test_batch_matrix != 0] 181 | # ys = test_batch_matrix[test_batch_matrix != 0] 182 | # if len(ys) > 0: 183 | # total_preds.append(preds) 184 | # total_ys.append(ys) 185 | # 186 | # total_preds = np.concatenate(total_preds) 187 | # total_ys = np.concatenate(total_ys) 188 | # 189 | # return total_preds, total_ys -------------------------------------------------------------------------------- /utils/HPShelper.py: -------------------------------------------------------------------------------- 1 | def get_POP_conf_dict(experiment): 2 | d = { 3 | # 4 | 'num_epochs': experiment.get_parameter("num_epochs"), 5 | 'early_stop': experiment.get_parameter("early_stop"), 6 | 'learning_rate': experiment.get_parameter("learning_rate"), 7 | 'batch_size': experiment.get_parameter("batch_size"), 8 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 9 | 'patience': experiment.get_parameter("patience"), 10 | } 11 | return d 12 | 13 | def get_AE_conf_dict(experiment): 14 | d = { 15 | # 16 | 'act': experiment.get_parameter("act"), 17 | 'hidden_dim': experiment.get_parameter("hidden_dim"), 18 | 'sparse_normalization': experiment.get_parameter("sparse_normalization"), 19 | # 20 | 'learning_rate': experiment.get_parameter("learning_rate"), 21 | 'weight_decay' : experiment.get_parameter("weight_decay"), 22 | 'batch_size': experiment.get_parameter("batch_size"), 23 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 24 | 'early_stop': experiment.get_parameter("early_stop"), 25 | 'num_epochs': experiment.get_parameter("num_epochs"), 26 | 'patience': experiment.get_parameter("patience") 27 | } 28 | return d 29 | 30 | def get_model_name(experiment): 31 | return experiment.get_parameter("model_name") 32 | 33 | def get_VAE_conf_dict(experiment): 34 | d = get_AE_conf_dict(experiment) 35 | vae = { 36 | 'anneal_cap': experiment.get_parameter("anneal_cap"), 37 | 'total_anneal_steps': experiment.get_parameter("total_anneal_steps"), 38 | } 39 | d.update(vae) 40 | return d 41 | 42 | def get_VAEsigma_conf_dict(experiment): 43 | d = get_AE_conf_dict(experiment) 44 | vae = { 45 | 'decoder_bias': experiment.get_parameter("decoder_bias"), 46 | 'training_type': experiment.get_parameter("training_type"), 47 | 'global_variance': experiment.get_parameter("global_variance") 48 | } 49 | d.update(vae) 50 | return d 51 | 52 | def get_VAEmultilayer_conf_dict(experiment): 53 | d = { 54 | # 55 | 'act': experiment.get_parameter("act"), 56 | 'hidden_dim': experiment.get_parameter("hidden_dim"), 57 | 'sparse_normalization': experiment.get_parameter("sparse_normalization"), 58 | 'dropout_ratio': experiment.get_parameter("dropout_ratio"), 59 | # 60 | 'learning_rate': experiment.get_parameter("learning_rate"), 61 | 'weight_decay' : experiment.get_parameter("weight_decay"), 62 | 'batch_size': experiment.get_parameter("batch_size"), 63 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 64 | 'early_stop': experiment.get_parameter("early_stop"), 65 | 'num_epochs': experiment.get_parameter("num_epochs"), 66 | 'patience': experiment.get_parameter("patience"), 67 | 'anneal_cap': experiment.get_parameter("anneal_cap"), 68 | 'total_anneal_steps': experiment.get_parameter("total_anneal_steps"), 69 | 'weighted_recon': experiment.get_parameter("weighted_recon") 70 | } 71 | return d 72 | 73 | def get_VAEsinglelayer_conf_dict(experiment): 74 | d = { 75 | 'act': experiment.get_parameter("act"), 76 | # 77 | 'hidden_dim': experiment.get_parameter("hidden_dim"), 78 | 'sparse_normalization': experiment.get_parameter("sparse_normalization"), 79 | 'dropout_ratio': experiment.get_parameter("dropout_ratio"), 80 | # 81 | 'learning_rate': experiment.get_parameter("learning_rate"), 82 | 'weight_decay' : experiment.get_parameter("weight_decay"), 83 | 'batch_size': experiment.get_parameter("batch_size"), 84 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 85 | 'early_stop': experiment.get_parameter("early_stop"), 86 | 'num_epochs': experiment.get_parameter("num_epochs"), 87 | 'patience': experiment.get_parameter("patience"), 88 | 'anneal_cap': experiment.get_parameter("anneal_cap"), 89 | 'total_anneal_steps': experiment.get_parameter("total_anneal_steps"), 90 | } 91 | return d 92 | 93 | def get_QVAE_conf_dict(experiment): 94 | d = { 95 | 'act': experiment.get_parameter("act"), 96 | # 97 | 'hidden_dim': experiment.get_parameter("hidden_dim"), 98 | 'sparse_normalization': experiment.get_parameter("sparse_normalization"), 99 | 'dropout_ratio': experiment.get_parameter("dropout_ratio"), # not being tuned for now 100 | # 101 | 'learning_rate': experiment.get_parameter("learning_rate"), 102 | 'weight_decay': experiment.get_parameter("weight_decay"), 103 | 'max_log_var': experiment.get_parameter("max_log_var"), 104 | 'batch_size': experiment.get_parameter("batch_size"), 105 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 106 | 'early_stop': experiment.get_parameter("early_stop"), 107 | 'num_epochs': experiment.get_parameter("num_epochs"), 108 | 'patience': experiment.get_parameter("patience"), 109 | 'anneal_cap': experiment.get_parameter("anneal_cap"), 110 | 'total_anneal_steps': experiment.get_parameter("total_anneal_steps"), 111 | } 112 | return d 113 | 114 | def get_VAEsigmamultilayer_conf_dict(experiment): 115 | d = { 116 | # 117 | 'hidden_dim': experiment.get_parameter("hidden_dim"), 118 | 'sparse_normalization': experiment.get_parameter("sparse_normalization"), 119 | 'dropout_ratio': experiment.get_parameter("dropout_ratio"), 120 | # 121 | 'learning_rate': experiment.get_parameter("learning_rate"), 122 | 'weight_decay' : experiment.get_parameter("weight_decay"), 123 | 'batch_size': experiment.get_parameter("batch_size"), 124 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 125 | 'early_stop': experiment.get_parameter("early_stop"), 126 | 'num_epochs': experiment.get_parameter("num_epochs"), 127 | 'patience': experiment.get_parameter("patience"), 128 | 'decoder_bias': experiment.get_parameter("decoder_bias"), 129 | 'training_type': experiment.get_parameter("training_type"), 130 | 'global_variance': experiment.get_parameter("global_variance") 131 | } 132 | return d 133 | 134 | def get_VAEcontrast_conf_dict(experiment): 135 | d = { 136 | 'act': experiment.get_parameter("act"), 137 | # 138 | 'hidden_dim': experiment.get_parameter("hidden_dim"), 139 | 'sparse_normalization': experiment.get_parameter("sparse_normalization"), 140 | 'dropout_ratio': experiment.get_parameter("dropout_ratio"), 141 | 'pos_uk_num': experiment.get_parameter("pos_uk_num"), 142 | 'neg_uk_num': experiment.get_parameter("neg_uk_num"), 143 | 'pos_kk_num': experiment.get_parameter("pos_kk_num"), 144 | 'neg_kk_num': experiment.get_parameter("neg_kk_num"), 145 | 'kernel_method': experiment.get_parameter("kernel_method"), 146 | 'temperature_tau_u': experiment.get_parameter("temperature_tau_u"), 147 | 'temperature_tau_k': experiment.get_parameter("temperature_tau_k"), 148 | # 149 | 'learning_rate': experiment.get_parameter("learning_rate"), 150 | 'weight_decay' : experiment.get_parameter("weight_decay"), 151 | 'batch_size': experiment.get_parameter("batch_size"), 152 | 'test_batch_size': experiment.get_parameter("test_batch_size"), 153 | 'early_stop': experiment.get_parameter("early_stop"), 154 | 'num_epochs': experiment.get_parameter("num_epochs"), 155 | 'patience': experiment.get_parameter("patience"), 156 | 'anneal_cap': experiment.get_parameter("anneal_cap"), 157 | 'total_anneal_steps': experiment.get_parameter("total_anneal_steps"), 158 | 'use_default_hp': experiment.get_parameter("use_default_hp"), 159 | 'hp_contrastive_u': experiment.get_parameter("hp_contrastive_u"), 160 | 'weighted_recon': experiment.get_parameter("weighted_recon") 161 | # 'hp_contrastive_k': experiment.get_parameter("hp_contrastive_k") 162 | # 'max_var': experiment.get_parameter("max_var") 163 | } 164 | return d 165 | 166 | conf_dict_generator = { 167 | 'POP': get_POP_conf_dict, 168 | 'AE': get_AE_conf_dict, 169 | 'VAE': get_VAE_conf_dict, 170 | 'VAEsigma': get_VAEsigma_conf_dict, 171 | 'VAEcontrast': get_VAEcontrast_conf_dict, 172 | 'VAEcontrast_multilayer': get_VAEcontrast_conf_dict, 173 | 'VAEmultilayer_contrast': get_VAEcontrast_conf_dict, 174 | 'VAEcontrast_multilayer_wcontext': get_VAEcontrast_conf_dict, 175 | 'VAEsinglelayer': get_VAEsinglelayer_conf_dict, 176 | 'VAEmultilayer': get_VAEmultilayer_conf_dict, 177 | 'VAEsigmamultilayer': get_VAEsigmamultilayer_conf_dict, 178 | 'QVAE': get_QVAE_conf_dict, 179 | 'QVAE_multi': get_QVAE_conf_dict, # save hps with single layer's settings 180 | } -------------------------------------------------------------------------------- /utils/io.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import pickle 4 | import pandas as pd 5 | import time 6 | import scipy.sparse as sparse 7 | from scipy.sparse import csr_matrix, save_npz, load_npz 8 | import numpy as np 9 | import os 10 | from os import listdir 11 | from os.path import isfile, join 12 | 13 | def save_dataframe_csv(df, path, name): 14 | if not os.path.exists(path): 15 | os.makedirs(path) 16 | 17 | csv_filename = "{:s}.csv".format(name) 18 | df.to_csv(path + csv_filename, index=False) 19 | print('Dataframe Saved Successfully: ', path + csv_filename) 20 | 21 | 22 | def load_dataframe_csv(path, name): 23 | return pd.read_csv(path + name) 24 | 25 | 26 | def save_numpy_csr(matrix, path, model): 27 | save_npz('{}{}'.format(path, model), matrix) 28 | 29 | 30 | def load_numpy_csr(path, name): 31 | return load_npz(path + name).tocsr() 32 | 33 | 34 | def save_numpy(matrix, path, model): 35 | np.save('{}{}'.format(path, model), matrix) 36 | 37 | 38 | # def load_numpy(path, name): 39 | # return np.load(path + name) 40 | 41 | def load_numpy(path, name): 42 | return load_npz(path + name).tocsr() 43 | 44 | 45 | def saveDictToJson(dictionary, path, fileName): # , trainOrTest='train' 46 | json_fileName = "{:s}.json".format(fileName) 47 | # if (trainOrTest == 'train'): 48 | # json.dump(dictionary, open(path + json_fileName, 'w')) 49 | # else: 50 | json.dump(dictionary, open(path + json_fileName, 'w')) 51 | 52 | def loadTextJson(path, fileName): 53 | file = path + fileName + '.txt' 54 | with open(file, "r") as fp: 55 | b = json.load(fp) 56 | return b 57 | 58 | def loadDict(file_dir): # trainOrTest='train' 59 | # json_fileName = "{:s}.json".format(fileName) 60 | # Read data from file: 61 | # if (trainOrTest == 'train'): 62 | # dataDict = json.load(open(path + fileName)) 63 | # else: 64 | dataDict = json.load(open(file_dir)) 65 | return dataDict 66 | 67 | 68 | def get_yelp_df(filename='Export_CleanedReview.json', sampling=True, top_user_num=7000, 69 | top_item_num=5000): 70 | """ 71 | Get the pandas dataframe 72 | Sampling only the top users/items by density 73 | Implicit representation applies 74 | """ 75 | with open(filename, 'r') as f: 76 | data = f.readlines() 77 | data = list(map(json.loads, data)) 78 | 79 | data = data[0] 80 | # Get all the data from the dggeata file 81 | df = pd.DataFrame(data) 82 | 83 | df.rename(columns={'stars': 'review_stars', 'text': 'review_text', 'cool': 'review_cool', 84 | 'funny': 'review_funny', 'useful': 'review_useful'}, 85 | inplace=True) 86 | 87 | df['business_num_id'] = df.business_id.astype('category'). \ 88 | cat.rename_categories(range(0, df.business_id.nunique())) 89 | df['business_num_id'] = df['business_num_id'].astype('int') 90 | 91 | df['user_num_id'] = df.user_id.astype('category'). \ 92 | cat.rename_categories(range(0, df.user_id.nunique())) 93 | df['user_num_id'] = df['user_num_id'].astype('int') 94 | 95 | df['timestamp'] = df['date'].apply(date_to_timestamp) 96 | 97 | if sampling: 98 | df = filter_yelp_df(df, top_user_num=top_user_num, top_item_num=top_item_num) 99 | # Refresh num id 100 | df['business_num_id'] = df.business_id.astype('category'). \ 101 | cat.rename_categories(range(0, df.business_id.nunique())) 102 | df['business_num_id'] = df['business_num_id'].astype('int') 103 | 104 | df['user_num_id'] = df.user_id.astype('category'). \ 105 | cat.rename_categories(range(0, df.user_id.nunique())) 106 | df['user_num_id'] = df['user_num_id'].astype('int') 107 | 108 | df = df.reset_index(drop=True) 109 | 110 | return df 111 | 112 | # implemented code to depopularize items 113 | def filter_yelp_df(df, top_user_num=7000, top_item_num=5000): 114 | # total_items = len(df.business_num_id.unique()) 115 | # Getting the reviews where starts are above 3 116 | df_implicit = df[(df['review_stars'] > 3) & (df['ghost'] == False) & (df['user_id'] != 'CxDOIDnH8gp9KXzpBHJYXw')] 117 | frequent_user_id = df_implicit['user_num_id'].value_counts().head(top_user_num).index.values 118 | frequent_item_id = df_implicit['business_num_id'].value_counts().head(top_item_num).index.values 119 | # frequent_item_id = np.random.choice(total_items, top_item_num, replace=False) 120 | return df.loc[(df['user_num_id'].isin(frequent_user_id)) & (df['business_num_id'].isin(frequent_item_id))] 121 | 122 | 123 | def date_to_timestamp(date): 124 | dt = datetime.datetime.strptime(date, '%Y-%m-%d') 125 | return time.mktime(dt.timetuple()) 126 | 127 | 128 | def df_to_sparse(df, row_name='userId', col_name='movieId', value_name='rating', 129 | shape=None): 130 | rows = df[row_name] 131 | cols = df[col_name] 132 | if value_name is not None: 133 | values = df[value_name] 134 | else: 135 | values = [1] * len(rows) 136 | 137 | return csr_matrix((values, (rows, cols)), shape=shape) 138 | 139 | def get_file_names(folder_path, extension='.yml'): 140 | return [f for f in listdir(folder_path) if isfile(join(folder_path, f)) and f.endswith(extension)] 141 | 142 | def write_file(folder_path, file_name, content, exe=False): 143 | full_path = folder_path+'/'+file_name 144 | with open(full_path, 'w') as the_file: 145 | the_file.write(content) 146 | 147 | if exe: 148 | st = os.stat(full_path) 149 | os.chmod(full_path, st.st_mode | stat.S_IEXEC) 150 | 151 | """ 152 | 153 | Matrix Generation 154 | 155 | """ 156 | def get_rating_timestamp_matrix(df): 157 | rating_matrix = df_to_sparse(df, row_name='user_num_id', 158 | col_name='business_num_id', 159 | value_name='review_stars', 160 | shape=None) 161 | 162 | timestamp_matrix = df_to_sparse(df, row_name='user_num_id', 163 | col_name='business_num_id', 164 | value_name='timestamp', 165 | shape=None) 166 | 167 | return rating_matrix, timestamp_matrix 168 | 169 | 170 | def get_IC_matrix(df): 171 | lst = df.categories.values.tolist() 172 | cat = [] 173 | for i in range(len(lst)): 174 | if lst[i] is None: 175 | print(i) 176 | cat.extend(lst[i].split(', ')) 177 | 178 | unique_cat = set(cat) 179 | # set categories id 180 | df_cat = pd.DataFrame(list(unique_cat), columns=["Categories"]) 181 | df_cat['cat_id'] = df_cat.Categories.astype('category').cat.rename_categories(range(0, df_cat.Categories.nunique())) 182 | dict_cat = df_cat.set_index('Categories')['cat_id'].to_dict() 183 | 184 | df_I_C = pd.DataFrame(columns=['business_num_id', 'cat_id']) 185 | 186 | for i in range((df['business_num_id'].unique().shape)[0]): 187 | df_temp = df[df['business_num_id'] == i].iloc[:1] 188 | temp_lst = df_temp['categories'].to_list()[0].split(",") 189 | for j in range(len(temp_lst)): 190 | df_I_C = df_I_C.append({'business_num_id': i, 'cat_id': dict_cat[temp_lst[j].strip()]}, ignore_index=True) 191 | 192 | IC_Matrix = df_to_sparse(df_I_C, row_name='business_num_id', 193 | col_name='cat_id', 194 | value_name=None, 195 | shape=None) 196 | return IC_Matrix, dict_cat 197 | 198 | 199 | def getImplicitMatrix(sparseMatrix, threashold=0): 200 | temp_matrix = sparse.csr_matrix(sparseMatrix.shape) 201 | temp_matrix[(sparseMatrix > threashold).nonzero()] = 1 202 | return temp_matrix 203 | 204 | 205 | def get_UC_Matrix(IC_Matrix, rtrain_implicit): 206 | U_C_matrix_explicit = rtrain_implicit * IC_Matrix 207 | U_C_matrix_implicit = getImplicitMatrix(U_C_matrix_explicit, 3) 208 | return U_C_matrix_explicit, U_C_matrix_implicit 209 | 210 | 211 | def get_csr_matrix(df, rowname, colname, value=None, shape=None): 212 | row = df[rowname] 213 | col = df[colname] 214 | if value == None: 215 | value = [1] * len(row) 216 | return csr_matrix((value, (row, col)), shape=shape) 217 | 218 | 219 | # get original dataframe, returns idx2item, keyphrase2idx, idx2keyphrase 220 | def get_idx_mapping(df): 221 | idx_2_itemName = dict(zip(df.business_num_id, df.name)) 222 | idx_2_itemId = dict(zip(df.business_num_id, df.business_id)) 223 | itemidx_2_category = dict(zip(df.business_num_id, df.categories)) 224 | return idx_2_itemName, idx_2_itemId, itemidx_2_category 225 | 226 | 227 | def pickle_dump(file, path, file_name): 228 | file_name = path + file_name + '.pickle' 229 | with open(file_name, 'wb') as handle: 230 | pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL) 231 | 232 | 233 | def pickle_load(path, file_name): 234 | file_name = path + file_name + '.pickle' 235 | with open(file_name, 'rb') as handle: 236 | return pickle.load(handle) 237 | 238 | 239 | # Create folders under data dir 240 | # For the usage of storing use case data 241 | def get_data_dir(data_dir): 242 | if not os.path.exists(data_dir): 243 | os.makedirs(data_dir) 244 | 245 | data_dirs = os.listdir(data_dir) 246 | if len(data_dirs) == 0: 247 | idx = 0 248 | else: 249 | idx_lis = [] 250 | for d in data_dirs: 251 | try: 252 | current_idx = int(d.split('_')[0]) 253 | idx_lis.append(current_idx) 254 | except: 255 | continue 256 | 257 | idx_list = sorted(idx_lis) 258 | idx = idx_list[-1] + 1 259 | 260 | cur_data_dir = '%d_%s' % (idx, time.strftime('%Y%m%d-%H%M')) 261 | full_data_dir = os.path.join(data_dir, cur_data_dir) 262 | if not os.path.exists(full_data_dir): 263 | os.mkdir(full_data_dir) 264 | 265 | return full_data_dir 266 | 267 | """ 268 | NOT USED 269 | """ 270 | def get_test_df(ratings_tr, tags_tr, tags_val): 271 | ''' 272 | Remove user/item/tag which only exist in validation set(remove cold-start case) 273 | ''' 274 | 275 | valid_user = ratings_tr['userId'].unique() 276 | valid_item = ratings_tr['itemId'].unique() 277 | valid_tag = tags_tr['tagId'].unique() 278 | 279 | tags_val = tags_val.loc[tags_val['userId'].isin(valid_user) & 280 | tags_val['itemId'].isin(valid_item) & 281 | tags_val['tagId'].isin(valid_tag)] 282 | 283 | return tags_val.groupby(['userId', 'itemId'])['tagId'].apply(list).reset_index(name='tagIds') 284 | -------------------------------------------------------------------------------- /utils/Dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | from utils.io import load_numpy_csr 5 | from pathlib import Path 6 | import pickle 7 | import pandas as pd 8 | import scipy.sparse as sp 9 | from tqdm import tqdm 10 | 11 | class Reddit_Dataset: 12 | def __init__(self, data_dir, top_keyphrases, target, min_ppmi=0): 13 | if os.path.isfile(data_dir + 'uk_train.npz'): 14 | print('Reading data from %s' % data_dir) 15 | self.test_matrix = sp.load_npz(data_dir + 'test_matrix.npz') 16 | self.uk_train = sp.load_npz(data_dir + 'uk_train.npz') 17 | a_file = open(data_dir+"item_idtoi.pkl", "rb") 18 | self.item_idtoi = pickle.load(a_file) 19 | a_file.close() 20 | a_file = open(data_dir+"user_idtoi.pkl", "rb") 21 | self.user_idtoi = pickle.load(a_file) 22 | a_file.close() 23 | else: 24 | print('Generating data from %s' % data_dir) 25 | user_index, item_index = 0, 0 26 | self.user_idtoi, self.item_idtoi = {}, {} 27 | ppmi, train_users, train_items = [], [], [] 28 | df = pd.read_csv(data_dir+'user_community_matrix_train.csv') 29 | for index, row in tqdm(df.iterrows(), total=df.shape[0]): 30 | if row['ppmi'] > min_ppmi: 31 | ppmi.append(row['ppmi']) 32 | if row['author'] in self.user_idtoi: 33 | train_users.append(self.user_idtoi[row['author']]) 34 | else: 35 | self.user_idtoi[row['author']] = user_index 36 | train_users.append(user_index) 37 | user_index += 1 38 | if row['subreddit'] in self.item_idtoi: 39 | train_items.append(self.item_idtoi[row['subreddit']]) 40 | else: 41 | self.item_idtoi[row['subreddit']] = item_index 42 | train_items.append(item_index) 43 | item_index += 1 44 | self.uk_train = sp.csr_matrix((ppmi, (train_users, train_items)), shape=(user_index, item_index)) 45 | sp.save_npz(data_dir + 'uk_train.npz', self.uk_train) 46 | 47 | test_users, test_items = [], [] 48 | df = pd.read_csv(data_dir+'user_community_matrix_test.csv') 49 | for index, row in tqdm(df.iterrows(), total=df.shape[0]): 50 | if not row['appears_in_train'] and row['author'] in self.user_idtoi and row['subreddit'] in self.item_idtoi: 51 | test_users.append(self.user_idtoi[row['author']]) 52 | test_items.append(self.item_idtoi[row['subreddit']]) 53 | self.test_matrix = sp.csr_matrix((np.ones(len(test_users)), (test_users, test_items)), shape=(user_index, item_index)) 54 | sp.save_npz(data_dir + 'test_matrix.npz', self.test_matrix) 55 | 56 | a_file = open(data_dir+"item_idtoi.pkl", "wb") 57 | pickle.dump(self.item_idtoi, a_file) 58 | a_file.close() 59 | 60 | a_file = open(data_dir+"user_idtoi.pkl", "wb") 61 | pickle.dump(self.user_idtoi, a_file) 62 | a_file.close() 63 | 64 | # select top users 65 | top_users = 5000 66 | if target == 'rep_dem': 67 | tag1 = [self.item_idtoi['democrats'], self.item_idtoi['DemocratsforDiversity'], \ 68 | self.item_idtoi['DemocraticSocialism'], self.item_idtoi['Forum_Democratie'], \ 69 | self.item_idtoi['Impeach_Trump'], self.item_idtoi['neoliberal'], self.item_idtoi['AskALiberal'], \ 70 | self.item_idtoi['Liberal'], self.item_idtoi['Classical_Liberals'], self.item_idtoi['JoeBiden']] 71 | tag2 = [self.item_idtoi['Republican'], self.item_idtoi['Conservative'], self.item_idtoi['ConservativesOnly'], \ 72 | self.item_idtoi['askaconservative'], self.item_idtoi['conservatives'], \ 73 | self.item_idtoi['republicans'], self.item_idtoi['Trumpgret'], \ 74 | self.item_idtoi['IronFrontUSA'], self.item_idtoi['AskThe_Donald'], self.item_idtoi['AskTrumpSupporters'], \ 75 | self.item_idtoi['TheBidenshitshow']] 76 | elif target == 'men_women': 77 | tag1 = [self.item_idtoi['women'], self.item_idtoi['WomenWhoDontSell'], self.item_idtoi['AskWomen'], \ 78 | self.item_idtoi['AskWomenOver30'], self.item_idtoi['askwomenadvice'], self.item_idtoi['WomensHealth']] 79 | tag2 = [self.item_idtoi['Divorce_Men'], self.item_idtoi['AskMen'], self.item_idtoi['MensRights'], \ 80 | self.item_idtoi['AskMenAdvice'], self.item_idtoi['AskMenOver30']] 81 | tag1_influ = np.argpartition(np.array(np.sum(self.uk_train[:, tag1], axis=1)).flatten(), -top_users)[-top_users:] 82 | tag2_influ = np.argpartition(np.array(np.sum(self.uk_train[:, tag2], axis=1)).flatten(), -top_users)[-top_users:] 83 | top_users = np.unique(np.concatenate((tag1_influ, tag2_influ))) 84 | self.uk_train = self.uk_train[top_users] 85 | self.test_matrix = self.test_matrix[top_users] 86 | 87 | self.train_matrix = self.uk_train.copy() 88 | self.train_matrix[self.train_matrix > min_ppmi] = 1 89 | self.num_users, self.num_items = self.train_matrix.shape 90 | 91 | print(self.num_users, self.num_items) 92 | print(np.sum(self.train_matrix)) 93 | print(np.sum(self.train_matrix)/self.num_users/self.num_items) 94 | 95 | def eval_data(self): 96 | return self.train_matrix, self.test_matrix 97 | 98 | def all_uk(self): 99 | return self.uk_train 100 | 101 | 102 | class Dataset: 103 | def __init__(self, data_dir, top_keyphrases, rating_threshold, top_users=None): 104 | print('Read data from %s' % data_dir) 105 | self.data_dir = data_dir 106 | self.data_name = self.data_dir.split('/')[-2] 107 | glob_data_dir = self.data_dir.split('fold')[0] 108 | self.train_matrix, self.raw_test_matrix, self.test_matrix, self.uk_train, self.uk_test,\ 109 | self.train_item_keyphrase_matrix, self.ik_label_matrix\ 110 | = self.load_data(data_dir, 111 | top_keyphrases, 112 | rating_threshold, 113 | top_users) 114 | 115 | self.num_users, self.num_items = self.train_matrix.shape 116 | print(self.num_users, self.num_items) 117 | print(np.sum(self.train_matrix)) 118 | print(np.sum(self.train_matrix)/self.num_users/self.num_items) 119 | self.num_keyphrases = self.uk_train.shape[1] 120 | 121 | # log user's rating frequency (from UI) 122 | binary_ui_train = np.zeros(self.train_matrix.shape) 123 | binary_ui_train = sp.lil_matrix(binary_ui_train) 124 | binary_ui_train[self.train_matrix>0] = 1 125 | 126 | binary_uk_train = np.zeros(self.uk_train.shape) 127 | binary_uk_train = sp.lil_matrix(binary_uk_train) 128 | binary_uk_train[self.uk_train > 0] = 1 129 | 130 | self.log_freq_array = np.array(np.log1p(binary_ui_train.sum(axis=1))) # [num_users * 1] 131 | # log user's keyphrase frequency (from UK) 132 | self.log_freq_array_keyphrase = np.array(np.log1p(binary_uk_train.sum(axis=1))) 133 | 134 | # get keyphrases' user and keyphrase log frequency information 135 | binary_ik_train = np.zeros(self.ik_label_matrix.shape) 136 | binary_ik_train = sp.lil_matrix(binary_ik_train) 137 | binary_ik_train[self.ik_label_matrix > 0] = 1 138 | 139 | self.log_freq_array_ku = np.array(np.log1p((binary_uk_train.T).sum(axis=1))) 140 | self.log_freq_array_ki = np.array(np.log1p((binary_ik_train.T).sum(axis=1))) 141 | 142 | # get distributed keyphrase probabilities - for negative sampling 143 | # self.word_prob = self.get_word_prob(power=0.75) 144 | # self.kk_ppmi = self.get_kk_ppmi() 145 | 146 | self.idx_2_keyphrase, self.keyphrase_2_idx = self.load_idx_keyphrase_dic(glob_data_dir) 147 | 148 | # def load_data(self, data_path, freq_threshold, confidence_threshold): 149 | def load_data(self, data_path, top_keyphrases, rating_threshold, top_users): 150 | # load npz files, we're binarizing the data for this task 151 | with open(Path(data_path) / 'tr_data.pkl', 'rb') as f: 152 | train_matrix = pickle.load(f) 153 | train_matrix[train_matrix > 0] = 1 154 | 155 | with open(Path(data_path) / 'te_data.pkl', 'rb') as f: 156 | test_matrix = pickle.load(f) 157 | raw_test_matrix = test_matrix.copy() 158 | test_matrix[test_matrix > 0] = 1 159 | 160 | # shrinking down the data size 161 | if top_users: 162 | train_matrix = train_matrix[:top_users,:] 163 | raw_test_matrix = raw_test_matrix[:top_users,:] 164 | test_matrix = test_matrix[:top_users,:] 165 | 166 | df_tags = pd.read_csv(str(Path(data_path) / 'tr_tags.csv')) 167 | rows, cols, values = df_tags.item, df_tags.tag, np.ones(len(df_tags)) 168 | ik_matrix = sp.csr_matrix((values, (rows, cols)), dtype='float64', shape=(train_matrix.shape[1], 169 | len(df_tags.tag.unique()))) 170 | IK_binary = label_IK(ik_matrix, top_keyphrases=top_keyphrases) 171 | uk_train = label_UK(train_matrix, IK_binary, rating_threshold=1) 172 | uk_test = label_UK(test_matrix, IK_binary, rating_threshold=1) 173 | 174 | # train_matrix = load_numpy_csr(data_path, 'R_train.npz') 175 | # test_matrix = load_numpy_csr(data_path, 'R_test.npz') 176 | # UK matrix 177 | # uk_train = load_numpy_csr(data_path, 'UK_train.npz') 178 | # uk_test = load_numpy_csr(data_path, 'UK_test.npz') 179 | # IK matrix 180 | # ik_matrix = load_numpy_csr(data_path, 'IK_train.npz') 181 | 182 | return train_matrix, raw_test_matrix, test_matrix, uk_train, uk_test, ik_matrix, IK_binary 183 | 184 | def load_idx_keyphrase_dic(self, data_path): 185 | keyphrase_2_idx = json.load(open(data_path + "tag_id_dict.json")) 186 | idx_2_keyphrase = {int(v): k for k, v in keyphrase_2_idx.copy().items()} 187 | return idx_2_keyphrase, keyphrase_2_idx 188 | 189 | def eval_data(self): 190 | return self.train_matrix, self.test_matrix 191 | 192 | def eval_data_uk(self): 193 | return self.uk_train, self.uk_test 194 | 195 | def all_data(self): 196 | return self.train_matrix 197 | 198 | def all_uk(self): 199 | return self.uk_train 200 | 201 | def __str__(self): 202 | # return string representation of 'Dataset' class 203 | # print(Dataset) or str(Dataset) 204 | ret = '======== [Dataset] ========\n' 205 | # ret += 'Train file: %s\n' % self.train_file 206 | # ret += 'Test file : %s\n' % self.test_file 207 | ret += 'Number of users : %d\n' % self.num_users 208 | ret += 'Number of items : %d\n' % self.num_items 209 | ret += 'Number of Keyphrases : %d\n' % self.num_keyphrases 210 | ret += 'None-zero training entries: %d\n' % self.train_matrix.nnz 211 | ret += 'None-zero testing entries: %d\n' % self.test_matrix.nnz 212 | ret += '\n' 213 | return ret 214 | 215 | def label_IK(ik_matrix, top_keyphrases): 216 | IK_binary = ik_matrix.toarray() 217 | num_items, num_keyphrases = IK_binary.shape 218 | 219 | # generate top 10 labels for each item according to frequency number 220 | for item in range(num_items): 221 | item_keyphrase = IK_binary[item] 222 | nonzero_keyphrases_index = item_keyphrase.nonzero()[0] 223 | nonzero_keyphrases_frequency = item_keyphrase[nonzero_keyphrases_index] 224 | 225 | # sort to get the top candidate keyphrases to label each item 226 | candidate_index = nonzero_keyphrases_index[np.argsort(-nonzero_keyphrases_frequency)[:top_keyphrases]] 227 | binarized_keyphrase = np.zeros(num_keyphrases) 228 | binarized_keyphrase[candidate_index] = 1 229 | IK_binary[item] = binarized_keyphrase 230 | 231 | return sp.csr_matrix(IK_binary) 232 | 233 | def label_UK(ui_matrix, ik_matrix, rating_threshold): 234 | # ui_matrix: original user-item rating matrix, explicit 235 | # ik_matrix: labeled-item matrix 236 | # rating_threshold: binarizing user-item matrix 237 | 238 | # get binarized rating data, treat as topical preference 239 | ui_matrix_binary = np.zeros(ui_matrix.shape) 240 | ui_matrix_binary = sp.lil_matrix(ui_matrix_binary) 241 | 242 | # rating 3 threshold 243 | ui_matrix_binary[ui_matrix >= rating_threshold] = 1 244 | ui_matrix_binary = sp.csr_matrix(ui_matrix_binary) 245 | ui_matrix_binary.eliminate_zeros() 246 | 247 | UK_matrix = ui_matrix_binary @ ik_matrix 248 | UK_matrix.eliminate_zeros() 249 | assert UK_matrix.shape[0] == ui_matrix.shape[0] 250 | assert UK_matrix.shape[1] == ik_matrix.shape[1] 251 | 252 | # return the generated UK matrix, not necessarily binary 253 | return UK_matrix 254 | -------------------------------------------------------------------------------- /utils/Tools.py: -------------------------------------------------------------------------------- 1 | import math 2 | # import torch.nn.functional as F 3 | from random import sample 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | ''' 11 | ELK code referring to: https://github.com/schelotto/Gaussian_Word_Embedding/blob/master/main.py 12 | ''' 13 | 14 | # pass in embeddings of one target keyphrase with the embeddings of all the keyphrases and compute the similarity score 15 | # with all the keyphrases in the corpus 16 | def compute_cosSimilarity(target_vector, all_vectors): 17 | return cosine_similarity(target_vector, all_vectors).flatten() 18 | 19 | 20 | def gaussian_nll(mu, sigma, x): 21 | # return 0.5 * torch.pow((x - mu) / log_sigma.exp(), 2) + log_sigma + 0.5 * np.log(2 * np.pi) 22 | se = 0.5 * torch.sum(torch.pow((x - mu), 2)) / (len(torch.nonzero(mu)) * (2 * torch.pow(sigma, 2))) + torch.log(sigma) 23 | return se / (len(mu)) 24 | 25 | 26 | def softclip(tensor, min): 27 | """ Clips the tensor values at the minimum value min in a softway. Taken from Handful of Trials """ 28 | result_tensor = min + F.softplus(tensor - min) # plus |min|, relu activated 29 | 30 | return result_tensor 31 | 32 | 33 | def apply_activation(act_name, x): 34 | if act_name == 'sigmoid': 35 | return torch.sigmoid(x) 36 | elif act_name == 'tanh': 37 | return torch.tanh(x) 38 | elif act_name == 'relu': 39 | return torch.relu(x) 40 | elif act_name == 'elu': 41 | return torch.elu(x) 42 | elif act_name == 'linear': 43 | return x 44 | else: 45 | raise NotImplementedError('Choose appropriate activation function. (current input: %s)' % act_name) 46 | 47 | 48 | def activation_map(act_name): 49 | if act_name == 'sigmoid': 50 | return nn.Sigmoid() 51 | elif act_name == 'tanh': 52 | return nn.Tanh() 53 | elif act_name == 'relu': 54 | return nn.ReLU() 55 | elif act_name == 'elu': 56 | return nn.ELU() 57 | else: 58 | raise NotImplementedError('Choose appropriate activation function. (current input: %s)' % act_name) 59 | 60 | 61 | def kernel_selection(method, mu_i, mu_j, var_i, var_j, c=1e-5): 62 | # so far only implemented kernels for Guassian distributions 63 | # could experiment with the ones for point vectors 64 | if method == 'ELK': 65 | return elk_metric(mu_i, mu_j, var_i, var_j, c) 66 | elif method == 'BK': 67 | return bk_metric(mu_i, mu_j, var_i, var_j, c) 68 | elif method == 'W2': 69 | return wasserstein2_distance(mu_i, mu_j, var_i, var_j) 70 | elif method == 'MB': 71 | return mahalanobis_distance(mu_i, mu_j, var_i, var_j, c) 72 | elif method == 'Euclidean': 73 | return euclidean_distance(mu_i, mu_j) 74 | else: 75 | raise NotImplementedError('Choose appropriate kernel function. (current input: %s)' % method) 76 | 77 | 78 | # Expected Likelihood Kernel (ELK) 79 | def elk_metric(mu_i, mu_j, var_i, var_j, c): 80 | """ 81 | param mu_i: mu of word i: [batch, embed] 82 | param mu_j: mu of word j: [batch, embed] 83 | param var_i: variance of word i: [batch, embed] 84 | param var_j: variance of word j: [batch, embed] 85 | param exp: if apply exponentiation to the returned value 86 | param c: constant term added to denominator (since positive samples don't need exponentials) 87 | return: the energy function between the two batchs of data: [batch] 88 | """ 89 | embedding_shape = mu_i.shape[1] 90 | 91 | # print(var_i.device, var_j.device) 92 | # assertion of batch size equality 93 | assert mu_i.size()[0] == mu_j.size()[0] 94 | 95 | # log volume of ellipse 96 | det_fac = torch.sum(torch.log(var_i + var_j + c), 1) 97 | # det_fac = torch.sum(torch.log(var_i + var_j), 1) 98 | 99 | # mahalanobis distance between the means 100 | diff_mu = torch.sum((mu_i - mu_j) ** 2 / (var_i + var_j + c), 1) 101 | # diff_mu = torch.sum((mu_i - mu_j) ** 2 / (var_i + var_j), 1) 102 | 103 | # return torch.exp(0.5 * (det_fac + diff_mu)) 104 | # returning the original value 105 | return -0.5 * (det_fac + diff_mu + embedding_shape * math.log(2 * math.pi)) 106 | 107 | 108 | # Bhattacharyya kernel 109 | def bk_metric(mu_i, mu_j, var_i, var_j, c): 110 | # Computing the sigma values 111 | # sigma_i = var_i ** 0.5 112 | # sigma_j = var_j ** 0.5 113 | 114 | # sigma sum 115 | sigma_sum = 2 * torch.sum(torch.log(var_i ** 0.5 / (var_j ** 0.5 + c) + (var_j ** 0.5) / ((var_i ** 0.5) + c)), 1) 116 | 117 | # mahalanobis distance between the means 118 | diff_mu = torch.sum((mu_i - mu_j) ** 2 / (var_i + var_j + c), 1) 119 | 120 | return -0.25 * (sigma_sum + diff_mu) 121 | 122 | # Mahalanobis distance 123 | def mahalanobis_distance(mu_i, mu_j, var_i, var_j, c): 124 | # mahalanobis distance between the means 125 | ma_distance = torch.sum((mu_i - mu_j) ** 2 / (var_i + var_j + c), 1) 126 | 127 | return -1 * ma_distance 128 | 129 | 130 | # 2-Wasserstein distance between Gaussian distributions 131 | def wasserstein2_distance(mu_i, mu_j, var_i, var_j): 132 | # Computing the sigma values 133 | sigma_i = var_i ** 0.5 134 | sigma_j = var_j ** 0.5 135 | 136 | diff_mu = torch.sum((mu_i - mu_j) ** 2, 1) 137 | diff_sigma = torch.sum((sigma_i - sigma_j) ** 2, 1) 138 | 139 | return -(diff_mu + diff_sigma) 140 | 141 | 142 | ''' 143 | Similarity measures for mean embeddings only 144 | ''' 145 | 146 | 147 | def euclidean_distance(mu_i, mu_j): 148 | diff_mu = torch.sum((mu_i - mu_j) ** 2, 1) 149 | # return torch.cdist(mu_i, mu_j, p=2) 150 | return -diff_mu 151 | 152 | 153 | class RunningAverage: 154 | def __init__(self): 155 | self.sum = 0 156 | self.history = [] 157 | self.total = 0 158 | 159 | def update(self, value): 160 | self.sum += value 161 | self.history.append(value) 162 | self.total += 1 163 | 164 | @property 165 | def mean(self): 166 | return self.sum / self.total 167 | 168 | 169 | # Need to chagne the sample number parameter 170 | # Delete avoid self sampling parameter 171 | def sampling(idx, matrix, uk_pos_num, uk_neg_num, kk_pos_num, kk_neg_num): 172 | # matrix_test 173 | # sample_method 174 | ''' 175 | Args: 176 | idx: index used to locate user or keyphrase for sampling 177 | matrix: matrix uk or kk used to assist sampling for contrastive learning 178 | matrix_test: the test matrix used for evaluating performance, test positive samples should not be negative samples 179 | sample_number: number of positive and negative samples, equal number 180 | avoid_self_sample: needed when sampling for KK, not for UK 181 | 182 | Returns: 183 | pos_samples: an array that stores the positive sample idx [idx.shape[0] * sample_number] 184 | neg_samples: an array that stores the negative sample idx [idx.shape[0] * sample_number] 185 | ''' 186 | 187 | # for the yelp_SIGIR dataset 188 | def random_sample(): 189 | # for each anchoring user point in the batch 190 | for i in range(pos_entries.shape[0]): 191 | # negative entries would be the ones that are not positive. 192 | # if test_pos_entries is not None: 193 | # neg_entry = list(set(range(matrix.shape[1])) - set(pos_entries[i]) - set(test_pos_entries[i])) 194 | # else: 195 | pos = np.random.choice(pos_entries[i], uk_pos_num).tolist() 196 | neg_entry = list(set(range(matrix.shape[1])) - set(pos)) 197 | 198 | neg = np.random.choice(neg_entry, uk_neg_num).tolist() 199 | pos_samples.append(pos) 200 | neg_samples.append(neg) 201 | 202 | # for each anchoring keyphrase from the positive keyphrase 203 | # Need to handle the case where there's only 1 positive keyphrase 204 | for j in range(len(pos)): 205 | # sample from the previously defined lists. 206 | current_pos_candidates = pos_entries[i].copy() 207 | if pos[j] in current_pos_candidates: 208 | current_pos_candidates.remove(pos[j]) # avoid self sampling 209 | pos_k = np.random.choice(current_pos_candidates, kk_pos_num).tolist() 210 | 211 | current_neg_candidates = list(set(range(matrix.shape[1])) - set(pos_k)) 212 | if pos[j] in current_neg_candidates: 213 | current_neg_candidates.remove(pos[j]) # avoid self sampling 214 | 215 | neg_k = np.random.choice(current_neg_candidates, kk_neg_num).tolist() 216 | pos_samples_kk.append(pos_k) 217 | neg_samples_kk.append(neg_k) 218 | 219 | # for the yelp_SIGIR dataset 220 | def experiment_random_sample(): 221 | # for each anchoring user point in the batch 222 | for i in range(pos_entries.shape[0]): 223 | pos = np.random.choice(pos_entries[i], uk_pos_num).tolist() 224 | neg_entry = list(set(range(matrix.shape[1])) - set(pos)) 225 | 226 | neg = np.random.choice(neg_entry, uk_neg_num).tolist() 227 | pos_samples.append(pos) 228 | neg_samples.append(neg) 229 | 230 | # for each anchoring keyphrase from the positive keyphrase 231 | # Need to handle the case where there's only 1 positive keyphrase 232 | for j in range(len(pos)): 233 | # sample from the previously defined lists. 234 | current_pos_candidates = pos.copy() 235 | if pos[j] in current_pos_candidates: 236 | current_pos_candidates.remove(pos[j]) # avoid self sampling 237 | pos_k = np.random.choice(current_pos_candidates, kk_pos_num).tolist() 238 | 239 | current_neg_candidates = list(set(range(matrix.shape[1])) - set(pos_k)) 240 | if pos[j] in current_neg_candidates: 241 | current_neg_candidates.remove(pos[j]) # avoid self sampling 242 | 243 | neg_k = np.random.choice(current_neg_candidates, kk_neg_num).tolist() 244 | pos_samples_kk.append(pos_k) 245 | neg_samples_kk.append(neg_k) 246 | 247 | # for uk 248 | pos_samples = [] # m positive keyphrases 249 | neg_samples = [] # n negative keyphrasess 250 | 251 | # for kk 252 | pos_samples_kk = [] # m' positive keyphrasess 253 | neg_samples_kk = [] # n' negative keyphrases 254 | 255 | # gets an array of lists, which contains the positive column indices [idx.shape[0] * sample_numer] 256 | pos_entries = matrix[idx].tolil().rows 257 | 258 | # if matrix_test is not None: 259 | # test_pos_entries = matrix_test[idx].tolil().rows 260 | # else: 261 | # test_pos_entries = None 262 | 263 | # perform random sampling 264 | random_sample() 265 | # experiment_random_sample() 266 | 267 | # convert the samples to arrays 268 | pos_samples = np.array(pos_samples) 269 | neg_samples = np.array(neg_samples) 270 | 271 | pos_samples_kk = np.array(pos_samples_kk) 272 | neg_samples_kk = np.array(neg_samples_kk) 273 | 274 | # assertions 275 | assert pos_samples.shape[0] == pos_entries.shape[0] 276 | assert neg_samples.shape[0] == pos_samples.shape[0] 277 | try: 278 | assert pos_samples.shape[1] == uk_pos_num 279 | except: 280 | print('debugging') 281 | assert neg_samples.shape[1] == uk_neg_num 282 | assert pos_samples_kk.shape[0] == pos_entries.shape[0] * uk_pos_num 283 | assert neg_samples_kk.shape[0] == pos_entries.shape[0] * uk_pos_num 284 | assert pos_samples_kk.shape[1] == kk_pos_num 285 | assert neg_samples_kk.shape[1] == kk_neg_num 286 | 287 | # for arr_idx in range(len(pos_samples)): 288 | # if test_pos_entries is not None and sample_method != 'simple_distribution': 289 | # assert len(set(pos_samples[arr_idx]).intersection(test_pos_entries[arr_idx])) == 0 290 | # assert len(set(neg_samples[arr_idx]).intersection(test_pos_entries[arr_idx])) == 0 291 | # if avoid_self_sample: 292 | # assert idx[arr_idx] not in pos_samples[arr_idx] 293 | # assert idx[arr_idx] not in neg_samples[arr_idx] 294 | return pos_samples, neg_samples, pos_samples_kk, neg_samples_kk 295 | 296 | 297 | # generate samples based on Gaussian embeddings 298 | def generate_sample_embeddings(keyphrase_idx_check, sample_num, mean_embed, stdev_embed): 299 | # generate sample embeddings for all 300 | for index, kidx in enumerate(keyphrase_idx_check): 301 | # sample embedding for current keyphrase 302 | sample_embed = np.array([np.random.normal(mean_embed[kidx], stdev_embed[kidx]) 303 | for _ in range(sample_num)]) 304 | if index == 0: 305 | sampled_embedding = sample_embed.copy() 306 | else: 307 | sampled_embedding = np.vstack((sampled_embedding, sample_embed)) 308 | 309 | # loop through the rest of 70 random samples 310 | for kidx in sample(range(mean_embed.shape[0]), 70): 311 | if kidx in keyphrase_idx_check: 312 | continue 313 | else: 314 | # sample embedding for current keyphrase 315 | sample_embed = np.array([np.random.normal(mean_embed[kidx], stdev_embed[kidx]) 316 | for _ in range(sample_num)]) 317 | sampled_embedding = np.vstack((sampled_embedding, sample_embed)) 318 | 319 | return sampled_embedding 320 | 321 | 322 | def logsumexp(pos_x, neg_x, num_pos_samples, num_neg_samples): 323 | # for the same anchoring point, concatenating smaples horizontally 324 | input_shape = pos_x.shape 325 | pos_x = pos_x.reshape(-1, num_pos_samples) 326 | neg_x = neg_x.reshape(-1, num_neg_samples) 327 | assert pos_x.shape[0] == neg_x.shape[0] 328 | concat = torch.cat((pos_x, neg_x), 1).detach() 329 | # find the maximum sampling point for each anchor [batch_size * 1] 330 | c, _ = torch.max(concat, dim=1, keepdim=True) 331 | 332 | pos_x_exp = torch.exp(pos_x - c) 333 | neg_x_exp = torch.exp(neg_x - c) 334 | 335 | sum_kernels = c + torch.log(torch.sum(pos_x_exp, axis=1, keepdim=True) + torch.sum(neg_x_exp, axis=1, keepdim=True)) 336 | sum_kernels = torch.repeat_interleave(sum_kernels, repeats=num_pos_samples, dim=0) 337 | 338 | return sum_kernels.reshape(input_shape) # summing of the denominator 339 | 340 | 341 | class AverageMeter(object): 342 | """Computes and stores the average and current value""" 343 | 344 | def __init__(self): 345 | self.reset() 346 | 347 | def reset(self): 348 | self.sum = 0 349 | self.count = 0 350 | 351 | def update(self, val, n): 352 | self.sum += val * n 353 | self.count += n 354 | 355 | def avg(self): 356 | if self.count == 0: 357 | return 0 358 | return float(self.sum) / self.count 359 | -------------------------------------------------------------------------------- /utils/Trainer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import seaborn as sns 7 | import torch 8 | from sklearn.manifold import TSNE 9 | 10 | from utils.Table import Table 11 | from utils.Tools import generate_sample_embeddings 12 | from utils.io import pickle_dump 13 | 14 | 15 | class Trainer: 16 | def __init__(self, dataname, target, dataset, model, evaluator, logger, conf, experiment=None, 17 | plot_graph=False, run_samples=False): 18 | self.dataname = dataname 19 | self.dataset = dataset 20 | self.train_matrix, self.test_matrix = self.dataset.eval_data() 21 | self.uk_train = self.dataset.all_uk() 22 | 23 | # yelp 24 | if dataname == 'yelp_SIGIR': 25 | self.strong_tag1 = [] 26 | self.strong_tag2 = [] 27 | temp = self.uk_train.toarray() 28 | top_tags = np.argsort(-temp, axis=1)[:, :10] 29 | if target == 'veg_bbq': 30 | tag1 = 170 31 | tag2 = 203 32 | for u in range(len(top_tags)): 33 | tag1_in = tag1 in top_tags[u] 34 | tag2_in = tag2 in top_tags[u] 35 | if tag1_in and not tag2_in and temp[u, tag1]-temp[u, tag2]>1: 36 | self.strong_tag1.append(u) 37 | elif not tag1_in and tag2_in and temp[u, tag2]-temp[u, tag1]>2: 38 | self.strong_tag2.append(u) 39 | elif target == 'fried_salad': 40 | tag1 = 47 41 | tag2 = 37 42 | for u in range(len(top_tags)): 43 | tag1_in = tag1 in top_tags[u] 44 | tag2_in = tag2 in top_tags[u] 45 | if tag1_in and not tag2_in: 46 | self.strong_tag1.append(u) 47 | elif not tag1_in and tag2_in: 48 | self.strong_tag2.append(u) 49 | ik_label_matrix = dataset.ik_label_matrix.toarray() 50 | self.item_tag1 = ik_label_matrix[:, tag1]-ik_label_matrix[:, tag2] 51 | self.item_tag1[self.item_tag1<0] = 0 52 | self.item_tag1 = self.item_tag1.nonzero()[0] 53 | self.item_tag2 = ik_label_matrix[:, tag2]-ik_label_matrix[:, tag1] 54 | self.item_tag2[self.item_tag2<0] = 0 55 | self.item_tag2 = self.item_tag2.nonzero()[0] 56 | self.tagged_items = [self.item_tag1, self.item_tag2] 57 | print(len(self.strong_tag1), len(self.strong_tag2)) 58 | print(len(self.item_tag1), len(self.item_tag2)) 59 | 60 | elif dataname == 'reddit': 61 | 62 | # reddit 63 | item_idtoi = self.dataset.item_idtoi 64 | if target == 'rep_dem': 65 | tag1 = [item_idtoi['democrats'], item_idtoi['DemocratsforDiversity'], \ 66 | item_idtoi['DemocraticSocialism'], item_idtoi['Forum_Democratie'], \ 67 | item_idtoi['Impeach_Trump'], item_idtoi['neoliberal'], item_idtoi['AskALiberal'], \ 68 | item_idtoi['Liberal'], item_idtoi['Classical_Liberals'], item_idtoi['JoeBiden']] 69 | tag2 = [item_idtoi['Republican'], item_idtoi['Conservative'], item_idtoi['ConservativesOnly'], \ 70 | item_idtoi['askaconservative'], item_idtoi['conservatives'], \ 71 | item_idtoi['republicans'], item_idtoi['Trumpgret'], \ 72 | item_idtoi['IronFrontUSA'], item_idtoi['AskThe_Donald'], item_idtoi['AskTrumpSupporters'], \ 73 | item_idtoi['TheBidenshitshow']] 74 | thresh_1, thresh_2 = 2, 4 75 | elif target == 'men_women': 76 | tag1 = [item_idtoi['women'], item_idtoi['WomenWhoDontSell'], item_idtoi['AskWomen'], \ 77 | item_idtoi['AskWomenOver30'], item_idtoi['askwomenadvice'], item_idtoi['WomensHealth']] 78 | tag2 = [item_idtoi['Divorce_Men'], item_idtoi['AskMen'], item_idtoi['MensRights'], \ 79 | item_idtoi['AskMenAdvice'], item_idtoi['AskMenOver30']] 80 | thresh_1, thresh_2 = 2, 1 81 | self.item_tag1 = tag1 82 | self.item_tag2 = tag2 83 | self.tagged_items = [tag1, tag2] 84 | temp = self.uk_train.toarray() 85 | top_tags = np.argsort(-temp, axis=1)[:, :10] 86 | self.strong_tag1 = [] 87 | self.strong_tag2 = [] 88 | for i in range(len(temp)): 89 | tag1_in = np.sum(np.isin(top_tags[i], tag1)) 90 | tag2_in = np.sum(np.isin(top_tags[i], tag2)) 91 | if tag1_in and not tag2_in and tag1_in > thresh_1: 92 | self.strong_tag1.append(i) 93 | elif not tag1_in and tag2_in and tag2_in > thresh_2: 94 | self.strong_tag2.append(i) 95 | print(len(self.strong_tag1), len(self.strong_tag2)) 96 | print(len(self.item_tag1), len(self.item_tag2)) 97 | 98 | 99 | self.model = model 100 | self.evaluator = evaluator 101 | self.logger = logger 102 | self.conf = conf 103 | self.experiment = experiment 104 | self.plot_graphs = plot_graph 105 | self.run_samples = run_samples 106 | 107 | self.num_epochs = conf.num_epochs 108 | self.lr = conf.learning_rate 109 | self.batch_size = conf.batch_size 110 | self.test_batch_size = conf.test_batch_size 111 | 112 | self.early_stop = conf.early_stop 113 | self.patience = conf.patience 114 | self.endure = 0 115 | self.skip_eval = conf.skip_eval 116 | 117 | self.best_epoch = -1 118 | self.best_score = None 119 | self.best_params = None 120 | self.best_rec_score = None 121 | self.best_uk_score = None 122 | 123 | # save the best keyphrase embeddings during best epochs 124 | self.mean_embeddings = None 125 | self.stdev_embeddings = None 126 | 127 | # for use case, save selected user and keyphrase embeddings during training 128 | self.sampled_user_idx = 799 # a single number 129 | self.sample_user_embeddings = None 130 | self.sample_user_embeddings_std = None 131 | self.sampled_keyphrase_idx = [225, 429, 674] # a list of keyphrase idx 132 | self.sampled_keyphrase_embeddings = None 133 | self.sampled_keyphrase_embeddings_std = None 134 | self.sampled_epoch = [10, 50, 100, 200, 250, 300] # a list of training epochs to sample 135 | self.score_comparison_df = pd.DataFrame(columns=['MSE', 'uk_rprec', 'epoch']) 136 | 137 | def train(self): 138 | self.logger.info(self.conf) 139 | 140 | # pass module parameters to the optimizer 141 | if len(list(self.model.parameters())) > 0: 142 | optimizer = torch.optim.RMSprop(self.model.parameters(), self.lr) 143 | # optimizer = torch.optim.RMSprop(self.model.parameters(), self.lr) 144 | else: 145 | optimizer = None 146 | 147 | # create table for logging 148 | score_table = Table(table_name='Scores') 149 | 150 | for epoch in range(1, self.num_epochs + 1): 151 | # train for an epoch 152 | epoch_start = time.time() 153 | loss = self.model.train_one_epoch(train_matrix=self.train_matrix, 154 | # uk_matrix=self.uk_train, 155 | # uk_test=self.uk_valid, 156 | optimizer=optimizer, 157 | batch_size=self.batch_size, 158 | verbose=False) 159 | # experiment=self.experiment) # verbose/printing false 160 | 161 | # log epoch loss 162 | if self.experiment: self.experiment.log_metric(name='epoch_loss', value=loss, epoch=epoch) 163 | print("Epoch:", epoch, "Loss:", loss) 164 | 165 | train_elapsed = time.time() - epoch_start 166 | 167 | #and epoch >= 50 168 | if (not self.skip_eval and epoch % 20 == 0) or (self.skip_eval and epoch == self.num_epochs): 169 | if not self.skip_eval and self.early_stop: # get scores during training only 170 | # recommendation performance 171 | rec_score = self.evaluator.evaluate_recommendations(self.dataname, self.item_tag1, self.item_tag2, 172 | self.tagged_items, epoch, self.strong_tag1, 173 | self.strong_tag2, self.model, self.train_matrix, 174 | self.test_matrix, mse_only=False, 175 | ndcg_only=True, analytical=False, 176 | test_batch_size=self.test_batch_size) 177 | 178 | else: # At the end of training epochs, during evaluation 179 | # recommendation performance 180 | rec_score = self.evaluator.evaluate_recommendations(self.dataname, self.item_tag1, self.item_tag2, 181 | self.tagged_items, epoch, self.strong_tag1, 182 | self.strong_tag2, self.model, self.train_matrix, 183 | self.test_matrix, mse_only=False, 184 | ndcg_only=False, analytical=False, 185 | test_batch_size=self.test_batch_size) 186 | 187 | # score we want to check during training 188 | score = {"Loss": float(loss), 189 | "RMSE": rec_score['RMSE'][0]} 190 | if "NDCG" in rec_score.keys(): 191 | score['NDCG'] = rec_score['NDCG'][0] 192 | 193 | score_str = ' '.join(['%s=%.4f' % (m, score[m]) for m in score]) 194 | epoch_elapsed = time.time() - epoch_start 195 | 196 | self.logger.info('[Epoch %3d/%3d, epoch time: %.2f, train_time: %.2f] %s' % ( 197 | epoch, self.num_epochs, epoch_elapsed, train_elapsed, score_str)) 198 | 199 | # log for comet ml, per 10 epochs 200 | if self.experiment: 201 | self.experiment.log_metric(name='RMSE', value=score['RMSE'], \ 202 | epoch=epoch) 203 | if "NDCG" in rec_score.keys(): 204 | self.experiment.log_metric(name='NDCG', value=score['NDCG'], 205 | epoch=epoch) 206 | # update if ... 207 | standard = 'NDCG' 208 | if self.best_score is None or score[standard] > self.best_score[standard]: 209 | self.best_epoch = epoch 210 | self.best_score = score 211 | self.best_rec_score = rec_score 212 | self.best_params = self.model.parameters() 213 | 214 | self.endure = 0 215 | 216 | # log stats plot, every 50 epoch is enough 217 | if self.plot_graphs and epoch >= 50 and epoch % 50 == 0: 218 | self.log_stats_plot(epoch) 219 | else: 220 | self.endure += 10 221 | if self.early_stop and self.endure >= self.patience: 222 | print('Early Stop Triggered...') 223 | break 224 | 225 | # log plot at the end of training, and log last epoch embeddings 226 | if self.plot_graphs: 227 | self.log_stats_plot(epoch) 228 | # close plt records 229 | plt.clf() 230 | plt.cla() 231 | plt.close() 232 | 233 | print('Training Finished.') 234 | score_table.add_row('Best at epoch %d' % self.best_epoch, self.best_score) 235 | self.logger.info(score_table.to_string()) 236 | 237 | # create scatter plot for user embedding values 238 | def create_scatter(self, embedding, axis_value): 239 | log_freq = self.dataset.log_freq_array 240 | log_freq_keyphrase = self.dataset.log_freq_array_keyphrase 241 | avg_stdev = np.mean(embedding, axis=1) 242 | fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(8, 5)) 243 | ax1.scatter(log_freq, avg_stdev) 244 | ax1.set_xlabel('log total rating frequency') 245 | ax1.set_ylabel('avg_{}'.format(axis_value)) 246 | 247 | ax2.scatter(log_freq_keyphrase, avg_stdev) 248 | ax2.set_xlabel('log total keyphrase frequency') 249 | ax2.set_ylabel('avg_{}'.format(axis_value)) 250 | 251 | fig.suptitle('Rating frequencies & keyphrase mentioning frequencies vs. averaged {}'.format(axis_value)) 252 | 253 | return fig 254 | 255 | def create_scatter_keyhrase(self, axis_value): 256 | if axis_value == 'stdev': 257 | embedding = np.array(torch.exp(0.5 * self.model.keyphrase_log_var.weight.data)) 258 | elif axis_value == 'mean': 259 | embedding = np.array(self.model.keyphrase_mu.weight.data) 260 | else: 261 | raise NotImplementedError('Choose appropriate embedding parameter. (current input: %s)' % axis_value) 262 | 263 | log_freq_ku = self.dataset.log_freq_array_ku 264 | log_freq_ki = self.dataset.log_freq_array_ki 265 | 266 | avg_embedding = np.mean(embedding, axis=1) 267 | 268 | fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(8, 5)) 269 | ax1.scatter(log_freq_ku, avg_embedding) 270 | ax1.set_xlabel('log user mentioning frequency') 271 | ax1.set_ylabel('avg_{}'.format(axis_value)) 272 | 273 | ax2.scatter(log_freq_ki, avg_embedding) 274 | ax2.set_xlabel('log item labeled frequency') 275 | ax2.set_ylabel('avg_{}'.format(axis_value)) 276 | 277 | fig.suptitle('User mention frequencies & item labeled frequencies vs. averaged {}'.format(axis_value)) 278 | # file_dir = log_dir + conf_name + '.png' 279 | # plt.savefig(file_dir) 280 | 281 | return fig 282 | 283 | def create_TSNE(self): 284 | # manual modifications 285 | keyphrase_list = ['kbbq', 'bbq', 'kebab', 'korean bbq', 'korean', 'pizza', 'pizzeria libretto', 286 | 'pistachio', 'pita bread'] 287 | sample_num = 150 288 | keyphrase_idx_check = [self.dataset.keyphrase_2_idx[word] for word in keyphrase_list] 289 | 290 | sampled_embedding = generate_sample_embeddings(keyphrase_idx_check=keyphrase_idx_check, 291 | sample_num=sample_num, 292 | mean_embed=self.mean_embeddings, 293 | stdev_embed=self.stdev_embeddings) 294 | tsne = TSNE(n_components=2, perplexity=5, early_exaggeration=4) # , verbose=1 , n_iter=500, , perplexity=50 295 | tsne_results = tsne.fit_transform(sampled_embedding) 296 | 297 | # feed sample embedding to tsne & create plots 298 | df_plot = { 299 | 'keyphrase': np.array( 300 | [[self.dataset.idx_2_keyphrase[idx]] * sample_num for idx in keyphrase_idx_check]).flatten()} 301 | df_plot = pd.DataFrame(data=df_plot) 302 | 303 | samples_to_plot = len(keyphrase_idx_check) * sample_num 304 | df_plot['tsne-2d-one'] = tsne_results[:samples_to_plot, 0] 305 | df_plot['tsne-2d-two'] = tsne_results[:samples_to_plot, 1] 306 | 307 | tsne = TSNE(n_components=2, perplexity=5, early_exaggeration=4) # , verbose=1 , n_iter=500, , perplexity=50 308 | tsne_results = tsne.fit_transform(sampled_embedding) 309 | 310 | # feed sample embedding to tsne & create plots 311 | df_plot = { 312 | 'keyphrase': np.array( 313 | [[self.dataset.idx_2_keyphrase[idx]] * sample_num for idx in keyphrase_idx_check]).flatten()} 314 | df_plot = pd.DataFrame(data=df_plot) 315 | 316 | samples_to_plot = len(keyphrase_idx_check) * sample_num 317 | df_plot['tsne-2d-one'] = tsne_results[:samples_to_plot, 0] 318 | df_plot['tsne-2d-two'] = tsne_results[:samples_to_plot, 1] 319 | 320 | plt.figure(figsize=(10, 8)) 321 | sns_plot = sns.scatterplot( 322 | x="tsne-2d-one", y="tsne-2d-two", 323 | hue="keyphrase", 324 | palette=sns.color_palette("hls", len(df_plot.keyphrase.unique())), 325 | data=df_plot, 326 | legend="full", 327 | alpha=0.3 328 | ) 329 | 330 | return sns_plot.get_figure() 331 | 332 | def plot_comparison_plot(self): 333 | plt.clf() 334 | plt.figure() 335 | sns_plot = sns.scatterplot(data=self.score_comparison_df, 336 | x="MSE", 337 | y="uk_rprec", 338 | hue="epoch") 339 | 340 | return sns_plot.get_figure() 341 | 342 | def log_stats_plot(self, epoch_num): 343 | mean_embedding, stdev_embedding = self.get_mu_S() 344 | 345 | stats_figure = self.create_scatter(stdev_embedding, 'stdev') 346 | # stats_figure_mean = self.create_scatter(mean_embedding, 'mean') 347 | 348 | stats_figure_k = self.create_scatter_keyhrase('stdev') 349 | stats_figure_mean_k = self.create_scatter_keyhrase('mean') 350 | 351 | self.experiment.log_figure(figure_name='stats_fig_' + str(epoch_num), figure=stats_figure, overwrite=True) 352 | # self.experiment.log_figure(figure_name='stats_fig_mean_' + str(epoch_num), figure=stats_figure_mean, 353 | # overwrite=True) 354 | self.experiment.log_figure(figure_name='stats_fig_keyphrase_' + str(epoch_num), figure=stats_figure_k, 355 | overwrite=True) 356 | # self.experiment.log_figure(figure_name='stats_fig_mean_keyphrase_' + str(epoch_num), 357 | # figure=stats_figure_mean_k, overwrite=True) 358 | 359 | # close plt records 360 | plt.clf() 361 | plt.cla() 362 | plt.close() 363 | 364 | def save_best_embeddings(self, mean_embeddigns, stdev_embeddings): 365 | self.mean_embeddings = mean_embeddigns 366 | self.stdev_embeddings = stdev_embeddings 367 | 368 | def log_embeddings_asset(self, mean_embeddings, stdev_embeddings, epoch): 369 | embedded_mean_df = pd.DataFrame(mean_embeddings, columns=list(range(1, mean_embeddings.shape[1] + 1)), 370 | index=list(self.dataset.keyphrase_2_idx.keys())) 371 | 372 | embedded_std_df = pd.DataFrame(stdev_embeddings, columns=list(range(1, stdev_embeddings.shape[1] + 1)), 373 | index=list(self.dataset.keyphrase_2_idx.keys())) 374 | 375 | # self.experiment.log_dataframe_profile(dataframe=embedded_std_df, name='embedded_stdev') 376 | # self.experiment.log_dataframe_profile(dataframe=embedded_mean_df, name='embedded_mean') 377 | self.experiment.log_table('embedded_means{}.csv'.format(epoch), embedded_mean_df) 378 | self.experiment.log_table('embedded_stdev{}.csv'.format(epoch), embedded_std_df) 379 | 380 | def get_mu_S(self): 381 | input_matrix = self.dataset.all_data() 382 | i = torch.FloatTensor(input_matrix.toarray()).to(self.model.device) 383 | with torch.no_grad(): 384 | mu, logvar = self.model.get_mu_logvar(i) 385 | std = self.model.logvar2std(logvar) 386 | mu, std = mu.cpu().data.numpy(), std.cpu().data.numpy() 387 | 388 | return mu, std 389 | -------------------------------------------------------------------------------- /utils/Evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from utils.Tools import kernel_selection 4 | from utils.KAVgenerator import KAVgenerator 5 | import matplotlib.pyplot as plt 6 | from tqdm import tqdm 7 | import pandas as pd 8 | import time 9 | 10 | class Evaluator: 11 | def __init__(self, rec_atK, explain_atK, lamb, std): 12 | self.rec_atK = rec_atK # a list of the topK indecies 13 | self.rec_maxK = max(self.rec_atK) 14 | self.explain_atK = explain_atK 15 | self.explain_maxK = max(self.explain_atK) 16 | 17 | self.global_metrics = { 18 | "R-Precision": r_precision, 19 | "NDCG": ndcg 20 | } 21 | 22 | self.local_metrics = { 23 | "Precision": precisionk, 24 | "Recall": recallk, 25 | "MAP": average_precisionk, 26 | "NDCG": ndcg 27 | } 28 | 29 | self.global_metrics_embeddings = { 30 | "UK_R-Precision": r_precision, 31 | "UK_NDCG": ndcg 32 | } 33 | 34 | self.local_metrics_embeddings = { 35 | "UK_NDCG": ndcg, 36 | "UK_Precision": precisionk, 37 | "UK_Recall": recallk, 38 | "UK_MAP": average_precisionk 39 | } 40 | 41 | self.lamb = lamb 42 | self.std = std 43 | 44 | # evaluate Gaussian embeddings, explanations and keyphraes relationships 45 | def evaluate_embeddings(self, model, train_matrix_uk, test_matrix_uk, 46 | mu_user, var_user, ndcg_only, data_name, analytical=False): 47 | """ 48 | Args: 49 | model: passed in model, e.g., VAE, VAE-contrast 50 | train_matrix_uk: test matrix of UK 51 | test_matrix_uk: test matrix of UK 52 | mu_user: mean embedding for users with all historical item entires known 53 | var_user: sigma embedding for users with all historical item entires known 54 | analytical: False if getting the confidence interval value 55 | 56 | Returns: a dictionary of metric scores 57 | """ 58 | # switch to evaluation mode 59 | model.eval() 60 | model.before_evaluate() 61 | 62 | mu_user = torch.from_numpy(mu_user).to(model.device) 63 | var_user = torch.from_numpy(var_user).to(model.device) 64 | 65 | assert mu_user.shape[0] == train_matrix_uk.shape[0] 66 | assert torch.all(torch.gt(var_user, torch.zeros(size=var_user.size()).to(var_user.device))) 67 | 68 | with torch.no_grad(): 69 | keyphrase_mean_embeddings = model.keyphrase_mu.weight.data 70 | keyphrase_var_embeddings = torch.exp(model.keyphrase_log_var.weight.data) 71 | 72 | # Get corresponding keyphrases predictions 73 | predicted_uk = self.kernel_predict(train_matrix_uk, test_matrix_uk, mu_user, var_user, 74 | keyphrase_mean_embeddings, keyphrase_var_embeddings, 75 | model.kernel_method, model.temperature_tau_u, data_name) 76 | 77 | uk_results = self.evaluation(predicted_uk, test_matrix_uk, eval_type='embeddings', ndcg_only=ndcg_only, 78 | analytical=analytical) 79 | 80 | return uk_results 81 | 82 | def kernel_predict(self, train_matrix, test_matrix, mu_anchor, var_anchor, mu_samples, var_samples, 83 | kernel_method, temperature, data_name): 84 | 85 | # maxK = self.explain_maxK 86 | pos_entries = train_matrix.tolil().rows # array of lists, to not consider 87 | # ground_entries = test_matrix.tolil().rows 88 | 89 | prediction = [] 90 | for i in range(pos_entries.shape[0]): # for each user 91 | 92 | # skipping the negative keyphrase cases 93 | if len(pos_entries[i]) == 0: 94 | prediction.append(np.zeros(self.explain_maxK, dtype=np.float32)) 95 | 96 | else: 97 | # topK = max(len(ground_entries[i]), maxK) # max of number of ground truth entries and topK 98 | # only care about those unk entries 99 | # if 'yelp_SIGIR' not in data_name: # for the non-yelp_SIGIR datasets 100 | # unk_entries = list(set(range(train_matrix.shape[1]))) 101 | # else: 102 | # unk_entries = list(set(range(train_matrix.shape[1])) - set(pos_entries[i])) 103 | unk_entries = list(set(range(train_matrix.shape[1]))) 104 | mu_anchor_i = torch.repeat_interleave(mu_anchor[i].reshape(1, -1), repeats=len(unk_entries), dim=0).to(mu_anchor.device) 105 | var_anchor_i = torch.repeat_interleave(var_anchor[i].reshape(1, -1), repeats=len(unk_entries), dim=0).to(mu_anchor.device) 106 | 107 | # corresponding unknown keyphrases' embeddings 108 | mu_sample_i = mu_samples[unk_entries] 109 | var_sample_i = var_samples[unk_entries] 110 | 111 | assert mu_anchor_i.shape == mu_sample_i.shape 112 | assert var_anchor_i.shape == var_sample_i.shape 113 | 114 | # Becomes the predictions 115 | kernel = torch.divide(kernel_selection(kernel_method, mu_anchor_i, 116 | mu_sample_i, var_anchor_i, 117 | var_sample_i), temperature) 118 | 119 | # check kernel shape correspondence 120 | assert kernel.shape[0] == len(unk_entries) 121 | 122 | # select argmax 123 | top_index = (torch.argsort(kernel, dim=-1, descending=True)[:self.explain_maxK]).cpu().data.numpy() 124 | top_predict = np.array(unk_entries)[top_index] 125 | 126 | prediction.append(top_predict) 127 | 128 | # predicted item indecies 129 | # predicted_items = np.vstack(prediction) 130 | predicted_items = prediction.copy() 131 | assert len(predicted_items) == train_matrix.shape[0] 132 | return predicted_items 133 | 134 | def evaluate_recommendations(self, dataname, item_tag1, item_tag2, tagged_items, epoch, strong_tag1, strong_tag2, model, input_matrix, 135 | test_matrix, mse_only, ndcg_only, test_batch_size, analytical=False): 136 | # switch to evaluation mode 137 | model.eval() 138 | # operations before evaluation, does not perform for VAE models 139 | model.before_evaluate() 140 | 141 | # get prediction data, in matrix form 142 | # get prediction data, in matrix form, not masking, for recommendation results 143 | pred_matrix = model.predict(input_matrix) 144 | pred_matrix = np.array(pred_matrix) 145 | assert pred_matrix.shape == input_matrix.shape 146 | RMSE = round(np.sqrt(np.mean((input_matrix.toarray() - pred_matrix) ** 2)), 4) 147 | # preds, ys = model.predict(input_matrix, test_matrix, test_batch_size=test_batch_size) 148 | # RMSE = round(np.sqrt((np.sum((preds - ys) ** 2)) / len(ys)),4) 149 | 150 | if mse_only: 151 | recommendation_results = {"RMSE": (RMSE,0)} 152 | else: 153 | # get predicted item index 154 | prediction = [] 155 | # get prediction data, in matrix form, not masking, for recommendation results 156 | # pred_matrix = model.simple_predict(input_matrix) 157 | # assert pred_matrix.shape == input_matrix.shape 158 | num_users = pred_matrix.shape[0] 159 | 160 | # Prediction section 161 | for user_index in range(num_users): 162 | vector_prediction = pred_matrix[user_index] 163 | vector_train = input_matrix[user_index] 164 | 165 | if len(vector_train.nonzero()[0]) > 0: 166 | vector_predict = sub_routine(vector_prediction, vector_train, topK=self.rec_maxK) 167 | else: 168 | vector_predict = np.zeros(self.rec_maxK, dtype=np.int64) 169 | 170 | prediction.append(vector_predict) 171 | 172 | # predicted item indecies 173 | predicted_items = prediction.copy() 174 | recommendation_results = self.evaluation(predicted_items, test_matrix, eval_type='recommendations', 175 | ndcg_only=ndcg_only, analytical=analytical) 176 | recommendation_results["RMSE"] = (RMSE,0) 177 | 178 | # CAVs 179 | user_embeddings, user_logvar = model.get_mu_logvar(torch.FloatTensor(input_matrix.toarray()).to(model.device)) 180 | user_embeddings = np.array(user_embeddings.detach().cpu().numpy()) 181 | item_embeddings = model.decoder.weight.detach().cpu().numpy() 182 | if dataname == 'yelp_SIGIR': 183 | generator = KAVgenerator(item_embeddings[item_tag1], item_embeddings[item_tag2]) 184 | elif dataname == 'reddit': 185 | generator = KAVgenerator(user_embeddings[strong_tag1], user_embeddings[strong_tag2]) 186 | cavs = np.squeeze(generator.get_all_mean_cav(20, 10)) 187 | metrics = ['Recall@5', 'Recall@10', 'Recall@20', 'Recall@50', 'NDCG@5', 'NDCG@10', 'NDCG@20', 'NDCG@50'] 188 | 189 | # MMR 190 | num_users = pred_matrix.shape[0] 191 | temp = input_matrix.toarray() 192 | temp[temp > 0] = np.inf 193 | 194 | # MMR using CAV 195 | mmr_predictions = np.zeros((num_users, self.rec_maxK), dtype=np.int32) 196 | item_item_sim = np.dot(item_embeddings, cavs) 197 | user_item_sim = np.dot(user_embeddings, cavs) 198 | user_item_sim = -np.absolute(np.expand_dims(item_item_sim, 0) - np.expand_dims(user_item_sim, 1))-temp 199 | item_item_sim = -np.absolute(np.expand_dims(item_item_sim, 0) - np.expand_dims(item_item_sim, 1)) 200 | for i in tqdm(range(mmr_predictions.shape[1])): 201 | if i == 0: 202 | mmr_predictions[:, 0] = np.argmax(user_item_sim, axis=1) 203 | user_item_sim[np.arange(num_users), mmr_predictions[:, 0]] = np.NINF 204 | continue 205 | for u in range(num_users): 206 | mmr_predictions[u, i] = np.argmax(self.lamb*user_item_sim[u]-(1-self.lamb)*np.max(item_item_sim[mmr_predictions[u, :i]], axis=0)) 207 | user_item_sim[u, mmr_predictions[u, i]] = np.NINF 208 | mmr_results = self.evaluation(mmr_predictions, test_matrix, eval_type='recommendations', 209 | ndcg_only=ndcg_only, analytical=analytical) 210 | # get diversity for MMR 211 | _, s_precision_2, ks_test, prob1, prob2, prob_product = diversity_metric(mmr_predictions, tagged_items, input_matrix) 212 | print('T-MMR', self.lamb, ':', s_precision_2, ks_test, prob_product) 213 | for m in metrics: 214 | print(mmr_results[m][0]) 215 | print('\n') 216 | 217 | # MMR using cosine sim 218 | mmr_predictions = np.zeros((num_users, self.rec_maxK), dtype=np.int32) 219 | user_item_sim = np.dot(user_embeddings, item_embeddings.T)-temp 220 | item_item_sim = np.dot(item_embeddings, item_embeddings.T) 221 | for i in tqdm(range(mmr_predictions.shape[1])): 222 | if i == 0: 223 | mmr_predictions[:, 0] = np.argmax(user_item_sim, axis=1) 224 | user_item_sim[np.arange(num_users), mmr_predictions[:, 0]] = np.NINF 225 | continue 226 | for u in range(num_users): 227 | mmr_predictions[u, i] = np.argmax(self.lamb*user_item_sim[u]-(1-self.lamb)*np.max(item_item_sim[mmr_predictions[u, :i]], axis=0)) 228 | user_item_sim[u, mmr_predictions[u, i]] = np.NINF 229 | mmr_results = self.evaluation(mmr_predictions, test_matrix, eval_type='recommendations', 230 | ndcg_only=ndcg_only, analytical=analytical) 231 | # get diversity for MMR 232 | _, s_precision_2, ks_test, prob1, prob2, prob_product = diversity_metric(mmr_predictions, tagged_items, input_matrix) 233 | print('U-MMR', self.lamb, ':', s_precision_2, ks_test, prob_product) 234 | for m in metrics: 235 | print(mmr_results[m][0]) 236 | print('\n') 237 | del temp 238 | del user_item_sim 239 | del item_item_sim 240 | 241 | # get diversity for VAE-CF 242 | _, s_precision_2, ks_test, prob1, prob2, prob_product = diversity_metric(prediction, tagged_items, input_matrix) 243 | print('VAE-CF:', s_precision_2, ks_test, prob_product) 244 | for m in metrics: 245 | print(recommendation_results[m][0]) 246 | print('\n') 247 | 248 | # VAE-CF + CAV appendix for flatten the filter bubble 249 | cav_norm = np.sqrt(sum(cavs**2)) 250 | new_user_embeddings = user_embeddings - (np.expand_dims(np.dot(user_embeddings, cavs), 1)/(cav_norm**2))*cavs 251 | 252 | new_predictions = None 253 | for i in range(10): 254 | div_new_user_embeddings = new_user_embeddings+np.random.normal(scale=self.std, size=(user_embeddings.shape[0], 1)) 255 | if i == 0: 256 | new_predictions = np.dot(div_new_user_embeddings, item_embeddings.T) 257 | else: 258 | new_predictions += np.dot(div_new_user_embeddings, item_embeddings.T) 259 | new_predictions /= 10 260 | 261 | # get predicted item index 262 | new_prediction = [] 263 | 264 | # Prediction section 265 | for user_index in range(num_users): 266 | vector_prediction = new_predictions[user_index] 267 | vector_train = input_matrix[user_index] 268 | 269 | if len(vector_train.nonzero()[0]) > 0: 270 | vector_predict = sub_routine(vector_prediction, vector_train, topK=self.rec_maxK) 271 | else: 272 | vector_predict = np.zeros(self.rec_maxK, dtype=np.int64) 273 | 274 | new_prediction.append(vector_predict) 275 | 276 | # predicted item indecies 277 | predicted_items = new_prediction.copy() 278 | new_results = self.evaluation(predicted_items, test_matrix, eval_type='recommendations', 279 | ndcg_only=ndcg_only, analytical=analytical) 280 | 281 | # get diversity for CAV 282 | _, s_precision_2, ks_test, prob1, prob2, prob_product = diversity_metric(new_prediction, tagged_items, input_matrix) 283 | print('TD-VAE-CF Flatten', self.std, ':', s_precision_2, ks_test, prob_product) 284 | for m in metrics: 285 | print(new_results[m][0]) 286 | print('\n') 287 | 288 | # VAE-CF + TCAV 289 | cav_norm = np.sqrt(sum(cavs**2)) 290 | new_user_embeddings = user_embeddings - (1-self.lamb)*(np.expand_dims(np.dot(user_embeddings, cavs), 1)/(cav_norm**2))*cavs 291 | new_predictions = np.dot(new_user_embeddings, item_embeddings.T) 292 | 293 | # get predicted item index 294 | new_prediction = [] 295 | 296 | # Prediction section 297 | for user_index in range(num_users): 298 | vector_prediction = new_predictions[user_index] 299 | vector_train = input_matrix[user_index] 300 | 301 | if len(vector_train.nonzero()[0]) > 0: 302 | vector_predict = sub_routine(vector_prediction, vector_train, topK=self.rec_maxK) 303 | else: 304 | vector_predict = np.zeros(self.rec_maxK, dtype=np.int64) 305 | 306 | new_prediction.append(vector_predict) 307 | 308 | # predicted item indecies 309 | predicted_items = new_prediction.copy() 310 | new_results = self.evaluation(predicted_items, test_matrix, eval_type='recommendations', 311 | ndcg_only=ndcg_only, analytical=analytical) 312 | 313 | # get diversity for CAV 314 | _, s_precision_2, ks_test, prob1, prob2, prob_product = diversity_metric(new_prediction, tagged_items, input_matrix) 315 | print('TD-VAE-CF', self.lamb, ':', s_precision_2, ks_test, prob_product) 316 | for m in metrics: 317 | print(new_results[m][0]) 318 | print('\n') 319 | 320 | return recommendation_results 321 | 322 | # function to perform evaluation on metrics 323 | def evaluation(self, predicted_items, test_matrix, eval_type, ndcg_only, analytical=False): 324 | if eval_type == 'recommendations' and ndcg_only: 325 | local_metrics = None 326 | global_metrics = {"NDCG": ndcg} 327 | atK = self.rec_atK 328 | elif eval_type == 'recommendations' and not ndcg_only: 329 | local_metrics = self.local_metrics 330 | global_metrics = self.global_metrics 331 | atK = self.rec_atK 332 | elif eval_type == 'embeddings'and ndcg_only: 333 | local_metrics = None 334 | global_metrics = {"UK_NDCG": ndcg} 335 | atK = self.explain_atK 336 | elif eval_type == 'embeddings' and not ndcg_only: 337 | local_metrics = self.local_metrics_embeddings 338 | global_metrics = self.global_metrics_embeddings 339 | atK = self.explain_atK 340 | else: 341 | raise NotImplementedError("Please select proper evaluation type, current choice: %s" % eval_type) 342 | 343 | num_users = test_matrix.shape[0] 344 | 345 | # evaluation section 346 | output = dict() 347 | 348 | # The @K metrics 349 | if local_metrics: 350 | for k in atK: 351 | results = {name: [] for name in local_metrics.keys()} 352 | 353 | # topK_Predict = predicted_items[:, :k] 354 | for user_index in range(num_users): 355 | # vector_predict = topK_Predict[user_index] 356 | vector_predict = predicted_items[user_index][:k] 357 | if (len(vector_predict.nonzero()[0]) > 0): 358 | vector_true_dense = test_matrix[user_index].nonzero()[1] 359 | 360 | if vector_true_dense.size > 0: # only if length of validation set is not 0 361 | hits = np.isin(vector_predict, vector_true_dense) 362 | for name in local_metrics.keys(): 363 | results[name].append(local_metrics[name](vector_true_dense=vector_true_dense, 364 | vector_predict=vector_predict, 365 | hits=hits)) 366 | 367 | results_summary = dict() 368 | if analytical: 369 | for name in local_metrics.keys(): 370 | results_summary['{0}@{1}'.format(name, k)] = results[name] 371 | else: 372 | for name in local_metrics.keys(): 373 | results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]), 374 | 1.96 * np.std(results[name]) / np.sqrt( 375 | len(results[name]))) 376 | output.update(results_summary) 377 | 378 | # The global metrics 379 | results = {name: [] for name in global_metrics.keys()} 380 | for user_index in range(num_users): 381 | vector_predict = predicted_items[user_index] 382 | 383 | if len(vector_predict.nonzero()[0]) > 0: 384 | vector_true_dense = test_matrix[user_index].nonzero()[1] 385 | hits = np.isin(vector_predict, vector_true_dense) 386 | 387 | if vector_true_dense.size > 0: 388 | for name in global_metrics.keys(): 389 | results[name].append(global_metrics[name](vector_true_dense=vector_true_dense, 390 | vector_predict=vector_predict, hits=hits)) 391 | results_summary = dict() 392 | if analytical: 393 | for name in global_metrics.keys(): 394 | results_summary[name] = results[name] 395 | else: 396 | for name in global_metrics.keys(): 397 | results_summary[name] = ( 398 | np.average(results[name]), 1.96 * np.std(results[name]) / np.sqrt(len(results[name]))) 399 | output.update(results_summary) 400 | 401 | return output 402 | 403 | 404 | def diversity_metric(prediction, tagged_items, input_matrix): 405 | ks_test = [] 406 | s_precision = [] 407 | s_precision_2 = [] 408 | prob1 = [] 409 | prob2 = [] 410 | for u in tqdm(range(len(prediction)), total=len(prediction)): 411 | tag1_in = (np.isin(prediction[u], tagged_items[0]).nonzero()[0]).tolist() 412 | tag2_in = (np.isin(prediction[u], tagged_items[1]).nonzero()[0]).tolist() 413 | if not tag1_in and not tag2_in: 414 | continue 415 | if not tag1_in and tag2_in: 416 | ks_test.append(1) 417 | s_precision.append(0) 418 | prob1.append(0) 419 | prob2.append(1) 420 | continue 421 | elif tag1_in and not tag2_in: 422 | ks_test.append(1) 423 | s_precision.append(0) 424 | prob1.append(1) 425 | prob2.append(0) 426 | continue 427 | 428 | # calculate s-precision 429 | s_precision.append((min(tag1_in[0], tag2_in[0])+1)/(max(tag1_in[0], tag2_in[0])+1)) 430 | s_precision_2.append(2/(max(tag1_in[0], tag2_in[0])+1)) 431 | 432 | # calculate probabilities 433 | tag1_in_training = np.isin(input_matrix[u].nonzero()[1], tagged_items[0]).sum() 434 | tag2_in_training = np.isin(input_matrix[u].nonzero()[1], tagged_items[1]).sum() 435 | ratio_tag1 = len(tag1_in) / (len(tagged_items[0]) - tag1_in_training) 436 | ratio_tag2 = len(tag2_in) / (len(tagged_items[1]) - tag2_in_training) 437 | prob1.append(ratio_tag1/(ratio_tag1+ratio_tag2)) 438 | prob2.append(ratio_tag2/(ratio_tag1+ratio_tag2)) 439 | 440 | # calculate ks-test 441 | tag1_step = 1/len(tag1_in) 442 | tag2_step = 1/len(tag2_in) 443 | i_1, i_2, diff, r_1, r_2 = 0, 0, 0, 0, 0 444 | while True: 445 | if i_1 == len(tag1_in)-1: 446 | diff = max(diff, 1 - r_2) 447 | break 448 | elif i_2 == len(tag2_in)-1: 449 | diff = max(diff, 1 - r_1) 450 | break 451 | if tag1_in[i_1] < tag2_in[i_2]: 452 | i_1 += 1 453 | r_1 += tag1_step 454 | diff = max(diff, abs(r_1 - r_2)) 455 | elif tag1_in[i_1] > tag2_in[i_2]: 456 | i_2 += 1 457 | r_2 += tag2_step 458 | diff = max(diff, abs(r_1 - r_2)) 459 | else: 460 | i_1 += 1 461 | r_1 += tag1_step 462 | i_2 += 1 463 | r_2 += tag2_step 464 | diff = max(diff, abs(r_1 - r_2)) 465 | ks_test.append(diff) 466 | 467 | ks_test = np.mean(np.array(ks_test)) 468 | s_precision = np.mean(np.array(s_precision)) 469 | s_precision_2 = np.mean(np.array(s_precision_2)) 470 | prob1 = np.array(prob1) 471 | prob2 = np.array(prob2) 472 | prob_product = np.mean(prob1*prob2) 473 | 474 | return s_precision, s_precision_2, 1-ks_test, prob1, prob2, prob_product 475 | 476 | 477 | def sub_routine(vector_predict, vector_train, topK): 478 | train_index = vector_train.nonzero()[1] 479 | 480 | # take the top recommended items 481 | candidate_index = np.argpartition(-vector_predict, topK + len(train_index))[:topK + len(train_index)] 482 | vector_predict = candidate_index[vector_predict[candidate_index].argsort()[::-1]] 483 | 484 | # vector_predict = np.argsort(-vector_predict)[:topK + len(train_index)] 485 | vector_predict = np.delete(vector_predict, np.isin(vector_predict, train_index).nonzero()[0]) 486 | 487 | return vector_predict[:topK] 488 | 489 | 490 | def recallk(vector_true_dense, hits, **unused): 491 | hits = len(hits.nonzero()[0]) 492 | return float(hits) / len(vector_true_dense) 493 | 494 | 495 | def precisionk(vector_predict, hits, **unused): 496 | hits = len(hits.nonzero()[0]) 497 | return float(hits) / len(vector_predict) 498 | 499 | 500 | def average_precisionk(vector_predict, hits, **unused): 501 | precisions = np.cumsum(hits, dtype=np.float32) / range(1, len(vector_predict) + 1) 502 | return np.mean(precisions) 503 | 504 | 505 | def r_precision(vector_true_dense, vector_predict, **unused): 506 | vector_predict_short = vector_predict[:len(vector_true_dense)] 507 | hits = len(np.isin(vector_predict_short, vector_true_dense).nonzero()[0]) 508 | return float(hits) / len(vector_true_dense) 509 | 510 | 511 | def _dcg_support(size): 512 | arr = np.arange(1, size + 1) + 1 513 | return 1. / np.log2(arr) 514 | 515 | 516 | def ndcg(vector_true_dense, vector_predict, hits): 517 | idcg = np.sum(_dcg_support(len(vector_true_dense))) 518 | dcg_base = _dcg_support(len(vector_predict)) 519 | dcg_base[np.logical_not(hits)] = 0 520 | dcg = np.sum(dcg_base) 521 | return dcg / idcg 522 | --------------------------------------------------------------------------------