├── models ├── __pycache__ │ ├── model.cpython-38.pyc │ ├── Predictor.cpython-37.pyc │ ├── Predictor.cpython-38.pyc │ ├── RGCNModel.cpython-38.pyc │ ├── model_test.cpython-38.pyc │ ├── RGCNModel_steam.cpython-38.pyc │ ├── SimpleRGCNModel.cpython-38.pyc │ ├── RGCNModel_steam_rank.cpython-37.pyc │ └── RGCNModel_steam_rank.cpython-38.pyc ├── Predictor.py └── model.py ├── utils ├── __pycache__ │ ├── parser.cpython-37.pyc │ ├── parser.cpython-38.pyc │ ├── metrics.cpython-37.pyc │ ├── metrics.cpython-38.pyc │ ├── dataloader.cpython-37.pyc │ ├── dataloader.cpython-38.pyc │ ├── NegativeSampler.cpython-37.pyc │ ├── NegativeSampler.cpython-38.pyc │ ├── dataloader_steam.cpython-37.pyc │ ├── dataloader_steam.cpython-38.pyc │ ├── dataloader_item_graph.cpython-38.pyc │ ├── dataloader_steam_filtered.cpython-37.pyc │ └── dataloader_steam_filtered.cpython-38.pyc ├── NegativeSampler.py ├── parser.py ├── dataloader_item_graph.py ├── metrics.py └── dataloader_steam.py ├── README.md └── main.py /models/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/parser.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/parser.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/parser.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/parser.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/metrics.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/metrics.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/metrics.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/Predictor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/Predictor.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/Predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/Predictor.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/RGCNModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/RGCNModel.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/model_test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/model_test.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/RGCNModel_steam.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/RGCNModel_steam.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/SimpleRGCNModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/SimpleRGCNModel.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/NegativeSampler.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/NegativeSampler.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/NegativeSampler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/NegativeSampler.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader_steam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader_steam.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader_steam.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader_steam.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/RGCNModel_steam_rank.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/RGCNModel_steam_rank.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/RGCNModel_steam_rank.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/models/__pycache__/RGCNModel_steam_rank.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader_item_graph.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader_item_graph.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader_steam_filtered.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader_steam_filtered.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/dataloader_steam_filtered.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YangLiangwei/SCGRec/HEAD/utils/__pycache__/dataloader_steam_filtered.cpython-38.pyc -------------------------------------------------------------------------------- /models/Predictor.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import dgl.function as fn 3 | class HeteroDotProductPredictor(nn.Module): 4 | def forward(self, graph, h, etype): 5 | with graph.local_scope(): 6 | graph.ndata['h'] = h 7 | graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype = etype) 8 | return graph.edges[etype].data['score'] 9 | -------------------------------------------------------------------------------- /utils/NegativeSampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | import torch 4 | 5 | class NegativeSampler(object): 6 | def __init__(self, dic): 7 | self.dic_user_game = dic 8 | 9 | def __call__(self, g, eids_dict): 10 | result_dict = {} 11 | for etype, eids in eids_dict.items(): 12 | src_type, edge, dst_type = etype 13 | src, _ = g.find_edges(eids, etype = etype) 14 | dst = [] 15 | for i in range(src.shape[0]): 16 | s = int(src[i]) 17 | while True: 18 | negitem = np.random.randint(0, g.num_nodes(dst_type)) 19 | if negitem in self.dic_user_game[s]: 20 | continue 21 | else: 22 | break 23 | dst.append(negitem) 24 | dst = torch.tensor(dst) 25 | result_dict[etype] = (src, dst) 26 | return result_dict 27 | 28 | 29 | -------------------------------------------------------------------------------- /utils/parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_args(): 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('--data', default = 'Ciao', type = str, 6 | help = 'Dataset to use') 7 | parser.add_argument('--train_percent', default = 0.8, type = float, 8 | help = 'training_percent') 9 | parser.add_argument('--embed_size', default = 32, type = int, 10 | help = 'embedding size for all layer') 11 | parser.add_argument('--lr', default = 0.03, type = float, 12 | help = 'learning rate') 13 | parser.add_argument('--model', default = 'RGCN', type = str, 14 | help = 'model selection') 15 | parser.add_argument('--epoch', default = 1000, type = int, 16 | help = 'epoch number') 17 | parser.add_argument('--early_stop', default = 10, type = int, 18 | help = 'early_stop validation') 19 | parser.add_argument('--batch_size', default = 1024, type = int, 20 | help = 'batch size') 21 | parser.add_argument('--layers', default = 1, type = int, 22 | help = 'layer number') 23 | parser.add_argument('--gpu', default = -1, type = int, 24 | help = '-1 for cpu, 0 for gpu:0') 25 | parser.add_argument('--k', default = [5, 10, 20], type = list, 26 | help = 'negative sampler number for each node') 27 | parser.add_argument('--g', default = 0.1, type = float, 28 | help = 'hyper-parameter for aggregation weight') 29 | parser.add_argument('--social_g', default = 0.1, type = float, 30 | help = 'hyper-parameter for aggregation weight') 31 | parser.add_argument('--item_g', default = 0.1, type = float, 32 | help = 'hyper-parameter for aggregation weight') 33 | 34 | args = parser.parse_args() 35 | return args 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Large-scale Personalized Video Game Recommendation via Social-aware Contextualized Graph Neural Network 2 | 3 | > Authors: Liangwei Yang, Zhiwei Liu, Yu Wang, Chen Wang, Ziwei Fan, Philip S. Yu 4 | > Affiliation: University of Illinois at Chicago 5 | 6 | ![](./assets/draftrec_model.png) 7 | 8 | 9 | > **Abstract:** 10 | Because of the large number of online games available nowadays, online game recommender systems are necessary for users and online game platforms. The former can discover more potential online games of their interests, and the latter can attract users to dwell longer in the platform. This paper investigates the characteristics of user behaviors with respect to the online games on the Steam platform. Based on the observations, we argue that a satisfying recommender system for online games is able to characterize: personalization, game contextualization and social connection. However, simultaneously solving all is rather challenging for game recommendation. Firstly, personalization for game recommendation requires the incorporation of the dwelling time of engaged games, which are ignored in existing methods. 11 | Secondly, game contextualization should reflect the complex and high-order properties of those relations. Last but not least, it is problematic to use social connections directly for game recommendations due to the massive noise within social connections. To this end, we propose a Social-aware Contextualized Graph Neural Recommender System~(SCGRec), which harnesses three perspectives to improve game recommendation. We conduct a comprehensive analysis of users' online game behaviors, which motivates the necessity of handling those three characteristics in the online game recommendation. 12 | 13 | ## Dataset 14 | 15 | [Google drive link](https://drive.google.com/file/d/1F9kr_YWimBtexJEH-zkDzCOwl1q7GmFp/view) 16 | 17 | ![](./assets/dataset.png) 18 | 19 | ## How to run 20 | python main.py 21 | 22 | ## Cite 23 | 24 | 25 | ``` 26 | @inproceedings{SCGRec, 27 | author = {Liangwei Yang and 28 | Zhiwei Liu and 29 | Yu Wang and 30 | Chen Wang and 31 | Ziwei Fan and 32 | Philip S. Yu}, 33 | title = {Large-scale Personalized Video Game Recommendation via Social-aware 34 | Contextualized Graph Neural Network}, 35 | booktitle = {{WWW} '22: The {ACM} Web Conference 2022, Virtual Event, Lyon, France, 36 | April 25 - 29, 2022}, 37 | pages = {3376--3386}, 38 | publisher = {{ACM}}, 39 | year = {2022}, 40 | url = {https://doi.org/10.1145/3485447.3512273}, 41 | doi = {10.1145/3485447.3512273}, 42 | timestamp = {Tue, 26 Apr 2022 16:02:09 +0200}, 43 | biburl = {https://dblp.org/rec/conf/www/YangLWWFY22.bib}, 44 | bibsource = {dblp computer science bibliography, https://dblp.org} 45 | } 46 | ``` 47 | 48 | 49 | -------------------------------------------------------------------------------- /utils/dataloader_item_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dgl.data.utils import save_graphs 4 | from tqdm import tqdm 5 | from scipy import stats 6 | from .NegativeSampler import NegativeSampler 7 | import pdb 8 | import torch 9 | import logging 10 | logging.basicConfig(stream = sys.stdout, level = logging.INFO) 11 | import numpy as np 12 | import dgl 13 | from dgl.data import DGLDataset 14 | import pandas as pd 15 | from sklearn import preprocessing 16 | from dgl.data import DGLDataset 17 | 18 | class Dataloader_item_graph(DGLDataset): 19 | def __init__(self, graph, app_id_path, publisher_path, developer_path, genre_path): 20 | self.app_id_path = app_id_path 21 | self.publisher_path = publisher_path 22 | self.developer_path = developer_path 23 | self.genre_path = genre_path 24 | 25 | logging.info("reading item graph") 26 | self.app_id_mapping = self.read_id_mapping(self.app_id_path) 27 | self.publisher = self.read_mapping(self.publisher_path) 28 | self.developer = self.read_mapping(self.developer_path) 29 | self.genre = self.read_mapping(self.genre_path) 30 | 31 | graph_data = { 32 | ('game', 'co_publisher', 'game'): self.publisher, 33 | ('game', 'co_developer', 'game'): self.developer, 34 | ('game', 'co_genre', 'game'): self.genre 35 | } 36 | self.graph = dgl.heterograph(graph_data) 37 | self.graph.nodes['game'].data['h'] = graph.ndata['h']['game'].float() 38 | 39 | 40 | def read_mapping(self, path): 41 | mapping = {} 42 | with open(path, 'r') as f: 43 | for line in f: 44 | line = line.strip().split(',') 45 | if line[1] != '': 46 | if line[0] not in mapping: 47 | mapping[self.app_id_mapping[line[0]]] = [line[1]] 48 | else: 49 | mapping[self.app_id_mapping[line[0]]].append(line[1]) 50 | for key in mapping: 51 | mapping[key] = set(mapping[key]) 52 | src = [] 53 | dst = [] 54 | keys = list(mapping.keys()) 55 | for i in range(len(keys) - 1): 56 | for j in range(i +1, len(keys)): 57 | game1 = keys[i] 58 | game2 = keys[j] 59 | if len(mapping[game1] & mapping[game2]) > 0: 60 | src.extend([game1, game2]) 61 | dst.extend([game2, game1]) 62 | return (torch.tensor(src), torch.tensor(dst)) 63 | 64 | def read_id_mapping(self, path): 65 | mapping = {} 66 | count = 0 67 | with open(path, 'r') as f: 68 | for line in f: 69 | line = line.strip() 70 | if line not in mapping: 71 | mapping[line] = count 72 | count += 1 73 | return mapping 74 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from tqdm import tqdm 3 | import pdb 4 | import torch.nn.functional as F 5 | import torch 6 | from dgl.nn import SAGEConv 7 | import dgl 8 | import dgl.function as fn 9 | import dgl.nn as dglnn 10 | from dgl.nn import GATConv 11 | from dgl.nn import GraphConv 12 | 13 | class Proposed_model(nn.Module): 14 | def __init__(self, args, graph, item_graph): 15 | super().__init__() 16 | self.args = args 17 | self.hid_dim = args.embed_size 18 | self.layer_num = args.layers 19 | 20 | # self.user_embedding = torch.nn.Parameter(torch.randn(graph.nodes('user').shape[0], self.hid_dim)) 21 | # self.item_embedding = torch.nn.Parameter(torch.randn(graph.nodes('game').shape[0], self.hid_dim)) 22 | 23 | self.user_embedding = torch.nn.Parameter(torch.load('./baselines/user_embedding.pt')) 24 | self.item_embedding = torch.nn.Parameter(torch.load('./baselines/item_embedding.pt')) 25 | 26 | self.item_conv = SAGEConv(self.hid_dim, self.hid_dim, 'mean') 27 | self.social_GAT = GATConv(self.hid_dim, self.hid_dim, num_heads = 1, allow_zero_in_degree = True) 28 | self.social_conv = SAGEConv(self.hid_dim, self.hid_dim, 'mean') 29 | self.linear = torch.nn.Linear(3 * self.hid_dim, self.hid_dim) 30 | 31 | self.build_model(item_graph) 32 | 33 | def build_layer(self, idx, graph): 34 | if idx == 0: 35 | input_dim = graph.ndata['h'].shape[1] 36 | else: 37 | input_dim = self.hid_dim 38 | dic = { 39 | rel: GraphConv(input_dim, self.hid_dim, weight = True, bias = False) 40 | for rel in graph.etypes 41 | } 42 | return dglnn.HeteroGraphConv(dic, aggregate = 'mean') 43 | 44 | def build_model(self, graph): 45 | self.layers = nn.ModuleList() 46 | for idx in range(self.layer_num): 47 | h2h = self.build_layer(idx, graph) 48 | self.layers.append(h2h) 49 | 50 | def forward(self, graph, item_graph, social_graph): 51 | 52 | h_game = item_graph.ndata['h'] 53 | for layer in self.layers: 54 | h_game = layer(item_graph, {'game': h_game})['game'] 55 | 56 | graph_game2user = dgl.edge_type_subgraph(graph, ['played by']) 57 | 58 | weight = graph.edata['weight'][('game', 'played by', 'user')] 59 | h_user_aggregate = self.item_conv(graph_game2user, (h_game, self.user_embedding), edge_weight = weight) 60 | 61 | _, social_weight = self.social_GAT(social_graph, h_user_aggregate, get_attention = True) 62 | social_weight = social_weight.sum(1) 63 | h_user_social = self.social_conv(social_graph, self.user_embedding, edge_weight = social_weight) 64 | 65 | user_embed = (1 - self.args.social_g - self.args.item_g) * self.user_embedding + self.args.item_g * h_user_aggregate + self.args.social_g * h_user_social 66 | 67 | return {"user": user_embed, "game": self.item_embedding} 68 | -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error 4 | 5 | denominator_table = np.log2(np.arange(2, 102)) 6 | 7 | def MAE(score, label): 8 | return torch.mean(torch.abs(score - label)) 9 | 10 | def RMSE(score, label): 11 | return torch.sqrt(torch.mean((score - label) ** 2)) 12 | 13 | def recall(rank, ground_truth, N): 14 | return len(set(rank[:N]) & set(ground_truth)) / float(len(set(ground_truth))) 15 | 16 | def precision_at_k(r, k): 17 | """Score is precision @ k 18 | Relevance is binary (nonzero is relevant). 19 | Returns: 20 | Precision @ k 21 | Raises: 22 | ValueError: len(r) must be >= k 23 | """ 24 | assert k >= 1 25 | r = np.asarray(r)[:k] 26 | return np.mean(r) 27 | 28 | def average_precision(r,cut): 29 | """Score is average precision (area under PR curve) 30 | Relevance is binary (nonzero is relevant). 31 | Returns: 32 | Average precision 33 | """ 34 | r = np.asarray(r) 35 | out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]] 36 | if not out: 37 | return 0. 38 | return np.sum(out)/float(min(cut, np.sum(r))) 39 | 40 | 41 | def mean_average_precision(rs): 42 | """Score is mean average precision 43 | Relevance is binary (nonzero is relevant). 44 | Returns: 45 | Mean average precision 46 | """ 47 | return np.mean([average_precision(r) for r in rs]) 48 | 49 | 50 | def dcg_at_k(r, k, method=1): 51 | """Score is discounted cumulative gain (dcg) 52 | Relevance is positive real values. Can use binary 53 | as the previous methods. 54 | Returns: 55 | Discounted cumulative gain 56 | """ 57 | r = np.asfarray(r)[:k] 58 | if r.size: 59 | if method == 0: 60 | return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1))) 61 | elif method == 1: 62 | return np.sum(r / np.log2(np.arange(2, r.size + 2))) 63 | else: 64 | raise ValueError('method must be 0 or 1.') 65 | return 0. 66 | 67 | 68 | def ndcg_at_k(r, k, method=1): 69 | """Score is normalized discounted cumulative gain (ndcg) 70 | Relevance is positive real values. Can use binary 71 | as the previous methods. 72 | Returns: 73 | Normalized discounted cumulative gain 74 | """ 75 | dcg_max = dcg_at_k(sorted(r, reverse=True), k, method) 76 | if not dcg_max: 77 | return 0. 78 | return dcg_at_k(r, k, method) / dcg_max 79 | 80 | 81 | def recall_at_k(r, k): 82 | all_pos_num = sum(r) 83 | r = np.asfarray(r)[:k] 84 | return np.sum(r) / all_pos_num 85 | 86 | 87 | def hit_at_k(r, k): 88 | r = np.array(r)[:k] 89 | if np.sum(r) > 0: 90 | return 1. 91 | else: 92 | return 0. 93 | 94 | def F1(pre, rec): 95 | if pre + rec > 0: 96 | return (2.0 * pre * rec) / (pre + rec) 97 | else: 98 | return 0. 99 | 100 | def auc(ground_truth, prediction): 101 | try: 102 | res = roc_auc_score(y_true=ground_truth, y_score=prediction) 103 | except Exception: 104 | res = 0. 105 | return res 106 | 107 | def logloss(ground_truth, prediction): 108 | # preds = [max(min(p, 1. - 10e-12), 10e-12) for p in prediction] 109 | logloss = log_loss(np.asarray(ground_truth), np.asarray(prediction)) 110 | return logloss 111 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import dgl 3 | import dgl.function as fn 4 | sys.path.append('../') 5 | import os 6 | import multiprocessing as mp 7 | # mp.set_start_method('spawn') 8 | from tqdm import tqdm 9 | import pdb 10 | import random 11 | import numpy as np 12 | import torch 13 | import torch.nn as nn 14 | import logging 15 | logging.basicConfig(stream = sys.stdout, level = logging.INFO) 16 | from utils.parser import parse_args 17 | from utils.metrics import MAE, RMSE, ndcg_at_k, recall_at_k, hit_at_k, precision_at_k 18 | from utils.dataloader_steam import Dataloader_steam 19 | from utils.dataloader_item_graph import Dataloader_item_graph 20 | from models.RGCNModel_steam_rank import RGCNModel_steam_rank 21 | from models.Predictor import HeteroDotProductPredictor 22 | from models.model import Proposed_model 23 | 24 | def validate(train_mask, dic, h, ls_k): 25 | users = torch.tensor(list(dic.keys())).long() 26 | user_embedding = h['user'][users] 27 | game_embedding = h['game'] 28 | rating = torch.mm(user_embedding, game_embedding.t()) 29 | rating[train_mask] = -float('inf') 30 | 31 | valid_mask = torch.zeros_like(train_mask) 32 | for i in range(users.shape[0]): 33 | user = int(users[i]) 34 | items = torch.tensor(dic[user]) 35 | valid_mask[i, items] = 1 36 | 37 | _, indices = torch.sort(rating, descending = True) 38 | ls = [valid_mask[i,:][indices[i, :]] for i in range(valid_mask.shape[0])] 39 | result = torch.stack(ls).float() 40 | 41 | res = [] 42 | for k in ls_k: 43 | discount = (torch.tensor([i for i in range(k)]) + 2).log2() 44 | ideal, _ = result.sort(descending = True) 45 | idcg = (ideal[:, :k] / discount).sum(dim = 1) 46 | dcg = (result[:, :k] / discount).sum(dim = 1) 47 | ndcg = torch.mean(dcg / idcg) 48 | 49 | recall = torch.mean(result[:, :k].sum(1) / result.sum(1)) 50 | hit = torch.mean((result[:, :k].sum(1) > 0).float()) 51 | precision = torch.mean(result[:, :k].mean(1)) 52 | 53 | logging_result = "For k = {}, ndcg = {}, recall = {}, hit = {}, precision = {}".format(k, ndcg, recall, hit, precision) 54 | logging.info(logging_result) 55 | res.append(logging_result) 56 | return ndcg, str(res) 57 | 58 | 59 | def construct_negative_graph(graph, etype): 60 | utype, _ , vtype = etype 61 | src, _ = graph.edges(etype = etype) 62 | dst = torch.randint(graph.num_nodes(vtype), size = src.shape) 63 | return dgl.heterograph({etype: (src, dst)}, num_nodes_dict = {ntype: graph.number_of_nodes(ntype) for ntype in graph.ntypes}) 64 | 65 | def setup_seed(seed): 66 | torch.manual_seed(seed) 67 | torch.cuda.manual_seed_all(seed) 68 | np.random.seed(seed) 69 | random.seed(seed) 70 | torch.backends.cudnn.deterministic = True 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | setup_seed(2020) 75 | 76 | if args.gpu >= 0 and torch.cuda.is_available(): 77 | device = 'cuda:{}'.format(args.gpu) 78 | else: 79 | device = 'cpu' 80 | 81 | path = '/home/yangliangwei/datasets/steam/' 82 | 83 | user_id_path = path + '/users.txt' 84 | app_id_path = path + '/app_id.txt' 85 | app_info_path = path + '/App_ID_Info.txt' 86 | friends_path = path + '/friends.txt' 87 | developer_path = path + '/Games_Developers.txt' 88 | publisher_path = path + '/Games_Publishers.txt' 89 | genres_path = path + '/Games_Genres.txt' 90 | 91 | DataLoader = Dataloader_steam(args, path, user_id_path, app_id_path, app_info_path, friends_path, developer_path, publisher_path, genres_path) 92 | 93 | graph = DataLoader.graph 94 | DataLoader_item = Dataloader_item_graph(graph, app_id_path, publisher_path, developer_path, genres_path) 95 | 96 | graph_item = DataLoader_item.graph 97 | 98 | graph_social = dgl.edge_type_subgraph(graph, [('user', 'friend of', 'user')]) 99 | 100 | graph = dgl.edge_type_subgraph(graph, [('user', 'play', 'game'), ('game', 'played by', 'user')]) 101 | graph.update_all(fn.copy_edge('percentile', 'm'), fn.sum('m', 'total'), etype = 'played by') 102 | graph.apply_edges(func = fn.e_div_v('percentile', 'total', 'weight'), etype = 'played by') 103 | 104 | valid_user = list(DataLoader.valid_data.keys()) 105 | train_mask = torch.zeros(len(valid_user), graph.num_nodes('game')) 106 | for i in range(len(valid_user)): 107 | user = valid_user[i] 108 | item_train = torch.tensor(DataLoader.dic_user_game[user]) 109 | train_mask[i, :][item_train] = 1 110 | train_mask = train_mask.bool() 111 | 112 | model = Proposed_model(args, graph, graph_item) 113 | 114 | predictor = HeteroDotProductPredictor() 115 | model.to(device) 116 | opt = torch.optim.Adam(model.parameters(), lr = args.lr) 117 | 118 | stop_count = 0 119 | ndcg_val_best = 0 120 | ls_k = args.k 121 | 122 | total_epoch = 0 123 | for epoch in range(args.epoch): 124 | model.train() 125 | graph_neg = construct_negative_graph(graph, ('user', 'play', 'game')) 126 | h = model(graph, graph_item, graph_social) 127 | 128 | score = predictor(graph, h, ('user', 'play', 'game')) 129 | score_neg = predictor(graph_neg, h, ('user', 'play', 'game')) 130 | loss = -(score - score_neg).sigmoid().log().sum() 131 | logging.info("loss = {}".format(loss)) 132 | opt.zero_grad() 133 | loss.backward() 134 | opt.step() 135 | total_epoch += 1 136 | 137 | # score, h = model.forward_all(graph, 'play') 138 | logging.info('Epoch {}'.format(epoch)) 139 | if total_epoch > 1: 140 | model.eval() 141 | logging.info("begin validation") 142 | 143 | ndcg, _ = validate(train_mask, DataLoader.valid_data, h, ls_k) 144 | 145 | if ndcg > ndcg_val_best: 146 | ndcg_val_best = ndcg 147 | stop_count = 0 148 | logging.info("begin test") 149 | ndcg_test, test_result = validate(train_mask, DataLoader.test_data, h, ls_k) 150 | else: 151 | stop_count += 1 152 | if stop_count > args.early_stop: 153 | logging.info('early stop') 154 | break 155 | 156 | logging.info('Final ndcg {}'.format(ndcg_test)) 157 | logging.info(test_result) 158 | -------------------------------------------------------------------------------- /utils/dataloader_steam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dgl.data.utils import save_graphs 4 | from tqdm import tqdm 5 | from scipy import stats 6 | from .NegativeSampler import NegativeSampler 7 | import pdb 8 | import torch 9 | import logging 10 | logging.basicConfig(stream = sys.stdout, level = logging.INFO) 11 | import numpy as np 12 | import dgl 13 | from dgl.data import DGLDataset 14 | import pandas as pd 15 | from sklearn import preprocessing 16 | 17 | class Dataloader_steam_filtered(DGLDataset): 18 | def __init__(self, args, root_path, user_id_path, app_id_path, app_info_path, friends_path, developer_path, publisher_path, genres_path, device = 'cpu', name = 'steam'): 19 | logging.info("steam dataloader init") 20 | 21 | self.args = args 22 | self.root_path = root_path 23 | self.user_id_path = user_id_path 24 | self.app_id_path = app_id_path 25 | self.app_info_path = app_info_path 26 | self.friends_path = friends_path 27 | self.developer_path = developer_path 28 | self.publisher_path = publisher_path 29 | self.genres_path = genres_path 30 | self.device = device 31 | self.graph_path = self.root_path + '/graph.bin' 32 | self.game_path = self.root_path + '/train_game.txt' 33 | self.time_path = self.root_path + '/train_time.txt' 34 | self.valid_path = self.root_path + '/valid_game.txt' 35 | self.test_path = self.root_path + '/test_game.txt' 36 | 37 | logging.info("reading user id mapping from {}".format(self.user_id_path)) 38 | self.user_id_mapping = self.read_id_mapping(self.user_id_path) 39 | logging.info("reading app id mapping from {}".format(self.app_id_path)) 40 | self.app_id_mapping = self.read_id_mapping(self.app_id_path) 41 | 42 | logging.info("build valid data") 43 | self.valid_data = self.build_valid_data(self.valid_path) 44 | 45 | logging.info("build test data") 46 | self.test_data = self.build_valid_data(self.test_path) 47 | 48 | if os.path.exists(self.graph_path): 49 | logging.info("loading preprocessed data") 50 | self.graph = dgl.load_graphs(self.graph_path) 51 | self.graph = self.graph[0][0] 52 | logging.info("reading user game information") 53 | self.dic_user_game = self.read_dic_user_game(self.game_path) 54 | 55 | else: 56 | self.process() 57 | dgl.save_graphs(self.graph_path, self.graph) 58 | 59 | self.dataloader = self.build_dataloader(self.args, self.graph) 60 | 61 | def build_valid_data(self, path): 62 | users = {} 63 | with open(path, 'r') as f: 64 | lines = f.readlines() 65 | for line in lines: 66 | line = line.strip().split(',') 67 | user = self.user_id_mapping[line[0]] 68 | games = [self.app_id_mapping[game] for game in line[1:]] 69 | users[user] = games 70 | return users 71 | 72 | def build_dataloader(self, args, graph): 73 | sampler = dgl.dataloading.MultiLayerFullNeighborSampler(args.layers, return_eids = False) 74 | train_id = torch.tensor([i for i in range(graph.edges(etype = 'play')[0].shape[0])], dtype = torch.long) 75 | dataloader = dgl.dataloading.EdgeDataLoader( 76 | graph, {('user', 'play', 'game'): train_id}, 77 | sampler, negative_sampler = NegativeSampler(self.dic_user_game), batch_size = args.batch_size, shuffle = True, num_workers = 2 78 | ) 79 | return dataloader 80 | 81 | 82 | def process(self): 83 | logging.info("reading app info from {}".format(self.app_info_path)) 84 | self.app_info = self.read_app_info(self.app_info_path) 85 | 86 | logging.info("reading publisher from {}".format(self.publisher_path)) 87 | self.publisher = self.read_mapping(self.publisher_path) 88 | 89 | logging.info("reading developer from {}".format(self.developer_path)) 90 | self.developer = self.read_mapping(self.developer_path) 91 | 92 | logging.info("reading genre from {}".format(self.genres_path)) 93 | self.genre = self.read_mapping(self.genres_path) 94 | 95 | logging.info("reading user item play time from {}".format(self.game_path)) 96 | 97 | self.user_game, self.dic_user_game = self.read_play_time_rank(self.game_path, self.time_path) 98 | 99 | logging.info("reading friend list from {}".format(self.friends_path)) 100 | self.friends = self.read_friends(self.friends_path) 101 | 102 | graph_data = { 103 | ('user', 'friend of', 'user'): (self.friends[:, 0], self.friends[:, 1]), 104 | 105 | ('game', 'developed by', 'developer'): (torch.tensor(list(self.developer.keys())), torch.tensor(list(self.developer.values()))), 106 | 107 | ('developer', 'develop', 'game'): (torch.tensor(list(self.developer.values())), torch.tensor(list(self.developer.keys()))), 108 | 109 | ('game', 'published by', 'publisher'): (torch.tensor(list(self.publisher.keys())), torch.tensor(list(self.publisher.values()))), 110 | 111 | ('publisher', 'publish', 'game'): (torch.tensor(list(self.publisher.values())), torch.tensor(list(self.publisher.keys()))), 112 | 113 | ('game', 'genre', 'type'): (torch.tensor(list(self.genre.keys())), torch.tensor(list(self.genre.values()))), 114 | 115 | ('type', 'genred', 'game'): (torch.tensor(list(self.genre.values())), torch.tensor(list(self.genre.keys()))), 116 | 117 | ('user', 'play', 'game'): (self.user_game[:, 0].long(), self.user_game[:, 1].long()), 118 | 119 | ('game', 'played by', 'user'): (self.user_game[:, 1].long(), self.user_game[:, 0].long()) 120 | } 121 | graph = dgl.heterograph(graph_data) 122 | 123 | ls_feature = [] 124 | 125 | for node in graph.nodes('game'): 126 | node = int(node) 127 | if node in self.app_info: 128 | ls_feature.append(self.app_info[node]) 129 | 130 | ls_feature = np.vstack(ls_feature) 131 | feature_mean = ls_feature.mean(0) 132 | 133 | ls_feature = [] 134 | 135 | count_total = 0 136 | count_without_feature = 0 137 | for node in graph.nodes('game'): 138 | count_total += 1 139 | node = int(node) 140 | if node in self.app_info: 141 | ls_feature.append(self.app_info[node]) 142 | else: 143 | count_without_feature += 1 144 | ls_feature.append(feature_mean) 145 | logging.info("total game number is {}, games without features number is {}".format(count_total,count_without_feature )) 146 | 147 | graph.nodes['game'].data['h'] = torch.tensor(np.vstack(ls_feature)) 148 | graph.edges['play'].data['time'] = self.user_game[:, 2] 149 | graph.edges['played by'].data['time'] = self.user_game[:, 2] 150 | graph.edges['play'].data['percentile'] = self.user_game[:, 3] 151 | graph.edges['played by'].data['percentile'] = self.user_game[:, 3] 152 | self.graph = graph 153 | 154 | def __getitem__(self, i): 155 | pass 156 | 157 | def __len__(self): 158 | pass 159 | 160 | def generate_percentile(self, ls): 161 | dic = {} 162 | for ls_i in ls: 163 | if ls_i[1] in dic: 164 | dic[ls_i[1]].append(ls_i[2]) 165 | else: 166 | dic[ls_i[1]] = [ls_i[2]] 167 | for key in tqdm(dic): 168 | dic[key] = sorted(list(set(dic[key]))) 169 | dic_percentile = {} 170 | 171 | for key in tqdm(dic): 172 | dic_percentile[key] = {} 173 | length = len(dic[key]) 174 | for i in range(len(dic[key])): 175 | time = dic[key][i] 176 | dic_percentile[key][time] = (i + 1) / length 177 | 178 | 179 | for i in tqdm(range(len(ls))): 180 | ls[i].append(dic_percentile[ls[i][1]][ls[i][2]]) 181 | return ls 182 | 183 | 184 | def read_dic_user_game(self, game_path): 185 | dic_game = {} 186 | with open(game_path, 'r') as f_game: 187 | lines_game = f_game.readlines() 188 | for i in tqdm(range(len(lines_game))): 189 | line_game = lines_game[i].strip().split(',') 190 | user = self.user_id_mapping[line_game[0]] 191 | 192 | dic_game[user] = [] 193 | for j in range(1, len(line_game)): 194 | game = self.app_id_mapping[line_game[j]] 195 | dic_game[user].append(game) 196 | return dic_game 197 | 198 | 199 | def read_play_time_rank(self, game_path, time_path): 200 | ls = [] 201 | dic_game = {} 202 | dic_time = {} 203 | with open(game_path, 'r') as f_game: 204 | with open(time_path, 'r') as f_time: 205 | lines_game = f_game.readlines() 206 | lines_time = f_time.readlines() 207 | for i in tqdm(range(len(lines_game))): 208 | line_game = lines_game[i].strip().split(',') 209 | line_time = lines_time[i].strip().split(',') 210 | user = self.user_id_mapping[line_game[0]] 211 | dic_game[user] = [] 212 | 213 | for j in range(1, len(line_game)): 214 | game = self.app_id_mapping[line_game[j]] 215 | dic_game[user].append(game) 216 | time = line_time[j] 217 | if time == r'\N': 218 | ls.append([user, game, 0]) 219 | else: 220 | ls.append([user, game, float(time)]) 221 | logging.info('generate percentiles') 222 | ls = self.generate_percentile(ls) 223 | return torch.tensor(ls), dic_game 224 | 225 | def read_play_time(self, path): 226 | ls = [] 227 | with open(path, 'r', encoding = 'utf8') as f: 228 | for line in f: 229 | line = line.strip().split(',') 230 | if line[-1] == r'\N': 231 | ls.append([self.user_id_mapping[line[0]], self.app_id_mapping[line[1]], 0]) 232 | else: 233 | ls.append([self.user_id_mapping[line[0]], self.app_id_mapping[line[1]], int(line[2])]) 234 | logging.info('generate percentiles') 235 | ls = self.generate_percentile(ls) 236 | return torch.tensor(ls) 237 | 238 | def read_id_mapping(self, path): 239 | mapping = {} 240 | count = 0 241 | with open(path, 'r') as f: 242 | for line in f: 243 | line = line.strip() 244 | if line not in mapping: 245 | mapping[line] = count 246 | count += 1 247 | return mapping 248 | 249 | def read_app_info(self, path): 250 | dic = {} 251 | df = pd.read_csv(path, header = None) 252 | df = pd.get_dummies(df, columns = [2]) 253 | df_time = pd.to_datetime(df.iloc[:, 3]) 254 | date_end = pd.to_datetime('2013-06-25') 255 | time_sub = date_end - df_time 256 | time_sub = time_sub.dt.days 257 | df = pd.concat([df, time_sub], axis = 1) 258 | column_num = len(df.columns) 259 | column_index = [2] 260 | column_index.extend([i for i in range(4, column_num)]) 261 | 262 | logging.info("begin feature engineering") 263 | df.iloc[:, 4].replace(to_replace = -1, value = np.nan, inplace = True) 264 | mean = df.iloc[:, 4].mean() 265 | df.iloc[:, 4].replace(to_replace = np.nan, value = mean, inplace = True) 266 | columns_norm = [2, 4, 5, 11] 267 | mean = df.iloc[:, columns_norm].mean() 268 | std = df.iloc[:, columns_norm].std() 269 | df.iloc[:, columns_norm] = (df.iloc[:, columns_norm] - mean) / std 270 | 271 | for i in range(len(df)): 272 | app_id = self.app_id_mapping[str(df.iloc[i, 0])] 273 | feature = df.iloc[i, column_index].to_numpy() 274 | feature = feature.astype(np.float64) 275 | dic[app_id] = feature 276 | dic['feature_num'] = len(feature) 277 | return dic 278 | 279 | def read_friends(self, path): 280 | ls = [] 281 | with open(path, 'r') as f: 282 | for line in f: 283 | line = line.strip().split(',') 284 | ls.append([self.user_id_mapping[line[0]], self.user_id_mapping[line[1]]]) 285 | return torch.tensor(ls) 286 | 287 | def read_mapping(self, path): 288 | mapping = {} 289 | with open(path, 'r') as f: 290 | for line in f: 291 | line = line.strip().split(',') 292 | if line[0] not in mapping: 293 | if line[1] != '': 294 | mapping[self.app_id_mapping[line[0]]] = line[1] 295 | mapping_value2id = {} 296 | count = 0 297 | for value in mapping.values(): 298 | if value not in mapping_value2id: 299 | mapping_value2id[value] = count 300 | count += 1 301 | for key in mapping: 302 | mapping[key] = mapping_value2id[mapping[key]] 303 | return mapping 304 | --------------------------------------------------------------------------------