├── .gitignore ├── LICENSE ├── Readme.md ├── __init__.py ├── api ├── action.py ├── catalog.py ├── item.py ├── items_ranking_request.py ├── sequential_dataset.py └── user.py ├── datasets ├── __init__.py ├── amazon.py ├── beauty.py ├── bert4rec_datasets.py ├── booking.py ├── dataset_stats.py ├── dataset_utils.py ├── datasets_register.py ├── download_file.py ├── gowalla.py ├── movielens100k.py ├── movielens20m.py ├── movielens25m.py ├── movies_dataset.py ├── mts_kion.py ├── netflix.py └── yelp.py ├── docker ├── .bashrc ├── .gitignore ├── .vimrc ├── Dockerfile ├── cuda-archive-keyring.gpg ├── requirements.txt └── sources.list ├── evaluation ├── .gitignore ├── __init__.py ├── analyze_experiment_in_progress.py ├── analyze_results.py ├── clean_output.cpp ├── conf_intervals.py ├── configs │ ├── ML1M-bpr-example.py │ ├── __init__.py │ └── gsasrec │ │ ├── common_benchmark_config.py │ │ ├── gowalla_benchmark.py │ │ ├── ml1m_benchmark.py │ │ └── steam_benchmark.py ├── dataset_by_config.py ├── evaluate_recommender.py ├── evaluation_utils.py ├── experiment_to_excel.py ├── filter_cold_start.py ├── metrics │ ├── __init__.py │ ├── average_popularity_rank.py │ ├── entropy.py │ ├── highest_score.py │ ├── hit.py │ ├── map.py │ ├── metric.py │ ├── model_confidence.py │ ├── mrr.py │ ├── ndcg.py │ ├── pairwise_cos_sim.py │ ├── precision.py │ ├── recall.py │ └── sampled_proxy_metric.py ├── n_actions_for_user.py ├── run_all_b4rec_originals.sh ├── run_experiment.py ├── run_n_experiments.sh ├── samplers │ ├── pop_sampler.py │ ├── random_sampler.py │ └── sampler.py ├── split_actions.py ├── statistical_signifficance_test.py └── two_predictions_signficance_test.py ├── losses ├── __init__.py ├── bce.py ├── bpr.py ├── climf.py ├── get_loss.py ├── items_masking_loss_proxy.py ├── lambda_gamma_rank.py ├── logit_norm.py ├── loss.py ├── loss_utils.py ├── mean_ypred_loss.py ├── softmax_crossentropy.py ├── top1.py └── xendcg.py ├── recommenders ├── BERT4rec │ ├── LICENSE │ ├── Readme.md │ ├── __init__.py │ ├── gen_data_fin.py │ ├── modeling.py │ ├── optimization.py │ ├── run.py │ ├── util.py │ └── vocab.py ├── __init__.py ├── conditional_top_recommender.py ├── constant_recommender.py ├── deep_mf.py ├── duorec │ └── duorec.py ├── featurizer.py ├── filter_seen_recommender.py ├── first_order_mc.py ├── item_item.py ├── kion_challenge_featurizer.py ├── lambdamart_ensemble_recommender.py ├── lightfm.py ├── matrix_factorization.py ├── metrics │ ├── __init__.py │ ├── ndcg.py │ └── success.py ├── mlp.py ├── mlp_historical.py ├── random_recommender.py ├── recommender.py ├── sequential │ ├── __init__.py │ ├── data_generator │ │ └── data_generator.py │ ├── featurizers │ │ ├── __init__.py │ │ └── hashing_featurizer.py │ ├── history_vectorizers │ │ ├── add_mask_history_vectorizer.py │ │ ├── default_history_vectorizer.py │ │ └── history_vectorizer.py │ ├── model_trainier.py │ ├── models │ │ ├── __init__.py │ │ ├── bert4rec │ │ │ ├── __init__.py │ │ │ ├── bert4recft.py │ │ │ ├── full_bert.py │ │ │ └── special_items.py │ │ ├── caser.py │ │ ├── gru4rec.py │ │ ├── positional_encodings.py │ │ ├── sasrec │ │ │ ├── __init__.py │ │ │ ├── sasrec.py │ │ │ └── sasrec_multihead_attention.py │ │ ├── sequential_recsys_model.py │ │ └── vit4rec.py │ ├── samplers │ │ ├── __init__.py │ │ ├── idf_sampler.py │ │ ├── popularity_sampler.py │ │ ├── random_sampler.py │ │ └── sampler.py │ ├── sequential_recommender.py │ ├── sequential_recommender_config.py │ ├── target_builders │ │ ├── full_matrix_targets_builder.py │ │ ├── items_masking_target_builder.py │ │ ├── negative_per_positive_target.py │ │ ├── positives_only_targets_builder.py │ │ ├── positives_sequence_target_builder.py │ │ ├── sampled_matrix_target_builder.py │ │ └── target_builders.py │ └── targetsplitters │ │ ├── fair_item_masking.py │ │ ├── items_masking.py │ │ ├── last_item_splitter.py │ │ ├── random_fraction_splitter.py │ │ ├── random_splitter.py │ │ ├── recency_sequence_sampling.py │ │ ├── shifted_sequence_splitter.py │ │ └── targetsplitter.py ├── svd.py ├── top_recommender.py ├── transition_chain_recommender.py └── vanilla_bert4rec.py ├── tests ├── .gitignore ├── __init__.py ├── datasets │ ├── __init__.py │ ├── booking_dataset_reference_actions.json │ ├── mts_kion_reference_actions.json │ ├── test_beauty_dataset.py │ ├── test_bert4rec_datasets.py │ ├── test_booking_dataset.py │ ├── test_datasets_register.py │ ├── test_filter_cold_users.py │ ├── test_get_movielens.py │ ├── test_gowalla_dataset.py │ ├── test_mts_kion_dataset.py │ ├── test_netflix.py │ └── test_yelp_dataset.py ├── generate_actions.py ├── lossess │ ├── __init__.py │ ├── bce_bad_sample.py │ ├── test_bce_loss.py │ ├── test_bpr_loss.py │ ├── test_climf_loss.py │ ├── test_items_masking_proxy_loss.py │ ├── test_lambdarank_loss.py │ ├── test_lambdarank_time.py │ ├── test_logit_norm.py │ ├── test_softmax_crossentropy.py │ ├── test_top1loss.py │ └── test_xendcg_loss.py ├── metrics │ ├── __init__.py │ ├── test_map.py │ ├── test_mrr.py │ ├── test_ndcg.py │ ├── test_pairwise_cos_sim.py │ ├── test_precision.py │ └── test_proxy_metric.py ├── misc │ ├── __init__.py │ ├── test_configs.py │ ├── test_evaluate_recommender.py │ ├── test_item_id.py │ ├── test_keras_ndcg.py │ ├── test_kion_challenge_featurizer.py │ ├── test_n_actions_for_user.py │ ├── test_recommender_evaluator.py │ └── test_split_actions.py ├── ml_sequences.py └── recommenders │ ├── __init__.py │ ├── baselines │ ├── __init__.py │ ├── test_conditional_top_recommender.py │ ├── test_constnat_recommender.py │ ├── test_deepmf.py │ ├── test_filter_seen_recommender.py │ ├── test_first_order_mc_recommender.py │ ├── test_item_item_recommender.py │ ├── test_lightfm_recommender.py │ ├── test_matrix_factorization_recommender.py │ ├── test_mlp_historical.py │ ├── test_mlp_recommender.py │ ├── test_svd_recommender.py │ ├── test_top_recommender.py │ └── test_transition_chain_recommender.py │ ├── sequential │ ├── __init__.py │ ├── bert4rec │ │ ├── __init__.py │ │ ├── test_bert4rec_ft.py │ │ └── test_full_bert.py │ ├── sasrec │ │ ├── __init__.py │ │ ├── test_positional_encoding.py │ │ ├── test_sasrec_attention_map.py │ │ ├── test_sasrec_full_target.py │ │ ├── test_sasrec_model.py │ │ ├── test_sasrec_no_embedding_reuse.py │ │ ├── test_sasrec_rss.py │ │ ├── test_sasrec_save_model.py │ │ └── test_vanilla_sasrec.py │ ├── test_add_mask_vectorizer.py │ ├── test_caser_no_uid.py │ ├── test_gru_model.py │ ├── test_items_masking_target_builder.py │ └── test_target_splitters.py │ ├── test_lambdamart_ensemble_recommender.py │ └── test_vanilla_bert4rec.py ├── ui ├── config.py ├── server.py └── static │ ├── app.js │ ├── index.html │ ├── typeahead.css │ └── typeahead.js └── utils ├── generator_limit.py ├── item_id.py └── os_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | data 3 | .idea/ 4 | evaluation/results/ 5 | evaluation/results 6 | *__pycache__/ 7 | *.ipynb_checkpoints/ 8 | *.DS_Store 9 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/__init__.py -------------------------------------------------------------------------------- /api/action.py: -------------------------------------------------------------------------------- 1 | import json 2 | class Action(object): 3 | def __init__(self, user_id, item_id, timestamp, data=None): 4 | if data is None: 5 | data = dict() 6 | self.user_id = user_id 7 | self.item_id = item_id 8 | self.data = data 9 | self.timestamp = timestamp 10 | 11 | def to_str(self): 12 | result = "Action(uid={}, item={}, ts={}".format( 13 | self.user_id, 14 | self.item_id, 15 | self.timestamp) 16 | if self.data != {}: 17 | result += ", data={}".format(str(self.data)) 18 | result += ")" 19 | return result 20 | 21 | def to_json(self): 22 | try: 23 | #check if data is json serializable 24 | json.dumps(self.data) 25 | data = self.data 26 | 27 | except: 28 | #fallback to just string representation 29 | #TODO: restore may work incorrectly with some datasets 30 | data = str(self.data) 31 | 32 | return json.dumps({ 33 | "user_id": self.user_id, 34 | "item_id": self.item_id, 35 | "data": data, 36 | "timestamp": self.timestamp 37 | }) 38 | 39 | @staticmethod 40 | def from_json(action_str): 41 | doc = json.loads(action_str) 42 | return Action(doc["user_id"], doc["item_id"], doc["data"], doc["timestamp"]) 43 | 44 | def __str__(self): 45 | return self.to_str() 46 | 47 | def __repr__(self): 48 | return self.to_str() 49 | 50 | -------------------------------------------------------------------------------- /api/catalog.py: -------------------------------------------------------------------------------- 1 | def relevancy (keyword, string): 2 | if keyword.lower() == string.lower(): 3 | return -1 4 | return keyword.lower().find(string.lower()) 5 | 6 | class Catalog(object): 7 | def __init__(self): 8 | self.items = {} 9 | 10 | def add_item(self, item): 11 | self.items[item.item_id] = item 12 | 13 | def get_item(self, item_id): 14 | return self.items[item_id] 15 | 16 | def search(self, keyword): 17 | result = [] 18 | for item in self.items.values(): 19 | if keyword.lower() in item.title.lower(): 20 | result.append(item) 21 | result.sort(key=lambda value: relevancy(keyword, value.title)) 22 | return result 23 | 24 | -------------------------------------------------------------------------------- /api/item.py: -------------------------------------------------------------------------------- 1 | class Item(object): 2 | tags = None 3 | title = None 4 | 5 | def __init__(self, item_id, cat_features=None, real_features=None): 6 | if real_features is None: 7 | real_features = [] 8 | 9 | if cat_features is None: 10 | cat_features = [] 11 | 12 | self.item_id = item_id 13 | self.cat_features = cat_features 14 | self.real_features = real_features 15 | 16 | 17 | def with_tags(self, tags): 18 | self.tags = tags 19 | return self 20 | 21 | def with_title(self, title): 22 | self.title = title 23 | return self 24 | 25 | def __str__(self): 26 | return "item id={} title={} tags={}".format(self.item_id, self.title, self.tags) 27 | 28 | def __repr__(self): 29 | return self.__str__() 30 | -------------------------------------------------------------------------------- /api/items_ranking_request.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | class ItemsRankingRequest(object): 5 | def __init__(self, user_id, item_ids: List): 6 | self.user_id = user_id 7 | self.item_ids = item_ids 8 | 9 | def __str__(self): 10 | return f"user_id={self.user_id} item_ids=[{','.join(self.item_ids)}]" 11 | 12 | def __repr__(self): 13 | return self.__str__() 14 | -------------------------------------------------------------------------------- /api/sequential_dataset.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pathlib import PosixPath 3 | from typing import List 4 | 5 | import numpy as np 6 | from aprec.api.action import Action 7 | from aprec.utils.item_id import ItemId 8 | from aprec.utils.os_utils import mkdir_p 9 | 10 | 11 | class MapedSequences(object): 12 | ALL_SEQUENCES = 'all_sequences.mmap' 13 | BORDERS = 'borders.mmap' 14 | 15 | def __init__(self, directory, n_users, n_items): 16 | self.directory = directory 17 | self.is_maped = False 18 | 19 | self.sequences = None 20 | self.borders = None 21 | 22 | @staticmethod 23 | def build(user_actions, n_users, n_items, directory:PosixPath): 24 | all_sequences = [] 25 | borders = [] 26 | for i in range(n_users): 27 | user_sequence = [] 28 | for action in user_actions[i]: 29 | item = action[0] 30 | user_sequence.append(item) 31 | all_sequences += user_sequence 32 | borders.append(len(all_sequences)) 33 | all_sequences = np.array(all_sequences, dtype='int32') 34 | sequences_map = np.memmap(directory/MapedSequences.ALL_SEQUENCES, shape=all_sequences.shape, dtype='int32', mode="write") 35 | sequences_map[:] = all_sequences[:] 36 | sequences_map.flush() 37 | borders = np.array(borders, dtype='int32') 38 | borders_map = np.memmap(directory/MapedSequences.ALL_SEQUENCES, shape=all_sequences.shape, dtype='int32', mode="write") 39 | borders_map.flush() 40 | 41 | class SequentialDataset(object): 42 | def __init__(self): 43 | self.user_mapping = ItemId() 44 | self.item_mapping = ItemId() 45 | self.user_actions = defaultdict(list) 46 | self.is_sorted = True 47 | 48 | def add_action(self, action): 49 | user_id = self.user_mapping.get_id(action.user_id) 50 | item_id = self.item_mapping.get_id(action.item_id) 51 | 52 | if self.user_actions.has(user_id) and self.user_actions[user_id][-1].timestamp > action.timestamp: 53 | self.is_sorted = False 54 | 55 | self.user_actions[user_id].append((item_id, action.timestamp)) 56 | 57 | def sort(self): 58 | if not self.is_sorted: 59 | for user in self.user_actions: 60 | self.user_actions[user].sort(lambda a: a.timestamp) 61 | self.is_sorted = True 62 | 63 | 64 | -------------------------------------------------------------------------------- /api/user.py: -------------------------------------------------------------------------------- 1 | class User(object): 2 | def __init__(self, user_id, cat_features=None, real_features=None): 3 | if real_features is None: 4 | real_features = dict() 5 | 6 | if cat_features is None: 7 | cat_features = dict() 8 | 9 | self.user_id = user_id 10 | self.cat_features = cat_features 11 | self.real_features = real_features 12 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/datasets/__init__.py -------------------------------------------------------------------------------- /datasets/amazon.py: -------------------------------------------------------------------------------- 1 | from aprec.api.action import Action 2 | from aprec.datasets.download_file import download_file 3 | 4 | 5 | URLS = {"books": "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv"} 6 | DATA_DIR = "data/amazon" 7 | 8 | def download(category): 9 | filename = download_file(URLS[category], f"{category}.csv", DATA_DIR) 10 | return filename 11 | 12 | def get_amazon_actions(category): 13 | filename = download(category) 14 | result = [] 15 | for line in open(filename): 16 | user_id, item_id, rating, timestamp = line.strip().split(",") 17 | rating = float(rating) 18 | timestamp = int(timestamp) 19 | result.append(Action(user_id, item_id, timestamp, {"rating": rating})) 20 | return result -------------------------------------------------------------------------------- /datasets/beauty.py: -------------------------------------------------------------------------------- 1 | from aprec.api.action import Action 2 | from aprec.datasets.download_file import download_file 3 | 4 | dataset_url="http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv" 5 | dataset = "ratings.csv" 6 | dir = "data/beauty" 7 | 8 | def get_beauty_dataset(): 9 | dataset_filename = download_file(dataset_url, dataset, dir) 10 | actions = [] 11 | with open(dataset_filename) as input: 12 | for line in input: 13 | user, item, rating, timestamp = line.strip().split(",") 14 | timestamp = int(timestamp) 15 | actions.append(Action(user, item, timestamp)) 16 | return actions -------------------------------------------------------------------------------- /datasets/bert4rec_datasets.py: -------------------------------------------------------------------------------- 1 | from aprec.api.action import Action 2 | from aprec.datasets.download_file import download_file 3 | 4 | BERT4REC_DATASET_URL="https://raw.githubusercontent.com/asash/BERT4rec_py3_tf2/master/BERT4rec/data/{}.txt" 5 | BERT4REC_DIR = "data/bert4rec" 6 | VALID_DATASETS={"beauty", "ml-1m", "steam"} 7 | 8 | def get_bert4rec_dataset(dataset): 9 | if dataset not in VALID_DATASETS: 10 | raise ValueError(f"unknown bert4rec dataset {dataset}") 11 | dataset_filename = download_file(BERT4REC_DATASET_URL.format(dataset), dataset + ".txt", BERT4REC_DIR) 12 | actions = [] 13 | prev_user = None 14 | current_timestamp = 0 15 | with open(dataset_filename) as input: 16 | for line in input: 17 | user, item = [str(id) for id in line.strip().split()] 18 | if user != prev_user: 19 | current_timestamp = 0 20 | prev_user = user 21 | current_timestamp += 1 22 | actions.append(Action(user, item, current_timestamp)) 23 | return actions -------------------------------------------------------------------------------- /datasets/dataset_utils.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import gzip 3 | import logging 4 | import os 5 | import mmh3 6 | 7 | from aprec.utils.os_utils import get_dir, mkdir_p, shell 8 | 9 | 10 | 11 | def filter_popular_items(actions_generator, max_actions): 12 | actions = [] 13 | items_counter = Counter() 14 | for action in actions_generator: 15 | actions.append(action) 16 | items_counter[action.item_id] += 1 17 | popular_items = set([item_id for (item_id, cnt) in items_counter.most_common(max_actions)]) 18 | return filter(lambda action: action.item_id in popular_items, actions) 19 | 20 | def filter_cold_users(actions_generator, min_actions_per_user = 0): 21 | actions = [] 22 | user_counter = Counter() 23 | for action in actions_generator: 24 | actions.append(action) 25 | user_counter[action.user_id] += 1 26 | return filter(lambda action: user_counter[action.user_id] >= min_actions_per_user, actions) 27 | 28 | def take_user_fraction(actions_generator, fraction): 29 | return filter(lambda action: abs(mmh3.hash(action.user_id) / 2**31) < fraction, actions_generator) 30 | 31 | def unzip(zipped_file, unzip_dir): 32 | full_dir_name = os.path.join(get_dir(), unzip_dir) 33 | if os.path.isdir(full_dir_name): 34 | logging.info(f"{unzip_dir} already exists, skipping") 35 | else: 36 | mkdir_p(full_dir_name) 37 | shell(f"unzip -o {zipped_file} -d {full_dir_name}") 38 | return full_dir_name 39 | 40 | def gunzip(gzip_file): 41 | full_file_name = os.path.abspath(gzip_file) 42 | if not(gzip_file.endswith(".gz")): 43 | raise Exception(f"{gzip_file} is not a gzip file") 44 | unzipped_file_name = full_file_name[:-3] 45 | if os.path.isfile(unzipped_file_name): 46 | logging.info(f"{unzipped_file_name} already exists, skipping") 47 | return unzipped_file_name 48 | 49 | with gzip.open(full_file_name) as input: 50 | data = input.read() 51 | with open(unzipped_file_name, 'wb') as output: 52 | output.write(data) 53 | return unzipped_file_name 54 | -------------------------------------------------------------------------------- /datasets/download_file.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import requests 5 | from tqdm import tqdm 6 | 7 | from aprec.utils.os_utils import mkdir_p_local, get_dir 8 | def download_file(url, filename, data_dir): 9 | mkdir_p_local(data_dir) 10 | full_filename = os.path.join(get_dir(), data_dir, filename) 11 | if not os.path.isfile(full_filename): 12 | logging.info(f"downloading {filename} file") 13 | response = requests.get(url, stream=True) 14 | with open(full_filename, 'wb') as out_file: 15 | expected_length = int(response.headers.get('content-length')) 16 | downloaded_bytes = 0 17 | with tqdm(total=expected_length, ascii=True) as pbar: 18 | for chunk in response.iter_content(chunk_size=1024): 19 | out_file.write(chunk) 20 | out_file.flush() 21 | pbar.update(len(chunk)) 22 | logging.info(f"{filename} dataset downloaded") 23 | else: 24 | logging.info(f"booking {filename} file already exists, skipping") 25 | return full_filename -------------------------------------------------------------------------------- /datasets/gowalla.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import dateutil.parser 4 | 5 | from aprec.datasets.dataset_utils import gunzip 6 | from aprec.datasets.download_file import download_file 7 | from aprec.api.action import Action 8 | 9 | 10 | GOWALLA_DATASET_URL='https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz' 11 | DIR="data/gowalla" 12 | GOWALLA_GZIPPED="gowalla.txt.gz" 13 | 14 | def prepare_data(): 15 | gowalla_file_zipped = download_file(GOWALLA_DATASET_URL,GOWALLA_GZIPPED, DIR) 16 | unzipped_gowalla_file = gunzip(gowalla_file_zipped) 17 | return unzipped_gowalla_file 18 | 19 | def parse_line(line): 20 | user_id, timestamp, lat, lon, item_id = line.split("\t") 21 | timestamp = time.mktime(dateutil.parser.isoparse(timestamp).timetuple()) 22 | return Action(user_id, item_id, timestamp) 23 | 24 | def get_gowalla_dataset(max_actions=None): 25 | dataset_file = prepare_data() 26 | actions = [] 27 | for line in open(dataset_file): 28 | actions.append(parse_line(line.strip())) 29 | if max_actions is not None and len(actions) >= max_actions: 30 | break 31 | return actions -------------------------------------------------------------------------------- /datasets/movielens100k.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | from aprec.utils.os_utils import get_dir, console_logging, shell 5 | from aprec.api.action import Action 6 | from aprec.datasets.download_file import download_file 7 | from requests.exceptions import ConnectionError 8 | 9 | DATASET_NAME = 'ml-100k' 10 | MOVIELENS_URL = "http://files.grouplens.org/datasets/movielens/{}.zip".format(DATASET_NAME) 11 | MOVIELENS_BACKUP_URL = "https://web.archive.org/web/20220128015818/https://files.grouplens.org/datasets/movielens/ml-100k.zip" 12 | MOVIELENS_DIR = "data/movielens100k" 13 | MOVIELENS_FILE = "movielens.zip" 14 | MOVIELENS_FILE_ABSPATH = os.path.join(get_dir(), MOVIELENS_DIR, MOVIELENS_FILE) 15 | MOVIELENS_DIR_ABSPATH = os.path.join(get_dir(), MOVIELENS_DIR) 16 | RATINGS_FILE = os.path.join(MOVIELENS_DIR_ABSPATH, 'u.data') 17 | 18 | 19 | def extract_movielens_dataset(): 20 | if os.path.isfile(RATINGS_FILE): 21 | logging.info("movielens dataset is already extracted") 22 | return 23 | shell("unzip -o {} -d {}".format(MOVIELENS_FILE_ABSPATH, MOVIELENS_DIR_ABSPATH)) 24 | dataset_dir = os.path.join(MOVIELENS_DIR_ABSPATH, DATASET_NAME) 25 | for filename in os.listdir(dataset_dir): 26 | shell("mv {} {}".format(os.path.join(dataset_dir, filename), MOVIELENS_DIR_ABSPATH)) 27 | shell("rm -rf {}".format(dataset_dir)) 28 | 29 | 30 | def prepare_data(): 31 | try: 32 | download_file(MOVIELENS_URL, MOVIELENS_FILE, MOVIELENS_DIR) 33 | except ConnectionError: 34 | download_file(MOVIELENS_BACKUP_URL, MOVIELENS_FILE, MOVIELENS_DIR) 35 | 36 | extract_movielens_dataset() 37 | 38 | 39 | def get_movielens100k_actions(min_rating=4.0): 40 | prepare_data() 41 | with open(RATINGS_FILE, 'r') as data_file: 42 | i = 0 43 | for line in data_file: 44 | i += 1 45 | user_id, movie_id, rating_str, timestamp_str = line.strip().split('\t') 46 | rating = float(rating_str) 47 | timestamp = int(timestamp_str) 48 | if rating >= min_rating: 49 | yield Action(user_id, movie_id, timestamp, {"rating": rating}) 50 | 51 | 52 | if __name__ == "__main__": 53 | console_logging() 54 | prepare_data() 55 | -------------------------------------------------------------------------------- /docker/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /docker/.vimrc: -------------------------------------------------------------------------------- 1 | set nu 2 | colorscheme elflord 3 | 4 | filetype plugin on 5 | set expandtab 6 | set tabstop=4 7 | set smarttab 8 | set shiftwidth=4 9 | set smartindent 10 | set nocompatible 11 | syntax on 12 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | from ubuntu:jammy 2 | run unset https_proxy 3 | add ./sources.list /etc/apt/sources.list 4 | add ./cuda-archive-keyring.gpg /usr/share/keyrings/cuda-archive-keyring.gpg 5 | run apt-get update 6 | ARG DEBIAN_FRONTEND=noninteractive 7 | run apt-get install -y apt-utils 8 | run apt-get install -y ca-certificates 9 | run apt-get install -y vim 10 | 11 | run echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" > /etc/apt/sources.list.d/cuda-ubuntu2204-x86_64.list 12 | run apt-get update 13 | 14 | 15 | run apt-get install -y wget curl vim gcc git openssl\ 16 | htop atop screen locales tmux mc sudo 17 | 18 | run apt-get install -y cuda-toolkit-11.7 19 | run apt-get install -y libcudnn8=8.5.0.96-1+cuda11.7 20 | run apt-mark hold libcudnn8 21 | 22 | #run apt-get install -y nvidia-utils-520 23 | run apt-get install -y pip 24 | run apt-get install -y expect 25 | run apt-get install -y zsh 26 | run apt-get install -y curl 27 | 28 | run locale-gen "en_US.UTF-8" 29 | run dpkg-reconfigure locales 30 | run update-locale LANG=en_US.UTF-8 LANGUAGE=en.UTF-8 31 | run mkdir -p /home/aprec/Projects/aprec 32 | run useradd aprec 33 | run chown -R aprec:aprec /home/aprec/ 34 | run usermod -aG sudo aprec 35 | run usermod --password $(openssl passwd -1 lambdarank) aprec 36 | run echo "aprec ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers 37 | user aprec 38 | 39 | SHELL ["/bin/zsh", "-c"] 40 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.4/zsh-in-docker.sh)" -- \ 41 | -t robbyrussell 42 | 43 | 44 | ENV SHELL=/bin/zsh 45 | ENV NVIDIA_VISIBLE_DEVICES all 46 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 47 | add .vimrc /home/aprec/.vimrc 48 | add .gitconfig /home/aprec/.gitconfig 49 | # 50 | ##make /bin/sh symlink to zsh instead of dash: 51 | user root 52 | RUN echo "dash dash/sh boolean false" | debconf-set-selections 53 | RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash 54 | user aprec 55 | ENV ENV=/home/aprec/.profile 56 | ENV PATH=$PATH:/home/aprec/.local/bin 57 | 58 | workdir /home/aprec/Projects/ 59 | ENV PYTHONPATH=/home/aprec/Projects 60 | 61 | add requirements.txt /tmp/aprec_requirements.txt 62 | run pip3 install -r /tmp/aprec_requirements.txt 63 | 64 | cmd zsh 65 | -------------------------------------------------------------------------------- /docker/cuda-archive-keyring.gpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/docker/cuda-archive-keyring.gpg -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter>=1.0.0 2 | tqdm>=4.62.3 3 | requests>=2.26.0 4 | pandas>=1.5.2 5 | scipy>=1.6.0 6 | tornado>=6.1 7 | scikit-learn>=1.0 8 | lightgbm>=3.3.0 9 | mmh3>=3.0.0 10 | matplotlib>=3.6.2 11 | seaborn>=0.12.1 12 | jupyterlab>=3.2.2 13 | transformers>=4.25.1 14 | wget>=3.2 15 | pytest>=7.1.2 16 | pytest-forked>=1.4.0 17 | multiprocessing_on_dill>=3.5.0a4 18 | ujson>=5.5.0 19 | faiss-gpu>=1.7.2 20 | tensorflow-gpu>=2.11.0 21 | tensorflow-probability>=0.18.0 22 | git+https://github.com/asash/lightfm.git@main 23 | -------------------------------------------------------------------------------- /docker/sources.list: -------------------------------------------------------------------------------- 1 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy main restricted 2 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-updates main restricted 3 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy universe 4 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-updates universe 5 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy multiverse 6 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-updates multiverse 7 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-backports main restricted universe multiverse 8 | 9 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-security main restricted 10 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-security universe 11 | deb mirror://mirrors.ubuntu.com/mirrors.txt jammy-security multiverse 12 | #http://security.ubuntu.com/ubuntu/ 13 | -------------------------------------------------------------------------------- /evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | booking_data/ 2 | saved/ 3 | .ipynb_checkpoints/ 4 | a.out 5 | log_tensorboard 6 | .DS_Store 7 | ._.DS_Store 8 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/evaluation/__init__.py -------------------------------------------------------------------------------- /evaluation/analyze_results.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import os 4 | from copy import deepcopy 5 | import pandas as pd 6 | 7 | pd.set_option("display.max_rows", None, "display.max_columns", None) 8 | pd.set_option('display.expand_frame_repr', False) 9 | 10 | experiment_file = sys.argv[1] 11 | data = json.load(open(experiment_file)) 12 | 13 | commit_filename = os.path.join(os.path.dirname(experiment_file), "commit") 14 | if os.path.isfile(commit_filename): 15 | with open(commit_filename) as commit_file: 16 | print(commit_file.read()) 17 | 18 | i = 0 19 | for split_fraction in data: 20 | print("="*40) 21 | i += 1 22 | doc = deepcopy(split_fraction) 23 | recommenders = doc['recommenders'] 24 | del(doc['recommenders']) 25 | print("experiment_{}".format(i)) 26 | print (pd.DataFrame([doc]).T) 27 | print("\n") 28 | 29 | experiment_docs = [] 30 | for recommender_name in recommenders: 31 | recommender = recommenders[recommender_name] 32 | recommender['name'] = recommender_name 33 | del(recommender['model_metadata']) 34 | experiment_docs.append(recommender) 35 | 36 | df = pd.DataFrame(experiment_docs) 37 | df = df.sort_values("ndcg@40") 38 | df = df.set_index('name') 39 | print(df) 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /evaluation/clean_output.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define BUFFERSIZE 10240 5 | 6 | void print(int * buf, int pos) { 7 | int i ; 8 | for (i = 0; i < pos; ++i) 9 | putchar(buf[i]) ; 10 | putchar('\n') ; 11 | } 12 | 13 | int main(int argc, char* argv[]) 14 | { 15 | int c ; 16 | int buf[BUFFERSIZE] ; 17 | int pos = 0 ; 18 | while((c = getchar()) != EOF) 19 | { 20 | switch (c) 21 | { 22 | case '\b': 23 | { 24 | if (pos > 0) 25 | pos-- ; 26 | break ; 27 | } 28 | case '\n': 29 | { 30 | print(buf, pos); 31 | pos = 0 ; 32 | break ; 33 | } 34 | 35 | case '\r': 36 | { 37 | print(buf, pos); 38 | pos = 0 ; 39 | break ; 40 | } 41 | 42 | default: 43 | { 44 | buf[pos++] = c ; 45 | break ; 46 | } 47 | } 48 | } 49 | return 0 ; 50 | } 51 | -------------------------------------------------------------------------------- /evaluation/conf_intervals.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gzip 3 | import json 4 | import pandas as pd 5 | import numpy as np 6 | import scipy.stats 7 | 8 | def mean_confidence_interval(data, confidence=0.95): 9 | a = 1.0 * np.array(data) 10 | n = len(a) 11 | m, se = np.mean(a), scipy.stats.sem(a) 12 | h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1) 13 | return m, m-h, m+h 14 | 15 | prediction_file = sys.argv[1] 16 | data = json.load(gzip.open(prediction_file)) 17 | docs = [] 18 | for doc in data: 19 | docs.append(doc['metrics']) 20 | df = pd.DataFrame(docs) 21 | 22 | metrics = [] 23 | for metric in (df.columns): 24 | mean, conf_min, conf_max = mean_confidence_interval(df[metric]) 25 | metric_doc = {'name': metric, 'mean': mean, 'conf_min': conf_min, 'conf_max': conf_max} 26 | metrics.append(metric_doc) 27 | 28 | print(pd.DataFrame(metrics)) 29 | 30 | -------------------------------------------------------------------------------- /evaluation/configs/ML1M-bpr-example.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.top_recommender import TopRecommender 2 | from aprec.recommenders.lightfm import LightFMRecommender 3 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 4 | from aprec.evaluation.samplers.pop_sampler import PopTargetItemsSampler 5 | from aprec.evaluation.metrics.mrr import MRR 6 | from aprec.evaluation.metrics.ndcg import NDCG 7 | from aprec.evaluation.split_actions import LeaveOneOut 8 | import numpy as np 9 | 10 | 11 | DATASET = "BERT4rec.ml-1m" 12 | 13 | USERS_FRACTIONS = [1] 14 | 15 | def top_recommender(): 16 | return FilterSeenRecommender(TopRecommender()) 17 | 18 | def lightfm_recommender(k, loss): 19 | return FilterSeenRecommender(LightFMRecommender(k, loss)) 20 | 21 | RECOMMENDERS = { 22 | "top_recommender": top_recommender, 23 | "MF-BPR": lambda: lightfm_recommender(30, 'bpr'), 24 | } 25 | 26 | MAX_TEST_USERS=6040 27 | 28 | METRICS = [NDCG(10), MRR()] 29 | TARGET_ITEMS_SAMPLER = PopTargetItemsSampler(101) 30 | 31 | RECOMMENDATIONS_LIMIT = 100 32 | SPLIT_STRATEGY = LeaveOneOut(MAX_TEST_USERS) 33 | -------------------------------------------------------------------------------- /evaluation/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/evaluation/configs/__init__.py -------------------------------------------------------------------------------- /evaluation/configs/gsasrec/gowalla_benchmark.py: -------------------------------------------------------------------------------- 1 | 2 | from aprec.evaluation.split_actions import LeaveOneOut 3 | from aprec.evaluation.configs.gsasrec.common_benchmark_config import * 4 | 5 | DATASET = "gowalla_warm5" 6 | N_VAL_USERS=512 7 | MAX_TEST_USERS=86168 8 | SPLIT_STRATEGY = LeaveOneOut(MAX_TEST_USERS) 9 | RECOMMENDERS = get_recommenders(filter_seen=False) 10 | 11 | if __name__ == "__main__": 12 | 13 | from aprec.tests.misc.test_configs import TestConfigs 14 | TestConfigs().validate_config(__file__) 15 | -------------------------------------------------------------------------------- /evaluation/configs/gsasrec/ml1m_benchmark.py: -------------------------------------------------------------------------------- 1 | from aprec.evaluation.split_actions import LeaveOneOut 2 | from aprec.evaluation.configs.gsasrec.common_benchmark_config import * 3 | 4 | DATASET = "BERT4rec.ml-1m" 5 | N_VAL_USERS=512 6 | MAX_TEST_USERS=6040 7 | SPLIT_STRATEGY = LeaveOneOut(MAX_TEST_USERS) 8 | RECOMMENDERS = get_recommenders(filter_seen=True) 9 | 10 | if __name__ == "__main__": 11 | 12 | from aprec.tests.misc.test_configs import TestConfigs 13 | TestConfigs().validate_config(__file__) 14 | -------------------------------------------------------------------------------- /evaluation/configs/gsasrec/steam_benchmark.py: -------------------------------------------------------------------------------- 1 | from aprec.evaluation.split_actions import LeaveOneOut 2 | from aprec.evaluation.configs.gsasrec.common_benchmark_config import * 3 | 4 | DATASET = "BERT4rec.steam" 5 | N_VAL_USERS=512 6 | MAX_TEST_USERS=281428 7 | SPLIT_STRATEGY = LeaveOneOut(MAX_TEST_USERS) 8 | RECOMMENDERS = get_recommenders(filter_seen=True) 9 | 10 | if __name__ == "__main__": 11 | from aprec.tests.misc.test_configs import TestConfigs 12 | TestConfigs().validate_config(__file__) 13 | -------------------------------------------------------------------------------- /evaluation/dataset_by_config.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import sys 3 | 4 | spec = importlib.util.spec_from_file_location("config", sys.argv[1]) 5 | config = importlib.util.module_from_spec(spec) 6 | spec.loader.exec_module(config) 7 | sys.stdout.write(config.DATASET) -------------------------------------------------------------------------------- /evaluation/evaluation_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | def group_by_user(actions): 5 | result = defaultdict(list) 6 | for action in actions: 7 | result[action.user_id].append(action) 8 | return result -------------------------------------------------------------------------------- /evaluation/filter_cold_start.py: -------------------------------------------------------------------------------- 1 | def filter_cold_start(train_actions, test_actions): 2 | train_user_ids = set() 3 | cold_start_set = set() 4 | non_cold_start = set() 5 | result = [] 6 | 7 | for action in train_actions: 8 | train_user_ids.add(action.user_id) 9 | 10 | for action in test_actions: 11 | if action.user_id in train_user_ids: 12 | non_cold_start.add(action.user_id) 13 | result.append(action) 14 | else: 15 | cold_start_set.add(action.user_id) 16 | print("number of cold start users filtered: {}".format(len(cold_start_set))) 17 | print("number of users in test set: {}".format(len(non_cold_start))) 18 | return result 19 | 20 | -------------------------------------------------------------------------------- /evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/evaluation/metrics/__init__.py -------------------------------------------------------------------------------- /evaluation/metrics/average_popularity_rank.py: -------------------------------------------------------------------------------- 1 | from .metric import Metric 2 | from collections import Counter 3 | 4 | class AveragePopularityRank(Metric): 5 | def __init__(self, k, actions): 6 | self.name = "apr@{}".format(k) 7 | self.k = k 8 | cnt = Counter() 9 | for action in actions: 10 | cnt[action.item_id] += 1 11 | 12 | self.pop_rank = {} 13 | rank = 0 14 | for item, cnt in cnt.most_common(): 15 | rank += 1 16 | self.pop_rank[item] = rank 17 | 18 | 19 | def __call__(self, recommendations, actual_actions): 20 | cnt = 0 21 | s = 0 22 | for recommendation in recommendations[:self.k]: 23 | item_id = recommendation[0] 24 | if item_id in self.pop_rank: 25 | s += self.pop_rank[item_id] 26 | cnt += 1 27 | if cnt == 0: 28 | return 0 29 | return s/cnt 30 | -------------------------------------------------------------------------------- /evaluation/metrics/entropy.py: -------------------------------------------------------------------------------- 1 | from aprec.evaluation.metrics.metric import Metric 2 | from scipy.special import softmax 3 | from scipy.stats import entropy 4 | import numpy as np 5 | 6 | def sigmoid(x): 7 | return np.exp(-np.logaddexp(0, -x)) 8 | 9 | class Entropy(Metric): 10 | def __init__(self, activation, k): 11 | self.name = f"{activation}Entropy@{k}" 12 | if activation == 'Softmax': 13 | self.activation = softmax 14 | elif activation == 'Sigmoid': 15 | self.activation = sigmoid 16 | else: 17 | raise Exception(f"unknown activation {activation}") 18 | self.k = k 19 | 20 | 21 | def __call__(self, recommendations, actual_actions): 22 | if len(recommendations) == 0: 23 | return 0 24 | scores = self.activation(np.array([rec[1] for rec in recommendations[:self.k]])) 25 | scores = scores/np.sum(scores) #normalize, so that we can treat them as probs 26 | return entropy(scores, base=2) / len(scores) -------------------------------------------------------------------------------- /evaluation/metrics/highest_score.py: -------------------------------------------------------------------------------- 1 | from aprec.evaluation.metrics.metric import Metric 2 | 3 | 4 | class HighestScore(Metric): 5 | def __init__(self): 6 | self.name = "HighestScore" 7 | 8 | def __call__(self, recommendations, actual_actions): 9 | if len(recommendations) == 0: 10 | return 0 11 | return recommendations[0][1] -------------------------------------------------------------------------------- /evaluation/metrics/hit.py: -------------------------------------------------------------------------------- 1 | from .metric import Metric 2 | 3 | class HIT(Metric): 4 | """ 5 | Short-Term Prediction Success 6 | Equals 1 if recommender system was able to predict next item in sequence, 0 otherwise. 7 | """ 8 | def __init__(self, k): 9 | self.name = "HIT@{}".format(k) 10 | self.k = k 11 | 12 | def __call__(self, recommendations, actual_actions): 13 | if(len(recommendations) == 0): 14 | return 0 15 | action_to_check = actual_actions[0] 16 | for action in actual_actions[1:]: 17 | if action.timestamp < action_to_check.timestamp: 18 | action_to_check = action 19 | recommended = set([recommendation[0] for recommendation in recommendations[:self.k]]) 20 | return 1 if action_to_check.item_id in recommended else 0 21 | -------------------------------------------------------------------------------- /evaluation/metrics/map.py: -------------------------------------------------------------------------------- 1 | import math 2 | from .metric import Metric 3 | 4 | class MAP(Metric): 5 | def __init__(self, k): 6 | self.name = f"MAP@{k}" 7 | self.k = k 8 | 9 | def __call__(self, recommendations, actual_actions): 10 | if(len(recommendations) == 0 or len(actual_actions) == 0): 11 | return 0 12 | actual_set = set([action.item_id for action in actual_actions]) 13 | correct_predictions = 0 14 | running_sum = 0 15 | for i in range(len(recommendations[:self.k])): 16 | pos = i + 1 17 | predicted = recommendations[i][0] 18 | if predicted in actual_set: 19 | correct_predictions += 1 20 | running_sum += correct_predictions/pos 21 | pass 22 | return running_sum / len(actual_actions) 23 | 24 | 25 | -------------------------------------------------------------------------------- /evaluation/metrics/metric.py: -------------------------------------------------------------------------------- 1 | class Metric(object): 2 | less_is_better = False 3 | def __init__(self): 4 | self.name == "undefined" 5 | 6 | def get_name(self) -> str: 7 | return self.name 8 | 9 | def __call__(self, recommendations, actual): 10 | raise NotImplementedError 11 | -------------------------------------------------------------------------------- /evaluation/metrics/model_confidence.py: -------------------------------------------------------------------------------- 1 | from aprec.evaluation.metrics.metric import Metric 2 | from scipy.special import softmax 3 | import numpy as np 4 | 5 | def sigmoid(x): 6 | return np.exp(-np.logaddexp(0, -x)) 7 | 8 | class Confidence(Metric): 9 | def __init__(self, activation): 10 | self.name = f"{activation}Confidence" 11 | if activation == 'Softmax': 12 | self.activation = softmax 13 | elif activation == 'Sigmoid': 14 | self.activation = sigmoid 15 | else: 16 | raise Exception(f"unknown activation {activation}") 17 | 18 | 19 | def __call__(self, recommendations, actual_actions): 20 | if len(recommendations) == 0: 21 | return 0 22 | scores = np.array([rec[1] for rec in recommendations]) 23 | return self.activation(scores)[0] -------------------------------------------------------------------------------- /evaluation/metrics/mrr.py: -------------------------------------------------------------------------------- 1 | import math 2 | from .metric import Metric 3 | 4 | class MRR(Metric): 5 | def __init__(self): 6 | self.name = "MRR" 7 | 8 | def __call__(self, recommendations, actual_actions): 9 | if(len(recommendations) == 0): 10 | return 0 11 | actual_set = set([action.item_id for action in actual_actions]) 12 | for i in range(len(recommendations)): 13 | if recommendations[i][0] in actual_set: 14 | return 1/(i + 1) 15 | return 0 16 | -------------------------------------------------------------------------------- /evaluation/metrics/ndcg.py: -------------------------------------------------------------------------------- 1 | import math 2 | from .metric import Metric 3 | 4 | class NDCG(Metric): 5 | def __init__(self, k): 6 | self.name = "ndcg@{}".format(k) 7 | self.k = k 8 | 9 | def __call__(self, recommendations, actual_actions): 10 | if(len(recommendations) == 0): 11 | return 0 12 | actual_set = set([action.item_id for action in actual_actions]) 13 | recommended = [recommendation[0] for recommendation in recommendations[:self.k]] 14 | cool = set(recommended).intersection(actual_set) 15 | if len(cool) == 0: 16 | return 0 17 | ideal_rec = sorted(recommended, key = lambda x: not(x in actual_set)) 18 | return NDCG.dcg(recommended, actual_set)/NDCG.dcg(ideal_rec, actual_set) 19 | 20 | 21 | @staticmethod 22 | def dcg(id_list, relevant_id_set): 23 | result = 0.0 24 | for idx in range(len(id_list)): 25 | i = idx + 1 26 | if (id_list[idx]) in relevant_id_set: 27 | result += 1 / math.log2(i+1) 28 | return result 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /evaluation/metrics/pairwise_cos_sim.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | import random 3 | 4 | from aprec.evaluation.metrics.metric import Metric 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | 9 | class PairwiseCosSim(Metric): 10 | def __init__(self, actions, k): 11 | print("init pairwise_cos_sim...") 12 | self.name = "pairwise_cos_sim@{}".format(k) 13 | self.k = k 14 | self.max_actions_per_user = 500 15 | self.max_users = 500 16 | self.item_cnt = Counter() 17 | self.pair_cnt = Counter() 18 | 19 | user_sets = defaultdict(list) 20 | 21 | for action in actions: 22 | user_sets[action.user_id].append(action.item_id) 23 | 24 | for user_id in np.random.choice(list(user_sets.keys()), min(self.max_users, len(user_sets)), replace=False): 25 | random.shuffle(user_sets[user_id]) 26 | for item1 in user_sets[user_id][:self.max_actions_per_user]: 27 | self.item_cnt[item1] += 1 28 | for item2 in user_sets[user_id][:self.max_actions_per_user]: 29 | if item1 != item2: 30 | self.pair_cnt[(item1, item2)] += 1 31 | self.item_cnt = dict(self.item_cnt) 32 | self.pair_cnt = dict(self.pair_cnt) 33 | print("init done...") 34 | 35 | def __call__(self, recommendations, actual_actions): 36 | items = [recommendation[0] for recommendation in recommendations[:self.k]] 37 | pairs = 0 38 | s = 0 39 | for item1 in items: 40 | for item2 in items: 41 | if (item1 != item2): 42 | pairs += 1 43 | if (item1, item2) in self.pair_cnt: 44 | s += self.pair_cnt[(item1, item2)] ** 2 / (self.item_cnt[item1] * self.item_cnt[(item2)]) 45 | if pairs == 0: return 0 46 | return s/pairs 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /evaluation/metrics/precision.py: -------------------------------------------------------------------------------- 1 | from .metric import Metric 2 | 3 | class Precision(Metric): 4 | def __init__(self, k): 5 | self.name = "precision@{}".format(k) 6 | self.k = k 7 | 8 | def __call__(self, recommendations, actual_actions): 9 | if len(recommendations) == 0: 10 | return 0 11 | actual_set = set([action.item_id for action in actual_actions]) 12 | recommended = set([recommendation[0] for recommendation in recommendations[:self.k]]) 13 | cool = recommended.intersection(actual_set) 14 | return len(cool) / len(recommended) 15 | -------------------------------------------------------------------------------- /evaluation/metrics/recall.py: -------------------------------------------------------------------------------- 1 | from .metric import Metric 2 | 3 | class Recall(Metric): 4 | def __init__(self, k): 5 | self.name = "recall@{}".format(k) 6 | self.k = k 7 | 8 | def __call__(self, recommendations, actual_actions): 9 | if len(recommendations) == 0: 10 | return 0 11 | actual_set = set([action.item_id for action in actual_actions]) 12 | recommended = set([recommendation[0] for recommendation in recommendations[:self.k]]) 13 | cool = recommended.intersection(actual_set) 14 | return len(cool) / len(actual_set) 15 | -------------------------------------------------------------------------------- /evaluation/metrics/sampled_proxy_metric.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import defaultdict, Counter 3 | 4 | import numpy as np 5 | 6 | from .metric import Metric 7 | 8 | #this proxy is used to match BERT4rec evaluation strategy, 9 | # in order to be able to compare our models evaluation to what they report in the paper 10 | # In their code they randomly sample 100 items out of full items list, add relevant items and then calculate metrics. 11 | # for the items outside of returned recommendations, 12 | # we assume, that score is equal to min_score of recommended items - random value 13 | 14 | 15 | class SampledProxy(Metric): 16 | def __init__(self, item_ids, probs, n_negatives, metric): 17 | self.item_ids = list(item_ids) 18 | self.n_negatives = n_negatives 19 | self.metric = metric 20 | self.name = f"{metric.name}_sampled@{self.n_negatives}" 21 | self.probs = probs 22 | 23 | def __call__(self, recommendations, actual_actions): 24 | rec_dict = {} 25 | min_score = float('inf') 26 | for item, score in recommendations: 27 | rec_dict[item] = score 28 | min_score = min(min_score, score) 29 | 30 | recs = [] 31 | recommended = set() 32 | for action in actual_actions: 33 | recs.append((action.item_id, self.get_item_score(action.item_id, min_score, rec_dict))) 34 | recommended.add(action.item_id) 35 | 36 | target_size = len(actual_actions) + self.n_negatives 37 | while(len(recommended) < target_size): 38 | item_ids = np.random.choice(self.item_ids, target_size - len(recommended), p=self.probs, replace=False) 39 | for item_id in item_ids: 40 | if item_id not in recommended: 41 | recs.append((item_id, self.get_item_score(item_id, min_score, rec_dict))) 42 | recommended.add(item_id) 43 | recs.sort(key=lambda x: -x[1]) 44 | return self.metric(recs, actual_actions) 45 | 46 | @staticmethod 47 | def all_item_ids_probs(actions): 48 | counter = Counter() 49 | cnt = 0 50 | for action in actions: 51 | counter[action.item_id] += 1 52 | cnt += 1 53 | 54 | items, probs = [], [] 55 | for item, item_cnt in counter.most_common(): 56 | items.append(item) 57 | probs.append(item_cnt / cnt) 58 | return items, probs 59 | 60 | 61 | 62 | @staticmethod 63 | def get_item_score(item_id, min_score, rec_dict): 64 | if item_id not in rec_dict: 65 | return min_score - random.random() 66 | else: 67 | return rec_dict[item_id] 68 | -------------------------------------------------------------------------------- /evaluation/n_actions_for_user.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | def n_actions_for_user(actions, n): 3 | """leave only n first actions for particular user""" 4 | user_actions = defaultdict(lambda: []) 5 | for action in actions: 6 | user_actions[action.user_id].append(action) 7 | 8 | result = [] 9 | for user_id in user_actions: 10 | result += sorted(user_actions[user_id], key = lambda action: action.timestamp)[:n] 11 | 12 | return result 13 | 14 | 15 | -------------------------------------------------------------------------------- /evaluation/run_all_b4rec_originals.sh: -------------------------------------------------------------------------------- 1 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_200000steps.py 2 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_400000steps.py 3 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_800000steps.py 4 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_1600000steps.py 5 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_3200000steps.py 6 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_6400000steps.py 7 | sh run_n_experiments.sh configs/bert4rec_repro_paper/ml_1m_original_12800000steps.py 8 | -------------------------------------------------------------------------------- /evaluation/run_n_experiments.sh: -------------------------------------------------------------------------------- 1 | config=$1 2 | N=$2 3 | 4 | if [ "$CHECK_COMMIT_STATUS" != "false" ]; then 5 | if [ -n "$(git status --porcelain)" ]; then 6 | echo "There are changes in the repo. Please commit the code in order to maintain traceability of the experiments"; 7 | exit 1 8 | fi 9 | fi 10 | 11 | config_filename=$(basename -- "$config") 12 | config_id="${config_filename%.*}" 13 | 14 | 15 | 16 | date=`date +%Y_%m_%dT%H_%M_%S` 17 | experiment_id="${config_id}_${date}" 18 | dataset_id=`python3 dataset_by_config.py ${config}` 19 | echo "running experiments on dataset ${dataset_id}" 20 | root_dir=./results/$dataset_id/$experiment_id 21 | experiment_stdout=$root_dir/stdout 22 | experiment_stderr=$root_dir/stderr 23 | experiment_commit=$root_dir/commit 24 | 25 | mkdir -p $root_dir 26 | 27 | latest_experiment_link=./results/latest_experiment 28 | 29 | rm -f $latest_experiment_link 30 | ln -s `pwd`/$root_dir $latest_experiment_link 31 | 32 | echo experement resutls are saved at $root_dir 33 | 34 | cp $config $root_dir 35 | 36 | for i in `seq 1 $N`; 37 | do 38 | experiment_result=$root_dir/experiment_${i}.json 39 | echo "experiment_stdout: ${experiment_stdout}" 40 | echo "experiment_stderr: ${experiment_stderr}" 41 | echo "experiment_result: ${experiment_result}" 42 | echo "experiment_commit: ${experiment_commit}" 43 | git log -1 > $experiment_commit 44 | unbuffer python3 run_experiment.py $config $experiment_result > $experiment_stdout 2> $experiment_stderr; 45 | done; 46 | -------------------------------------------------------------------------------- /evaluation/samplers/random_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from aprec.api.items_ranking_request import ItemsRankingRequest 4 | from aprec.evaluation.evaluation_utils import group_by_user 5 | from aprec.evaluation.samplers.sampler import TargetItemSampler 6 | 7 | 8 | class RandomTargetItemSampler(TargetItemSampler): 9 | def get_sampled_ranking_requests(self): 10 | all_items = set() 11 | for action in self.actions: 12 | all_items.add(action.item_id) 13 | items = list(all_items) 14 | by_user_test = group_by_user(self.test) 15 | result = [] 16 | for user_id in by_user_test: 17 | target_items = set(action.item_id for action in by_user_test[user_id]) 18 | while(len(target_items) < self.target_size): 19 | item_ids = np.random.choice(items, 20 | self.target_size - len(target_items),replace=False) 21 | for item_id in item_ids: 22 | if item_id not in target_items: 23 | target_items.add(item_id) 24 | result.append(ItemsRankingRequest(user_id=user_id, item_ids=list(target_items))) 25 | return result -------------------------------------------------------------------------------- /evaluation/samplers/sampler.py: -------------------------------------------------------------------------------- 1 | class TargetItemSampler(object): 2 | def __init__(self, target_size) -> None: 3 | super().__init__() 4 | self.target_size = target_size 5 | 6 | def set_actions(self, all_actions, test_actions): 7 | self.actions = all_actions 8 | self.test = test_actions 9 | 10 | 11 | def get_sampled_ranking_requests(self): 12 | raise NotImplementedError() -------------------------------------------------------------------------------- /evaluation/statistical_signifficance_test.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import os 4 | from argparse import ArgumentParser 5 | from collections import defaultdict 6 | from scipy.stats import ttest_ind 7 | 8 | def get_arguments(): 9 | parser = ArgumentParser() 10 | parser.add_argument("--predictions-dir", required=True) 11 | parser.add_argument("--output-file", required=True) 12 | return parser.parse_args() 13 | 14 | 15 | def process(arguments): 16 | metrics = defaultdict(lambda: defaultdict(list)) 17 | for filename in os.listdir(arguments.predictions_dir): 18 | if filename.endswith(".json.gz"): 19 | in_file = gzip.open(os.path.join(arguments.predictions_dir, filename)) 20 | recommender_name = ".".join(filename.split(".")[:-2]) 21 | elif filename.endswith(".json"): 22 | in_file = open(os.path.join(filename, filename)) 23 | recommender_name = ".".join(filename.split(".")[:-1]) 24 | else: 25 | continue 26 | for line in in_file: 27 | user_doc = json.loads(line) 28 | for metric in user_doc["metrics"]: 29 | metrics[metric][recommender_name].append(user_doc["metrics"][metric]) 30 | result = defaultdict(lambda: defaultdict(dict)) 31 | for metric in metrics: 32 | for recommender_name_1 in metrics[metric]: 33 | rec_1_sample = metrics[metric][recommender_name_1] 34 | for recommender_name_2 in metrics[metric]: 35 | rec_2_sample = metrics[metric][recommender_name_2] 36 | t, p_value = ttest_ind(rec_1_sample, rec_2_sample) 37 | result[recommender_name_1][metric][recommender_name_2] = p_value 38 | with open(arguments.output_file, 'w') as output: 39 | output.write(json.dumps(result, indent=4)) 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | def main(): 48 | arguments = get_arguments() 49 | process(arguments) 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /evaluation/two_predictions_signficance_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gzip 4 | import json 5 | import pandas as pd 6 | from scipy.stats import ttest_ind 7 | from argparse import ArgumentParser 8 | 9 | parser = ArgumentParser() 10 | parser.add_argument('--first', type=str, required=True) 11 | parser.add_argument('--second', type=str, required=True) 12 | parser.add_argument("--metrics", type=str,required=False, default=None) 13 | args = parser.parse_args() 14 | 15 | predictions_file_1 = args.first 16 | predictions_file_2 = args.second 17 | first_name = os.path.basename(predictions_file_1).rstrip(".json.gz") 18 | second_name = os.path.basename(predictions_file_2).rstrip(".json.gz") 19 | 20 | def get_metrics(doc): 21 | result = doc['metrics'] 22 | if 'sampled_metrics' in doc: 23 | for key in doc['sampled_metrics']: 24 | result[f"sampled_{key}"] = doc['sampled_metrics'][key] 25 | return result 26 | 27 | def read_data(filename): 28 | result = [] 29 | data = json.load(gzip.open(filename)) 30 | for doc in data: 31 | metrics = get_metrics(doc) 32 | result.append(metrics) 33 | return pd.DataFrame(result) 34 | 35 | df1 = read_data(predictions_file_1) 36 | df2 = read_data(predictions_file_2) 37 | 38 | overlap_columns = set(df1.columns).intersection(set(df2.columns)) 39 | 40 | if args.metrics is not None: 41 | overlap_columns = overlap_columns.intersection(set(args.metrics.split(","))) 42 | 43 | 44 | docs = [] 45 | 46 | for column_name in overlap_columns: 47 | df1_series = df1[column_name] 48 | df2_series = df2[column_name] 49 | 50 | mean1 = df1_series.mean() 51 | mean2 = df2_series.mean() 52 | doc = {} 53 | doc["metric_name"] = column_name 54 | doc[first_name] = mean1 55 | doc[second_name] = mean2 56 | doc["difference"] = mean2 - mean1 57 | doc["difference_pct"] = (mean2 - mean1) * 100 / mean1 58 | t, pval = ttest_ind(df1_series, df2_series) 59 | doc["p_value"] = pval 60 | doc["p_value_bonferoni"] = pval * len(overlap_columns) 61 | docs.append(doc) 62 | 63 | result = pd.DataFrame(docs) 64 | result['significant_0.05'] = result["p_value_bonferoni"] < 0.05 65 | result['significant_0.01'] = result["p_value_bonferoni"] < 0.01 66 | result['significant_0.001'] = result["p_value_bonferoni"] < 0.001 67 | result['significant_0.0001'] = result["p_value_bonferoni"] < 0.0001 68 | 69 | with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.expand_frame_repr', False): # more options can be specified also 70 | print(result) 71 | 72 | -------------------------------------------------------------------------------- /losses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/losses/__init__.py -------------------------------------------------------------------------------- /losses/bce.py: -------------------------------------------------------------------------------- 1 | from aprec.losses.loss import ListWiseLoss 2 | import tensorflow as tf 3 | 4 | 5 | class BCELoss(ListWiseLoss): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(**kwargs) 8 | self.__name__ = "BCE" 9 | self.less_is_better = True 10 | 11 | def calc_per_list(self, y_true_raw, y_pred): 12 | eps = tf.constant(1e-8, y_pred.dtype) 13 | y_true = tf.cast(y_true_raw, y_pred.dtype) 14 | is_target = tf.cast((y_true >= -eps), y_pred.dtype) 15 | trues = y_true*is_target 16 | pos = trues*tf.math.softplus(-y_pred) * is_target 17 | neg = (1.0 - trues)*tf.math.softplus(y_pred) * is_target 18 | num_targets = tf.reduce_sum(is_target, axis=1) 19 | ce_sum = tf.reduce_sum(pos + neg, axis=1) 20 | res_sum = tf.math.divide_no_nan(ce_sum, num_targets) 21 | return res_sum 22 | 23 | def __call__(self, y_true_raw, y_pred): 24 | y_true = tf.cast(y_true_raw, y_pred.dtype) 25 | eps = tf.constant(1e-8, y_pred.dtype) 26 | is_target = tf.cast((y_true >= -eps), y_pred.dtype) 27 | trues = y_true*is_target 28 | pos = trues*tf.math.softplus(-y_pred) * is_target 29 | neg = (1.0 - trues)*tf.math.softplus(y_pred) * is_target 30 | num_targets = tf.reduce_sum(is_target) 31 | ce_sum = tf.reduce_sum(pos + neg) 32 | res_sum = tf.math.divide_no_nan(ce_sum, num_targets) 33 | return res_sum -------------------------------------------------------------------------------- /losses/bpr.py: -------------------------------------------------------------------------------- 1 | from math import perm 2 | import tensorflow as tf 3 | 4 | from aprec.losses.loss_utils import get_pairwise_diff_batch, get_truncated, masked_softmax 5 | from aprec.losses.loss import ListWiseLoss, Loss 6 | 7 | #BPR Loss as described in orignial paper. 8 | #https://arxiv.org/abs/1205.2618 9 | #This loss doesn't include regularization term as in tensorflow it should be done on the model side (e.g. include l2 regularization in embeddings) 10 | #Setting softmax_weighted into True will turn this loss into BPR-max loss, as described in the GRU4Rec+ paper 11 | ##https://dl.acm.org/doi/abs/10.1145/3269206.3271761 12 | class BPRLoss(ListWiseLoss): 13 | def __init__(self, num_items=None, batch_size=None, max_positives=10, pred_truncate=None, softmax_weighted=False): 14 | super().__init__(num_items, batch_size) 15 | self.max_positives = max_positives 16 | self.softmax_weighted=softmax_weighted 17 | self.pred_truncate = pred_truncate 18 | 19 | def calc_per_list(self, y_true, y_pred): 20 | top_true = tf.math.top_k(y_true, self.max_positives) 21 | pred_ordered_by_true = tf.gather(y_pred, top_true.indices, batch_dims=1) 22 | 23 | pred, true_ordered_by_pred = get_truncated(y_true, y_pred, self.pred_truncate) 24 | pred_size = tf.shape(pred)[-1] 25 | 26 | mask = tf.cast((get_pairwise_diff_batch(top_true.values, true_ordered_by_pred, self.max_positives, pred_size) > 0), tf.float32) 27 | values = get_pairwise_diff_batch(pred_ordered_by_true, pred, self.max_positives, pred_size) 28 | sigmoid = -tf.math.log_sigmoid(values) 29 | sigmoid = sigmoid * mask 30 | if self.softmax_weighted: 31 | pred_tile = tf.tile(tf.expand_dims(pred, 1), [1, self.max_positives, 1]) 32 | mask_transposed = tf.transpose(mask, perm=[0, 2, 1]) 33 | pred_softmax = tf.transpose(masked_softmax(pred_tile, mask_transposed), perm=[0, 2, 1]) 34 | sigmoid *= pred_softmax 35 | result = tf.reduce_sum(sigmoid, axis=[1, 2]) / tf.reduce_sum(mask, axis=[1, 2]) 36 | return result 37 | 38 | def __call__(self, y_true, y_pred): 39 | result = self.calc_per_list(y_true, y_pred) 40 | return tf.reduce_mean(result) 41 | 42 | -------------------------------------------------------------------------------- /losses/climf.py: -------------------------------------------------------------------------------- 1 | #CLIMF Loss Implementation 2 | #See paper: 3 | #https://dl.acm.org/doi/10.1145/2365952.2365981 4 | 5 | import tensorflow as tf 6 | 7 | from aprec.losses.loss_utils import my_map 8 | from aprec.losses.loss import Loss 9 | 10 | 11 | class CLIMFLoss(Loss): 12 | def __init__(self, num_items=None, batch_size=None, max_positives=10): 13 | super().__init__(num_items, batch_size) 14 | self.max_positives = max_positives 15 | 16 | 17 | def get_pairwise_diffs_matrix(self, x, y): 18 | a, b = tf.meshgrid(tf.transpose(y), x) 19 | return tf.subtract(b, a) 20 | 21 | def get_pairwise_diffs_matrices(self, a, b): 22 | result = my_map(self.get_pairwise_diffs_matrix, (a, b)) 23 | return result 24 | 25 | #equation (9) from the paper 26 | def __call__(self, y_true, y_pred): 27 | EPS=1e-6 28 | top_true = tf.math.top_k(y_true, self.max_positives) 29 | true_values = top_true.values 30 | pred_ordered = tf.gather(y_pred, top_true.indices, batch_dims=1) 31 | values = self.get_pairwise_diffs_matrices(pred_ordered, y_pred) 32 | values_sigmoid = tf.math.sigmoid(values) 33 | tiled_values = tf.tile(true_values, [1, y_pred.shape[-1]]) 34 | mask = tf.reshape(tiled_values, (self.batch_size, self.num_items, true_values.shape[1])) 35 | mask = tf.transpose(mask, perm=[0, 2, 1]) 36 | second_climf_term = tf.math.reduce_sum(tf.math.log(1 - mask*values_sigmoid + EPS), axis=1) 37 | first_climf_term = tf.math.log_sigmoid(y_pred) 38 | result = -tf.reduce_sum(y_true*(second_climf_term + first_climf_term)) 39 | return result -------------------------------------------------------------------------------- /losses/get_loss.py: -------------------------------------------------------------------------------- 1 | from aprec.losses.bpr import BPRLoss 2 | from aprec.losses.bce import BCELoss 3 | from aprec.losses.climf import CLIMFLoss 4 | from aprec.losses.loss import ListWiseLoss, Loss 5 | from aprec.losses.softmax_crossentropy import SoftmaxCrossEntropy 6 | from aprec.losses.top1 import TOP1Loss 7 | from aprec.losses.lambda_gamma_rank import LambdaGammaRankLoss 8 | from aprec.losses.xendcg import XENDCGLoss 9 | import tensorflow as tf 10 | 11 | losses = { 12 | 'xendcg': XENDCGLoss, 13 | 'bpr': BPRLoss, 14 | 'climf': CLIMFLoss, 15 | 'bce': BCELoss, 16 | 'top1': TOP1Loss, 17 | 'lambdarank': LambdaGammaRankLoss, 18 | 'softmax_ce': SoftmaxCrossEntropy, 19 | } 20 | 21 | def get_loss(loss_name, items_num, batch_size, max_positives=40, 22 | internal_dtype=tf.float32, lambda_normalization=True, 23 | lambdarank_pred_truncate=None, 24 | lambdarank_bce_weight=0.0, 25 | ): 26 | if loss_name == 'lambdarank': 27 | return LambdaGammaRankLoss(num_items=items_num, batch_size=batch_size, ndcg_at=max_positives, 28 | dtype=internal_dtype, 29 | lambda_normalization=lambda_normalization, 30 | pred_truncate_at=lambdarank_pred_truncate, 31 | bce_grad_weight=lambdarank_bce_weight) 32 | return losses[loss_name](num_items=items_num, batch_size=batch_size) 33 | 34 | 35 | listwise_losses = { 36 | 'softmax_ce': SoftmaxCrossEntropy, 37 | 'lambdarank': LambdaGammaRankLoss, 38 | 'bce': BCELoss, 39 | } 40 | 41 | def listwise_loss_from_config(loss_name, loss_params) -> ListWiseLoss: 42 | return losses[loss_name](**loss_params) 43 | 44 | -------------------------------------------------------------------------------- /losses/items_masking_loss_proxy.py: -------------------------------------------------------------------------------- 1 | from aprec.losses.loss import ListWiseLoss, Loss 2 | import tensorflow as tf 3 | 4 | 5 | class ItemsMaksingLossProxy(Loss): 6 | def __init__(self, listwise_loss: ListWiseLoss, negatives_per_positve, sequence_length, num_items=None, batch_size=None, add_positive = True): 7 | super().__init__(num_items, batch_size) 8 | self.listwise_loss = listwise_loss 9 | self.negatives_per_positive = negatives_per_positve 10 | self.sequence_length = sequence_length 11 | if add_positive: 12 | self.listwise_loss.set_num_items(negatives_per_positve + 1) 13 | else: 14 | self.listwise_loss.set_num_items(negatives_per_positve) 15 | self.less_is_better = listwise_loss.less_is_better 16 | self.__name__ = self.listwise_loss.__name__ + "_proxy" 17 | self.add_positive = add_positive 18 | 19 | def set_batch_size(self, batch_size): 20 | super().set_batch_size(batch_size) 21 | self.listwise_loss.set_batch_size(self.batch_size * self.sequence_length) 22 | 23 | def __call__(self, y_true, y_pred): 24 | n_targets = self.negatives_per_positive 25 | if self.add_positive: 26 | n_targets += 1 27 | ytrue_reshaped = tf.reshape(y_true, (self.batch_size * self.sequence_length, n_targets)) 28 | ypred_reshaped = tf.cast(tf.reshape(y_pred, (self.batch_size * self.sequence_length, n_targets)), 'float32') 29 | result = self.listwise_loss.loss_per_list(ytrue_reshaped, ypred_reshaped) 30 | return result 31 | 32 | -------------------------------------------------------------------------------- /losses/logit_norm.py: -------------------------------------------------------------------------------- 1 | from aprec.losses.loss import ListWiseLoss 2 | import tensorflow as tf 3 | 4 | #https://arxiv.org/abs/2205.09310 5 | class LogitNormLoss(ListWiseLoss): #used by bert 6 | def __init__(self, temperature=1, *args, **kwargs): 7 | super().__init__() 8 | self.__name__ = "LogitNormLoss" 9 | self.less_is_better = True 10 | self.temperature = temperature 11 | 12 | def calc_per_list(self, y_true, y_pred): 13 | norms = tf.expand_dims(tf.norm(y_pred, axis=-1), -1) 14 | logit_norms = tf.math.divide_no_nan(y_pred, norms)/self.temperature 15 | return tf.nn.softmax_cross_entropy_with_logits(y_true, logit_norms) 16 | -------------------------------------------------------------------------------- /losses/loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | class Loss(): 3 | def __init__(self, num_items=None, batch_size=None): 4 | self.num_items = num_items 5 | self.batch_size = batch_size 6 | 7 | def __call__(self, y_true, y_pred): 8 | raise NotImplementedError 9 | 10 | def set_num_items(self, num_items): 11 | self.num_items = num_items 12 | 13 | def set_batch_size(self, batch_size): 14 | self.batch_size = batch_size 15 | 16 | def get_config(self): 17 | result = {"num_items": self.num_items, "batch_size": self.batch_size} 18 | return result 19 | 20 | @classmethod 21 | def from_config(cls, config): 22 | return cls(num_items=config['num_items'], batch_size=config['batch_size']) 23 | 24 | class ListWiseLoss(Loss): 25 | @tf.custom_gradient 26 | def loss_per_list(self, y_true, y_pred, sample_weights=None): 27 | with tf.GradientTape() as g: 28 | g.watch(y_pred) 29 | ignore_mask = tf.cast(y_true == -100, y_pred.dtype) #-100 is the default ignore value 30 | use_mask = 1.0 - ignore_mask 31 | noise = ignore_mask * tf.random.uniform(y_pred.shape, 0.0, 1.0, dtype=y_pred.dtype) 32 | listwise_ytrue = use_mask * tf.cast(y_true, y_pred.dtype) + noise 33 | listwise_loss = self.calc_per_list(listwise_ytrue, y_pred) 34 | use_loss_mask = tf.squeeze(use_mask[:,:1], axis=1) 35 | average_loss = tf.reduce_sum(listwise_loss * use_loss_mask) / tf.reduce_sum(use_loss_mask) 36 | loss_grads = g.gradient(average_loss, y_pred) 37 | 38 | if sample_weights: 39 | weighted_mask = use_loss_mask * sample_weights[:,0] 40 | average_loss = tf.reduce_sum(listwise_loss * weighted_mask) / tf.reduce_sum(weighted_mask) 41 | 42 | def grad(dy): #ensure that we don't utilize gradients for ignored items 43 | y_true_grad = tf.zeros_like(y_true) 44 | y_pred_grad = dy * use_mask * loss_grads 45 | if sample_weights: 46 | y_pred_grad = sample_weights * y_pred_grad 47 | sample_weights_grad = tf.zeros_like(sample_weights) 48 | return y_true_grad, y_pred_grad, sample_weights_grad 49 | return y_true_grad, y_pred_grad 50 | 51 | return average_loss, grad 52 | 53 | 54 | def calc_per_list(self, y_true, y_pred): 55 | raise NotImplementedError 56 | -------------------------------------------------------------------------------- /losses/loss_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # https://stackoverflow.com/questions/37086098/does-tensorflow-map-fn-support-taking-more-than-one-tensor 4 | 5 | 6 | def my_map(fn, arrays, dtype=tf.float32): 7 | # assumes all arrays have same leading dim 8 | indices = tf.range(tf.shape(arrays[0])[0]) 9 | out = tf.map_fn(lambda ii: fn(*[array[ii] for array in arrays]), indices, dtype=dtype) 10 | return out 11 | 12 | def get_pairwise_diff_batch(a, b, a_size, b_size): 13 | a_tile = tf.tile(tf.expand_dims(a, 1), [1, b_size, 1]) 14 | b_tile = tf.tile(tf.expand_dims(b, 2), [1, 1, a_size]) 15 | result = a_tile - b_tile 16 | return result 17 | 18 | 19 | def get_truncated(y_true, y_pred, truncate_at): 20 | if truncate_at is not None: 21 | top_pred = tf.math.top_k(y_pred, truncate_at) 22 | pred = top_pred.values 23 | true_ordered_by_pred = tf.gather(y_true, top_pred.indices, batch_dims=1) 24 | else: 25 | pred = y_pred 26 | true_ordered_by_pred = y_true 27 | return pred,true_ordered_by_pred 28 | 29 | def masked_softmax(x, mask): 30 | exp = tf.math.exp(x) * mask 31 | sum = tf.expand_dims(tf.reduce_sum(exp, -1), -1) 32 | result = tf.math.divide_no_nan(exp, sum) 33 | return result 34 | -------------------------------------------------------------------------------- /losses/mean_ypred_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from aprec.losses.loss import Loss 4 | 5 | #this is a dummy loss function, that does not use y_true 6 | #It can be useful when the model itself already computes loss 7 | #Example = BERT masking model 8 | 9 | class MeanPredLoss(Loss): 10 | def __init__(self, num_items=None, batch_size=None, name="mean_ypred"): 11 | super().__init__(num_items, batch_size) 12 | self.__name__ = name 13 | self.less_is_better=True 14 | 15 | def __call__(self, y_true, y_pred): 16 | result = tf.reduce_mean(y_pred) 17 | return result 18 | -------------------------------------------------------------------------------- /losses/softmax_crossentropy.py: -------------------------------------------------------------------------------- 1 | from aprec.losses.loss import ListWiseLoss 2 | import tensorflow as tf 3 | 4 | class SoftmaxCrossEntropy(ListWiseLoss): #used by bert 5 | def __init__(self, *args, **kwargs): 6 | super().__init__() 7 | self.__name__ = "SoftmaxCrossEntropy" 8 | self.less_is_better = True 9 | 10 | def calc_per_list(self, y_true, y_pred): 11 | return tf.nn.softmax_cross_entropy_with_logits(y_true, y_pred) 12 | 13 | -------------------------------------------------------------------------------- /losses/top1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from aprec.losses.loss_utils import get_pairwise_diff_batch, get_truncated, masked_softmax 3 | 4 | from aprec.losses.loss import Loss 5 | 6 | #TOP1 loss as defined in GRU4rec Papper https://arxiv.org/pdf/1511.06939 7 | #We assume that there is only one positive sample. 8 | #If there are more then one posive, the one will be sampled randomly. 9 | 10 | #setting softmax_weighting to True turns this loss into TOP1-Max loss, described in the GRU4Rrec+ Paper 11 | #https://dl.acm.org/doi/abs/10.1145/3269206.3271761 12 | class TOP1Loss(Loss): 13 | def __init__(self, num_items=None, batch_size=None, pred_truncate=None, softmax_weighted=False): 14 | super().__init__(num_items, batch_size) 15 | self.pred_truncate = pred_truncate 16 | self.softmax_weighted = softmax_weighted 17 | 18 | def __call__(self, y_true, y_pred): 19 | top_true = tf.math.top_k(y_true) 20 | positive_true = top_true.values 21 | positive_pred = tf.gather(y_pred, top_true.indices, batch_dims=1) 22 | pred, true_ordered_by_pred = get_truncated(y_true, y_pred, self.pred_truncate) 23 | diff = pred - positive_pred 24 | mask = tf.cast(true_ordered_by_pred < positive_true, 'float32') 25 | sigm = tf.sigmoid(diff) * mask 26 | square = tf.sigmoid(pred * pred) * mask 27 | result = (sigm + square) 28 | if self.softmax_weighted: 29 | pred_softmax = masked_softmax(pred, mask) 30 | result *= pred_softmax 31 | result_mean = tf.reduce_sum(result, axis=1 ) / tf.reduce_sum(mask, axis=1) 32 | return tf.reduce_mean(result_mean) 33 | -------------------------------------------------------------------------------- /losses/xendcg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from aprec.losses.loss import Loss 4 | 5 | 6 | class XENDCGLoss(Loss): 7 | def __init__(self, num_items=None, batch_size=None): 8 | super().__init__(num_items, batch_size) 9 | self.__name__ = 'xendcg' 10 | 11 | def __call__(self, true, pred): 12 | eps = 1e-5 13 | gamma = tf.random.uniform(shape=(self.batch_size, self.num_items)) 14 | true_transformed = (2 ** true) - gamma 15 | true_transformed_sum = tf.expand_dims(tf.math.reduce_sum(true_transformed, axis=1),1) 16 | true_probs = true_transformed / (true_transformed_sum + eps) 17 | 18 | pred_transformed = tf.exp(pred) 19 | pred_transformed_sum = tf.expand_dims(tf.math.reduce_sum(pred_transformed, axis=1),1) 20 | pred_probs = pred_transformed / (pred_transformed_sum + eps) 21 | 22 | result = -tf.math.reduce_sum(true_probs * tf.math.log(pred_probs), axis=1) 23 | return result 24 | -------------------------------------------------------------------------------- /recommenders/BERT4rec/Readme.md: -------------------------------------------------------------------------------- 1 | This code is ported from the original BERT4rec implementation. 2 | 3 | Original code is taken from here: 4 | https://github.com/FeiSun/BERT4Rec -------------------------------------------------------------------------------- /recommenders/BERT4rec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/BERT4rec/__init__.py -------------------------------------------------------------------------------- /recommenders/BERT4rec/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from collections import defaultdict 3 | 4 | 5 | def data_partition(fname): 6 | usernum = 0 7 | itemnum = 0 8 | User = defaultdict(list) 9 | user_train = {} 10 | user_test = {} 11 | # assume user/item index starting from 1 12 | f = open(fname, 'r') 13 | for line in f: 14 | u, i = line.rstrip().split(' ') 15 | u = int(u) 16 | i = int(i) 17 | usernum = max(u, usernum) 18 | itemnum = max(i, itemnum) 19 | User[u].append(i) 20 | 21 | for user in User: 22 | user_train[user] = User[user][:-1] 23 | user_test[user] = [User[user][-1]] 24 | return [user_train, user_test, usernum, itemnum] 25 | 26 | -------------------------------------------------------------------------------- /recommenders/BERT4rec/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | 4 | def convert_by_vocab(vocab, items): 5 | """Converts a sequence of [tokens|ids] using the vocab.""" 6 | output = [] 7 | for item in items: 8 | try: 9 | output.append(vocab[item]) 10 | except: 11 | pass 12 | return output 13 | 14 | 15 | class FreqVocab(object): 16 | """Runs end-to-end tokenziation.""" 17 | 18 | def __init__(self, user_to_list): 19 | # layout of the ulary 20 | # item_id based on freq 21 | # special token 22 | # user_id based on nothing 23 | self.counter = Counter( 24 | ) #sorted(self.items(), key=_itemgetter(1), reverse=True) 25 | self.user_set = set() 26 | for u, item_list in user_to_list.items(): 27 | self.counter.update(item_list) 28 | self.user_set.add(str(u)) 29 | 30 | self.user_count = len(self.user_set) 31 | self.item_count = len(self.counter.keys()) 32 | self.special_tokens = {"[pad]", "[MASK]", '[NO_USE]'} 33 | self.token_to_ids = {} # index begin from 1 34 | #first items 35 | for token, count in self.counter.most_common(): 36 | self.token_to_ids[token] = len(self.token_to_ids) + 1 37 | 38 | # then special tokens 39 | for token in self.special_tokens: 40 | self.token_to_ids[token] = len(self.token_to_ids) + 1 41 | 42 | # then user 43 | # for user in self.user_set: 44 | # self.token_to_ids[user] = len(self.token_to_ids) + 1 45 | 46 | self.id_to_tokens = {v: k for k, v in self.token_to_ids.items()} 47 | self.vocab_words = list(self.token_to_ids.keys()) 48 | 49 | def convert_tokens_to_ids(self, tokens): 50 | return convert_by_vocab(self.token_to_ids, tokens) 51 | 52 | def convert_ids_to_tokens(self, ids): 53 | return convert_by_vocab(self.id_to_tokens, ids) 54 | 55 | def get_vocab_words(self): 56 | return self.vocab_words # not in order 57 | 58 | def get_item_count(self): 59 | return self.item_count 60 | 61 | def get_user_count(self): 62 | return self.user_count 63 | 64 | def get_items(self): 65 | return list(self.counter.keys()) 66 | 67 | def get_users(self): 68 | return self.user_set 69 | 70 | def get_special_token_count(self): 71 | return len(self.special_tokens) 72 | 73 | def get_special_token(self): 74 | return self.special_tokens 75 | 76 | def get_vocab_size(self): 77 | return self.get_item_count() + self.get_special_token_count() + 1 #self.get_user_count() 78 | -------------------------------------------------------------------------------- /recommenders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/__init__.py -------------------------------------------------------------------------------- /recommenders/conditional_top_recommender.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from aprec.api.user import User 3 | from aprec.recommenders.recommender import Recommender 4 | 5 | 6 | class ConditionalTopRecommender(Recommender): 7 | """ 8 | This recommender calculates top items based on some condition. For example, we want to recommend 9 | the most popular hotel in the city, not globally (for global top we can use @TopRecommender). 10 | """ 11 | def __init__(self, conditional_field: str): 12 | self.conditional_field: str = conditional_field 13 | self.items_counts: dict = dict() 14 | self.precalculated_top_items: dict = dict() 15 | self.user_field_values: dict = dict() 16 | 17 | def add_user(self, user: User): 18 | if self.conditional_field in user.cat_features: 19 | self.user_field_values[user.user_id] = user.cat_features[self.conditional_field] 20 | 21 | 22 | def add_action(self, action): 23 | 24 | if self.conditional_field in action.data: 25 | field_value = action.data[self.conditional_field] 26 | elif action.user_id in self.user_field_values: 27 | field_value = self.user_field_values[action.user_id] 28 | else: 29 | field_value = "N/A" 30 | if field_value not in self.items_counts: 31 | self.items_counts[field_value] = Counter() 32 | self.user_field_values[action.user_id] = field_value 33 | 34 | if action.item_id is not None: 35 | self.items_counts[field_value][action.item_id] += 1 36 | 37 | def rebuild_model(self): 38 | self.precalculated_top_items = { 39 | field_value: counter.most_common() for field_value, counter in self.items_counts.items() 40 | } 41 | 42 | def recommend(self, user_id, limit, features=None): 43 | if user_id not in self.user_field_values: 44 | field_value = "N/A" 45 | else: 46 | field_value = self.user_field_values[user_id] 47 | return self.precalculated_top_items.get(field_value, [])[:limit] 48 | 49 | def get_similar_items(self, item_id, limit): 50 | raise NotImplementedError 51 | 52 | def name(self): 53 | return "ConditionalTopItemsRecommender" 54 | -------------------------------------------------------------------------------- /recommenders/constant_recommender.py: -------------------------------------------------------------------------------- 1 | from .recommender import Recommender 2 | 3 | class ConstantRecommender(Recommender): 4 | def __init__(self, recommendations): 5 | super().__init__() 6 | self.recommendations = recommendations 7 | 8 | def name(self): 9 | return "ConstantRecommender" 10 | 11 | def add_action(self, action): 12 | pass 13 | 14 | def rebuild_model(self): 15 | pass 16 | 17 | def recommend(self, user_id, limit, features=None): 18 | return self.recommendations[:limit] 19 | 20 | def get_similar_items(self, item_id, limit): 21 | return self.recommendations[:limit] 22 | 23 | def to_str(self): 24 | raise(NotImplementedError) 25 | 26 | def from_str(self): 27 | raise(NotImplementedError) 28 | -------------------------------------------------------------------------------- /recommenders/featurizer.py: -------------------------------------------------------------------------------- 1 | from aprec.api.action import Action 2 | from aprec.api.user import User 3 | from aprec.api.item import Item 4 | 5 | class Featurizer(object): 6 | def __init__(self): 7 | pass 8 | 9 | def add_action(self, action: Action): 10 | pass 11 | 12 | def add_user(self, user: User): 13 | pass 14 | 15 | def add_item(self, item: Item): 16 | pass 17 | 18 | def get_features(self, user_id, item_id): 19 | pass 20 | 21 | def build(self): 22 | pass 23 | -------------------------------------------------------------------------------- /recommenders/first_order_mc.py: -------------------------------------------------------------------------------- 1 | from argparse import Action 2 | from collections import Counter, defaultdict 3 | from aprec.recommenders.recommender import Recommender 4 | 5 | 6 | class FirstOrderMarkovChainRecommender(Recommender): 7 | def __init__(self, cache_items=1000): 8 | super().__init__() 9 | self.user_actions = defaultdict(list) 10 | self.cache_items = cache_items 11 | 12 | def add_action(self, action: Action): 13 | self.user_actions[action.user_id].append(action.item_id) 14 | 15 | def rebuild_model(self): 16 | self.item_pairs_counter = defaultdict(Counter) 17 | for user in self.user_actions: 18 | for i in range(1, len(self.user_actions[user])): 19 | src = self.user_actions[user][i-1] 20 | dst = self.user_actions[user][i] 21 | self.item_pairs_counter[src][dst] += 1 22 | 23 | self.cache = defaultdict(list) 24 | for item in self.item_pairs_counter: 25 | self.cache[item] = self.item_pairs_counter[item].most_common(self.cache_items) 26 | 27 | def recommend(self, user_id, limit: int, features=None): 28 | if user_id not in self.user_actions: 29 | return [] 30 | return self.cache[self.user_actions[user_id][-1]][:limit] 31 | 32 | def get_item_rankings(self): 33 | result = {} 34 | for request in self.items_ranking_requests: 35 | user_result = [] 36 | user_id = request.user_id 37 | last_item = self.user_actions[user_id][-1] 38 | scores = self.item_pairs_counter[last_item] 39 | 40 | for item_id in request.item_ids: 41 | score = scores.get(item_id, 0) 42 | user_result.append((item_id, score)) 43 | user_result.sort(key=lambda x: -x[1]) 44 | result[request.user_id] = user_result 45 | return result 46 | -------------------------------------------------------------------------------- /recommenders/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/metrics/__init__.py -------------------------------------------------------------------------------- /recommenders/metrics/ndcg.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import tensorflow as tf 4 | 5 | from aprec.losses.loss import ListWiseLoss 6 | 7 | class KerasNDCG(ListWiseLoss): 8 | def __init__(self, k): 9 | self.k = k 10 | discounts = [] 11 | for i in range(1, k+1): 12 | discounts.append(1 / math.log2(i + 1)) 13 | self.discounts = tf.cast(tf.constant(tf.expand_dims(discounts, 1)), 'float32') 14 | self.__name__ = f"ndcg_at_{k}" 15 | self.less_is_better = False 16 | 17 | def dcg(self, scores): 18 | gain = tf.pow(2.0, scores) - 1 19 | return gain @ self.discounts 20 | 21 | def calc_per_list(self, y_true, y_pred): 22 | return self.__call__(y_true, y_pred) 23 | 24 | def __call__(self, y_true, y_pred): 25 | eps = 0.000001 26 | top_k = tf.nn.top_k(y_pred, self.k) 27 | gains = tf.gather(y_true, top_k.indices, batch_dims=1) 28 | dcg_val = self.dcg(tf.cast(gains, 'float32')) 29 | 30 | ideal_top_k = tf.nn.top_k(y_true, self.k) 31 | ideal_gains = tf.gather(y_true, ideal_top_k.indices, batch_dims=1) 32 | idcg_val = self.dcg(tf.cast(ideal_gains, 'float32')) 33 | return float(tf.reduce_mean(dcg_val / (idcg_val + eps))) 34 | 35 | -------------------------------------------------------------------------------- /recommenders/metrics/success.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras.backend as K 3 | 4 | class KerasSuccess(object): 5 | def __init__(self, k): 6 | self.k = k 7 | self.__name__ = f"Success_at_{k}" 8 | self.less_is_better = False 9 | 10 | def __call__(self, y_true, y_pred): 11 | top_k = tf.nn.top_k(y_pred, self.k) 12 | gains = tf.gather(y_true, top_k.indices, batch_dims=1) 13 | user_success = K.sum(gains, axis=-1) 14 | return K.mean(user_success) 15 | 16 | -------------------------------------------------------------------------------- /recommenders/random_recommender.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aprec.recommenders.recommender import Recommender 3 | 4 | 5 | class RandomRecommender(Recommender): 6 | def __init__(self): 7 | self.items_set = set() 8 | 9 | def add_action(self, action): 10 | self.items_set.add(action.item_id) 11 | 12 | def rebuild_model(self): 13 | self.items = list(self.items_set) 14 | 15 | def recommend(self, user_id, limit, features=None): 16 | recommended_items = np.random.choice(self.items, limit, replace=False) 17 | result = [] 18 | current_score = 1.0 19 | for item in recommended_items: 20 | result.append((item, current_score)) 21 | current_score *= 0.9 22 | return result 23 | -------------------------------------------------------------------------------- /recommenders/sequential/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/sequential/__init__.py -------------------------------------------------------------------------------- /recommenders/sequential/featurizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/sequential/featurizers/__init__.py -------------------------------------------------------------------------------- /recommenders/sequential/featurizers/hashing_featurizer.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | 3 | class HashingFeaturizer(object): 4 | def __init__(self, num_cat_hashes=3, cat_hashes_space=1000): 5 | self.num_cat_hashes = num_cat_hashes 6 | self.cat_hashes_space = cat_hashes_space 7 | 8 | def __call__(self, obj): 9 | result = [] 10 | if type(obj.cat_features) == dict: 11 | features = list(obj.cat_features.items()) 12 | else: 13 | features = obj.cat_features 14 | 15 | for feature in features: 16 | for hash_num in range(self.num_cat_hashes): 17 | val = f"{feature[0]}_" + str(feature[1]) + f"_hash{hash_num}" 18 | hash_val = mmh3.hash(val) % self.cat_hashes_space + 1 19 | result.append(hash_val) 20 | return result 21 | -------------------------------------------------------------------------------- /recommenders/sequential/history_vectorizers/add_mask_history_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aprec.recommenders.sequential.history_vectorizers.history_vectorizer import HistoryVectorizer 3 | 4 | class AddMaskHistoryVectorizer(HistoryVectorizer): 5 | def __call__(self, user_actions): 6 | mask = self.padding_value + 1 7 | if len(user_actions) >= self.sequence_len - 1: 8 | return np.array([action[1] for action in user_actions[-self.sequence_len + 1:]] + [mask]) 9 | else: 10 | n_special = self.sequence_len - 1 - len(user_actions) 11 | result_list = [self.padding_value] * n_special + [action[1] for action in user_actions] + [mask] 12 | return np.array(result_list) 13 | -------------------------------------------------------------------------------- /recommenders/sequential/history_vectorizers/default_history_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aprec.recommenders.sequential.history_vectorizers.history_vectorizer import HistoryVectorizer 3 | 4 | class DefaultHistoryVectrizer(HistoryVectorizer): 5 | def __call__(self, user_actions): 6 | if len(user_actions) >= self.sequence_len: 7 | return np.array([action[1] for action in user_actions[-self.sequence_len:]]) 8 | else: 9 | n_special = self.sequence_len - len(user_actions) 10 | result_list = [self.padding_value] * n_special + [action[1] for action in user_actions] 11 | return np.array(result_list) 12 | 13 | 14 | -------------------------------------------------------------------------------- /recommenders/sequential/history_vectorizers/history_vectorizer.py: -------------------------------------------------------------------------------- 1 | class HistoryVectorizer(object): 2 | def __init__(self) -> None: 3 | self.sequence_len = None 4 | self.padding_value = None 5 | 6 | def set_sequence_len(self, sequence_len): 7 | self.sequence_len = sequence_len 8 | 9 | def set_padding_value(self, padding_value): 10 | self.padding_value = padding_value 11 | 12 | def __call__(self, user_actions): 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /recommenders/sequential/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/sequential/models/__init__.py -------------------------------------------------------------------------------- /recommenders/sequential/models/bert4rec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/sequential/models/bert4rec/__init__.py -------------------------------------------------------------------------------- /recommenders/sequential/models/bert4rec/special_items.py: -------------------------------------------------------------------------------- 1 | SPECIAL_ITEMS = { 2 | "PAD" : 0, 3 | "MASK": 1, 4 | "IGNORE": 2 5 | } -------------------------------------------------------------------------------- /recommenders/sequential/models/positional_encodings.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class SinePositionEncoding(tf.keras.layers.Layer): 5 | def __init__( 6 | self, 7 | seq_length, 8 | hidden_size, 9 | max_wavelength=10000, 10 | **kwargs, 11 | ): 12 | super().__init__(**kwargs) 13 | self.max_wavelength = max_wavelength 14 | self.seq_length = seq_length 15 | self.hidden_size = hidden_size 16 | 17 | def call(self, positions): 18 | seq_length = self.seq_length 19 | hidden_size = self.hidden_size 20 | position = tf.cast(tf.range(seq_length), self.compute_dtype) 21 | min_freq = tf.cast(1 / self.max_wavelength, dtype=self.compute_dtype) 22 | timescales = tf.pow( 23 | min_freq, 24 | tf.cast(2 * (tf.range(hidden_size) // 2), self.compute_dtype) 25 | / tf.cast(hidden_size, self.compute_dtype), 26 | ) 27 | angles = tf.expand_dims(position, 1) * tf.expand_dims(timescales, 0) 28 | cos_mask = tf.cast(tf.range(hidden_size) % 2, self.compute_dtype) 29 | sin_mask = 1 - cos_mask 30 | positional_encodings = ( 31 | tf.sin(angles) * sin_mask + tf.cos(angles) * cos_mask 32 | ) 33 | return tf.gather(positional_encodings, positions) 34 | 35 | 36 | class ExpPositionEncoding(tf.keras.layers.Layer): 37 | def __init__(self, seq_len, emb_size, init=3, **kwargs): 38 | super().__init__(**kwargs) 39 | self.seq_len = seq_len 40 | self.emb_size = emb_size 41 | pows_initalizer = tf.random_uniform_initializer(-init, init) 42 | self.pow = tf.Variable(initial_value=pows_initalizer(shape=(emb_size, )), trainable=True) 43 | 44 | 45 | def __call__(self, positions): 46 | w = tf.exp(self.pow) 47 | for i in range(len(positions.shape)): 48 | w = tf.expand_dims(w, 0) 49 | tiles = list(positions.shape) + [1] 50 | w = tf.tile(w, tiles) 51 | positions_norm = tf.cast((positions+1), 'float32')/(self.seq_len+1) 52 | pos = tf.tile(tf.expand_dims(positions_norm, -1), [1] * len(positions.shape) + [self.emb_size]) 53 | return tf.pow(pos, w) 54 | 55 | def get_pos_embedding(seq_len, emb_size, kind): 56 | if (kind == 'default') or (kind=='learnable'): 57 | return tf.keras.layers.Embedding(seq_len, output_dim=emb_size, dtype='float32') 58 | 59 | if kind == 'exp': 60 | return ExpPositionEncoding(seq_len, emb_size) 61 | 62 | if kind == 'sin': 63 | return SinePositionEncoding(seq_len, emb_size) 64 | 65 | 66 | -------------------------------------------------------------------------------- /recommenders/sequential/models/sasrec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/sequential/models/sasrec/__init__.py -------------------------------------------------------------------------------- /recommenders/sequential/models/sasrec/sasrec_multihead_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # this version of multihead attention was ported from the original SASRec implementation, 4 | # as it does some non-standard transformations, including 'casuality' one 5 | def multihead_attention(queries, 6 | keys, 7 | num_heads, 8 | attention_layers, 9 | causality=False): 10 | Q = attention_layers["query_proj"](queries) # (N, T_q, C) 11 | K = attention_layers["key_proj"](keys) # (N, T_k, C) 12 | V = attention_layers["val_proj"](keys) # (N, T_k, C) 13 | 14 | # Split and concat 15 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 16 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 17 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 18 | 19 | # Multiplication 20 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) 21 | 22 | # Scale 23 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) 24 | 25 | # Key Masking 26 | key_masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k) 27 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) 28 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) 29 | 30 | paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) 31 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) 32 | 33 | # Causality = Future blinding 34 | if causality: 35 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) 36 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k) 37 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) 38 | 39 | paddings = tf.ones_like(masks) * (-2 ** 32 + 1) 40 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) 41 | 42 | # Activation 43 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) 44 | 45 | # Query Masking 46 | query_masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q) 47 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) 48 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) 49 | outputs *= query_masks # broadcasting. (N, T_q, C) 50 | 51 | attention_weights = outputs 52 | 53 | # Dropouts 54 | outputs = attention_layers["dropout"](outputs) 55 | 56 | # Weighted sum 57 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) 58 | 59 | # Restore shape 60 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) 61 | return outputs, attention_weights 62 | -------------------------------------------------------------------------------- /recommenders/sequential/models/sequential_recsys_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import List, Type 3 | import tensorflow as tf 4 | 5 | class SequentialModelConfig(object): 6 | def __init__(self): 7 | self.config = {} 8 | 9 | def as_dict(self) -> dict: 10 | return self.config 11 | 12 | def get_model_architecture(self) -> Type[SequentialRecsysModel]: 13 | raise NotImplementedError() 14 | 15 | 16 | class SequentialDataParameters(object): 17 | def __init__(self, num_users, num_items, sequence_length, batch_size) -> None: 18 | self.num_users = num_users 19 | self.num_items = num_items 20 | self.sequence_length = sequence_length 21 | self.batch_size = batch_size 22 | 23 | def as_dict(self): 24 | return self.__dict__ 25 | 26 | class SequentialRecsysModel(tf.keras.Model): 27 | @classmethod 28 | def get_model_config_class() -> Type[SequentialModelConfig]: 29 | raise NotImplementedError() 30 | 31 | def __init__(self, model_parameters: SequentialModelConfig, data_parameters: SequentialDataParameters, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | self.model_parameters = model_parameters 34 | self.data_parameters = data_parameters 35 | 36 | def get_dummy_inputs(self) -> List[tf.Tensor]: 37 | raise NotImplementedError() 38 | 39 | def fit_biases(self, train_users): 40 | pass 41 | 42 | #write tensorboard staff metrics here 43 | def log(self): 44 | pass 45 | 46 | @classmethod 47 | def from_config(cls, config: dict): 48 | data_parameters = SequentialDataParameters(**config['data_parameters']) 49 | model_parameters = cls.get_model_config_class()(**config['model_parameters']) 50 | model = cls(model_parameters, data_parameters) 51 | dummy_data = model.get_dummy_inputs() 52 | model(dummy_data, training=False) #dummy call to build the model 53 | return model 54 | 55 | def get_config(self): 56 | return get_config_dict(self.model_parameters, self.data_parameters) 57 | 58 | 59 | 60 | def get_sequential_model(model_config: SequentialModelConfig, data_parameters: SequentialDataParameters): 61 | config = get_config_dict(model_config, data_parameters) 62 | model_arch = model_config.get_model_architecture() 63 | return model_arch.from_config(config) 64 | 65 | def get_config_dict(model_config, data_parameters): 66 | model_config_dict = model_config.as_dict() 67 | data_parameters_dict = data_parameters.as_dict() 68 | config = {'model_parameters': model_config_dict, 'data_parameters': data_parameters_dict} 69 | return config 70 | 71 | 72 | -------------------------------------------------------------------------------- /recommenders/sequential/models/vit4rec.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.sequential.models.sequential_recsys_model import SequentialRecsysModelBuilder 2 | from tensorflow.keras import Model 3 | from tensorflow.keras.layers import Embedding, Dense 4 | import tensorflow as tf 5 | from transformers import TFViTModel 6 | from tensorflow.keras import activations 7 | 8 | 9 | class Vit4Rec(SequentialRecsysModelBuilder): 10 | VIT_SIZE = 224 11 | def __init__(self, output_layer_activation='linear', embedding_size=None, max_history_len=VIT_SIZE): 12 | super().__init__(output_layer_activation, embedding_size, max_history_len) 13 | 14 | def get_model(self): 15 | return Vit4RecModel(self.num_items, self.output_layer_activation) 16 | 17 | 18 | 19 | class Vit4RecModel(Model): 20 | def __init__(self, n_items, activation): 21 | super().__init__() 22 | vit_image_size = Vit4Rec.VIT_SIZE 23 | VIT_EMBEDDING_SIZE=768 24 | self.embeddings_r = Embedding(n_items+1, vit_image_size) 25 | self.embeddings_g = Embedding(n_items+1, vit_image_size) 26 | self.embeddings_b = Embedding(n_items+1, vit_image_size) 27 | self.projection = Dense(VIT_EMBEDDING_SIZE, input_shape=(3 * vit_image_size,)) 28 | self.n_items = n_items 29 | self.all_items = tf.range(0, self.n_items) 30 | self.vit = TFViTModel.from_pretrained("google/vit-base-patch16-224") 31 | self.output_activation = activations.get(activation) 32 | 33 | def call(self, inputs): 34 | seqs = inputs[0] 35 | red = self.embeddings_r(seqs) 36 | green = self.embeddings_g(seqs) 37 | blue = self.embeddings_b(seqs) 38 | images = tf.tanh(tf.stack([red, green, blue], axis=1)) 39 | encoded = self.vit(images).pooler_output 40 | all_items_red = self.embeddings_r(self.all_items) 41 | all_items_green = self.embeddings_r(self.all_items) 42 | all_items_blue = self.embeddings_r(self.all_items) 43 | all_embs = self.projection(tf.concat([all_items_red, all_items_green, all_items_blue], axis=1)) 44 | result = self.output_activation(tf.einsum("be, ie -> bi", encoded, all_embs)) 45 | return result 46 | 47 | 48 | -------------------------------------------------------------------------------- /recommenders/sequential/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/recommenders/sequential/samplers/__init__.py -------------------------------------------------------------------------------- /recommenders/sequential/samplers/idf_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow_probability as tfp 4 | import tqdm 5 | from aprec.recommenders.sequential.samplers.sampler import NegativesSampler 6 | from aprec.recommenders.sequential.models.sequential_recsys_model import SequentialDataParameters 7 | 8 | 9 | class IDFSampler(NegativesSampler): 10 | def __init__(self, data_parameters: SequentialDataParameters, num_negatives: int) -> None: 11 | super().__init__(data_parameters, num_negatives) 12 | weights = tf.random.uniform((self.data_parameters.num_items,), 0, 1) 13 | self.reset_logits(weights) 14 | 15 | def reset_logits(self, weights): 16 | probs = weights/tf.reduce_sum(weights) 17 | self.logits = tf.expand_dims(tf.math.log(probs/tf.reduce_sum(probs)), 0) 18 | 19 | def fit(self, train_users): 20 | print("fitting idf negatives sampler...") 21 | item_counts = np.zeros(self.data_parameters.num_items) 22 | for user_seq in tqdm.tqdm(train_users): 23 | for timestamp, item in user_seq: 24 | item_counts[item] += 1 25 | item_counts = tf.constant(item_counts, 'float32' ) 26 | numerator = tf.expand_dims(tf.constant(len(train_users), 'float32'), 0) 27 | EPS = 1-9 28 | #for items with zero zero interactions the result is negative, so we take relu to make them 9 29 | inverted_counts = tf.nn.relu(tf.math.log(tf.math.divide_no_nan(numerator, item_counts) + EPS)) 30 | self.reset_logits(inverted_counts) 31 | 32 | def __call__(self, masked_sequences, labels): 33 | negatives = tf.random.categorical(self.logits, self.data_parameters.batch_size*self.data_parameters.sequence_length*self.num_negatives) 34 | negatives = tf.reshape(negatives, (self.data_parameters.batch_size, self.data_parameters.sequence_length, self.num_negatives)) 35 | return negatives 36 | 37 | 38 | -------------------------------------------------------------------------------- /recommenders/sequential/samplers/popularity_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow_probability as tfp 4 | import tqdm 5 | from aprec.recommenders.sequential.samplers.sampler import NegativesSampler 6 | from aprec.recommenders.sequential.models.sequential_recsys_model import SequentialDataParameters 7 | 8 | 9 | class PopularitySampler(NegativesSampler): 10 | def __init__(self, data_parameters: SequentialDataParameters, num_negatives: int) -> None: 11 | super().__init__(data_parameters, num_negatives) 12 | weights = tf.random.uniform((self.data_parameters.num_items,), 0, 1) 13 | self.reset_logits(weights) 14 | 15 | def reset_logits(self, weights): 16 | probs = weights/tf.reduce_sum(weights) 17 | self.logits = tf.expand_dims(tf.math.log(probs/tf.reduce_sum(probs)), 0) 18 | 19 | def fit(self, train_users): 20 | print("fitting popularity negatives sampler...") 21 | item_counts = np.zeros(self.data_parameters.num_items) 22 | for user_seq in tqdm.tqdm(train_users): 23 | for timestamp, item in user_seq: 24 | item_counts[item] += 1 25 | self.reset_logits(tf.convert_to_tensor(item_counts)) 26 | 27 | def __call__(self, masked_sequences, labels): 28 | negatives = tf.random.categorical(self.logits, self.data_parameters.batch_size*self.data_parameters.sequence_length*self.num_negatives) 29 | negatives = tf.reshape(negatives, (self.data_parameters.batch_size, self.data_parameters.sequence_length, self.num_negatives)) 30 | return negatives 31 | 32 | -------------------------------------------------------------------------------- /recommenders/sequential/samplers/random_sampler.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from aprec.recommenders.sequential.samplers.sampler import NegativesSampler 3 | from aprec.recommenders.sequential.models.sequential_recsys_model import SequentialDataParameters 4 | 5 | 6 | class RandomNegativesSampler(NegativesSampler): 7 | def __init__(self, data_parameters: SequentialDataParameters, num_negatives:int) -> None: 8 | super().__init__(data_parameters, num_negatives) 9 | 10 | def fit(self, training_sequences): 11 | pass 12 | 13 | def __call__(self, masked_sequences, labels): 14 | negatives = tf.random.uniform((self.data_parameters.batch_size, 15 | self.data_parameters.sequence_length, 16 | self.num_negatives), dtype='int64', maxval=self.data_parameters.num_items) 17 | return negatives 18 | -------------------------------------------------------------------------------- /recommenders/sequential/samplers/sampler.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.sequential.models.sequential_recsys_model import SequentialDataParameters 2 | 3 | 4 | class NegativesSampler(object): 5 | def __init__(self, data_parameters: SequentialDataParameters, num_negatives: int) -> None: 6 | self.data_parameters = data_parameters 7 | self.num_negatives = num_negatives 8 | 9 | def fit(self, train_users): 10 | pass 11 | 12 | def __call__(self, masked_sequences, labels): 13 | raise NotImplementedError() 14 | 15 | 16 | def get_negatives_sampler(sampler_name, data_parameters, num_negatives) -> NegativesSampler: 17 | if sampler_name == "random": 18 | from aprec.recommenders.sequential.samplers.random_sampler import RandomNegativesSampler 19 | return RandomNegativesSampler(data_parameters, num_negatives) 20 | 21 | elif sampler_name == "popularity": 22 | from aprec.recommenders.sequential.samplers.popularity_sampler import PopularitySampler 23 | return PopularitySampler(data_parameters, num_negatives) 24 | 25 | elif sampler_name == "idf": 26 | from aprec.recommenders.sequential.samplers.idf_sampler import IDFSampler 27 | return IDFSampler(data_parameters, num_negatives) 28 | else: 29 | raise Exception(f"wrong negatives sampler name {sampler_name}") -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/full_matrix_targets_builder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse.csr import csr_matrix 3 | from aprec.recommenders.sequential.target_builders.target_builders import TargetBuilder 4 | 5 | 6 | class FullMatrixTargetsBuilder(TargetBuilder): 7 | def __init__(self, max_target_label=1.0, target_decay=1.0, min_target_val=0.1): 8 | self.max_target_label = max_target_label 9 | self.target_decay = target_decay 10 | self.min_target_val = min_target_val 11 | 12 | def build(self, user_targets): 13 | rows = [] 14 | cols = [] 15 | vals = [] 16 | for i in range(len(user_targets)): 17 | cur_val = self.max_target_label 18 | for action_num in range(len(user_targets[i])): 19 | action = user_targets[i][action_num] 20 | rows.append(i) 21 | cols.append(action[1]) 22 | vals.append(cur_val) 23 | cur_val *= self.target_decay 24 | if cur_val < self.min_target_val: 25 | cur_val = self.min_target_val 26 | self.target_matrix = csr_matrix((vals, (rows, cols)), shape=(len(user_targets), self.n_items), 27 | dtype='float32') 28 | def get_targets(self, start, end): 29 | target_inputs = [] 30 | target_outputs = np.array(self.target_matrix[start:end].todense()) 31 | return target_inputs, target_outputs -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/items_masking_target_builder.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | 3 | import numpy as np 4 | from aprec.recommenders.sequential.target_builders.target_builders import TargetBuilder 5 | 6 | class ItemsMaskingTargetsBuilder(TargetBuilder): 7 | def __init__(self, random_seed=31337, 8 | relative_positions_encoding = False, 9 | ignore_value=-100): #-100 is used by default in hugginface's BERT implementation 10 | self.random = Random() 11 | self.random.seed(random_seed) 12 | self.targets = [] 13 | self.ignore_value = ignore_value 14 | self.relative_positions_encoding = relative_positions_encoding 15 | self.positions = [] 16 | 17 | def build(self, user_targets): 18 | targets = [] 19 | positions = [] 20 | for seq_len, user in user_targets: 21 | user_positions = [] 22 | user_target = [self.ignore_value] * self.sequence_len 23 | if self.relative_positions_encoding: 24 | split_pos = self.random.randint(self.sequence_len - seq_len, self.sequence_len - 1) 25 | else: 26 | split_pos = self.sequence_len - 1 27 | 28 | for i in range(self.sequence_len): 29 | user_positions.append(self.sequence_len - split_pos + i) 30 | 31 | positions.append(user_positions) 32 | for pos in user: 33 | user_target[pos[0]] = pos[1][1] 34 | 35 | targets.append(user_target) 36 | 37 | self.positions = np.array(positions) 38 | self.targets = np.array(targets) 39 | 40 | 41 | 42 | def get_targets(self, start, end): 43 | return [self.targets[start:end], self.positions[start:end]], self.targets[start:end] 44 | -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/negative_per_positive_target.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | 3 | import numpy as np 4 | from aprec.recommenders.sequential.target_builders.target_builders import TargetBuilder 5 | 6 | 7 | class NegativePerPositiveTargetBuilder(TargetBuilder): 8 | def __init__(self, sequence_len=64, random_seed=31337): 9 | self.random = Random() 10 | self.random.seed(random_seed) 11 | self.sequence_len = sequence_len 12 | 13 | def build(self, user_targets): 14 | self.inputs = [] 15 | self.targets = [] 16 | for i in range(len(user_targets)): 17 | user_inputs = [] 18 | targets_for_user = [] 19 | seq = user_targets[i] 20 | if len(seq) < self.sequence_len: 21 | user_inputs += [[self.n_items, self.n_items]] * (self.sequence_len - len(seq)) 22 | targets_for_user += [[-1.0, -1.0]] * (self.sequence_len - len(seq)) 23 | for target in seq[-self.sequence_len:]: 24 | positive = target[1] 25 | negative = self.random.randint(0, self.n_items - 1) 26 | while negative == positive: 27 | negative = self.random.randint(0, self.n_items - 1) 28 | user_inputs.append([positive, negative]) 29 | targets_for_user.append([1.0, 0.0]) 30 | self.inputs.append(user_inputs) 31 | self.targets.append(targets_for_user) 32 | self.inputs = np.array(self.inputs) 33 | self.targets = np.array(self.targets) 34 | 35 | def get_targets(self, start, end): 36 | return [self.inputs[start:end]], self.targets[start:end] 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/positives_only_targets_builder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aprec.recommenders.sequential.target_builders.target_builders import TargetBuilder 3 | 4 | 5 | class PositvesOnlyTargetBuilder(TargetBuilder): 6 | def __init__(self, max_targets_per_user = 10): 7 | self.max_targets_per_user = max_targets_per_user 8 | 9 | def build(self, user_targets): 10 | result = [] 11 | for i in range(len(user_targets)): 12 | seq = np.array([item[1] for item in user_targets[i]]) 13 | if len(seq) > self.max_targets_per_user: 14 | seq = np.random.choice(seq, self.max_targets_per_user) 15 | if len(seq) < self.max_targets_per_user: 16 | seq = np.pad(seq, (0, self.max_targets_per_user - len(seq)), constant_values=self.n_items) 17 | result.append(seq) 18 | self.target_matrix = np.array(result) 19 | pass 20 | 21 | def get_targets(self, start, end): 22 | target_outputs = self.target_matrix[start:end] 23 | return [target_outputs], target_outputs -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/positives_sequence_target_builder.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | 3 | import numpy as np 4 | from aprec.recommenders.sequential.target_builders.target_builders import TargetBuilder 5 | 6 | 7 | class PositivesSequenceTargetBuilder(TargetBuilder): 8 | def __init__(self, sequence_len=64): 9 | self.random = Random() 10 | self.sequence_len = sequence_len 11 | 12 | def build(self, user_targets): 13 | self.targets = [] 14 | for i in range(len(user_targets)): 15 | targets_for_user = [] 16 | seq = user_targets[i] 17 | if len(seq) < self.sequence_len: 18 | targets_for_user += [-100.0] * (self.sequence_len - len(seq)) 19 | for target in seq[-self.sequence_len:]: 20 | targets_for_user.append(target[1]) 21 | self.targets.append(targets_for_user) 22 | self.targets = np.array(self.targets, 'int64') 23 | 24 | def get_targets(self, start, end): 25 | return [self.targets[start:end]], self.targets[start:end] 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/sampled_matrix_target_builder.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from aprec.recommenders.sequential.target_builders.target_builders import TargetBuilder 5 | 6 | 7 | class SampledMatrixBuilder(TargetBuilder): 8 | def __init__(self, max_target_label=1.0, target_decay=1.0, min_target_val=0.1, n_samples=101): 9 | self.max_target_label = max_target_label 10 | self.target_decay = target_decay 11 | self.min_target_val = min_target_val 12 | self.n_samples = n_samples 13 | 14 | def build(self, user_targets): 15 | all_items = list(range(self.n_items)) 16 | self.target_matrix = [] 17 | self.target_ids = [] 18 | for i in range(len(user_targets)): 19 | targets = [] 20 | target_ids = [] 21 | sampled = set() 22 | cur_val = self.max_target_label 23 | for action_num in range(len(user_targets[i])): 24 | action = user_targets[i][action_num] 25 | targets.append(cur_val) 26 | target_ids.append(action[1]) 27 | sampled.add(action[1]) 28 | cur_val *= self.target_decay 29 | if cur_val < self.min_target_val: 30 | cur_val = self.min_target_val 31 | sampled.add(action[1]) 32 | while(len(targets) < self.n_samples): 33 | negatives = np.random.choice(all_items, self.n_samples - len(targets)) 34 | for item_id in negatives: 35 | if item_id not in sampled: 36 | sampled.add(item_id) 37 | target_ids.append(item_id) 38 | targets.append(0.0) 39 | targets_with_ids = list(zip(targets, target_ids)) 40 | random.shuffle(targets_with_ids) 41 | targets, target_ids = zip(*targets_with_ids) 42 | self.target_matrix.append(targets) 43 | self.target_ids.append(target_ids) 44 | self.target_matrix = np.array(self.target_matrix) 45 | self.target_ids = np.array(self.target_ids) 46 | 47 | def get_targets(self, start, end): 48 | target_inputs = [self.target_ids[start:end]] 49 | target_outputs = self.target_matrix[start:end] 50 | return target_inputs, target_outputs 51 | 52 | 53 | -------------------------------------------------------------------------------- /recommenders/sequential/target_builders/target_builders.py: -------------------------------------------------------------------------------- 1 | class TargetBuilder(object): 2 | def __init__(self): 3 | pass 4 | 5 | def set_n_items(self, n): 6 | self.n_items = n 7 | 8 | def set_sequence_len(self, sequence_len): 9 | self.sequence_len = sequence_len 10 | 11 | def build(self, user_targets): 12 | raise NotImplementedError() 13 | 14 | def set_train_sequences(self, train_sequences): 15 | pass 16 | 17 | def get_targets(self, start, end): 18 | raise NotImplementedError() 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/fair_item_masking.py: -------------------------------------------------------------------------------- 1 | from ast import List 2 | from collections import Counter 3 | import copy 4 | from typing import Dict 5 | 6 | import numpy as np 7 | 8 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 9 | 10 | class FairItemMasking(TargetSplitter): 11 | def __init__(self, masking_prob = 0.2, 12 | max_predictions_per_seq = 20 13 | 14 | ) -> None: 15 | super().__init__() 16 | self.masking_prob = masking_prob 17 | self.max_predictions_per_seq = max_predictions_per_seq 18 | 19 | 20 | def set_item_attributes(self, item_attributes: List[int]): 21 | self.item_attributes = item_attributes 22 | self.temperature = np.ones(len(self.item_attributes)) 23 | 24 | 25 | def set_actions(self, actions): 26 | attribute_counts = Counter() 27 | for action in actions: 28 | attribute_counts[action.item_id] 29 | 30 | 31 | 32 | 33 | def split(self, sequence): 34 | seq = sequence[-self.seqence_len: ] 35 | seq_len = len(seq) 36 | 37 | if len(seq) < self.seqence_len: 38 | seq = [(-1, self.num_items)] * (self.seqence_len - len(seq)) + seq 39 | 40 | if not self.force_last: 41 | n_masks = min(self.max_predictions_per_seq, 42 | max(1, int(round(len(sequence) * self.masking_prob)))) 43 | sample_range = list(range(len(seq) - seq_len, len(seq))) 44 | rss_vals = np.array([self.recency_importance(self.seqence_len, pos) for pos in sample_range]) 45 | rss_vals_sum = np.sum(rss_vals) 46 | probs = rss_vals / rss_vals_sum 47 | mask_positions = self.random.choice(sample_range, n_masks, p=probs, replace=False) 48 | else: 49 | n_masks = 1 50 | mask_positions = [len(seq) - 1] 51 | train = copy.deepcopy(seq) 52 | labels = [] 53 | mask_token = self.num_items + 1 #self.num_items is used for padding 54 | for position in mask_positions: 55 | labels.append((position, seq[position])) 56 | train[position] = (train[position][0], mask_token) 57 | return train, (seq_len, labels) 58 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/items_masking.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | 5 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 6 | 7 | class ItemsMasking(TargetSplitter): 8 | def __init__(self, masking_prob = 0.2, 9 | max_predictions_per_seq = 20, 10 | random_seed = 31337, 11 | force_last=False, 12 | recency_importance = lambda n, k: 1, 13 | tuning_samples_prob = 0.0 14 | ) -> None: 15 | super().__init__() 16 | self.masking_prob = masking_prob 17 | self.max_predictions_per_seq = max_predictions_per_seq 18 | self.random = np.random.Generator(np.random.PCG64(np.random.SeedSequence(random_seed))) 19 | self.tuning_random = np.random.Generator(np.random.PCG64(np.random.SeedSequence(random_seed+1))) 20 | self.force_last = force_last 21 | self.recency_importance = recency_importance 22 | self.tuning_samples_prob = tuning_samples_prob 23 | 24 | def split(self, sequence): 25 | seq = sequence[-self.seqence_len: ] 26 | seq_len = len(seq) 27 | 28 | if len(seq) < self.seqence_len: 29 | seq = [(-1, self.num_items)] * (self.seqence_len - len(seq)) + seq 30 | 31 | if not self.force_last and self.tuning_random.random() > self.tuning_samples_prob: 32 | n_masks = min(self.max_predictions_per_seq, 33 | max(1, int(round(len(sequence) * self.masking_prob)))) 34 | sample_range = list(range(len(seq) - seq_len, len(seq))) 35 | rss_vals = np.array([self.recency_importance(self.seqence_len, pos) for pos in sample_range]) 36 | rss_vals_sum = np.sum(rss_vals) 37 | probs = rss_vals / rss_vals_sum 38 | mask_positions = self.random.choice(sample_range, n_masks, p=probs, replace=False) 39 | else: 40 | n_masks = 1 41 | mask_positions = [len(seq) - 1] 42 | train = copy.deepcopy(seq) 43 | labels = [] 44 | mask_token = self.num_items + 1 #self.num_items is used for padding 45 | for position in mask_positions: 46 | labels.append((position, seq[position])) 47 | train[position] = (train[position][0], mask_token) 48 | return train, (seq_len, labels) 49 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/last_item_splitter.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 3 | 4 | 5 | class SequenceContinuation(TargetSplitter): 6 | def __init__(self, add_cls=False) -> None: 7 | super().__init__() 8 | self.add_cls = add_cls 9 | 10 | def split(self, sequence, max_targets=1): 11 | if len(sequence) == 0: 12 | return [], [] 13 | train = sequence[:-max_targets] 14 | 15 | target = sequence[-max_targets:] 16 | 17 | if self.add_cls: 18 | cls_token = self.num_items + 1 #self.num_items is used for padding 19 | for t in target: 20 | cls = (t[0], cls_token) 21 | train.append(cls) 22 | return train, target -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/random_fraction_splitter.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 3 | 4 | 5 | class RandomFractionSplitter(TargetSplitter): 6 | def __init__(self, min_targets=1, random_seed=31337) -> None: 7 | super().__init__() 8 | self.min_targets = min_targets 9 | self.random = Random(random_seed) 10 | 11 | def split(self, sequence): 12 | if(len(sequence) == 0): 13 | return [], [] 14 | target_actions = self.random.randint(1, max(len(sequence) -1, 1)) 15 | train_actions = len(sequence) - target_actions 16 | return sequence[:train_actions], sequence[-target_actions:] 17 | 18 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/random_splitter.py: -------------------------------------------------------------------------------- 1 | import random 2 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 3 | 4 | 5 | class RandomSplitter(TargetSplitter): 6 | def __init__(self, seed=31337, target_chance = 0.25) -> None: 7 | self.random = random.Random() 8 | self.random.seed(seed) 9 | self.target_chance = target_chance 10 | super().__init__() 11 | 12 | def split(self, sequence): 13 | input = [] 14 | target = [] 15 | for item in sequence: 16 | if self.random.random() < self.target_chance: 17 | target.append(item) 18 | else: 19 | input.append(item) 20 | return input, target 21 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/recency_sequence_sampling.py: -------------------------------------------------------------------------------- 1 | import math 2 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 3 | import numpy as np 4 | 5 | def exponential_importance(p): 6 | return lambda n, k: p**(n - k) 7 | 8 | def linear_importance(a=1, b=1): 9 | return lambda n, k: a*k+b 10 | 11 | def pow_importance(p, c=0): 12 | def func(n, k): 13 | return math.pow((k+1)/(n+1), math.exp(p)) +c 14 | return func 15 | 16 | class RecencySequenceSampling(TargetSplitter): 17 | #recency importance is a function that defines the chances of k-th element 18 | #to be sampled as a positive in the sequence of the length n 19 | 20 | def __init__(self, max_pct, recency_importance=exponential_importance(0.8), seed=31337, add_cls = False) -> None: 21 | super().__init__() 22 | self.max_pct = max_pct 23 | self.recency_iportnace = recency_importance 24 | self.random = np.random.default_rng(seed=seed) 25 | self.add_cls = add_cls 26 | 27 | 28 | def split(self, sequence): 29 | if len(sequence) == 0: 30 | return [], [] 31 | target = set() 32 | cnt = max(1, int(len(sequence)*self.max_pct)) 33 | f = lambda j: self.recency_iportnace(len(sequence), j) 34 | f_vals = np.array([f(i) for i in range(len(sequence))]) 35 | f_sum = sum(f_vals) 36 | sampled_idx = set(self.random.choice(range(len(sequence)), cnt, p=f_vals/f_sum, replace=True)) 37 | input = list() 38 | for i in range(len(sequence)): 39 | if i not in sampled_idx: 40 | input.append(sequence[i]) 41 | else: 42 | target.add(sequence[i]) 43 | 44 | if self.add_cls: 45 | if len(input) > 0: 46 | last_input_timestamp = input[-1][0] 47 | else: 48 | last_input_timestamp = 1 49 | cls_token = self.num_items + 1 #self.num_items is used for padding 50 | input.append((last_input_timestamp + 1, cls_token)) 51 | return input, list(target) 52 | 53 | 54 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/shifted_sequence_splitter.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | from aprec.recommenders.sequential.targetsplitters.targetsplitter import TargetSplitter 3 | 4 | 5 | class ShiftedSequenceSplitter(TargetSplitter): 6 | def __init__(self) -> None: 7 | super().__init__() 8 | 9 | def split(self, sequence): 10 | train = sequence[-self.seqence_len - 1: -1] 11 | label = sequence[-len(train):] 12 | return train, label 13 | 14 | -------------------------------------------------------------------------------- /recommenders/sequential/targetsplitters/targetsplitter.py: -------------------------------------------------------------------------------- 1 | class TargetSplitter(object): 2 | def __init__(self) -> None: 3 | self.num_items = None 4 | self.seqence_len = None 5 | 6 | def split(self, sequence): 7 | return NotImplementedError() 8 | 9 | def set_num_items(self, num_items): 10 | self.num_items = num_items 11 | 12 | def set_actions(self, actions): 13 | pass #most target splitters do not require actions beforehand. 14 | 15 | def set_sequence_len(self, sequence_len): 16 | self.seqence_len = sequence_len 17 | -------------------------------------------------------------------------------- /recommenders/top_recommender.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.recommender import Recommender 2 | from collections import Counter 3 | 4 | class TopRecommender(Recommender): 5 | def __init__(self, recency=1.0): #recency parameter controls how many actions are considered out of all actions 6 | super().__init__() 7 | self.items_counter=Counter() 8 | self.item_scores = {} 9 | self.actions = [] 10 | self.recency = recency 11 | 12 | def add_action(self, action): 13 | self.actions.append(action) 14 | 15 | def rebuild_model(self): 16 | self.actions.sort(key=lambda x: x.timestamp) 17 | n_actions = int(len(self.actions) * self.recency) 18 | for action in self.actions[-n_actions:]: 19 | self.items_counter[action.item_id] += 1 20 | self.actions = [] 21 | self.most_common = self.items_counter.most_common() 22 | for item, score in self.most_common: 23 | self.item_scores[item] = score 24 | 25 | def recommend(self, user_id, limit, features=None): 26 | return self.most_common[:limit] 27 | 28 | def get_metadata(self): 29 | return {"top 20 items": self.most_common[:20]} 30 | 31 | 32 | def get_similar_items(self, item_id, limit): 33 | return self.most_common[:limit] 34 | 35 | def name(self): 36 | return "TopItemsRecommender" 37 | 38 | def get_item_rankings(self): 39 | result = {} 40 | for request in self.items_ranking_requests: 41 | request_result = [] 42 | for item_id in request.item_ids: 43 | score = self.item_scores.get(item_id, 0) 44 | request_result.append((item_id, score)) 45 | request_result.sort(key=lambda x: -x[1]) 46 | result[request.user_id] = request_result 47 | return result 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | log_tensorboard/ 2 | saved/ 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | #physical_devices = tf.config.list_physical_devices('GPU') 4 | #tf.config.experimental.set_memory_growth(physical_devices[0], True) 5 | -------------------------------------------------------------------------------- /tests/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/datasets/__init__.py -------------------------------------------------------------------------------- /tests/datasets/booking_dataset_reference_actions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "user_id": "1000027_1", 4 | "item_id": "8183", 5 | "data": { 6 | "user_id": "1000027", 7 | "device_class": "desktop", 8 | "affiliate_id": "7168", 9 | "hotel_country": "Gondal", 10 | "booker_country": "Elbonia", 11 | "checkin_date": 1471046400.0, 12 | "checkout_date": 1471132800.0, 13 | "is_control": false 14 | }, 15 | "timestamp": 1471046400.0 16 | }, 17 | { 18 | "user_id": "1000027_1", 19 | "item_id": "15626", 20 | "data": { 21 | "user_id": "1000027", 22 | "device_class": "desktop", 23 | "affiliate_id": "7168", 24 | "hotel_country": "Gondal", 25 | "booker_country": "Elbonia", 26 | "checkin_date": 1471132800.0, 27 | "checkout_date": 1471305600.0, 28 | "is_control": false 29 | }, 30 | "timestamp": 1471132800.0 31 | }, 32 | { 33 | "user_id": "1000066_2", 34 | "item_id": "56430", 35 | "data": { 36 | "user_id": "1000066", 37 | "device_class": "desktop", 38 | "affiliate_id": "9924", 39 | "hotel_country": "Urkesh", 40 | "booker_country": "Gondal", 41 | "checkin_date": 1469059200.0, 42 | "checkout_date": 1469232000.0, 43 | "is_control": true 44 | }, 45 | "timestamp": 1469059200.0 46 | }, 47 | { 48 | "user_id": "1000066_2", 49 | "item_id": "41971", 50 | "data": { 51 | "user_id": "1000066", 52 | "device_class": "desktop", 53 | "affiliate_id": "9924", 54 | "hotel_country": "Urkesh", 55 | "booker_country": "Gondal", 56 | "checkin_date": 1469232000.0, 57 | "checkout_date": 1469404800.0, 58 | "is_control": true 59 | }, 60 | "timestamp": 1469232000.0 61 | } 62 | ] 63 | -------------------------------------------------------------------------------- /tests/datasets/mts_kion_reference_actions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "user_id": "176549", 4 | "item_id": "9506", 5 | "data": {}, 6 | "timestamp": 1620691200 7 | }, 8 | { 9 | "user_id": "699317", 10 | "item_id": "1659", 11 | "data": {}, 12 | "timestamp": 1622246400 13 | }, 14 | { 15 | "user_id": "656683", 16 | "item_id": "7107", 17 | "data": {}, 18 | "timestamp": 1620518400 19 | }, 20 | { 21 | "user_id": "864613", 22 | "item_id": "7638", 23 | "data": {}, 24 | "timestamp": 1625443200 25 | }, 26 | { 27 | "user_id": "964868", 28 | "item_id": "9506", 29 | "data": {}, 30 | "timestamp": 1619740800 31 | }, 32 | { 33 | "user_id": "1032142", 34 | "item_id": "6686", 35 | "data": {}, 36 | "timestamp": 1620864000 37 | }, 38 | { 39 | "user_id": "1016458", 40 | "item_id": "354", 41 | "data": {}, 42 | "timestamp": 1628899200 43 | }, 44 | { 45 | "user_id": "884009", 46 | "item_id": "693", 47 | "data": {}, 48 | "timestamp": 1628035200 49 | }, 50 | { 51 | "user_id": "648682", 52 | "item_id": "1449", 53 | "data": {}, 54 | "timestamp": 1623542400 55 | }, 56 | { 57 | "user_id": "203219", 58 | "item_id": "13582", 59 | "data": {}, 60 | "timestamp": 1629590400 61 | } 62 | ] -------------------------------------------------------------------------------- /tests/datasets/test_beauty_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | from unittest import TestCase 3 | import unittest 4 | 5 | 6 | class TestBeautyDataset(TestCase): 7 | def test_beauty_dataset(self): 8 | from aprec.datasets.dataset_stats import dataset_stats 9 | from aprec.datasets.beauty import get_beauty_dataset 10 | from aprec.datasets.dataset_utils import filter_cold_users 11 | 12 | dataset = filter_cold_users(get_beauty_dataset(), 5) 13 | result = dataset_stats(dataset, metrics=['num_users', 'num_items', 'num_interactions']) 14 | print(result) 15 | 16 | if __name__ == "__main__": 17 | unittest.main() -------------------------------------------------------------------------------- /tests/datasets/test_bert4rec_datasets.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestBert4recDatasets(unittest.TestCase): 4 | def test_bert4rec_dataset(self): 5 | import json 6 | from aprec.datasets.dataset_stats import dataset_stats 7 | from aprec.datasets.bert4rec_datasets import get_bert4rec_dataset 8 | 9 | for dataset_name in ["beauty", "steam", "ml-1m"]: 10 | print(f"analyzing dataset {dataset_name}") 11 | dataset = get_bert4rec_dataset(dataset_name) 12 | stats = dataset_stats(dataset, metrics=['num_users', 'num_items', 'num_interactions']) 13 | print(json.dumps(stats, indent=4)) 14 | 15 | if __name__ == "__main__": 16 | unittest.main() -------------------------------------------------------------------------------- /tests/datasets/test_booking_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import TestCase 3 | 4 | class TestBookingDatset(TestCase): 5 | def test_booking_download(self): 6 | from aprec.datasets.booking import download_booking_train, download_booking_test 7 | from aprec.utils.os_utils import file_md5 8 | 9 | #download train file 10 | result_file = download_booking_train() 11 | booking_file_md5 = file_md5(result_file) 12 | self.assertEqual(booking_file_md5, "4f343b12d76b28ec0f1899e4083a72a8") 13 | 14 | #download train file 15 | result_file = download_booking_test() 16 | booking_file_md5 = file_md5(result_file) 17 | self.assertEqual(booking_file_md5, "2d068bea795cc4b798422ad1d80bd0c4") 18 | 19 | def test_booking_dataset(self): 20 | import os.path 21 | 22 | from aprec.datasets.booking import get_booking_dataset 23 | import json 24 | 25 | local_path = os.path.abspath(os.path.dirname(__file__)) 26 | with open(os.path.join(local_path, "booking_dataset_reference_actions.json")) as input: 27 | reference_actions = json.load(input) 28 | actions_dataset = get_booking_dataset(max_actions_per_file=2, unix_timestamps=True)[0] 29 | actions = [json.loads(action.to_json()) for action in actions_dataset] 30 | self.assertEqual(actions, reference_actions) 31 | 32 | 33 | 34 | if __name__ == "__main__": 35 | unittest.main() 36 | 37 | -------------------------------------------------------------------------------- /tests/datasets/test_datasets_register.py: -------------------------------------------------------------------------------- 1 | from aprec.datasets.datasets_register import DatasetsRegister 2 | import unittest 3 | 4 | class TestDatasetsRegister(unittest.TestCase): 5 | def test_register(self): 6 | register = DatasetsRegister() 7 | dataset = register["ml-100k"]() 8 | self.assertEquals(len(dataset), 100000) 9 | 10 | if __name__ == "__main__": 11 | unittest.main() -------------------------------------------------------------------------------- /tests/datasets/test_filter_cold_users.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestFilterColdUsers(unittest.TestCase): 4 | def test_filter_cold_users(self): 5 | from aprec.api.action import Action 6 | from aprec.datasets.dataset_utils import filter_cold_users 7 | actions = [Action(item_id=1, user_id=1, timestamp=1), 8 | Action(item_id=2, user_id=1, timestamp=2), 9 | Action(item_id=1, user_id=2, timestamp=1)] 10 | result = list(filter_cold_users(actions, 2)) 11 | self.assertEquals(str(result), "[Action(uid=1, item=1, ts=1), Action(uid=1, item=2, ts=2)]") 12 | 13 | if __name__ == "__main__": 14 | unittest.main() -------------------------------------------------------------------------------- /tests/datasets/test_gowalla_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | reference_actions = [{'user_id': '0', 'item_id': '22847', 'data': {}, 'timestamp': 1287532527.0}, 4 | {'user_id': '0', 'item_id': '420315', 'data': {}, 'timestamp': 1287440263.0}, 5 | {'user_id': '0', 'item_id': '316637', 'data': {}, 'timestamp': 1287358923.0}, 6 | {'user_id': '0', 'item_id': '16516', 'data': {}, 'timestamp': 1287343565.0}, 7 | {'user_id': '0', 'item_id': '5535878', 'data': {}, 'timestamp': 1287255042.0}] 8 | 9 | class TestGowallaDataset(unittest.TestCase): 10 | def test_gowalla_dataset(self): 11 | import json 12 | from aprec.datasets.gowalla import get_gowalla_dataset 13 | actions = [json.loads(action.to_json()) for action in get_gowalla_dataset(5)] 14 | self.assertEquals(actions, reference_actions) -------------------------------------------------------------------------------- /tests/datasets/test_mts_kion_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestMtsKionDataset(unittest.TestCase): 4 | def test_get_mts_kion(self): 5 | import os 6 | import json 7 | from aprec.datasets.mts_kion import get_mts_kion_dataset 8 | 9 | local_path = os.path.abspath(os.path.dirname(__file__)) 10 | with open(os.path.join(local_path, "mts_kion_reference_actions.json")) as reference_file: 11 | reference_data = json.load(reference_file) 12 | data = [json.loads(action.to_json()) for action in get_mts_kion_dataset(max_actions=10)] 13 | self.assertEqual(reference_data, data) 14 | 15 | 16 | def test_get_submission_user_ids(self): 17 | from aprec.datasets.mts_kion import get_submission_user_ids 18 | submission_users = get_submission_user_ids() 19 | self.assertEqual(submission_users[:10], 20 | ['3', '11', '29', '30', '33', '39', '46', '47', '51', '61']) 21 | 22 | def test_get_users(self): 23 | from aprec.datasets.mts_kion import get_users 24 | 25 | users = get_users() 26 | pass 27 | 28 | def test_get_items(self): 29 | from aprec.datasets.mts_kion import get_items 30 | 31 | items = get_items() 32 | self.assertEquals(items[0].cat_features[:2], [('content_type', 'film'), ('age_rating', '16.0')]) 33 | self.assertEquals(len(items), 15963) 34 | 35 | 36 | 37 | 38 | if __name__ == '__main__': 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /tests/datasets/test_netflix.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import unittest 3 | from aprec.datasets.datasets_register import DatasetsRegister 4 | 5 | 6 | class TestNetflixDataset(TestCase): 7 | def test_netflix(self): 8 | dataset = DatasetsRegister()["netflix_fraction_0.001"]() 9 | self.assertEqual(len(dataset), 97030) 10 | 11 | if __name__ == "__main__": 12 | unittest.main() 13 | -------------------------------------------------------------------------------- /tests/datasets/test_yelp_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | 4 | reference_actions = [{'user_id': 430450, 'item_id': 91854, 'data': {}, 'timestamp': 1108524202}, 5 | {'user_id': 430450, 'item_id': 137692, 'data': {}, 'timestamp': 1108524579}, 6 | {'user_id': 430450, 'item_id': 105383, 'data': {}, 'timestamp': 1108526786}, 7 | {'user_id': 6662, 'item_id': 30082, 'data': {}, 'timestamp': 1109696237}, 8 | {'user_id': 6662, 'item_id': 105487, 'data': {}, 'timestamp': 1109696377}, 9 | {'user_id': 6662, 'item_id': 75311, 'data': {}, 'timestamp': 1109697913}, 10 | {'user_id': 6662, 'item_id': 124, 'data': {}, 'timestamp': 1109699235}, 11 | {'user_id': 6662, 'item_id': 15422, 'data': {}, 'timestamp': 1109699966}, 12 | {'user_id': 6662, 'item_id': 76445, 'data': {}, 'timestamp': 1109705615}, 13 | {'user_id': 6662, 'item_id': 76263, 'data': {}, 'timestamp': 1109705744}] 14 | 15 | class TestYelpDataset(unittest.TestCase): 16 | def test_yelp_dataset(self): 17 | from aprec.datasets.yelp import get_yelp_dataset 18 | dataset = [json.loads(action.to_json()) for action in get_yelp_dataset(max_actions=10)] 19 | self.assertEqual(reference_actions, dataset) 20 | 21 | if __name__ == "__main__": 22 | unittest.main() -------------------------------------------------------------------------------- /tests/generate_actions.py: -------------------------------------------------------------------------------- 1 | def generate_actions(n): 2 | from math import sin, cos 3 | from aprec.api.action import Action 4 | max_users = n / 3 5 | max_timestamp = n / 2 6 | result = [] 7 | actions_set = set() 8 | i = 0 9 | while len(result) < n: 10 | user_id = int((sin(i) + 1)/2 * max_users) 11 | item_id = int((cos(i) + 1)/2 * max_users) 12 | timestamp = int((sin(i) ** 2) * max_timestamp) 13 | if ((user_id, item_id) not in actions_set): 14 | actions_set.add((user_id, item_id)) 15 | result.append(Action(user_id, item_id, timestamp)) 16 | i += 1 17 | return result 18 | -------------------------------------------------------------------------------- /tests/lossess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/lossess/__init__.py -------------------------------------------------------------------------------- /tests/lossess/test_bce_loss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestBCELoss(unittest.TestCase): 4 | def test_bce_loss(self): 5 | from tensorflow.keras.losses import BinaryCrossentropy 6 | import tensorflow as tf 7 | from aprec.tests.lossess.bce_bad_sample import y_true as bad_y_true 8 | from aprec.tests.lossess.bce_bad_sample import y_pred as bad_y_pred 9 | from aprec.losses.bce import BCELoss 10 | loss = float(BCELoss()(bad_y_true, bad_y_pred)) 11 | print(loss) 12 | 13 | y_true = tf.constant([-1., -1, -1, -1]) 14 | y_pred = [-50.0, -50, -50, -50] 15 | loss = float(BCELoss()(y_true, y_pred)) 16 | self.assertAlmostEqual(loss, 0.0) 17 | 18 | y_true = tf.constant([1, 0, 1, 0]) 19 | y_pred = [0.1, 0.2, 0.3, 0.4] 20 | loss = float(BCELoss()(y_true, y_pred)) 21 | keras_loss = float(BinaryCrossentropy(from_logits=True)(y_true, y_pred)) 22 | self.assertAlmostEqual(loss, keras_loss, 5) 23 | 24 | y_true = tf.constant([1., 0, 1, 0]) 25 | y_pred = [-50.0, -50, -50, -50] 26 | loss = float(BCELoss()(y_true, y_pred)) 27 | keras_loss = float(BinaryCrossentropy(from_logits=True)(y_true, y_pred)) 28 | self.assertAlmostEqual(loss, 18.420679092407227) 29 | pass 30 | 31 | 32 | if __name__ == "__main__": 33 | unittest.main() 34 | 35 | -------------------------------------------------------------------------------- /tests/lossess/test_climf_loss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | 5 | class TestCLIMFLoss(unittest.TestCase): 6 | def test_climf_loss(self): 7 | from aprec.losses.climf import CLIMFLoss 8 | import tensorflow.keras.backend as K 9 | climf_loss = CLIMFLoss(4, 2, 3) 10 | val = climf_loss(K.constant([[0, 0, 1, 1], 11 | [0, 0, 1, 1]]), 12 | K.constant([[0.1, 0.3, 1, 0], [0, 0, 1, 1]])) 13 | self.assertAlmostEqual(float(val), 7.418338775634766, places=4) 14 | climf_loss = CLIMFLoss(4, 1, 3) 15 | poor_pred_loss = climf_loss(K.constant([[0, 0, 1, 1]]), K.constant([[1, 0.5, 0, 0]])) 16 | avg_pred_loss = climf_loss(K.constant([[0, 0, 1, 1]]), K.constant([[0.1, 0.3, 1, 0]])) 17 | good_pred_loss = climf_loss(K.constant([[0, 0, 1, 1]]), K.constant([[0, 0, 1, 1]])) 18 | assert (poor_pred_loss > avg_pred_loss) 19 | assert (good_pred_loss < avg_pred_loss) 20 | 21 | if __name__ == "__main__": 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /tests/lossess/test_items_masking_proxy_loss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | class TestItemsMaskingProxyLoss(unittest.TestCase): 5 | def test_items_masking_proxy_loss(self): 6 | from aprec.losses.bpr import BPRLoss 7 | from aprec.losses.bce import BCELoss 8 | from aprec.losses.items_masking_loss_proxy import ItemsMaksingLossProxy 9 | proxy_loss = ItemsMaksingLossProxy(BCELoss(), 2, 4) 10 | proxy_loss.set_batch_size(2) 11 | proxy_loss.set_num_items(10) 12 | 13 | ytrue = np.array([ 14 | [ 15 | [-100, -100, -100], 16 | [1, 0, 0], 17 | [-100, -100, -100], 18 | [1, 0, 0] 19 | ], 20 | [ 21 | [1, 0, 0], 22 | [-100, -100, -100], 23 | [-100, -100, -100], 24 | [-100, -100, -100] 25 | 26 | ] 27 | ]) 28 | np.random.seed(31337) 29 | ypred = np.random.rand(2, 4, 3) 30 | result = proxy_loss(ytrue, ypred) 31 | print(result) 32 | 33 | 34 | if __name__ == "__main__": 35 | unittest.main() -------------------------------------------------------------------------------- /tests/lossess/test_lambdarank_time.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestLambdaranTime(unittest.TestCase): 4 | def test_get_lambdas(self): 5 | import random 6 | import numpy as np 7 | from aprec.losses.lambda_gamma_rank import LambdaGammaRankLoss 8 | import tensorflow as tf 9 | from tqdm import tqdm 10 | 11 | 12 | random.seed(31337) 13 | np.random.seed(31337) 14 | batch_size = 128 15 | dataset_size = 128 * 1024 16 | positives_per_sample = 100 17 | n_items = 50000 18 | pred_truncate_at = 500 19 | 20 | y_true = np.zeros((batch_size, n_items)) 21 | for sample_num in range(batch_size): 22 | positives = np.random.choice((range(n_items)), positives_per_sample, replace=False) 23 | for positive in positives: 24 | y_true[sample_num][positive] = random.random() 25 | y_true = tf.constant(y_true) 26 | 27 | loss = LambdaGammaRankLoss(n_items, batch_size, 1, ndcg_at=40, dtype=tf.float32, 28 | pred_truncate_at=pred_truncate_at, bce_grad_weight=0.1) 29 | for i in tqdm(range(dataset_size // batch_size)): 30 | y_pred = tf.random.uniform((batch_size, n_items)) 31 | #tf.keras.losses.binary_crossentropy(y_true, y_pred) 32 | loss.get_lambdas(y_true, y_pred) 33 | 34 | 35 | if __name__ == "__main__": 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /tests/lossess/test_logit_norm.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from aprec.losses.logit_norm import LogitNormLoss 3 | import os 4 | import tensorflow as tf 5 | 6 | class LogitNormsTest(unittest.TestCase): 7 | def setUp(self) -> None: 8 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 9 | 10 | def test_logitnorm_loss(self): 11 | loss = LogitNormLoss() 12 | y_true = tf.constant([[0, 1, 0, 0], [1, 0, 0, 0.]]) 13 | y_pred = tf.constant([[0.2, -1, 5, 7], [2, 1, 1, 1.]]) 14 | expected = [1.8970, 1.1170] 15 | result = loss.calc_per_list(y_true, y_pred).numpy() 16 | self.assertAlmostEqual(expected[0], result[0], places=3) 17 | self.assertAlmostEqual(expected[1], result[1], places=3) 18 | 19 | loss = LogitNormLoss(2) 20 | result = loss.calc_per_list(y_true, y_pred).numpy() 21 | self.assertAlmostEquals(result[1], 1.2480, 4) 22 | 23 | if __name__ == "__main__": 24 | unittest.main() -------------------------------------------------------------------------------- /tests/lossess/test_softmax_crossentropy.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from aprec.losses.softmax_crossentropy import SoftmaxCrossEntropy 3 | from transformers.modeling_tf_utils import TFCausalLanguageModelingLoss 4 | import os 5 | import tensorflow as tf 6 | 7 | class SoftmaxCrossentropyLossTest(unittest.TestCase): 8 | def setUp(self) -> None: 9 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 10 | 11 | def test_nll_loss0(self): 12 | y_true_sparse = tf.constant([[1]]) 13 | y_true = [0, 1, 0, 0] 14 | y_pred = tf.constant([[0.1, 0.2, 0.3, 0.4]]) 15 | class HFTLossConfig(object): 16 | tf_legacy_loss = False 17 | 18 | hft_transformers_loss = TFCausalLanguageModelingLoss() 19 | hft_transformers_loss.config = HFTLossConfig() 20 | hft_loss = hft_transformers_loss.hf_compute_loss(y_true_sparse, y_pred).numpy()[0] 21 | our_loss = SoftmaxCrossEntropy().calc_per_list(y_true, y_pred).numpy()[0] 22 | self.assertEqual(hft_loss, our_loss) 23 | 24 | y_true_sparse = tf.constant([[1, -100, 2]]) 25 | y_true = tf.constant([[0, 1, 0, 0], [-100, -100, -100, -100], [0, 0, 1, 0]]) 26 | y_pred = tf.constant([[0.1, 0.2, 0.3, 0.4], [0.0, 0.0, 0.0, 1.0], [0.8, 0.0, 0.2, 0.0]]) 27 | hft_loss = hft_transformers_loss.hf_compute_loss(y_true_sparse, y_pred) 28 | our_loss = SoftmaxCrossEntropy().loss_per_list(y_true, y_pred) 29 | self.assertEqual(our_loss, hft_loss) 30 | 31 | 32 | if __name__ == "__main__": 33 | unittest.main() -------------------------------------------------------------------------------- /tests/lossess/test_xendcg_loss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from aprec.losses.xendcg import XENDCGLoss 3 | 4 | 5 | class TestXENDCGLoss(unittest.TestCase): 6 | def test_xendcg(self): 7 | import tensorflow as tf 8 | true = tf.constant([[0., 1., 0.]]) 9 | pred = tf.constant([[0., 0.5, 0]]) 10 | xendcg = XENDCGLoss(true.shape[1], true.shape[0]) 11 | result = xendcg(true, pred) 12 | print(result) 13 | 14 | def test_model_xendcg(self): 15 | import tensorflow as tf 16 | from tensorflow.keras.models import Sequential 17 | from tensorflow.keras.layers import Dense 18 | X = tf.constant([[0., 0], [1, 0]]) 19 | Y = tf.constant([[1., 0], [0, 1]]) 20 | model = Sequential() 21 | model.add(Dense(2, activation='sigmoid')) 22 | model.add(Dense(2, activation='sigmoid')) 23 | model.add(Dense(2, activation='sigmoid')) 24 | model.add(Dense(2, activation='linear')) 25 | xendcg = XENDCGLoss(X.shape[1], X.shape[0]) 26 | model.compile(optimizer='adam', loss=xendcg) 27 | 28 | model.fit(X, Y, epochs=2000, verbose=False) 29 | result = model.predict(X) 30 | tf.print(result) 31 | assert (result[0, 0] > result[0, 1]) 32 | assert (result[1, 0] < result[1, 1]) 33 | 34 | if __name__ == "__main__": 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/metrics/test_map.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | class TestMAP(unittest.TestCase): 5 | def test_map(self): 6 | from aprec.evaluation.metrics.map import MAP 7 | from aprec.api.action import Action 8 | recommended = [(6, 0.9), (3, 0.85), (5, 0.71), (0, 0.63), (4, 0.47), (2, 0.36), (1, 0.24), (7, 0.16)] 9 | actual = [Action(user_id = 1, item_id = 6, timestamp=1), 10 | Action(user_id = 1, item_id = 5, timestamp=2), 11 | Action(user_id = 1, item_id = 0, timestamp=3), 12 | Action(user_id = 1, item_id = 2, timestamp=4), 13 | ] 14 | map = MAP(8) 15 | self.assertEqual(map(recommended, actual), 0.7708333333333333) 16 | 17 | if __name__ == "__main__": 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /tests/metrics/test_mrr.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | class TestMRR(unittest.TestCase): 5 | def test_map(self): 6 | from aprec.evaluation.metrics.mrr import MRR 7 | from aprec.api.action import Action 8 | recommended = [(1, 2), (2, 1), (3, 0.5)] 9 | actual = [Action(user_id = 1, item_id = 4, timestamp=1), 10 | Action(user_id = 1, item_id = 3, timestamp=2)] 11 | mrr = MRR() 12 | self.assertEqual(mrr(recommended, actual), 1/3) 13 | 14 | if __name__ == "__main__": 15 | unittest.main() 16 | -------------------------------------------------------------------------------- /tests/metrics/test_ndcg.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestNDCG(unittest.TestCase): 4 | def test_ndcg(self): 5 | from aprec.evaluation.metrics.ndcg import NDCG 6 | from aprec.api.action import Action 7 | recommended = [(1, 2), (2, 1), (3, 0.5)] 8 | actual = [Action(user_id = 1, item_id = 4, timestamp=1), 9 | Action(user_id = 1, item_id = 3, timestamp=2)] 10 | ndcg = NDCG(3) 11 | self.assertEqual(ndcg(recommended, actual), 0.5) 12 | 13 | if __name__ == "__main__": 14 | unittest.main() 15 | -------------------------------------------------------------------------------- /tests/metrics/test_pairwise_cos_sim.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestPairwiseCosSim(unittest.TestCase): 4 | def test_pairwise_cos_sim(self): 5 | from aprec.evaluation.metrics.pairwise_cos_sim import PairwiseCosSim 6 | from aprec.api.action import Action 7 | actions = [Action(user_id=1, item_id=1, timestamp=1), 8 | Action(user_id=1, item_id=3, timestamp=2), 9 | 10 | Action(user_id=2, item_id=1, timestamp=2), 11 | Action(user_id=2, item_id=2, timestamp=2), 12 | Action(user_id=2, item_id=3, timestamp=2)] 13 | 14 | pairwise_cos_sim = PairwiseCosSim(actions, 10) 15 | 16 | recommended = [(1, 2), (2, 1), (3, 0.5)] 17 | actual = [Action(user_id = 1, item_id = 1, timestamp=1), 18 | Action(user_id = 1, item_id = 2, timestamp=2)] 19 | self.assertEqual(pairwise_cos_sim(recommended, actual), 2/3) 20 | 21 | if __name__ == "__main__": 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /tests/metrics/test_precision.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestPrecision(unittest.TestCase): 4 | def test_precsion(self): 5 | from aprec.evaluation.metrics.precision import Precision 6 | from aprec.api.action import Action 7 | 8 | recommended = [(1, 2), (2, 1), (3, 0.5)] 9 | actual = [Action(user_id = 1, item_id = 1, timestamp=1), 10 | Action(user_id = 1, item_id = 3, timestamp=2)] 11 | precision_1 = Precision(1) 12 | precision_2 = Precision(2) 13 | precision_3 = Precision(3) 14 | self.assertEqual(precision_1(recommended, actual), 1) 15 | self.assertEqual(precision_2(recommended, actual), 0.5) 16 | self.assertEqual(precision_3(recommended, actual), 2/3) 17 | 18 | if __name__ == "__main__": 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /tests/metrics/test_proxy_metric.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from aprec.api.action import Action 7 | from aprec.evaluation.metrics.sampled_proxy_metric import SampledProxy 8 | from aprec.evaluation.metrics.precision import Precision 9 | 10 | 11 | class TestSampledProxyMetric(unittest.TestCase): 12 | def test_proxy_precision(self): 13 | recommended = [(1, 2), (2, 1), (3, 0.5)] 14 | actual = [Action(user_id = 1, item_id = 1, timestamp=1), 15 | Action(user_id = 1, item_id = 3, timestamp=2)] 16 | all_item_ids = [1, 2, 3, 4, 5, 6] 17 | random.seed(31337) 18 | np.random.seed(31337) 19 | metric = SampledProxy(all_item_ids, [1./6] * 6, 2, Precision(3)) 20 | self.assertAlmostEqual(metric(recommended, actual), 2./3) 21 | 22 | if __name__ == "__main__": 23 | unittest.main() 24 | 25 | -------------------------------------------------------------------------------- /tests/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/misc/__init__.py -------------------------------------------------------------------------------- /tests/misc/test_evaluate_recommender.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestEvaluateRecommender(unittest.TestCase): 4 | def test_evaluate(self): 5 | import tempfile 6 | from aprec.datasets.movielens20m import get_movielens20m_actions 7 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 8 | from aprec.recommenders.top_recommender import TopRecommender 9 | from aprec.utils.generator_limit import generator_limit 10 | from aprec.evaluation.split_actions import TemporalGlobal 11 | from aprec.evaluation.n_actions_for_user import n_actions_for_user 12 | from aprec.evaluation.evaluate_recommender import evaluate_recommender 13 | from aprec.evaluation.metrics.precision import Precision 14 | from aprec.evaluation.metrics.recall import Recall 15 | 16 | recommender = FilterSeenRecommender(TopRecommender()) 17 | actions = generator_limit(get_movielens20m_actions(), 10000) 18 | split_actions = TemporalGlobal((70, 30)) 19 | train, test = split_actions(actions) 20 | test = n_actions_for_user(test, 1) 21 | for action in train: 22 | recommender.add_action(action) 23 | recommender.rebuild_model() 24 | metrics = [Precision(1), Recall(1), Precision(5), Recall(5), Precision(10), Recall(10)] 25 | output_dir = tempfile.mkdtemp() 26 | result = evaluate_recommender(recommender, test, metrics, output_dir, "top_recommender") 27 | reference_result = {'precision@1': 0.0, 'recall@1': 0.0, 28 | 'precision@5': 0.00425531914893617, 'recall@5': 0.02127659574468085, 29 | 'precision@10': 0.002127659574468085, 'recall@10': 0.02127659574468085} 30 | self.assertEqual(reference_result, result) 31 | 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/misc/test_item_id.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | class TestItemId(unittest.TestCase): 4 | def test_get_id(self): 5 | from aprec.utils.item_id import ItemId 6 | items_dict = ItemId() 7 | self.assertEqual(items_dict.get_id("aaa"), 0) 8 | self.assertEqual(items_dict.get_id("bbb"), 1) 9 | self.assertEqual(items_dict.get_id("ccc"), 2) 10 | self.assertEqual(items_dict.get_id("ddd"), 3) 11 | self.assertEqual(items_dict.get_id("aaa"), 0) 12 | self.assertEqual(items_dict.get_id("ccc"), 2) 13 | self.assertEqual(items_dict.reverse_id(2), "ccc") 14 | self.assertTrue(items_dict.has_id(2)) 15 | self.assertFalse(items_dict.has_id(4)) 16 | self.assertTrue(items_dict.has_item("aaa")) 17 | self.assertFalse(items_dict.has_item("fff")) 18 | self.assertRaises(KeyError, items_dict.reverse_id, 4) 19 | with tempfile.NamedTemporaryFile(suffix="_dict.txt") as tmp: 20 | items_dict.save(tmp.name) 21 | new_dict = ItemId.load(tmp.name) 22 | 23 | self.assertEqual(items_dict.straight, new_dict.straight) 24 | self.assertEqual(items_dict.reverse, new_dict.reverse) 25 | self.assertEqual(new_dict.get_id("aaa"), 0) 26 | self.assertEqual(new_dict.get_id("bbb"), 1) 27 | self.assertEqual(new_dict.get_id("ccc"), 2) 28 | self.assertEqual(new_dict.get_id("ddd"), 3) 29 | self.assertEqual(new_dict.get_id("aaa"), 0) 30 | self.assertEqual(new_dict.get_id("ccc"), 2) 31 | 32 | 33 | 34 | if __name__ == "__main__": 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /tests/misc/test_keras_ndcg.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.metrics.ndcg import KerasNDCG 2 | import tensorflow.keras.backend as K 3 | import tensorflow as tf 4 | 5 | import numpy as np 6 | import unittest 7 | 8 | class TestKerasNDCG(unittest.TestCase): 9 | def setUp(cls): 10 | tf.keras.backend.clear_session() 11 | 12 | def tearDown(cls): 13 | tf.keras.backend.clear_session() 14 | 15 | 16 | def test_keras_ndcg(self): 17 | EPS=1e-5 18 | y_true = K.constant(np.array([[0, 1, 0], [1, 1, 0]])) 19 | y_pred = K.constant(np.array([[0.1, 0.2, 0.3], [0.6, 0.5, 0.4]])) 20 | keras_ndcg = KerasNDCG(2) 21 | res = keras_ndcg(y_true, y_pred) 22 | assert abs(res - K.constant(0.815464854)) < EPS 23 | 24 | if __name__ == "__main__": 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /tests/misc/test_kion_challenge_featurizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | class TestKionChallengeFeaturizer(unittest.TestCase): 3 | def test_kion_challenge_featurizer(self): 4 | from aprec.recommenders.kion_challenge_featurizer import KionChallengeFeaturizer 5 | from aprec.datasets.mts_kion import get_users 6 | from aprec.datasets.mts_kion import get_items 7 | from aprec.datasets.mts_kion import get_mts_kion_dataset 8 | 9 | 10 | featurizer = KionChallengeFeaturizer() 11 | for user in get_users(): 12 | featurizer.add_user(user) 13 | for item in get_items(): 14 | featurizer.add_item(item) 15 | for action in get_mts_kion_dataset(20000): 16 | featurizer.add_action(action) 17 | featurizer.build() 18 | candidates = ['7638', '6686', '9506'] 19 | features = featurizer.get_features('176549', candidates) 20 | self.assertEquals(len(features),len(candidates)) 21 | for i in range(len(candidates)): 22 | self.assertEquals(len(features[i]), len(featurizer.feature_names)) 23 | pass 24 | 25 | -------------------------------------------------------------------------------- /tests/misc/test_n_actions_for_user.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | REFERENCE_1_ACTION =\ 3 | """Action(uid=0, item=0, ts=2) 4 | Action(uid=1, item=3, ts=0) 5 | Action(uid=2, item=0, ts=0) 6 | Action(uid=3, item=2, ts=3)""" 7 | 8 | 9 | REFERENCE_2_ACTION =\ 10 | """Action(uid=0, item=0, ts=2) 11 | Action(uid=0, item=2, ts=4) 12 | Action(uid=1, item=0, ts=0) 13 | Action(uid=1, item=3, ts=0) 14 | Action(uid=2, item=0, ts=0) 15 | Action(uid=2, item=2, ts=2) 16 | Action(uid=3, item=0, ts=4) 17 | Action(uid=3, item=2, ts=3)""" 18 | 19 | def sorted_actions_str(actions): 20 | return "\n".join(sorted([str(action) for action in actions])) 21 | 22 | class TestNActionsForUser(unittest.TestCase): 23 | def test_n_actions_for_user(self): 24 | from aprec.tests.generate_actions import generate_actions 25 | from aprec.evaluation.n_actions_for_user import n_actions_for_user 26 | 27 | 28 | actions = generate_actions(10) 29 | actions_1 = n_actions_for_user(actions, 1) 30 | actions_2 = n_actions_for_user(actions, 2) 31 | self.assertEqual(sorted_actions_str(actions_1), REFERENCE_1_ACTION) 32 | self.assertEqual(sorted_actions_str(actions_2), REFERENCE_2_ACTION) 33 | 34 | 35 | if __name__ == "__main__": 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /tests/misc/test_recommender_evaluator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class TestRecommenderEvaluator(unittest.TestCase): 5 | def test_recommender_evaluator(self): 6 | from aprec.datasets.movielens20m import get_movielens20m_actions 7 | from aprec.evaluation.samplers.pop_sampler import PopTargetItemsSampler 8 | from aprec.utils.generator_limit import generator_limit 9 | from aprec.evaluation.split_actions import LeaveOneOut 10 | from aprec.evaluation.metrics.precision import Precision 11 | from aprec.recommenders.top_recommender import TopRecommender 12 | from aprec.evaluation.evaluate_recommender import RecommendersEvaluator 13 | import tempfile 14 | 15 | 16 | actions = [action for action in generator_limit(get_movielens20m_actions(), 100000)] 17 | recommenders= {"top_recommender": TopRecommender} 18 | 19 | data_splitter = LeaveOneOut(max_test_users=128) 20 | metrics = [Precision(5)] 21 | out_dir = tempfile.mkdtemp() 22 | n_val_users=10 23 | recommendations_limit = 10 24 | target_items_sampler = PopTargetItemsSampler(20) 25 | evaluator = RecommendersEvaluator(actions, recommenders, metrics, 26 | out_dir, data_splitter, n_val_users, 27 | recommendations_limit, 28 | target_items_sampler=target_items_sampler) 29 | result = evaluator()['recommenders']['top_recommender'] 30 | 31 | 32 | del(result["model_build_time"]) 33 | del(result["model_inference_time"]) 34 | del(result["minutes_to_converge"]) 35 | del(result["model_metadata"]["tensorboard_dir"]) 36 | 37 | self.assertEqual(result, 38 | {'precision@5': 0.0078125, 'sampled_metrics': {'precision@5': 0.039062500000000014}, 39 | 'model_metadata': {"top 20 items": [("318", 556), ("296", 523), ("356", 501), ("593", 493), 40 | ("260", 425), ("50", 410), ("527", 407), ("2571", 403), 41 | ("110", 372), ("1196", 356), ("457", 355), ("1198", 355), 42 | ("2858", 349), ("589", 341), ("608", 339), ("1210", 338), 43 | ("1", 334), ("858", 334), ("47", 324), ("2959", 321)]}}) 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /tests/misc/test_split_actions.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from aprec.evaluation.split_actions import TemporalGlobal, RandomSplit 3 | from aprec.datasets.movielens20m import get_movielens20m_actions 4 | from aprec.tests.generate_actions import generate_actions 5 | from aprec.utils.generator_limit import generator_limit 6 | from aprec.evaluation.evaluate_recommender import group_by_user 7 | 8 | 9 | class TestSplitActions(unittest.TestCase): 10 | 11 | def test_split_actions(self): 12 | actions = generate_actions(100) 13 | split_actions = TemporalGlobal((7, 1, 2)) 14 | splitted = split_actions(actions) 15 | self.assertEqual(len(splitted), 3) 16 | self.assertEqual(len(splitted[0]), 70) 17 | self.assertEqual(len(splitted[1]), 10) 18 | self.assertEqual(len(splitted[2]), 20) 19 | assert(times_func(splitted[0], max) <= times_func(splitted[1], min)) 20 | assert(times_func(splitted[1], max) <= times_func(splitted[2], min)) 21 | self.assertEqual(set(actions), set(splitted[0] + splitted[1] + splitted[2])) 22 | 23 | def test_random_split(self): 24 | user_ids = set() 25 | actions = [] 26 | for action in generator_limit(get_movielens20m_actions(), 10000): 27 | actions.append(action) 28 | user_ids.add(action.user_id) 29 | random_split = RandomSplit(0.5, 10) 30 | train, test = random_split(actions) 31 | train_users = group_by_user(train) 32 | test_users = group_by_user(test) 33 | self.assertEqual(len(test_users), 10) 34 | for user in test_users: 35 | self.assertTrue(abs(len(test_users[user]) - len(train_users[user])) <= 1) 36 | test_items = set([action.item_id for action in test_users[user]]) 37 | train_items = set([action.item_id for action in train_users[user]]) 38 | self.assertEqual(len(train_items.intersection(test_items)), 0) 39 | 40 | 41 | 42 | def times_func(actions, func): 43 | return func([action.timestamp for action in actions]) 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /tests/ml_sequences.py: -------------------------------------------------------------------------------- 1 | def ml_sequences(n_actions): 2 | from aprec.utils.generator_limit import generator_limit 3 | from aprec.datasets.movielens20m import get_movielens20m_actions 4 | from aprec.utils.item_id import ItemId 5 | from collections import defaultdict 6 | sequences_dict = defaultdict(list) 7 | actions = [action for action in generator_limit(get_movielens20m_actions(), n_actions)] 8 | actions.sort(key = lambda action: action.timestamp) 9 | item_ids = ItemId() 10 | for action in actions: 11 | sequences_dict[action.user_id].append((action.timestamp, item_ids.get_id(action.item_id))) 12 | sequences = list(sequences_dict.values()) 13 | return sequences, item_ids 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/recommenders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/recommenders/__init__.py -------------------------------------------------------------------------------- /tests/recommenders/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/recommenders/baselines/__init__.py -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_conditional_top_recommender.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestConditionalTopRecommender(unittest.TestCase): 4 | def test_conditional_top_recommender(self): 5 | from typing import List 6 | from aprec.api.action import Action 7 | from aprec.recommenders.conditional_top_recommender import ConditionalTopRecommender 8 | 9 | recommender = ConditionalTopRecommender(conditional_field='country_id') 10 | actions: List[Action] = [ 11 | Action(user_id=0, item_id=0, timestamp=0, data={'country_id': 100}), 12 | Action(user_id=0, item_id=0, timestamp=10, data={'country_id': 100}), 13 | Action(user_id=0, item_id=1, timestamp=20, data={'country_id': 100}), 14 | ] 15 | for action in actions: 16 | recommender.add_action(action) 17 | recommender.rebuild_model() 18 | recommendations = recommender.recommend(0, 1) 19 | self.assertEqual(recommendations, [(0, 2)]) 20 | 21 | if __name__ == "__main__": 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_constnat_recommender.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestConstantRecommender(unittest.TestCase): 4 | def test_constant_recommender(self): 5 | from aprec.recommenders.constant_recommender import ConstantRecommender 6 | constant_recommender = ConstantRecommender(((1, 1),(2, 0.5), (3, 0.4))) 7 | self.assertEqual(constant_recommender.recommend(1, 2), ((1, 1), (2, 0.5))) 8 | 9 | if __name__ == "__main__": 10 | unittest.main() 11 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_deepmf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | USER_ID = '120' 4 | 5 | class TestDeepMF(unittest.TestCase): 6 | def test_deepmf_recommender(self): 7 | from aprec.recommenders.deep_mf import DeepMFRecommender 8 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 9 | from aprec.datasets.movielens20m import get_movielens20m_actions 10 | from aprec.utils.generator_limit import generator_limit 11 | 12 | mlp_recommender = DeepMFRecommender(100, 1000, steps=20) 13 | recommender = FilterSeenRecommender(mlp_recommender) 14 | for action in generator_limit(get_movielens20m_actions(), 10000): 15 | recommender.add_action(action) 16 | recommender.rebuild_model() 17 | recs = recommender.recommend(USER_ID, 10) 18 | print(recs) 19 | 20 | 21 | 22 | if __name__ == "__main__": 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_filter_seen_recommender.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import unittest 4 | 5 | class TestFilterSeenRecommender(unittest.TestCase): 6 | def test_constant_recommender(self): 7 | from aprec.recommenders.constant_recommender import ConstantRecommender 8 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 9 | from aprec.api.action import Action 10 | 11 | constant_recommender = ConstantRecommender(((1, 1),(2, 0.5), (3, 0.4))) 12 | recommender = FilterSeenRecommender(constant_recommender) 13 | recommender.add_action(Action(user_id=1, item_id=2, timestamp=1)) 14 | self.assertEqual(recommender.recommend(1, 2), [(1, 1), (3, 0.4)]) 15 | 16 | def test_filte_seen_sampled_rankings(self): 17 | from aprec.datasets.movielens20m import get_movielens20m_actions 18 | from aprec.recommenders.top_recommender import TopRecommender 19 | from aprec.utils.generator_limit import generator_limit 20 | from aprec.api.items_ranking_request import ItemsRankingRequest 21 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 22 | 23 | recommender = FilterSeenRecommender(TopRecommender()) 24 | ranking_request = ItemsRankingRequest(user_id='1', item_ids=['1196', '589']) 25 | recommender.add_test_items_ranking_request(ranking_request) 26 | for action in generator_limit(get_movielens20m_actions(), 1000): 27 | recommender.add_action(action) 28 | recommender.rebuild_model() 29 | recommendations = recommender.get_item_rankings() 30 | self.assertEqual(recommendations,{'1': [('589', 9), ('1196', -float('inf'))]}) 31 | 32 | if __name__ == "__main__": 33 | unittest.main() -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_first_order_mc_recommender.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | USER_ID = '120' 5 | 6 | class TestFirstOrderMCRecommender(unittest.TestCase): 7 | def test_first_order_mc_recommender(self): 8 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 9 | from aprec.recommenders.first_order_mc import FirstOrderMarkovChainRecommender 10 | from aprec.datasets.movielens20m import get_movielens20m_actions 11 | from aprec.utils.generator_limit import generator_limit 12 | recommender = FilterSeenRecommender(FirstOrderMarkovChainRecommender()) 13 | for action in generator_limit(get_movielens20m_actions(), 100000): 14 | recommender.add_action(action) 15 | recommender.rebuild_model() 16 | recs = recommender.recommend(USER_ID, 10) 17 | print(recs) 18 | 19 | def test_sampled_rankings(self): 20 | from aprec.api.items_ranking_request import ItemsRankingRequest 21 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 22 | from aprec.recommenders.first_order_mc import FirstOrderMarkovChainRecommender 23 | from aprec.datasets.movielens20m import get_movielens20m_actions 24 | from aprec.utils.generator_limit import generator_limit 25 | 26 | recommender = FilterSeenRecommender(FirstOrderMarkovChainRecommender()) 27 | for action in generator_limit(get_movielens20m_actions(), 100000): 28 | recommender.add_action(action) 29 | ranking_request = ItemsRankingRequest('120', ['608', '294', '648']) 30 | recommender.add_test_items_ranking_request(ranking_request) 31 | recommender.rebuild_model() 32 | sampled_scores = recommender.get_item_rankings() 33 | self.assertEqual(len(sampled_scores), 1) 34 | predicted_scores = sampled_scores['120'] 35 | unseen_item = '294' 36 | for item, score in predicted_scores: 37 | if item == unseen_item: 38 | self.assertEqual(score, 0) 39 | else: 40 | self.assertGreater(score, 0) 41 | 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_item_item_recommender.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | USER_ID = '120' 5 | 6 | REFERENCE_COLD_START = [('296', 62), ('318', 62), ('356', 60), 7 | ('593', 48), ('260', 44), ('50', 43), ('527', 43), ('608', 42), ('47', 41), ('480', 40)] 8 | 9 | REFERENCE_USER_RECOMMENDATIONS = [('276', 0.5), ('450', 0.5), ('296', 0.48612153038259565), ('292', 0.47265625), 10 | ('361', 0.4444444444444444), ('225', 0.4375), 11 | ('593', 0.436046511627907), ('474', 0.4166666666666667), 12 | ('1089', 0.38813151563753007), ('588', 0.3820662768031189)] 13 | 14 | class TestItemItemRecommender(unittest.TestCase): 15 | def compare_recommendations(self, rec1, rec2): 16 | print(rec1, rec2) 17 | self.assertEqual(len(rec1), len(rec2)) 18 | for i in range(len(rec1)): 19 | self.assertEqual(rec1[i][0], rec2[i][0]) 20 | self.assertAlmostEqual(rec1[i][1], rec2[i][1]) 21 | 22 | def test_item_item_recommender(self): 23 | from aprec.recommenders.item_item import ItemItemRecommender 24 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 25 | from aprec.datasets.movielens20m import get_movielens20m_actions, get_movies_catalog 26 | from aprec.utils.generator_limit import generator_limit 27 | from aprec.api.action import Action 28 | item_item_recommender = ItemItemRecommender() 29 | recommender = FilterSeenRecommender(item_item_recommender) 30 | catalog = get_movies_catalog() 31 | 32 | for action in generator_limit(get_movielens20m_actions(), 10000): 33 | recommender.add_action(action) 34 | recommender.rebuild_model() 35 | recs_cold_start = recommender.recommend(12341324, 10) 36 | self.compare_recommendations(recs_cold_start, REFERENCE_COLD_START) 37 | recs = recommender.recommend(USER_ID, 10) 38 | self.compare_recommendations(recs, REFERENCE_USER_RECOMMENDATIONS) 39 | 40 | actions = [Action('1', 1, 1), 41 | Action('1', 2, 2), 42 | Action('2', 2, 1), 43 | Action('2', 3, 1)] 44 | recommender = ItemItemRecommender() 45 | for action in actions: 46 | recommender.add_action(action) 47 | recommender.rebuild_model() 48 | 49 | 50 | 51 | 52 | 53 | 54 | if __name__ == "__main__": 55 | unittest.main() 56 | 57 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_lightfm_recommender.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestLightFMRecommender(unittest.TestCase): 4 | def test_lightfm_recommender(self): 5 | USER_ID = '120' 6 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 7 | from aprec.recommenders.lightfm import LightFMRecommender 8 | from aprec.datasets.movielens20m import get_movielens20m_actions 9 | from aprec.utils.generator_limit import generator_limit 10 | 11 | lightfm_recommender = LightFMRecommender(30, 'bpr') 12 | recommender = FilterSeenRecommender(lightfm_recommender) 13 | for action in generator_limit(get_movielens20m_actions(), 10000): 14 | recommender.add_action(action) 15 | recommender.rebuild_model() 16 | recs = recommender.recommend(USER_ID, 10) 17 | print(recs) 18 | 19 | 20 | 21 | if __name__ == "__main__": 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_matrix_factorization_recommender.py: -------------------------------------------------------------------------------- 1 | from aprec.losses import top1 2 | from aprec.recommenders.matrix_factorization import MatrixFactorizationRecommender 3 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 4 | from aprec.datasets.movielens20m import get_movielens20m_actions 5 | from aprec.utils.generator_limit import generator_limit 6 | import tensorflow as tf 7 | import unittest 8 | 9 | USER_ID = '120' 10 | 11 | class TestMatrixFactorizationRecommender(unittest.TestCase): 12 | def setUp(cls): 13 | tf.keras.backend.clear_session() 14 | 15 | def tearDown(cls): 16 | tf.keras.backend.clear_session() 17 | 18 | 19 | 20 | def test_matrix_factorization_recommender_recommender(self): 21 | losses = ['bce', 'bpr', 'lambdarank', 'xendcg', 'climf', 'top1'] 22 | for loss in losses: 23 | print(f"testing matrix factorization model with {loss} loss") 24 | matrix_factorization_recommender = MatrixFactorizationRecommender(32, 5, loss, batch_size=10) 25 | recommender = FilterSeenRecommender(matrix_factorization_recommender) 26 | for action in generator_limit(get_movielens20m_actions(), 10000): 27 | recommender.add_action(action) 28 | recommender.rebuild_model() 29 | recs = recommender.recommend(USER_ID, 10) 30 | print(recs) 31 | 32 | def test_recommend_batch(self): 33 | matrix_factorization_recommender = MatrixFactorizationRecommender(32, 5, 'bce', batch_size=10) 34 | recommender = FilterSeenRecommender(matrix_factorization_recommender) 35 | user_ids = set() 36 | for action in generator_limit(get_movielens20m_actions(), 10000): 37 | recommender.add_action(action) 38 | user_ids.add(action.user_id) 39 | recommender.rebuild_model() 40 | requests = [(user_id, None) for user_id in ['142', '111', '57', '37', '136', '88']] 41 | batch_recommendations = recommender.recommend_batch(requests, 10) 42 | print(batch_recommendations) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_mlp_historical.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.mlp_historical import GreedyMLPHistorical 2 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 3 | from aprec.datasets.movielens20m import get_movielens20m_actions 4 | from aprec.utils.generator_limit import generator_limit 5 | import tensorflow as tf 6 | import unittest 7 | 8 | USER_ID = '120' 9 | 10 | class TestMLPRecommender(unittest.TestCase): 11 | def setUp(cls): 12 | tf.keras.backend.clear_session() 13 | 14 | def tearDown(cls): 15 | tf.keras.backend.clear_session() 16 | 17 | 18 | def test_mlp_recommender(self): 19 | mlp_recommender = GreedyMLPHistorical(train_epochs=10, n_val_users=10, batch_size=5) 20 | recommender = FilterSeenRecommender(mlp_recommender) 21 | for action in generator_limit(get_movielens20m_actions(), 10000): 22 | recommender.add_action(action) 23 | recommender.rebuild_model() 24 | recs = recommender.recommend(USER_ID, 10) 25 | print(recs) 26 | 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_mlp_recommender.py: -------------------------------------------------------------------------------- 1 | from aprec.recommenders.mlp import GreedyMLP 2 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 3 | from aprec.datasets.movielens20m import get_movielens20m_actions, get_movies_catalog 4 | from aprec.utils.generator_limit import generator_limit 5 | import tensorflow as tf 6 | import unittest 7 | 8 | USER_ID = '120' 9 | 10 | class TestMLPRecommender(unittest.TestCase): 11 | def setUp(cls): 12 | tf.keras.backend.clear_session() 13 | 14 | def tearDown(cls): 15 | tf.keras.backend.clear_session() 16 | 17 | 18 | def test_mlp_recommender(self): 19 | mlp_recommender = GreedyMLP(train_epochs=10) 20 | recommender = FilterSeenRecommender(mlp_recommender) 21 | for action in generator_limit(get_movielens20m_actions(), 10000): 22 | recommender.add_action(action) 23 | recommender.rebuild_model() 24 | recs = recommender.recommend(USER_ID, 10) 25 | print(recs) 26 | 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | 32 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_svd_recommender.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | USER_ID = '120' 4 | 5 | REFERENCE_COLD_START = [('318', 0.6019900660660039), ('296', 0.5928136146373703), ('356', 0.5671645460239426), 6 | ('593', 0.494680602882191), ('50', 0.46695169879496523), ('47', 0.46184204110408533), 7 | ('527', 0.4398795906398074), ('260', 0.43692734916941883), 8 | ('1', 0.4210339121252358), ('589', 0.4195799728444275)] 9 | 10 | REFERENCE_USER_RECOMMENDATIONS = [('296', 0.5097028006608604), 11 | ('457', 0.46596785899698745), 12 | ('110', 0.46393997126655373), 13 | ('380', 0.4291430391625074), 14 | ('593', 0.4159414958428441), 15 | ('1', 0.398391005348504), 16 | ('1210', 0.35877141070731267), 17 | ('260', 0.35489876705579815), 18 | ('292', 0.34561595303551884), 19 | ('733', 0.34348521664244525)] 20 | class TestSvdRecommender(unittest.TestCase): 21 | def compare_recommendations(self, rec1, rec2): 22 | self.assertEqual(len(rec1), len(rec2)) 23 | for i in range(len(rec1)): 24 | self.assertEqual(rec1[i][0], rec2[i][0]) 25 | self.assertAlmostEqual(rec1[i][1], rec2[i][1]) 26 | 27 | def test_svd_recommender(self): 28 | from aprec.recommenders.svd import SvdRecommender 29 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 30 | from aprec.datasets.movielens20m import get_movielens20m_actions 31 | from aprec.utils.generator_limit import generator_limit 32 | from aprec.api.action import Action 33 | 34 | svd_recommender = SvdRecommender(10, random_seed=31337) 35 | recommender = FilterSeenRecommender(svd_recommender) 36 | for action in generator_limit(get_movielens20m_actions(), 10000): 37 | recommender.add_action(action) 38 | recommender.rebuild_model() 39 | self.compare_recommendations(recommender.recommend(12341324, 10), REFERENCE_COLD_START) 40 | recs = recommender.recommend(USER_ID, 10) 41 | self.compare_recommendations(recs, REFERENCE_USER_RECOMMENDATIONS) 42 | 43 | actions = [Action('1', 1, 1), 44 | Action('1', 2, 2), 45 | Action('2', 2, 1), 46 | Action('2', 3, 1)] 47 | recommender = SvdRecommender(2, random_seed=31337) 48 | for action in actions: 49 | recommender.add_action(action) 50 | recommender.rebuild_model() 51 | 52 | 53 | 54 | 55 | 56 | 57 | if __name__ == "__main__": 58 | unittest.main() 59 | 60 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_top_recommender.py: -------------------------------------------------------------------------------- 1 | from aprec.datasets.movielens20m import get_movielens20m_actions, get_movies_catalog 2 | from aprec.recommenders.top_recommender import TopRecommender 3 | from aprec.utils.generator_limit import generator_limit 4 | 5 | import unittest 6 | class TestTopRecommender(unittest.TestCase): 7 | def test_top_recommender(self): 8 | recommender = TopRecommender() 9 | catalog = get_movies_catalog() 10 | for action in generator_limit(get_movielens20m_actions(), 1000): 11 | recommender.add_action(action) 12 | recommender.rebuild_model() 13 | recommendations = recommender.recommend(1, 5) 14 | self.assertEqual(recommendations, [('260', 10), ('589', 9), ('1', 8), ('356', 8), ('480', 8)]) 15 | 16 | def test_recent_top(self): 17 | recommender = TopRecommender(recency=0.5) 18 | for action in generator_limit(get_movielens20m_actions(), 1000): 19 | recommender.add_action(action) 20 | recommender.rebuild_model() 21 | recommendations = recommender.recommend(1, 5) 22 | self.assertEquals(recommendations,[('2959', 3), ('2762', 3), ('1196', 3), ('260', 3), ('587', 2)] 23 | ) 24 | print(recommendations) 25 | 26 | 27 | def test_top_recommender_ranking_request(self): 28 | from aprec.api.items_ranking_request import ItemsRankingRequest 29 | recommender = TopRecommender() 30 | ranking_request = ItemsRankingRequest(user_id='1', item_ids=['1196', '589']) 31 | recommender.add_test_items_ranking_request(ranking_request) 32 | actions = list(generator_limit(get_movielens20m_actions(), 1000)) 33 | for action in actions: 34 | recommender.add_action(action) 35 | recommender.rebuild_model() 36 | recommendations = recommender.get_item_rankings() 37 | self.assertEqual(recommendations, {'1': [('589', 9), ('1196', 8)]}) 38 | 39 | if __name__ == "__main__": 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /tests/recommenders/baselines/test_transition_chain_recommender.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestTransitionsChainRecommender(unittest.TestCase): 4 | def test_transitions_chain_recommender(self): 5 | from typing import List 6 | from aprec.api.action import Action 7 | from aprec.recommenders.transition_chain_recommender import TransitionsChainRecommender 8 | recommender = TransitionsChainRecommender() 9 | actions: List[Action] = [ 10 | Action(user_id=0, item_id=0, timestamp=0, data={'utrip_id': 100}), 11 | Action(user_id=0, item_id=1, timestamp=10, data={'utrip_id': 100}), 12 | Action(user_id=0, item_id=2, timestamp=20, data={'utrip_id': 100}), 13 | 14 | Action(user_id=2, item_id=3, timestamp=0, data={'utrip_id': 200}), 15 | Action(user_id=2, item_id=1, timestamp=10, data={'utrip_id': 200}), 16 | Action(user_id=2, item_id=2, timestamp=20, data={'utrip_id': 200}), 17 | 18 | Action(user_id=3, item_id=1, timestamp=10, data={'utrip_id': 2000}), 19 | Action(user_id=3, item_id=3, timestamp=20, data={'utrip_id': 2000}), 20 | 21 | Action(user_id=4, item_id=1, timestamp=0, data={'utrip_id': 300}), 22 | ] 23 | for action in actions: 24 | recommender.add_action(action) 25 | recommender.rebuild_model() 26 | recommendations = recommender.recommend(4, 2) 27 | self.assertEqual(recommendations, [(2, 2), (3, 1)]) 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/recommenders/sequential/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/recommenders/sequential/__init__.py -------------------------------------------------------------------------------- /tests/recommenders/sequential/bert4rec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/recommenders/sequential/bert4rec/__init__.py -------------------------------------------------------------------------------- /tests/recommenders/sequential/sasrec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asash/gsasrec/9dd47e83b949a84f68a4616c6f779df7f17ebb26/tests/recommenders/sequential/sasrec/__init__.py -------------------------------------------------------------------------------- /tests/recommenders/sequential/sasrec/test_positional_encoding.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | class TestSinEmbedding(unittest.TestCase): 5 | def test_embedding(self): 6 | from aprec.recommenders.sequential.models.sasrec.sasrec import ExpPositionEncoding, SinePositionEncoding 7 | sinEncoder = SinePositionEncoding(50, 64) 8 | input = np.array([[0, 1, 2, 3],[1,2,3,4]]) 9 | encoded = sinEncoder(input) 10 | self.assertEqual(encoded.shape, (2, 4, 64)) 11 | 12 | expEncoder = ExpPositionEncoding(50, 64) 13 | input = np.array([[0, 1, 2, 3],[1,2,3,4]]) 14 | encoded = expEncoder(input) 15 | self.assertEqual(encoded.shape, (2, 4, 64)) 16 | 17 | if __name__== "__main__": 18 | unittest.main() -------------------------------------------------------------------------------- /tests/recommenders/sequential/sasrec/test_sasrec_attention_map.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | 5 | class TestSasrecModel(unittest.TestCase): 6 | def test_sasrec_model(self): 7 | from aprec.recommenders.sequential.models.sasrec.sasrec import SASRecConfig 8 | from aprec.recommenders.sequential.sequential_recommender import SequentialRecommender 9 | from aprec.recommenders.sequential.sequential_recommender_config import SequentialRecommenderConfig 10 | from aprec.recommenders.sequential.target_builders.positives_only_targets_builder import PositvesOnlyTargetBuilder 11 | from aprec.recommenders.sequential.targetsplitters.last_item_splitter import SequenceContinuation 12 | from aprec.datasets.movielens20m import get_movielens20m_actions 13 | from aprec.utils.generator_limit import generator_limit 14 | 15 | sasrec_config = SASRecConfig(embedding_size=32) 16 | recommender_config = SequentialRecommenderConfig(sasrec_config, train_epochs=10000, early_stop_epochs=50000, 17 | batch_size=5, 18 | training_time_limit=5, 19 | max_batches_per_epoch=100, 20 | sequence_splitter=SequenceContinuation, 21 | sequence_length=5, 22 | targets_builder=PositvesOnlyTargetBuilder, 23 | use_keras_training=True) 24 | 25 | recommender = SequentialRecommender(recommender_config) 26 | val_users = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] 27 | recommender.set_val_users(val_users) 28 | for action in generator_limit(get_movielens20m_actions(), 10000): 29 | recommender.add_action(action) 30 | recommender.rebuild_model() 31 | input_seq = recommender.get_model_inputs('120')[0] 32 | seq, attn = recommender.model.get_seq_embedding(input_seq) 33 | print(attn[0]) 34 | 35 | 36 | if __name__ == "__main__": 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /tests/recommenders/sequential/sasrec/test_sasrec_no_embedding_reuse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class TestSasrecNoEmbeddingReuse(unittest.TestCase): 5 | def test_sasrec_model_no_reuse(self): 6 | from aprec.recommenders.sequential.targetsplitters.last_item_splitter import SequenceContinuation 7 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 8 | from aprec.datasets.movielens20m import get_movielens20m_actions 9 | from aprec.utils.generator_limit import generator_limit 10 | from aprec.recommenders.sequential.models.sasrec.sasrec import SASRecConfig 11 | from aprec.recommenders.sequential.sequential_recommender import SequentialRecommender 12 | from aprec.recommenders.sequential.sequential_recommender_config import SequentialRecommenderConfig 13 | from aprec.recommenders.sequential.target_builders.positives_only_targets_builder import PositvesOnlyTargetBuilder 14 | val_users = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] 15 | sasrec_config = SASRecConfig(embedding_size=32, reuse_item_embeddings=False) 16 | recommender_config = SequentialRecommenderConfig(sasrec_config, train_epochs=10000, early_stop_epochs=50000, 17 | batch_size=5, 18 | training_time_limit=5, 19 | use_keras_training=True, 20 | max_batches_per_epoch=100, 21 | sequence_splitter=SequenceContinuation, 22 | sequence_length=5, 23 | targets_builder=PositvesOnlyTargetBuilder, 24 | ) 25 | 26 | 27 | recommender = SequentialRecommender(recommender_config) 28 | recommender.set_val_users(val_users) 29 | recommender = FilterSeenRecommender(recommender) 30 | for action in generator_limit(get_movielens20m_actions(), 10000): 31 | recommender.add_action(action) 32 | recommender.rebuild_model() 33 | USER_ID='120' 34 | recs = recommender.recommend(USER_ID, 10) 35 | print(recs) 36 | 37 | if __name__ == "__main__": 38 | unittest.main() -------------------------------------------------------------------------------- /tests/recommenders/sequential/sasrec/test_vanilla_sasrec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from aprec.recommenders.sequential.target_builders.positives_sequence_target_builder import PositivesSequenceTargetBuilder 5 | 6 | class TestVanillaSasrec(unittest.TestCase): 7 | def setUp(self): 8 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 9 | 10 | 11 | def test_vanilla_sasrec(self): 12 | from aprec.recommenders.sequential.sequential_recommender import SequentialRecommender 13 | from aprec.datasets.movielens20m import get_movielens20m_actions, get_movies_catalog 14 | from aprec.losses.bce import BCELoss 15 | from aprec.recommenders.sequential.target_builders.negative_per_positive_target import NegativePerPositiveTargetBuilder 16 | from aprec.recommenders.sequential.targetsplitters.shifted_sequence_splitter import ShiftedSequenceSplitter 17 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 18 | from aprec.recommenders.sequential.models.sasrec.sasrec import SASRecConfig 19 | from aprec.recommenders.sequential.sequential_recommender_config import SequentialRecommenderConfig 20 | from aprec.utils.generator_limit import generator_limit 21 | 22 | USER_ID = '120' 23 | val_users = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] 24 | model_config = SASRecConfig(embedding_size=32, vanilla=True, vanilla_num_negatives=5, vanilla_bce_t=1) 25 | 26 | recommender_config = SequentialRecommenderConfig(model_config, train_epochs=10000, early_stop_epochs=50000, 27 | batch_size=5, 28 | training_time_limit=5, 29 | sequence_splitter=ShiftedSequenceSplitter, 30 | targets_builder=PositivesSequenceTargetBuilder, 31 | use_keras_training=False 32 | ) 33 | 34 | recommender = SequentialRecommender(recommender_config) 35 | 36 | recommender.set_val_users(val_users) 37 | recommender = FilterSeenRecommender(recommender) 38 | for action in generator_limit(get_movielens20m_actions(), 10000): 39 | recommender.add_action(action) 40 | recommender.rebuild_model() 41 | recs = recommender.recommend(USER_ID, 10) 42 | catalog = get_movies_catalog() 43 | for rec in recs: 44 | print(catalog.get_item(rec[0]), "\t", rec[1]) 45 | 46 | if __name__ == "__main__": 47 | unittest.main() -------------------------------------------------------------------------------- /tests/recommenders/sequential/test_add_mask_vectorizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestAddMaskHistoryVectorizer(unittest.TestCase): 4 | def test_add_mask(self): 5 | from aprec.recommenders.sequential.history_vectorizers.add_mask_history_vectorizer import AddMaskHistoryVectorizer 6 | seq = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] 7 | vectorizer = AddMaskHistoryVectorizer() 8 | vectorizer.set_sequence_len(4) 9 | vectorizer.set_padding_value(7) 10 | vectorized = vectorizer(seq) 11 | self.assertEqual(len(vectorized), 4) 12 | self.assertEqual(list(vectorized), [3, 4, 5, 8]) 13 | 14 | seq = [(3, 3), (4, 4), (5, 5)] 15 | vectorizer = AddMaskHistoryVectorizer() 16 | vectorizer.set_sequence_len(4) 17 | vectorizer.set_padding_value(7) 18 | vectorized = vectorizer(seq) 19 | self.assertEqual(len(vectorized), 4) 20 | self.assertEqual(list(vectorized), [3, 4, 5, 8]) 21 | 22 | 23 | seq = [(4, 4), (5, 5)] 24 | vectorizer = AddMaskHistoryVectorizer() 25 | vectorizer.set_sequence_len(4) 26 | vectorizer.set_padding_value(7) 27 | vectorized = vectorizer(seq) 28 | self.assertEqual(len(vectorized), 4) 29 | self.assertEqual(list(vectorized), [7, 4, 5, 8]) 30 | 31 | if __name__ == "__main__": 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /tests/recommenders/sequential/test_caser_no_uid.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestCaserNoUid(unittest.TestCase): 4 | def test_caser_model_no_uid(self): 5 | from aprec.losses.bce import BCELoss 6 | from aprec.recommenders.sequential.sequential_recommender import SequentialRecommender 7 | from aprec.recommenders.sequential.sequential_recommender_config import SequentialRecommenderConfig 8 | from aprec.recommenders.sequential.models.caser import CaserConfig 9 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 10 | from aprec.datasets.movielens20m import get_movielens20m_actions 11 | from aprec.utils.generator_limit import generator_limit 12 | 13 | 14 | val_users = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] 15 | caser_config = CaserConfig() 16 | recommender_config = SequentialRecommenderConfig(caser_config, train_epochs=10, 17 | early_stop_epochs=5, batch_size=5, 18 | training_time_limit=10, 19 | loss=BCELoss(), 20 | sequence_length=5, 21 | use_keras_training=True 22 | ) 23 | recommender = SequentialRecommender(recommender_config) 24 | recommender.set_val_users(val_users) 25 | recommender = FilterSeenRecommender(recommender) 26 | for action in generator_limit(get_movielens20m_actions(), 10000): 27 | recommender.add_action(action) 28 | recommender.rebuild_model() 29 | USER_ID = '120' 30 | recs = recommender.recommend(USER_ID, 10) 31 | print(recs) 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | 36 | -------------------------------------------------------------------------------- /tests/recommenders/sequential/test_gru_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestGRUModel(unittest.TestCase): 4 | def test_gru_model(self): 5 | from aprec.losses.bce import BCELoss 6 | from aprec.recommenders.sequential.sequential_recommender_config import SequentialRecommenderConfig 7 | 8 | from aprec.recommenders.sequential.models.gru4rec import GRU4RecConfig 9 | from aprec.recommenders.sequential.sequential_recommender import SequentialRecommender 10 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 11 | from aprec.datasets.movielens20m import get_movielens20m_actions 12 | from aprec.utils.generator_limit import generator_limit 13 | USER_ID ='120' 14 | 15 | val_users = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] 16 | model_config = GRU4RecConfig() 17 | recommender_config = SequentialRecommenderConfig(model_config, train_epochs=10, early_stop_epochs=5, 18 | batch_size=5, training_time_limit=10, loss=BCELoss(), 19 | sequence_length=10) 20 | recommender = SequentialRecommender(recommender_config) 21 | recommender.set_val_users(val_users) 22 | recommender = FilterSeenRecommender(recommender) 23 | for action in generator_limit(get_movielens20m_actions(), 10000): 24 | recommender.add_action(action) 25 | recommender.rebuild_model() 26 | recs = recommender.recommend(USER_ID, 10) 27 | print(recs) 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | 32 | -------------------------------------------------------------------------------- /tests/recommenders/sequential/test_items_masking_target_builder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | 5 | class TestItemsMaskingTargetBuilder(unittest.TestCase): 6 | def test_target_builder(self): 7 | from aprec.recommenders.sequential.target_builders.items_masking_target_builder import ItemsMaskingTargetsBuilder 8 | targets_builder = ItemsMaskingTargetsBuilder(relative_positions_encoding=False) 9 | targets_builder.set_sequence_len(5) 10 | targets_builder.set_n_items(10) 11 | targets_builder.build([(4, [(1, (1, 3)), (3, (3, 5))]), (3, [(1, (1, 6))])]) 12 | expected_targets = np.array([[-100, 3, -100, 5, -100], [-100, 6, -100, -100, -100]]) 13 | expected_positions = np.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]) 14 | extra_inputs, target = targets_builder.get_targets(0, 2) 15 | self.assertEqual(len(extra_inputs), 2) 16 | self.assertEquals(len(extra_inputs), 2) # labels, positions 17 | self.assertTrue(np.all(expected_targets == target)) 18 | self.assertTrue(np.all(extra_inputs[0] == target)) 19 | self.assertTrue(np.all(extra_inputs[1] == expected_positions)) 20 | 21 | 22 | 23 | if __name__ == "__main__": 24 | unittest.main() 25 | 26 | -------------------------------------------------------------------------------- /tests/recommenders/test_lambdamart_ensemble_recommender.py: -------------------------------------------------------------------------------- 1 | 2 | from tempfile import NamedTemporaryFile 3 | import unittest 4 | 5 | def train_model(): 6 | import json 7 | import os 8 | from aprec.datasets.movielens20m import get_movielens20m_actions 9 | from aprec.recommenders.top_recommender import TopRecommender 10 | from aprec.recommenders.svd import SvdRecommender 11 | from aprec.recommenders.lambdamart_ensemble_recommender import LambdaMARTEnsembleRecommender 12 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 13 | import tempfile 14 | from aprec.utils.generator_limit import generator_limit 15 | import pandas as pd 16 | 17 | tempdir = tempfile.mkdtemp("lambdamart_recommender_test") 18 | candidates_selection = FilterSeenRecommender(TopRecommender()) 19 | other_recommenders = { 20 | "svd_recommender": SvdRecommender(128) 21 | } 22 | recommender = LambdaMARTEnsembleRecommender( 23 | candidates_selection_recommender=candidates_selection, 24 | other_recommenders=other_recommenders, 25 | n_ensemble_users=200, 26 | n_ensemble_val_users=20, 27 | log_dir=tempdir 28 | ) 29 | 30 | USER_ID = '120' 31 | 32 | for action in generator_limit(get_movielens20m_actions(), 100000): 33 | recommender.add_action(action) 34 | recommender.rebuild_model() 35 | recs = recommender.recommend(USER_ID, 10) 36 | recs = recommender.recommend('121', 10) 37 | print(recs) 38 | print(json.dumps(recommender.get_metadata())) 39 | train_csv = pd.read_csv(os.path.join(tempdir, 'ensemble_train.csv.gz'), compression='gzip', delimiter=';') 40 | val_csv = pd.read_csv(os.path.join(tempdir, 'ensemble_train.csv.gz'), compression='gzip', delimiter=';') 41 | return recommender 42 | 43 | def train_and_save(tempdir): 44 | recommender = train_model() 45 | 46 | 47 | 48 | class TestLambdaMartEnsembleRecommender(unittest.TestCase): 49 | def test_lambdamart_ensemble_recommender(self): 50 | recommender = train_model() 51 | with NamedTemporaryFile() as tmp: 52 | recommender.save(tmp.name) 53 | 54 | 55 | 56 | if __name__ == "__main__": 57 | unittest.main() 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /tests/recommenders/test_vanilla_bert4rec.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | def get_actions(): 5 | from aprec.utils.generator_limit import generator_limit 6 | from aprec.datasets.movielens20m import get_movielens20m_actions 7 | return [action for action in generator_limit(get_movielens20m_actions(), 100000)] 8 | 9 | def get_recommender_and_add_actions(): 10 | recommender = get_recommender() 11 | for action in get_actions(): 12 | recommender.add_action(action) 13 | return recommender 14 | 15 | def get_recommender(): 16 | from aprec.recommenders.vanilla_bert4rec import VanillaBERT4Rec 17 | return VanillaBERT4Rec(training_time_limit=5) 18 | 19 | class TestVanillaBert4rec(unittest.TestCase): 20 | def test_vanilla_bert4rec(self): 21 | recommender = get_recommender_and_add_actions() 22 | recommender.rebuild_model() 23 | print(recommender.recommend('120', 10)) 24 | recs = recommender.recommend('cold-start-user', 10) 25 | self.assertEqual(recs, []) 26 | 27 | def test_sampled_rankings(self): 28 | from aprec.api.items_ranking_request import ItemsRankingRequest 29 | recommender = get_recommender_and_add_actions() 30 | predict_items = ['260', '294', '296'] 31 | ranking_request = ItemsRankingRequest('120', ['260', '294', '296']) 32 | recommender.add_test_items_ranking_request(ranking_request) 33 | recommender.rebuild_model() 34 | sampled_scores = recommender.get_item_rankings() 35 | self.assertEqual(len(sampled_scores), 1) 36 | predicted_scores = sampled_scores['120'] 37 | unseen_item = '294' 38 | for item, score in predicted_scores: 39 | self.assertTrue(item in predict_items) 40 | if item == unseen_item: 41 | self.assertEqual(score, -float('inf')) 42 | else: 43 | self.assertGreater(score, -float('inf')) 44 | self.assertLess(score, float('inf')) 45 | 46 | 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /ui/config.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from aprec.datasets.movielens20m import get_movies_catalog, get_movielens20m_actions 3 | from aprec.recommenders.mlp_historical import GreedyMLPHistorical 4 | from aprec.recommenders.filter_seen_recommender import FilterSeenRecommender 5 | import sys 6 | 7 | CATALOG = get_movies_catalog() 8 | 9 | actions = get_movielens20m_actions(1.0) 10 | recommender = FilterSeenRecommender(GreedyMLPHistorical(train_epochs=300)) 11 | 12 | cnt = 0 13 | for action in tqdm(actions, ascii=True): 14 | recommender.add_action(action) 15 | cnt += 1 16 | 17 | sys.stderr.write("building model...") 18 | 19 | recommender.rebuild_model() 20 | 21 | sys.stderr.write("ready.") 22 | 23 | RECOMMENDER = recommender 24 | -------------------------------------------------------------------------------- /ui/server.py: -------------------------------------------------------------------------------- 1 | import tornado.ioloop 2 | import tornado.web 3 | import json 4 | import os 5 | 6 | from aprec.ui.config import CATALOG, RECOMMENDER 7 | 8 | 9 | class SearchHandler(tornado.web.RequestHandler): 10 | def get(self): 11 | keyword = self.request.arguments.get("keyword")[0].decode("utf-8") 12 | items = CATALOG.search(keyword) 13 | result = [] 14 | for item in items: 15 | result.append("[{}] {}".format(item.item_id, item.title)) 16 | self.set_header('Content-Type', 'application/json') 17 | self.write(json.dumps(result, indent=4)) 18 | 19 | 20 | class RecommenderHandler(tornado.web.RequestHandler): 21 | def post(self): 22 | history_raw = [item.decode("utf-8") for item in self.request.arguments['history[]']] 23 | history_item_ids = [item.split("]")[0].strip("[") for item in history_raw] 24 | recommendations = RECOMMENDER.recommend_by_items(history_item_ids, 10) 25 | result = [] 26 | for item in recommendations: 27 | result.append("[{}] {}".format(item[0], CATALOG.get_item(item[0]).title)) 28 | self.set_header('Content-Type', 'application/json') 29 | self.write(json.dumps(result, indent=4)) 30 | 31 | 32 | def make_app(): 33 | current_dir = os.path.dirname(__file__) 34 | static_dir = os.path.join(current_dir, "static") 35 | print(static_dir) 36 | return tornado.web.Application([ 37 | (r"/search", SearchHandler), 38 | (r"/recommend", RecommenderHandler), 39 | (r"/(.*)", tornado.web.StaticFileHandler, {"path": static_dir, "default_filename": "index.html"}) 40 | ]) 41 | 42 | 43 | if __name__ == "__main__": 44 | app = make_app() 45 | app.listen(31337) 46 | tornado.ioloop.IOLoop.current().start() 47 | -------------------------------------------------------------------------------- /ui/static/app.js: -------------------------------------------------------------------------------- 1 | function add_item_to_history(item){ 2 | $('#history').append('
  • ' + item + "
  • "); 3 | } 4 | 5 | function update_recommendations() { 6 | var last_movies = [] 7 | var history_elems = $('#history>li'); 8 | for (var i = 0; i < history_elems.length; ++i){ 9 | last_movies.push(history_elems[i].textContent); 10 | } 11 | $.post("/recommend", {"history": last_movies}).done(function(response){ 12 | $("#recommendations").empty(); 13 | for (var i = 0; i < response.length; ++i){ 14 | $("#recommendations").append('
  • ' + response[i] + "
  • "); 15 | } 16 | }); 17 | } 18 | 19 | $(document).ready(function() { 20 | // Defining the local dataset 21 | var movies = new Bloodhound({ 22 | datumTokenizer: Bloodhound.tokenizers.whitespace, 23 | queryTokenizer: Bloodhound.tokenizers.whitespace, 24 | remote: { 25 | url: '/search?keyword=%QUERY', 26 | wildcard: '%QUERY' 27 | } 28 | }); 29 | 30 | // Initializing the typeahead 31 | $('#search').typeahead({ 32 | hint: true, 33 | highlight: true, /* Enable substring highlighting */ 34 | minLength: 1 /* Specify minimum characters required for showing suggestions */ 35 | }, 36 | { 37 | name: 'moviesSearch', 38 | source: movies 39 | }); 40 | 41 | 42 | $('#search').bind('typeahead:selected', function(obj, datum, name) { 43 | add_item_to_history(datum); 44 | $('#search').typeahead('val', ''); 45 | update_recommendations(); 46 | }); 47 | // all custom jQuery will go here 48 | }); -------------------------------------------------------------------------------- /ui/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Item-Item recommender demo page 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
    19 |

    Aprec recommender demo page

    20 |

    Based on MovieLens 20M Dataset

    21 |

    Add few movies into liked list to get personal recommendations

    22 |
    23 | 24 |
    25 |
    26 | 27 | 28 |
    29 |
     
    30 |
    31 |
    32 |

    Your history

    33 |
      34 |
    35 |
    36 |
    37 |

    Recommended Movies

    38 |
      39 |
    40 |
    41 |
    42 |
    43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /utils/generator_limit.py: -------------------------------------------------------------------------------- 1 | def generator_limit(generator, n): 2 | limit = 0 3 | for item in generator: 4 | if limit >= n: 5 | break 6 | yield item 7 | limit += 1 8 | 9 | -------------------------------------------------------------------------------- /utils/item_id.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from pathlib import PosixPath 3 | 4 | 5 | class ItemId(object): 6 | def __init__(self): 7 | self.straight = {} 8 | self.reverse = {} 9 | self.get_count = Counter() 10 | 11 | def size(self): 12 | return len(self.straight) 13 | 14 | def get_id(self, item_id): 15 | if item_id not in self.straight: 16 | self.straight[item_id] = len(self.straight) 17 | self.reverse[self.straight[item_id]] = item_id 18 | self.get_count[item_id] += 1 19 | return self.straight[item_id] 20 | 21 | def has_id(self, id): 22 | return id in self.reverse 23 | 24 | def has_item(self, item_id): 25 | return item_id in self.straight 26 | 27 | def reverse_id(self, id): 28 | return self.reverse[id] 29 | 30 | def save(self, file_name): 31 | with open(file_name, "w") as output: 32 | for item in self.straight: 33 | output.write(f"{item} {self.straight[item]}\n") 34 | 35 | @staticmethod 36 | def load(file_name): 37 | straight, reverse = {}, {} 38 | for line in open(file_name): 39 | external, internal = line.rstrip().split(" ") 40 | internal = int(internal) 41 | straight[external] = internal 42 | reverse[internal] = external 43 | result = ItemId() 44 | result.straight = straight 45 | result.reverse = reverse 46 | return result -------------------------------------------------------------------------------- /utils/os_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import subprocess 4 | import shlex 5 | import logging 6 | import hashlib 7 | 8 | def get_dir(): 9 | utils_dirname = os.path.dirname(os.path.abspath(__file__)) 10 | lib_dirname = os.path.abspath(os.path.join(utils_dirname, "..")) 11 | return lib_dirname 12 | 13 | def recursive_listdir(dir_name): 14 | result = [] 15 | for name in os.listdir(dir_name): 16 | full_name = os.path.join(dir_name, name) 17 | if(os.path.isdir(full_name)): 18 | result += recursive_listdir(full_name) 19 | else: 20 | result.append(full_name) 21 | return result 22 | 23 | def shell(cmd): 24 | logging.info("running shell command: \n {}".format(cmd)) 25 | subprocess.check_call(shlex.split(cmd)) 26 | 27 | def mkdir_p(dir_path): 28 | shell("mkdir -p {}".format(dir_path)) 29 | return Path(dir_path) 30 | 31 | def mkdir_p_local(relative_dir_path): 32 | """create folder inside of library if does not exists""" 33 | local_dir = get_dir() 34 | abspath = os.path.join(local_dir, relative_dir_path) 35 | mkdir_p(abspath) 36 | return abspath 37 | 38 | 39 | def file_md5(fname): 40 | hash_md5 = hashlib.md5() 41 | with open(fname, "rb") as f: 42 | for chunk in iter(lambda: f.read(4096), b""): 43 | hash_md5.update(chunk) 44 | return hash_md5.hexdigest() 45 | 46 | def console_logging(): 47 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 48 | --------------------------------------------------------------------------------