├── .gitignore ├── LICENSE ├── README.md ├── examples └── 00_quick_start │ ├── CLSR │ └── taobao-clsr-debug │ │ ├── README.md │ │ └── model.tar.gz │ └── sequential.py ├── reco_utils ├── README.md ├── __init__.py ├── common │ ├── __init__.py │ ├── constants.py │ ├── general_utils.py │ ├── gpu_utils.py │ ├── notebook_memory_management.py │ ├── notebook_utils.py │ ├── plot.py │ ├── python_utils.py │ ├── spark_utils.py │ ├── tf_utils.py │ └── timer.py ├── dataset │ ├── __init__.py │ ├── blob_utils.py │ ├── covid_utils.py │ ├── download_utils.py │ ├── pandas_df_utils.py │ ├── python_splitters.py │ ├── sequential_reviews.py │ ├── spark_splitters.py │ ├── sparse.py │ ├── split_utils.py │ └── wikidata.py ├── evaluation │ ├── __init__.py │ ├── python_evaluation.py │ └── spark_evaluation.py └── recommender │ ├── __init__.py │ └── deeprec │ ├── DataModel │ └── ImplicitCF.py │ ├── __init__.py │ ├── config │ ├── asvd.yaml │ ├── caser.yaml │ ├── clsr.yaml │ ├── dien.yaml │ ├── din.yaml │ ├── gru4rec.yaml │ ├── lgn.yaml │ ├── lightgcn.yaml │ ├── ncf.yaml │ ├── nextitnet.yaml │ └── sli_rec.yaml │ ├── deeprec_utils.py │ ├── io │ ├── __init__.py │ ├── dkn_iterator.py │ ├── iterator.py │ ├── nextitnet_iterator.py │ └── sequential_iterator.py │ └── models │ ├── __init__.py │ ├── base_model.py │ └── sequential │ ├── asvd.py │ ├── caser.py │ ├── clsr.py │ ├── dien.py │ ├── din.py │ ├── gru4rec.py │ ├── lgn.py │ ├── ncf.py │ ├── nextitnet.py │ ├── rnn_cell_implement.py │ ├── rnn_dien.py │ ├── sequential_base_model.py │ └── sli_rec.py └── tests ├── __init__.py └── resources └── deeprec └── sequential └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.pretrain* 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Locust files: 73 | locustfile.py 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # Tensorflow 110 | *model_checkpoints 111 | **/outputs 112 | 113 | # Azure ML 114 | config.json 115 | aml_config/ 116 | aml_scripts/ 117 | aml_data/ 118 | 119 | # Spark 120 | spark-warehouse/ 121 | 122 | ########################## 123 | .DS_Store 124 | .~* 125 | Untitled*.ipynb 126 | *-Copy*.ipynb 127 | ~$* 128 | output.ipynb 129 | conda*.yaml 130 | reco_*.yaml 131 | .idea/ 132 | *.npz 133 | *.data 134 | *.dat 135 | *.csv 136 | *.zip 137 | *.7z 138 | .vscode/ 139 | u.item 140 | ml-100k/ 141 | ml-10M100K/ 142 | ml-1m/ 143 | ml-20m/ 144 | *.jar 145 | *.item 146 | *.pkl 147 | *.txt 148 | *.pdf 149 | .pretrain 150 | *.npy 151 | *.ckpt* 152 | *.png 153 | *.jpg 154 | *.jpeg 155 | *.gif 156 | *.model 157 | *.mml 158 | nohup.out 159 | *.vec 160 | *.tsv 161 | *.sh 162 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 FIB LAB, Tsinghua University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CLSR: Disentangling Long and Short-Term Interests for Recommendation 2 | 3 | This is the official implementation of our WWW'22 paper: 4 | 5 | Yu Zheng, Chen Gao, Jianxin Chang, Yanan Niu, Yang Song, Depeng Jin, Yong Li, **Disentangling Long and Short-Term Interests for Recommendation**, In Proceedings of the Web Conference 2022. 6 | 7 | The code is tested under a Linux desktop with TensorFlow 1.15.2 and Python 3.6.8. 8 | 9 | Please cite our paper if you use this repository. 10 | ``` 11 | @inproceedings{zheng2022disentangling, 12 | title={Disentangling Long and Short-Term Interests for Recommendation}, 13 | author={Zheng, Yu and Gao, Chen and Chang, Jianxin and Niu, Yanan and Song, Yang and Jin, Depeng and Li, Yong}, 14 | booktitle={Proceedings of the ACM Web Conference 2022}, 15 | pages={2256--2267}, 16 | year={2022} 17 | } 18 | ``` 19 | 20 | ## Data Pre-processing 21 | 22 | 23 | Run the script `reco_utils/dataset/sequential_reviews.py` to generate the data for training and evaluation. 24 | 25 | Details of the data are available at [Data](./tests/resources/deeprec/sequential/README.md). 26 | 27 | 28 | 29 | 30 | ## Model Training 31 | 32 | Use the following commands to train a CLSR model on `Taobao` dataset: 33 | 34 | ``` 35 | cd ./examples/00_quick_start/ 36 | python sequential.py --dataset taobao 37 | ``` 38 | 39 | or on `Kuaishou` dataset: 40 | 41 | ``` 42 | cd ./examples/00_quick_start/ 43 | python sequential.py --dataset kuaishou 44 | ``` 45 | 46 | 47 | ## Pretrained Model Evaluation 48 | 49 | We provide a pretrained model for the `Taobao` dataset at [Model](./examples/00_quick_start/CLSR/taobao-clsr-debug/README.md). 50 | 51 | ``` 52 | cd ./examples/00_quick_start/ 53 | python sequential.py --dataset taobao --only_test 54 | ``` 55 | 56 | The performance of the provided pretrained model is as follows: 57 | | AUC | GAUC | MRR | NDCG@2 | 58 | | ---- | ---- | ---- | ---- | 59 | | 0.8954 | 0.8936 | 0.4384 | 0.3807 | 60 | 61 | 62 | ## Note 63 | 64 | The implemention is based on *[Microsoft Recommender](https://github.com/microsoft/recommenders)*. 65 | -------------------------------------------------------------------------------- /examples/00_quick_start/CLSR/taobao-clsr-debug/README.md: -------------------------------------------------------------------------------- 1 | # Pretrained Model 2 | 3 | Decompress `model.tar.gz` at the current location. 4 | ``` 5 | tar -xzf model.tar.gz 6 | ``` -------------------------------------------------------------------------------- /examples/00_quick_start/CLSR/taobao-clsr-debug/model.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/examples/00_quick_start/CLSR/taobao-clsr-debug/model.tar.gz -------------------------------------------------------------------------------- /reco_utils/README.md: -------------------------------------------------------------------------------- 1 | # Recommender Utilities 2 | 3 | This package (reco_utils) contains functions to simplify common tasks used when developing and evaluating recommender systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code. 4 | 5 | See the [online documentation](https://readthedocs.org/projects/microsoft-recommenders/). 6 | 7 | ## [AzureML](azureml) 8 | 9 | The AzureML submodule contains utilities to train, tune and operationalize recommendation systems at scale using AzureML. 10 | 11 | ## [Common](common) 12 | 13 | This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: gpu, spark, jupyter notebook. 14 | 15 | ## [Dataset](dataset) 16 | 17 | Dataset includes helper functions for interacting with Azure Cosmos databases, pulling different datasets and formatting them appropriately as well as utilities for splitting data for training / testing. 18 | 19 | ### Data Loading 20 | 21 | There are dataloaders for several datasets. For example, the movielens module will allow you to load a dataframe in pandas or spark formats from the MovieLens dataset, with sizes of 100k, 1M, 10M, or 20M to test algorithms and evaluate performance benchmarks. 22 | 23 | ```python 24 | df = movielens.load_pandas_df(size="100k") 25 | ``` 26 | 27 | ### Splitting Techniques 28 | 29 | Currently three methods are available for splitting datasets. All of them support splitting by user or item and filtering out minimal samples (for instance users that have not rated enough item, or items that have not been rated by enough users). 30 | 31 | - Random: this is the basic approach where entries are randomly assigned to each group based on the ratio desired 32 | - Chronological: this uses provided timestamps to order the data and selects a cut-off time that will split the desired ratio of data to train before that time and test after that time 33 | - Stratified: this is similar to random sampling, but the splits are stratified, for example if the datasets are split by user, the splitting approach will attempt to maintain the same set of items used in both training and test splits. The converse is true if splitting by item. 34 | 35 | ## [Evaluation](evaluation) 36 | 37 | The evaluation submodule includes functionality for performing hyperparameter sweeps as well as calculating common recommender metrics directly in python or in a Spark environment using pyspark. 38 | 39 | Currently available metrics include: 40 | 41 | - Root Mean Squared Error 42 | - Mean Absolute Error 43 | - R2 44 | - Explained Variance 45 | - Precision at K 46 | - Recall at K 47 | - Normalized Discounted Cumulative Gain at K 48 | - Mean Average Precision at K 49 | - Area Under Curve 50 | - Logistic Loss 51 | 52 | ## [Recommender](recommender) 53 | 54 | The recommender submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new recommender system approaches. A description of all the algorithms can be found on [this table](../README.md#algorithms). Next a list of the algorithm utilities: 55 | 56 | * Cornac 57 | * DeepRec (includes xDeepFM and DKN) 58 | * FastAI 59 | * LightGBM 60 | * NCF 61 | * NewsRec (includes LSTUR, NAML NPA and NRMS) 62 | * RBM 63 | * RLRMC 64 | * SAR 65 | * Surprise 66 | * Vowpal Wabbit (VW) 67 | * Wide&Deep 68 | 69 | ## [Tuning](tuning) 70 | 71 | This submodule contains utilities for performing hyperparameter tuning. 72 | -------------------------------------------------------------------------------- /reco_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | __title__ = "Microsoft Recommenders" 5 | __version__ = "2020.8" 6 | __author__ = "RecoDev Team at Microsoft" 7 | __license__ = "MIT" 8 | __copyright__ = "Copyright 2018-present Microsoft Corporation" 9 | 10 | # Synonyms 11 | TITLE = __title__ 12 | VERSION = __version__ 13 | AUTHOR = __author__ 14 | LICENSE = __license__ 15 | COPYRIGHT = __copyright__ 16 | -------------------------------------------------------------------------------- /reco_utils/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/common/__init__.py -------------------------------------------------------------------------------- /reco_utils/common/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Default column names 5 | DEFAULT_USER_COL = "userID" 6 | DEFAULT_ITEM_COL = "itemID" 7 | DEFAULT_RATING_COL = "rating" 8 | DEFAULT_LABEL_COL = "label" 9 | DEFAULT_TIMESTAMP_COL = "timestamp" 10 | DEFAULT_PREDICTION_COL = "prediction" 11 | COL_DICT = { 12 | "col_user": DEFAULT_USER_COL, 13 | "col_item": DEFAULT_ITEM_COL, 14 | "col_rating": DEFAULT_RATING_COL, 15 | "col_prediction": DEFAULT_PREDICTION_COL, 16 | } 17 | 18 | # Filtering variables 19 | DEFAULT_K = 10 20 | DEFAULT_THRESHOLD = 10 21 | 22 | # Other 23 | SEED = 42 24 | -------------------------------------------------------------------------------- /reco_utils/common/general_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import psutil 6 | 7 | 8 | def invert_dictionary(dictionary): 9 | """Invert a dictionary 10 | 11 | .. note:: 12 | 13 | If the dictionary has unique keys and unique values, the inversion would be perfect. However, if there are 14 | repeated values, the inversion can take different keys 15 | 16 | Args: 17 | dictionary (dict): A dictionary 18 | 19 | Returns: 20 | dict: inverted dictionary 21 | """ 22 | return {v: k for k, v in dictionary.items()} 23 | 24 | 25 | def get_physical_memory(): 26 | """Get the physical memory in GBs. 27 | 28 | Returns: 29 | float: Physical memory in GBs. 30 | """ 31 | return psutil.virtual_memory()[0] / 1073741824 32 | 33 | 34 | def get_number_processors(): 35 | """Get the number of processors in a CPU. 36 | 37 | Returns: 38 | int: Number of processors. 39 | """ 40 | try: 41 | num = os.cpu_count() 42 | except Exception: 43 | import multiprocessing # force exception in case multiprocessing is not installed 44 | 45 | num = multiprocessing.cpu_count() 46 | return num 47 | -------------------------------------------------------------------------------- /reco_utils/common/gpu_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | import os 6 | import glob 7 | from numba import cuda 8 | from numba.cuda.cudadrv.error import CudaSupportError 9 | 10 | 11 | DEFAULT_CUDA_PATH_LINUX = "/usr/local/cuda/version.txt" 12 | 13 | 14 | def get_number_gpus(): 15 | """Get the number of GPUs in the system. 16 | 17 | Returns: 18 | int: Number of GPUs. 19 | """ 20 | try: 21 | return len(cuda.gpus) 22 | except CudaSupportError: 23 | return 0 24 | 25 | 26 | def get_gpu_info(): 27 | """Get information of GPUs. 28 | 29 | Returns: 30 | list: List of gpu information dictionary as with device_name, total_memory (in Mb) and free_memory (in Mb). 31 | Returns an empty list if there is no cuda device available. 32 | """ 33 | gpus = [] 34 | try: 35 | for gpu in cuda.gpus: 36 | with gpu: 37 | meminfo = cuda.current_context().get_memory_info() 38 | g = { 39 | "device_name": gpu.name.decode("ASCII"), 40 | "total_memory": meminfo[1] / 1048576, # Mb 41 | "free_memory": meminfo[0] / 1048576, # Mb 42 | } 43 | gpus.append(g) 44 | except CudaSupportError: 45 | pass 46 | 47 | return gpus 48 | 49 | 50 | def clear_memory_all_gpus(): 51 | """Clear memory of all GPUs.""" 52 | try: 53 | for gpu in cuda.gpus: 54 | with gpu: 55 | cuda.current_context().deallocations.clear() 56 | except CudaSupportError: 57 | print("No CUDA available") 58 | 59 | 60 | def get_cuda_version(unix_path=DEFAULT_CUDA_PATH_LINUX): 61 | """Get CUDA version. 62 | 63 | Args: 64 | unix_path (str): Path to CUDA version file in Linux/Mac. 65 | 66 | Returns: 67 | str: Version of the library. 68 | """ 69 | if sys.platform == "win32": 70 | raise NotImplementedError("Implement this!") 71 | elif sys.platform in ["linux", "darwin"]: 72 | if os.path.isfile(unix_path): 73 | with open(unix_path, "r") as f: 74 | data = f.read().replace("\n", "") 75 | return data 76 | else: 77 | return "No CUDA in this machine" 78 | else: 79 | raise ValueError("Not in Windows, Linux or Mac") 80 | 81 | 82 | def get_cudnn_version(): 83 | """Get the CuDNN version. 84 | 85 | Returns: 86 | str: Version of the library. 87 | 88 | """ 89 | 90 | def find_cudnn_in_headers(candidates): 91 | for c in candidates: 92 | file = glob.glob(c) 93 | if file: 94 | break 95 | if file: 96 | with open(file[0], "r") as f: 97 | version = "" 98 | for line in f: 99 | if "#define CUDNN_MAJOR" in line: 100 | version = line.split()[-1] 101 | if "#define CUDNN_MINOR" in line: 102 | version += "." + line.split()[-1] 103 | if "#define CUDNN_PATCHLEVEL" in line: 104 | version += "." + line.split()[-1] 105 | if version: 106 | return version 107 | else: 108 | return "Cannot find CUDNN version" 109 | else: 110 | return "No CUDNN in this machine" 111 | 112 | if sys.platform == "win32": 113 | candidates = [ 114 | "C:\\NVIDIA\\cuda\\include\\cudnn.h", 115 | "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\include\\cudnn.h", 116 | ] 117 | elif sys.platform == "linux": 118 | candidates = [ 119 | "/usr/include/x86_64-linux-gnu/cudnn_v*.h", 120 | "/usr/local/cuda/include/cudnn.h", 121 | "/usr/include/cudnn.h", 122 | ] 123 | elif sys.platform == "darwin": 124 | candidates = ["/usr/local/cuda/include/cudnn.h", "/usr/include/cudnn.h"] 125 | else: 126 | raise ValueError("Not in Windows, Linux or Mac") 127 | return find_cudnn_in_headers(candidates) 128 | -------------------------------------------------------------------------------- /reco_utils/common/notebook_memory_management.py: -------------------------------------------------------------------------------- 1 | # Original code: https://raw.githubusercontent.com/miguelgfierro/codebase/master/python/system/notebook_memory_management.py 2 | # 3 | # Profile memory usage envelope of IPython commands and report interactively. 4 | # Usage (inside a python notebook): 5 | # from notebook_memory_management import start_watching_memory, stop_watching_memory 6 | # To start profile: 7 | # start_watching_memory() 8 | # To stop profile: 9 | # stop_watching_memory() 10 | # 11 | # Based on: https://github.com/ianozsvald/ipython_memory_usage 12 | # 13 | 14 | from __future__ import division # 1/2 == 0.5, as in Py3 15 | from __future__ import absolute_import # avoid hiding global modules with locals 16 | from __future__ import print_function # force use of print("hello") 17 | from __future__ import ( 18 | unicode_literals, 19 | ) # force unadorned strings "" to be Unicode without prepending u"" 20 | import time 21 | import memory_profiler 22 | from IPython import get_ipython 23 | import psutil 24 | import warnings 25 | 26 | 27 | # keep a global accounting for the last known memory usage 28 | # which is the reference point for the memory delta calculation 29 | previous_call_memory_usage = memory_profiler.memory_usage()[0] 30 | t1 = time.time() # will be set to current time later 31 | keep_watching = True 32 | watching_memory = True 33 | try: 34 | input_cells = get_ipython().user_ns["In"] 35 | except: 36 | warnings.warn("Not running on notebook") 37 | 38 | 39 | def start_watching_memory(): 40 | """Register memory profiling tools to IPython instance.""" 41 | global watching_memory 42 | watching_memory = True 43 | ip = get_ipython() 44 | ip.events.register("post_run_cell", watch_memory) 45 | ip.events.register("pre_run_cell", pre_run_cell) 46 | 47 | 48 | def stop_watching_memory(): 49 | """Unregister memory profiling tools from IPython instance.""" 50 | global watching_memory 51 | watching_memory = False 52 | ip = get_ipython() 53 | try: 54 | ip.events.unregister("post_run_cell", watch_memory) 55 | except ValueError: 56 | print("ERROR: problem when unregistering") 57 | pass 58 | try: 59 | ip.events.unregister("pre_run_cell", pre_run_cell) 60 | except ValueError: 61 | print("ERROR: problem when unregistering") 62 | pass 63 | 64 | 65 | def watch_memory(): 66 | # bring in the global memory usage value from the previous iteration 67 | global previous_call_memory_usage, keep_watching, watching_memory, input_cells 68 | new_memory_usage = memory_profiler.memory_usage()[0] 69 | memory_delta = new_memory_usage - previous_call_memory_usage 70 | keep_watching = False 71 | total_memory = psutil.virtual_memory()[0] / 1024 / 1024 # in Mb 72 | # calculate time delta using global t1 (from the pre-run event) and current time 73 | time_delta_secs = time.time() - t1 74 | num_commands = len(input_cells) - 1 75 | cmd = "In [{}]".format(num_commands) 76 | # convert the results into a pretty string 77 | output_template = ( 78 | "{cmd} used {memory_delta:0.4f} Mb RAM in " 79 | "{time_delta:0.2f}s, total RAM usage " 80 | "{memory_usage:0.2f} Mb, total RAM " 81 | "memory {total_memory:0.2f} Mb" 82 | ) 83 | output = output_template.format( 84 | time_delta=time_delta_secs, 85 | cmd=cmd, 86 | memory_delta=memory_delta, 87 | memory_usage=new_memory_usage, 88 | total_memory=total_memory, 89 | ) 90 | if watching_memory: 91 | print(str(output)) 92 | previous_call_memory_usage = new_memory_usage 93 | 94 | 95 | def pre_run_cell(): 96 | """Capture current time before we execute the current command""" 97 | global t1 98 | t1 = time.time() 99 | -------------------------------------------------------------------------------- /reco_utils/common/notebook_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | 7 | def is_jupyter(): 8 | """Check if the module is running on Jupyter notebook/console. 9 | 10 | Returns: 11 | bool: True if the module is running on Jupyter notebook or Jupyter console, 12 | False otherwise. 13 | """ 14 | try: 15 | shell_name = get_ipython().__class__.__name__ 16 | if shell_name == "ZMQInteractiveShell": 17 | return True 18 | else: 19 | return False 20 | except NameError: 21 | return False 22 | 23 | 24 | def is_databricks(): 25 | """Check if the module is running on Databricks. 26 | 27 | Returns: 28 | bool: True if the module is running on Databricks notebook, 29 | False otherwise. 30 | """ 31 | try: 32 | if os.path.realpath(".") == "/databricks/driver": 33 | return True 34 | else: 35 | return False 36 | except NameError: 37 | return False 38 | -------------------------------------------------------------------------------- /reco_utils/common/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def line_graph( 5 | values, 6 | labels, 7 | x_guides=None, 8 | x_name=None, 9 | y_name=None, 10 | x_min_max=None, 11 | y_min_max=None, 12 | legend_loc=None, 13 | subplot=None, 14 | plot_size=(5, 5), 15 | ): 16 | """Plot line graph(s). 17 | 18 | Args: 19 | values (list(list(float or tuple)) or list(float or tuple): List of graphs or a graph to plot 20 | E.g. a graph = list(y) or list((y,x)) 21 | labels (list(str) or str): List of labels or a label for graph. 22 | If labels is a string, this function assumes the values is a single graph. 23 | x_guides (list(int)): List of guidelines (a vertical dotted line) 24 | x_name (str): x axis label 25 | y_name (str): y axis label 26 | x_min_max (list or tuple): Min and max value of the x axis 27 | y_min_max (list or tuple): Min and max value of the y axis 28 | legend_loc (str): legend location 29 | subplot (list or tuple): `matplotlib.pyplot.subplot` format. E.g. to draw 1 x 2 subplot, 30 | pass `(1,2,1)` for the first subplot and `(1,2,2)` for the second subplot. 31 | plot_size (list or tuple): Plot size (width, height) 32 | """ 33 | if subplot: 34 | # Setup figure only once 35 | if subplot[2] == 1: 36 | if plot_size: 37 | plt.figure( 38 | figsize=( 39 | plot_size[0] 40 | * subplot[1], # fig width = plot width * num columns 41 | plot_size[1] 42 | * subplot[0], # fig height = plot height * num rows 43 | ) 44 | ) 45 | plt.subplots_adjust(wspace=0.5) 46 | plt.subplot(*subplot) 47 | else: 48 | if plot_size: 49 | plt.figure(figsize=plot_size) 50 | 51 | if isinstance(labels, str): 52 | if isinstance(values[0], (int, float)): 53 | y, x = values, range(len(values)) 54 | else: 55 | y, x = zip(*values) 56 | plt.plot(x, y, label=labels, lw=1) 57 | else: 58 | assert len(values) == len(labels) 59 | for i, v in enumerate(values): 60 | if isinstance(v[0], (int, float)): 61 | y, x = v, range(len(v)) 62 | else: 63 | y, x = zip(*v) 64 | plt.plot(x, y, label=labels[i], lw=1) 65 | 66 | if x_guides: 67 | for x in x_guides: 68 | plt.axvline(x=x, color="gray", lw=1, linestyle="--") 69 | 70 | if x_name: 71 | plt.xlabel(x_name) 72 | if y_name: 73 | plt.ylabel(y_name) 74 | if x_min_max: 75 | plt.xlim(*x_min_max) 76 | if y_min_max: 77 | plt.ylim(*y_min_max) 78 | if legend_loc: 79 | plt.legend(loc=legend_loc) 80 | -------------------------------------------------------------------------------- /reco_utils/common/python_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import logging 5 | 6 | import numpy as np 7 | from scipy import sparse 8 | 9 | 10 | logger = logging.getLogger() 11 | 12 | 13 | def exponential_decay(value, max_val, half_life): 14 | """Compute decay factor for a given value based on an exponential decay. 15 | 16 | Values greater than `max_val` will be set to 1. 17 | 18 | Args: 19 | value (numeric): value to calculate decay factor 20 | max_val (numeric): value at which decay factor will be 1 21 | half_life (numeric): value at which decay factor will be 0.5 22 | 23 | Returns: 24 | float: decay factor 25 | """ 26 | return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life)) 27 | 28 | 29 | def jaccard(cooccurrence): 30 | """Helper method to calculate the Jaccard similarity of a matrix of co-occurrences. 31 | 32 | Args: 33 | cooccurrence (np.array): the symmetric matrix of co-occurrences of items. 34 | 35 | Returns: 36 | np.array: The matrix of Jaccard similarities between any two items. 37 | """ 38 | 39 | diag = cooccurrence.diagonal() 40 | diag_rows = np.expand_dims(diag, axis=0) 41 | diag_cols = np.expand_dims(diag, axis=1) 42 | 43 | with np.errstate(invalid="ignore", divide="ignore"): 44 | result = cooccurrence / (diag_rows + diag_cols - cooccurrence) 45 | 46 | return np.array(result) 47 | 48 | 49 | def lift(cooccurrence): 50 | """Helper method to calculate the Lift of a matrix of co-occurrences. 51 | 52 | Args: 53 | cooccurrence (np.array): the symmetric matrix of co-occurrences of items. 54 | 55 | Returns: 56 | np.array: The matrix of Lifts between any two items. 57 | """ 58 | 59 | diag = cooccurrence.diagonal() 60 | diag_rows = np.expand_dims(diag, axis=0) 61 | diag_cols = np.expand_dims(diag, axis=1) 62 | 63 | with np.errstate(invalid="ignore", divide="ignore"): 64 | result = cooccurrence / (diag_rows * diag_cols) 65 | 66 | return np.array(result) 67 | 68 | 69 | def get_top_k_scored_items(scores, top_k, sort_top_k=False): 70 | """Extract top K items from a matrix of scores for each user-item pair, optionally sort results per user. 71 | 72 | Args: 73 | scores (np.array): score matrix (users x items). 74 | top_k (int): number of top items to recommend. 75 | sort_top_k (bool): flag to sort top k results. 76 | 77 | Returns: 78 | np.array, np.array: indices into score matrix for each users top items, scores corresponding to top items. 79 | """ 80 | 81 | # ensure we're working with a dense ndarray 82 | if isinstance(scores, sparse.spmatrix): 83 | scores = scores.todense() 84 | 85 | if scores.shape[1] < top_k: 86 | logger.warning( 87 | "Number of items is less than top_k, limiting top_k to number of items" 88 | ) 89 | k = min(top_k, scores.shape[1]) 90 | 91 | test_user_idx = np.arange(scores.shape[0])[:, None] 92 | 93 | # get top K items and scores 94 | # this determines the un-ordered top-k item indices for each user 95 | top_items = np.argpartition(scores, -k, axis=1)[:, -k:] 96 | top_scores = scores[test_user_idx, top_items] 97 | 98 | if sort_top_k: 99 | sort_ind = np.argsort(-top_scores) 100 | top_items = top_items[test_user_idx, sort_ind] 101 | top_scores = top_scores[test_user_idx, sort_ind] 102 | 103 | return np.array(top_items), np.array(top_scores) 104 | 105 | 106 | def binarize(a, threshold): 107 | """Binarize the values. 108 | 109 | Args: 110 | a (np.ndarray): Input array that needs to be binarized. 111 | threshold (float): Threshold below which all values are set to 0, else 1. 112 | """ 113 | return np.where( 114 | a > threshold, 115 | 1.0, 116 | 0.0 117 | ) 118 | 119 | 120 | def rescale(data, new_min=0, new_max=1, data_min=None, data_max=None): 121 | """ 122 | Rescale/normalize the data to be within the range [new_min, new_max] 123 | If data_min and data_max are explicitly provided, they will be used 124 | as the old min/max values instead of taken from the data. 125 | 126 | Note: this is same as the scipy.MinMaxScaler with the exception that we can override 127 | the min/max of the old scale. 128 | 129 | Args: 130 | data (np.array): 1d scores vector or 2d score matrix (users x items). 131 | new_min (int|float): The minimum of the newly scaled data. 132 | new_max (int|float): The maximum of the newly scaled data. 133 | data_min (None|number): The minimum of the passed data [if omitted it will be inferred]. 134 | data_max (None|number): The maximum of the passed data [if omitted it will be inferred]. 135 | 136 | Returns: 137 | np.array: The newly scaled/normalized data. 138 | """ 139 | data_min = data.min() if data_min is None else data_min 140 | data_max = data.max() if data_max is None else data_max 141 | return (data - data_min) / (data_max - data_min) * (new_max - new_min) + new_min 142 | -------------------------------------------------------------------------------- /reco_utils/common/spark_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import sys 6 | 7 | 8 | try: 9 | from pyspark.sql import SparkSession 10 | except ImportError: 11 | pass # skip this import if we are in pure python environment 12 | 13 | 14 | def start_or_get_spark( 15 | app_name="Sample", 16 | url="local[*]", 17 | memory="10g", 18 | config=None, 19 | packages=None, 20 | jars=None, 21 | repository=None, 22 | ): 23 | """Start Spark if not started 24 | 25 | Args: 26 | app_name (str): Set name of the application 27 | url (str): URL for spark master 28 | memory (str): Size of memory for spark driver 29 | config (dict): dictionary of configuration options 30 | packages (list): list of packages to install 31 | jars (list): list of jar files to add 32 | repository (str): The maven repository 33 | 34 | Returns: 35 | obj: Spark context. 36 | """ 37 | 38 | submit_args = "" 39 | if packages is not None: 40 | submit_args = "--packages {} ".format(",".join(packages)) 41 | if jars is not None: 42 | submit_args += "--jars {} ".format(",".join(jars)) 43 | if repository is not None: 44 | submit_args += "--repositories {}".format(repository) 45 | if submit_args: 46 | os.environ["PYSPARK_SUBMIT_ARGS"] = "{} pyspark-shell".format(submit_args) 47 | 48 | spark_opts = [ 49 | 'SparkSession.builder.appName("{}")'.format(app_name), 50 | 'master("{}")'.format(url), 51 | ] 52 | 53 | if config is not None: 54 | for key, raw_value in config.items(): 55 | value = ( 56 | '"{}"'.format(raw_value) if isinstance(raw_value, str) else raw_value 57 | ) 58 | spark_opts.append('config("{key}", {value})'.format(key=key, value=value)) 59 | 60 | if config is None or "spark.driver.memory" not in config: 61 | spark_opts.append('config("spark.driver.memory", "{}")'.format(memory)) 62 | 63 | spark_opts.append("getOrCreate()") 64 | return eval(".".join(spark_opts)) 65 | -------------------------------------------------------------------------------- /reco_utils/common/tf_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import itertools 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow as tf 8 | 9 | MODEL_DIR = "model_checkpoints" 10 | 11 | 12 | OPTIMIZERS = dict( 13 | adadelta=tf.train.AdadeltaOptimizer, 14 | adagrad=tf.train.AdagradOptimizer, 15 | adam=tf.train.AdamOptimizer, 16 | ftrl=tf.train.FtrlOptimizer, 17 | momentum=tf.train.MomentumOptimizer, 18 | rmsprop=tf.train.RMSPropOptimizer, 19 | sgd=tf.train.GradientDescentOptimizer, 20 | ) 21 | 22 | 23 | def pandas_input_fn_for_saved_model(df, feat_name_type): 24 | """Pandas input function for TensorFlow SavedModel. 25 | 26 | Args: 27 | df (pd.DataFrame): Data containing features. 28 | feat_name_type (dict): Feature name and type spec. E.g. 29 | `{'userID': int, 'itemID': int, 'rating': float}` 30 | 31 | Returns: 32 | func: Input function 33 | 34 | """ 35 | for feat_type in feat_name_type.values(): 36 | assert feat_type in (int, float, list) 37 | 38 | def input_fn(): 39 | examples = [None] * len(df) 40 | for i, sample in df.iterrows(): 41 | ex = tf.train.Example() 42 | for feat_name, feat_type in feat_name_type.items(): 43 | feat = ex.features.feature[feat_name] 44 | if feat_type == int: 45 | feat.int64_list.value.extend([sample[feat_name]]) 46 | elif feat_type == float: 47 | feat.float_list.value.extend([sample[feat_name]]) 48 | elif feat_type == list: 49 | feat.float_list.value.extend(sample[feat_name]) 50 | examples[i] = ex.SerializeToString() 51 | return {"inputs": tf.constant(examples)} 52 | 53 | return input_fn 54 | 55 | 56 | def pandas_input_fn( 57 | df, y_col=None, batch_size=128, num_epochs=1, shuffle=False, seed=None 58 | ): 59 | """Pandas input function for TensorFlow high-level API Estimator. 60 | This function returns a `tf.data.Dataset` function. 61 | 62 | .. note:: 63 | 64 | `tf.estimator.inputs.pandas_input_fn` cannot handle array/list column properly. 65 | For more information, see https://www.tensorflow.org/api_docs/python/tf/estimator/inputs/numpy_input_fn 66 | 67 | Args: 68 | df (pd.DataFrame): Data containing features. 69 | y_col (str): Label column name if df has it. 70 | batch_size (int): Batch size for the input function. 71 | num_epochs (int): Number of epochs to iterate over data. If None will run forever. 72 | shuffle (bool): If True, shuffles the data queue. 73 | seed (int): Random seed for shuffle. 74 | 75 | Returns: 76 | tf.data.Dataset function 77 | """ 78 | 79 | X_df = df.copy() 80 | if y_col is not None: 81 | y = X_df.pop(y_col).values 82 | else: 83 | y = None 84 | 85 | X = {} 86 | for col in X_df.columns: 87 | values = X_df[col].values 88 | if isinstance(values[0], (list, np.ndarray)): 89 | values = np.array([l for l in values], dtype=np.float32) 90 | X[col] = values 91 | 92 | return lambda: _dataset( 93 | x=X, 94 | y=y, 95 | batch_size=batch_size, 96 | num_epochs=num_epochs, 97 | shuffle=shuffle, 98 | seed=seed, 99 | ) 100 | 101 | 102 | def _dataset(x, y=None, batch_size=128, num_epochs=1, shuffle=False, seed=None): 103 | if y is None: 104 | dataset = tf.data.Dataset.from_tensor_slices(x) 105 | else: 106 | dataset = tf.data.Dataset.from_tensor_slices((x, y)) 107 | 108 | if shuffle: 109 | dataset = dataset.shuffle( 110 | 1000, seed=seed, reshuffle_each_iteration=True # buffer size = 1000 111 | ) 112 | elif seed is not None: 113 | import warnings 114 | 115 | warnings.warn("Seed was set but `shuffle=False`. Seed will be ignored.") 116 | 117 | return dataset.repeat(num_epochs).batch(batch_size) 118 | 119 | 120 | def build_optimizer(name, lr=0.001, **kwargs): 121 | """Get an optimizer for TensorFlow high-level API Estimator. 122 | 123 | Args: 124 | name (str): Optimizer name. Note, to use 'Momentum', should specify 125 | lr (float): Learning rate 126 | kwargs: Optimizer arguments as key-value pairs 127 | 128 | Returns: 129 | tf.train.Optimizer 130 | """ 131 | name = name.lower() 132 | 133 | try: 134 | optimizer_class = OPTIMIZERS[name] 135 | except KeyError: 136 | raise KeyError("Optimizer name should be one of: {}".format(list(OPTIMIZERS))) 137 | 138 | # Set parameters 139 | params = {} 140 | if name == "ftrl": 141 | params["l1_regularization_strength"] = kwargs.get( 142 | "l1_regularization_strength", 0.0 143 | ) 144 | params["l2_regularization_strength"] = kwargs.get( 145 | "l2_regularization_strength", 0.0 146 | ) 147 | elif name == "momentum" or name == "rmsprop": 148 | params["momentum"] = kwargs.get("momentum", 0.0) 149 | 150 | return optimizer_class(learning_rate=lr, **params) 151 | 152 | 153 | def export_model(model, train_input_fn, eval_input_fn, tf_feat_cols, base_dir): 154 | """Export TensorFlow estimator (model). 155 | 156 | Args: 157 | model (tf.estimator.Estimator): Model to export. 158 | train_input_fn (function): Training input function to create data receiver spec. 159 | eval_input_fn (function): Evaluation input function to create data receiver spec. 160 | tf_feat_cols (list(tf.feature_column)): Feature columns. 161 | base_dir (str): Base directory to export the model. 162 | 163 | Returns: 164 | str: Exported model path 165 | """ 166 | tf.logging.set_verbosity(tf.logging.ERROR) 167 | train_rcvr_fn = tf.contrib.estimator.build_supervised_input_receiver_fn_from_input_fn( 168 | train_input_fn 169 | ) 170 | eval_rcvr_fn = tf.contrib.estimator.build_supervised_input_receiver_fn_from_input_fn( 171 | eval_input_fn 172 | ) 173 | serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( 174 | tf.feature_column.make_parse_example_spec(tf_feat_cols) 175 | ) 176 | rcvr_fn_map = { 177 | tf.estimator.ModeKeys.TRAIN: train_rcvr_fn, 178 | tf.estimator.ModeKeys.EVAL: eval_rcvr_fn, 179 | tf.estimator.ModeKeys.PREDICT: serve_rcvr_fn, 180 | } 181 | exported_path = tf.contrib.estimator.export_all_saved_models( 182 | model, export_dir_base=base_dir, input_receiver_fn_map=rcvr_fn_map 183 | ) 184 | 185 | return exported_path.decode("utf-8") 186 | 187 | 188 | def evaluation_log_hook( 189 | estimator, 190 | logger, 191 | true_df, 192 | y_col, 193 | eval_df, 194 | every_n_iter=10000, 195 | model_dir=None, 196 | batch_size=256, 197 | eval_fns=None, 198 | **eval_kwargs 199 | ): 200 | """Evaluation log hook for TensorFlow high-level API Estimator. 201 | 202 | .. note:: 203 | 204 | Note, TensorFlow Estimator model uses the last checkpoint weights for evaluation or prediction. 205 | In order to get the most up-to-date evaluation results while training, 206 | set model's `save_checkpoints_steps` to be equal or greater than hook's `every_n_iter`. 207 | 208 | Args: 209 | estimator (tf.estimator.Estimator): Model to evaluate. 210 | logger (Logger): Custom logger to log the results. 211 | E.g., define a subclass of Logger for AzureML logging. 212 | true_df (pd.DataFrame): Ground-truth data. 213 | y_col (str): Label column name in true_df 214 | eval_df (pd.DataFrame): Evaluation data without label column. 215 | every_n_iter (int): Evaluation frequency (steps). 216 | model_dir (str): Model directory to save the summaries to. If None, does not record. 217 | batch_size (int): Number of samples fed into the model at a time. 218 | Note, the batch size doesn't affect on evaluation results. 219 | eval_fns (iterable of functions): List of evaluation functions that have signature of 220 | (true_df, prediction_df, **eval_kwargs)->(float). If None, loss is calculated on true_df. 221 | **eval_kwargs: Evaluation function's keyword arguments. 222 | Note, prediction column name should be 'prediction' 223 | 224 | Returns: 225 | tf.train.SessionRunHook: Session run hook to evaluate the model while training. 226 | """ 227 | 228 | return _TrainLogHook( 229 | estimator, 230 | logger, 231 | true_df, 232 | y_col, 233 | eval_df, 234 | every_n_iter, 235 | model_dir, 236 | batch_size, 237 | eval_fns, 238 | **eval_kwargs 239 | ) 240 | 241 | 242 | class _TrainLogHook(tf.train.SessionRunHook): 243 | def __init__( 244 | self, 245 | estimator, 246 | logger, 247 | true_df, 248 | y_col, 249 | eval_df, 250 | every_n_iter=10000, 251 | model_dir=None, 252 | batch_size=256, 253 | eval_fns=None, 254 | **eval_kwargs 255 | ): 256 | """Evaluation log hook class""" 257 | self.model = estimator 258 | self.logger = logger 259 | self.true_df = true_df 260 | self.y_col = y_col 261 | self.eval_df = eval_df 262 | self.every_n_iter = every_n_iter 263 | self.model_dir = model_dir 264 | self.batch_size = batch_size 265 | self.eval_fns = eval_fns 266 | self.eval_kwargs = eval_kwargs 267 | 268 | self.summary_writer = None 269 | self.global_step_tensor = None 270 | self.step = 0 271 | 272 | def begin(self): 273 | if self.model_dir is not None: 274 | self.summary_writer = tf.summary.FileWriterCache.get(self.model_dir) 275 | self.global_step_tensor = tf.train.get_or_create_global_step() 276 | else: 277 | self.step = 0 278 | 279 | def before_run(self, run_context): 280 | if self.global_step_tensor is not None: 281 | requests = {"global_step": self.global_step_tensor} 282 | return tf.train.SessionRunArgs(requests) 283 | else: 284 | return None 285 | 286 | def after_run(self, run_context, run_values): 287 | if self.global_step_tensor is not None: 288 | self.step = run_values.results["global_step"] 289 | else: 290 | self.step += 1 291 | 292 | if self.step % self.every_n_iter == 0: 293 | _prev_log_level = tf.logging.get_verbosity() 294 | tf.logging.set_verbosity(tf.logging.ERROR) 295 | 296 | if self.eval_fns is None: 297 | result = self.model.evaluate( 298 | input_fn=pandas_input_fn( 299 | df=self.true_df, y_col=self.y_col, batch_size=self.batch_size 300 | ) 301 | )["average_loss"] 302 | self._log("validation_loss", result) 303 | else: 304 | predictions = list( 305 | itertools.islice( 306 | self.model.predict( 307 | input_fn=pandas_input_fn( 308 | df=self.eval_df, batch_size=self.batch_size 309 | ) 310 | ), 311 | len(self.eval_df), 312 | ) 313 | ) 314 | prediction_df = self.eval_df.copy() 315 | prediction_df["prediction"] = [p["predictions"][0] for p in predictions] 316 | for fn in self.eval_fns: 317 | result = fn(self.true_df, prediction_df, **self.eval_kwargs) 318 | self._log(fn.__name__, result) 319 | 320 | tf.logging.set_verbosity(_prev_log_level) 321 | 322 | def end(self, session): 323 | if self.summary_writer is not None: 324 | self.summary_writer.flush() 325 | 326 | def _log(self, tag, value): 327 | self.logger.log(tag, value) 328 | if self.summary_writer is not None: 329 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 330 | self.summary_writer.add_summary(summary, self.step) 331 | 332 | 333 | class MetricsLogger: 334 | """Metrics logger""" 335 | 336 | def __init__(self): 337 | """Initializer""" 338 | self._log = {} 339 | 340 | def log(self, metric, value): 341 | """Log metrics. Each metric's log will be stored in the corresponding list. 342 | 343 | Args: 344 | metric (str): Metric name. 345 | value (float): Value. 346 | """ 347 | if metric not in self._log: 348 | self._log[metric] = [] 349 | self._log[metric].append(value) 350 | 351 | def get_log(self): 352 | """Getter 353 | 354 | Returns: 355 | dict: Log metrics. 356 | """ 357 | return self._log 358 | -------------------------------------------------------------------------------- /reco_utils/common/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | from timeit import default_timer 5 | from datetime import timedelta 6 | 7 | 8 | class Timer(object): 9 | """Timer class. 10 | 11 | `Original code `_. 12 | 13 | Examples: 14 | >>> import time 15 | >>> t = Timer() 16 | >>> t.start() 17 | >>> time.sleep(1) 18 | >>> t.stop() 19 | >>> t.interval < 1 20 | True 21 | >>> with Timer() as t: 22 | ... time.sleep(1) 23 | >>> t.interval < 1 24 | True 25 | >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS 26 | 'Time elapsed 1...' 27 | """ 28 | 29 | def __init__(self): 30 | self._timer = default_timer 31 | self._interval = 0 32 | self.running = False 33 | 34 | def __enter__(self): 35 | self.start() 36 | return self 37 | 38 | def __exit__(self, *args): 39 | self.stop() 40 | 41 | def __str__(self): 42 | return "{:0.4f}".format(self.interval) 43 | 44 | def start(self): 45 | """Start the timer.""" 46 | self.init = self._timer() 47 | self.running = True 48 | 49 | def stop(self): 50 | """Stop the timer. Calculate the interval in seconds.""" 51 | self.end = self._timer() 52 | try: 53 | self._interval = self.end - self.init 54 | self.running = False 55 | except AttributeError: 56 | raise ValueError( 57 | "Timer has not been initialized: use start() or the contextual form with Timer() as t:" 58 | ) 59 | 60 | @property 61 | def interval(self): 62 | """Get time interval in seconds. 63 | 64 | Returns: 65 | float: Seconds. 66 | """ 67 | if self.running: 68 | raise ValueError("Timer has not been stopped, please use stop().") 69 | else: 70 | return self._interval 71 | -------------------------------------------------------------------------------- /reco_utils/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/dataset/__init__.py -------------------------------------------------------------------------------- /reco_utils/dataset/blob_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from io import StringIO 6 | import pandas as pd 7 | 8 | 9 | def load_csv_from_blob(blob_service, container_name, blob_name, **kwargs): 10 | """ Load a Pandas DataFrame from CSV in Azure Blob Storage. 11 | 12 | Args: 13 | blob_service (azure.storage.blob.BlockBlobService): Azure BlockBlobService for dataset. 14 | container_name (str): Azure storage container name. 15 | blob_name (str): Name of the blob located in the container. 16 | 17 | Returns: 18 | df (pd.DataFrame): Loaded dataframe. 19 | """ 20 | # Read blob into memory 21 | blob = blob_service.get_blob_to_text(container_name, blob_name) 22 | 23 | # Load into dataframe 24 | df = pd.read_csv(StringIO(blob.content), **kwargs) 25 | 26 | return df 27 | -------------------------------------------------------------------------------- /reco_utils/dataset/covid_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | from reco_utils.dataset import blob_utils 5 | from azure.storage.blob import BlockBlobService 6 | from io import StringIO 7 | import pandas as pd 8 | import numpy as np 9 | import json 10 | 11 | 12 | def load_pandas_df( 13 | azure_storage_account_name="azureopendatastorage", 14 | azure_storage_sas_token="sv=2019-02-02&ss=bfqt&srt=sco&sp=rlcup&se=2025-04-14T00:21:16Z&st=2020-04-13T16:21:16Z&spr=https&sig=JgwLYbdGruHxRYTpr5dxfJqobKbhGap8WUtKFadcivQ%3D", 15 | container_name="covid19temp", 16 | metadata_filename="metadata.csv", 17 | ): 18 | """ Loads the Azure Open Research COVID-19 dataset as a pd.DataFrame. 19 | 20 | The Azure COVID-19 Open Research Dataset may be found at https://azure.microsoft.com/en-us/services/open-datasets/catalog/covid-19-open-research/ 21 | 22 | Args: 23 | azure_storage_account_name (str): Azure storage account name. 24 | azure_storage_sas_token (str): Azure storage SaS token. 25 | container_name (str): Azure storage container name. 26 | metadata_filename (str): Name of file containing top-level metadata for the dataset. 27 | 28 | Returns: 29 | metadata (pd.DataFrame): Metadata dataframe. 30 | blob_service (azure.storage.blob.BlockBlobService): Azure BlockBlobService for dataset. 31 | """ 32 | 33 | # Get metadata (may take around 1-2 min) 34 | blob_service = BlockBlobService( 35 | account_name=azure_storage_account_name, sas_token=azure_storage_sas_token 36 | ) 37 | metadata = blob_utils.load_csv_from_blob( 38 | blob_service, container_name, metadata_filename 39 | ) 40 | 41 | return metadata, blob_service 42 | 43 | 44 | def remove_duplicates(df, cols): 45 | """ Remove duplicated entries. 46 | 47 | Args: 48 | df (pd.DataFrame): Pandas dataframe. 49 | cols (list of str): Name of columns in which to look for duplicates. 50 | 51 | Returns: 52 | df (pd.DataFrame): Pandas dataframe with duplicate rows dropped. 53 | 54 | """ 55 | for col in cols: 56 | # Reset index 57 | df = df.reset_index(drop=True) 58 | 59 | # Find where the identifier variable is duplicated 60 | dup_rows = np.where(df.duplicated([col]) == True)[0] 61 | 62 | # Drop duplicated rows 63 | df = df.drop(dup_rows) 64 | 65 | return df 66 | 67 | 68 | def remove_nan(df, cols): 69 | """ Remove rows with NaN values in specified column. 70 | 71 | Args: 72 | df (pd.DataFrame): Pandas dataframe. 73 | cols (list of str): Name of columns in which to look for NaN. 74 | 75 | Returns: 76 | df (pd.DataFrame): Pandas dataframe with invalid rows dropped. 77 | 78 | """ 79 | for col in cols: 80 | # Convert any empty string cells to nan 81 | df[col].replace("", np.nan, inplace=True) 82 | 83 | # Remove NaN rows 84 | df = df[df[col].notna()] 85 | 86 | return df 87 | 88 | 89 | def clean_dataframe(df): 90 | """ Clean up the dataframe. 91 | 92 | Args: 93 | df (pd.DataFrame): Pandas dataframe. 94 | 95 | Returns: 96 | df (pd.DataFrame): Cleaned pandas dataframe. 97 | """ 98 | 99 | # Remove duplicated rows 100 | cols = ["cord_uid", "doi"] 101 | df = remove_duplicates(df, cols) 102 | 103 | # Remove rows without values in specified columns 104 | cols = ["cord_uid", "doi", "title", "license", "url"] 105 | df = remove_nan(df, cols) 106 | 107 | return df 108 | 109 | 110 | def retrieve_text(entry, blob_service, container_name): 111 | """ Retrieve body text from article of interest. 112 | 113 | Args: 114 | entry (pd.Series): A single row from the dataframe (df.iloc[n]). 115 | blob_service (azure.storage.blob.BlockBlobService): Azure BlockBlobService for dataset. 116 | container_name (str): Azure storage container name. 117 | 118 | Results: 119 | text (str): Full text of the blob as a single string. 120 | """ 121 | 122 | try: 123 | # select based on whether it's pdf or pmc_xml 124 | if entry["has_pdf_parse"] == True: 125 | blob_name = "{0}/pdf_json/{1}.json".format( 126 | entry["full_text_file"], entry["sha"] 127 | ) 128 | else: 129 | if entry["has_pmc_xml_parse"] == True: 130 | blob_name = "{0}/pmc_json/{1}.xml.json".format( 131 | entry["full_text_file"], entry["pmcid"] 132 | ) 133 | else: 134 | print("Neither PDF or PMC_XML data is available for this file") 135 | 136 | # Extract text 137 | data = json.loads( 138 | blob_service.get_blob_to_text( 139 | container_name=container_name, blob_name=blob_name 140 | ).content 141 | ) 142 | text = " ".join([paragraph["text"] for paragraph in data["body_text"]]) 143 | 144 | except: 145 | text = "" 146 | 147 | return text 148 | 149 | 150 | def get_public_domain_text(df, blob_service, container_name): 151 | """ Get all public domain text. 152 | 153 | Args: 154 | df (pd.DataFrame): Metadata dataframe for public domain text. 155 | blob_service (azure.storage.blob.BlockBlobService): Azure BlockBlobService for dataset. 156 | container_name (str): Azure storage container name. 157 | 158 | Returns: 159 | df_full (pd.DataFrame): Dataframe with select metadata and full article text. 160 | """ 161 | # reset index 162 | df = df.reset_index(drop=True) 163 | 164 | # Add in full_text 165 | df["full_text"] = df.apply( 166 | lambda row: retrieve_text(row, blob_service, container_name), axis=1 167 | ) 168 | 169 | # Remove rows with empty full_text 170 | empty_rows = np.where(df["full_text"] == "")[0] 171 | df = df.drop(empty_rows) 172 | 173 | # Only keep columns of interest 174 | df_full = df[ 175 | [ 176 | "cord_uid", 177 | "doi", 178 | "title", 179 | "publish_time", 180 | "authors", 181 | "journal", 182 | "url", 183 | "abstract", 184 | "full_text", 185 | ] 186 | ] 187 | df_full = df_full.reset_index() 188 | 189 | return df_full 190 | -------------------------------------------------------------------------------- /reco_utils/dataset/download_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import logging 6 | import requests 7 | import math 8 | import zipfile 9 | from contextlib import contextmanager 10 | from tempfile import TemporaryDirectory 11 | from tqdm import tqdm 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def maybe_download(url, filename=None, work_directory=".", expected_bytes=None): 17 | """Download a file if it is not already downloaded. 18 | 19 | Args: 20 | filename (str): File name. 21 | work_directory (str): Working directory. 22 | url (str): URL of the file to download. 23 | expected_bytes (int): Expected file size in bytes. 24 | 25 | Returns: 26 | str: File path of the file downloaded. 27 | """ 28 | if filename is None: 29 | filename = url.split("/")[-1] 30 | os.makedirs(work_directory, exist_ok=True) 31 | filepath = os.path.join(work_directory, filename) 32 | if not os.path.exists(filepath): 33 | 34 | r = requests.get(url, stream=True) 35 | total_size = int(r.headers.get("content-length", 0)) 36 | block_size = 1024 37 | num_iterables = math.ceil(total_size / block_size) 38 | 39 | with open(filepath, "wb") as file: 40 | for data in tqdm( 41 | r.iter_content(block_size), 42 | total=num_iterables, 43 | unit="KB", 44 | unit_scale=True, 45 | ): 46 | file.write(data) 47 | else: 48 | log.info("File {} already downloaded".format(filepath)) 49 | if expected_bytes is not None: 50 | statinfo = os.stat(filepath) 51 | if statinfo.st_size != expected_bytes: 52 | os.remove(filepath) 53 | raise IOError("Failed to verify {}".format(filepath)) 54 | 55 | return filepath 56 | 57 | 58 | @contextmanager 59 | def download_path(path=None): 60 | """Return a path to download data. If `path=None`, then it yields a temporal path that is eventually deleted, 61 | otherwise the real path of the input. 62 | 63 | Args: 64 | path (str): Path to download data. 65 | 66 | Returns: 67 | str: Real path where the data is stored. 68 | 69 | Examples: 70 | >>> with download_path() as path: 71 | >>> ... maybe_download(url="http://example.com/file.zip", work_directory=path) 72 | 73 | """ 74 | if path is None: 75 | tmp_dir = TemporaryDirectory() 76 | try: 77 | yield tmp_dir.name 78 | finally: 79 | tmp_dir.cleanup() 80 | else: 81 | path = os.path.realpath(path) 82 | yield path 83 | 84 | 85 | def unzip_file(zip_src, dst_dir, clean_zip_file=True): 86 | """Unzip a file 87 | 88 | Args: 89 | zip_src (str): Zip file. 90 | dst_dir (str): Destination folder. 91 | clean_zip_file (bool): Whether or not to clean the zip file. 92 | """ 93 | fz = zipfile.ZipFile(zip_src, "r") 94 | for file in fz.namelist(): 95 | fz.extract(file, dst_dir) 96 | if clean_zip_file: 97 | os.remove(zip_src) 98 | -------------------------------------------------------------------------------- /reco_utils/dataset/python_splitters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.model_selection import train_test_split as sk_split 6 | 7 | from reco_utils.common.constants import ( 8 | DEFAULT_ITEM_COL, 9 | DEFAULT_USER_COL, 10 | DEFAULT_TIMESTAMP_COL, 11 | ) 12 | from reco_utils.dataset.split_utils import ( 13 | process_split_ratio, 14 | min_rating_filter_pandas, 15 | split_pandas_data_with_ratios, 16 | ) 17 | 18 | 19 | def python_random_split(data, ratio=0.75, seed=42): 20 | """Pandas random splitter. 21 | 22 | The splitter randomly splits the input data. 23 | 24 | Args: 25 | data (pd.DataFrame): Pandas DataFrame to be split. 26 | ratio (float or list): Ratio for splitting data. If it is a single float number 27 | it splits data into two halves and the ratio argument indicates the ratio 28 | of training data set; if it is a list of float numbers, the splitter splits 29 | data into several portions corresponding to the split ratios. If a list is 30 | provided and the ratios are not summed to 1, they will be normalized. 31 | seed (int): Seed. 32 | 33 | Returns: 34 | list: Splits of the input data as pd.DataFrame. 35 | """ 36 | multi_split, ratio = process_split_ratio(ratio) 37 | 38 | if multi_split: 39 | splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed) 40 | splits_new = [x.drop("split_index", axis=1) for x in splits] 41 | 42 | return splits_new 43 | else: 44 | return sk_split(data, test_size=None, train_size=ratio, random_state=seed) 45 | 46 | 47 | def _do_stratification( 48 | data, 49 | ratio=0.75, 50 | min_rating=1, 51 | filter_by="user", 52 | is_random=True, 53 | seed=42, 54 | col_user=DEFAULT_USER_COL, 55 | col_item=DEFAULT_ITEM_COL, 56 | col_timestamp=DEFAULT_TIMESTAMP_COL, 57 | ): 58 | # A few preliminary checks. 59 | if not (filter_by == "user" or filter_by == "item"): 60 | raise ValueError("filter_by should be either 'user' or 'item'.") 61 | 62 | if min_rating < 1: 63 | raise ValueError("min_rating should be integer and larger than or equal to 1.") 64 | 65 | if col_user not in data.columns: 66 | raise ValueError("Schema of data not valid. Missing User Col") 67 | 68 | if col_item not in data.columns: 69 | raise ValueError("Schema of data not valid. Missing Item Col") 70 | 71 | if not is_random: 72 | if col_timestamp not in data.columns: 73 | raise ValueError("Schema of data not valid. Missing Timestamp Col") 74 | 75 | multi_split, ratio = process_split_ratio(ratio) 76 | 77 | split_by_column = col_user if filter_by == "user" else col_item 78 | 79 | ratio = ratio if multi_split else [ratio, 1 - ratio] 80 | 81 | if min_rating > 1: 82 | data = min_rating_filter_pandas( 83 | data, 84 | min_rating=min_rating, 85 | filter_by=filter_by, 86 | col_user=col_user, 87 | col_item=col_item, 88 | ) 89 | 90 | # Split by each group and aggregate splits together. 91 | splits = [] 92 | 93 | # If it is for chronological splitting, the split will be performed in a random way. 94 | df_grouped = ( 95 | data.sort_values(col_timestamp).groupby(split_by_column) 96 | if is_random is False 97 | else data.groupby(split_by_column) 98 | ) 99 | 100 | for name, group in df_grouped: 101 | group_splits = split_pandas_data_with_ratios( 102 | df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed 103 | ) 104 | 105 | # Concatenate the list of split dataframes. 106 | concat_group_splits = pd.concat(group_splits) 107 | 108 | splits.append(concat_group_splits) 109 | 110 | # Concatenate splits for all the groups together. 111 | splits_all = pd.concat(splits) 112 | 113 | # Take split by split_index 114 | splits_list = [ 115 | splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) 116 | for x in range(len(ratio)) 117 | ] 118 | 119 | return splits_list 120 | 121 | 122 | def python_chrono_split( 123 | data, 124 | ratio=0.75, 125 | min_rating=1, 126 | filter_by="user", 127 | col_user=DEFAULT_USER_COL, 128 | col_item=DEFAULT_ITEM_COL, 129 | col_timestamp=DEFAULT_TIMESTAMP_COL, 130 | ): 131 | """Pandas chronological splitter. 132 | 133 | This function splits data in a chronological manner. That is, for each user / item, the 134 | split function takes proportions of ratings which is specified by the split ratio(s). 135 | The split is stratified. 136 | 137 | Args: 138 | data (pd.DataFrame): Pandas DataFrame to be split. 139 | ratio (float or list): Ratio for splitting data. If it is a single float number 140 | it splits data into two halves and the ratio argument indicates the ratio of 141 | training data set; if it is a list of float numbers, the splitter splits 142 | data into several portions corresponding to the split ratios. If a list is 143 | provided and the ratios are not summed to 1, they will be normalized. 144 | seed (int): Seed. 145 | min_rating (int): minimum number of ratings for user or item. 146 | filter_by (str): either "user" or "item", depending on which of the two is to 147 | filter with min_rating. 148 | col_user (str): column name of user IDs. 149 | col_item (str): column name of item IDs. 150 | col_timestamp (str): column name of timestamps. 151 | 152 | Returns: 153 | list: Splits of the input data as pd.DataFrame. 154 | """ 155 | return _do_stratification( 156 | data, 157 | ratio=ratio, 158 | min_rating=min_rating, 159 | filter_by=filter_by, 160 | col_user=col_user, 161 | col_item=col_item, 162 | col_timestamp=col_timestamp, 163 | is_random=False, 164 | ) 165 | 166 | 167 | def python_stratified_split( 168 | data, 169 | ratio=0.75, 170 | min_rating=1, 171 | filter_by="user", 172 | col_user=DEFAULT_USER_COL, 173 | col_item=DEFAULT_ITEM_COL, 174 | seed=42, 175 | ): 176 | """Pandas stratified splitter. 177 | 178 | For each user / item, the split function takes proportions of ratings which is 179 | specified by the split ratio(s). The split is stratified. 180 | 181 | Args: 182 | data (pd.DataFrame): Pandas DataFrame to be split. 183 | ratio (float or list): Ratio for splitting data. If it is a single float number 184 | it splits data into two halves and the ratio argument indicates the ratio of 185 | training data set; if it is a list of float numbers, the splitter splits 186 | data into several portions corresponding to the split ratios. If a list is 187 | provided and the ratios are not summed to 1, they will be normalized. 188 | seed (int): Seed. 189 | min_rating (int): minimum number of ratings for user or item. 190 | filter_by (str): either "user" or "item", depending on which of the two is to 191 | filter with min_rating. 192 | col_user (str): column name of user IDs. 193 | col_item (str): column name of item IDs. 194 | 195 | Returns: 196 | list: Splits of the input data as pd.DataFrame. 197 | """ 198 | return _do_stratification( 199 | data, 200 | ratio=ratio, 201 | min_rating=min_rating, 202 | filter_by=filter_by, 203 | col_user=col_user, 204 | col_item=col_item, 205 | is_random=True, 206 | seed=seed, 207 | ) 208 | 209 | 210 | def numpy_stratified_split(X, ratio=0.75, seed=42): 211 | """Split the user/item affinity matrix (sparse matrix) into train and test set matrices while maintaining 212 | local (i.e. per user) ratios. 213 | 214 | Main points : 215 | 216 | 1. In a typical recommender problem, different users rate a different number of items, 217 | and therefore the user/affinity matrix has a sparse structure with variable number 218 | of zeroes (unrated items) per row (user). Cutting a total amount of ratings will 219 | result in a non-homogeneous distribution between train and test set, i.e. some test 220 | users may have many ratings while other very little if none. 221 | 222 | 2. In an unsupervised learning problem, no explicit answer is given. For this reason 223 | the split needs to be implemented in a different way then in supervised learningself. 224 | In the latter, one typically split the dataset by rows (by examples), ending up with 225 | the same number of features but different number of examples in the train/test setself. 226 | This scheme does not work in the unsupervised case, as part of the rated items needs to 227 | be used as a test set for fixed number of users. 228 | 229 | Solution: 230 | 231 | 1. Instead of cutting a total percentage, for each user we cut a relative ratio of the rated 232 | items. For example, if user1 has rated 4 items and user2 10, cutting 25% will correspond to 233 | 1 and 2.6 ratings in the test set, approximated as 1 and 3 according to the round() function. 234 | In this way, the 0.75 ratio is satisfied both locally and globally, preserving the original 235 | distribution of ratings across the train and test set. 236 | 237 | 2. It is easy (and fast) to satisfy this requirements by creating the test via element subtraction 238 | from the original dataset X. We first create two copies of X; for each user we select a random 239 | sample of local size ratio (point 1) and erase the remaining ratings, obtaining in this way the 240 | train set matrix Xtst. The train set matrix is obtained in the opposite way. 241 | 242 | Args: 243 | X (np.array, int): a sparse matrix to be split 244 | ratio (float): fraction of the entire dataset to constitute the train set 245 | seed (int): random seed 246 | 247 | Returns: 248 | np.array, np.array: Xtr is the train set user/item affinity matrix. Xtst is the test set user/item affinity 249 | matrix. 250 | """ 251 | 252 | np.random.seed(seed) # set the random seed 253 | test_cut = int((1 - ratio) * 100) # percentage of ratings to go in the test set 254 | 255 | # initialize train and test set matrices 256 | Xtr = X.copy() 257 | Xtst = X.copy() 258 | 259 | # find the number of rated movies per user 260 | rated = np.sum(Xtr != 0, axis=1) 261 | 262 | # for each user, cut down a test_size% for the test set 263 | tst = np.around((rated * test_cut) / 100).astype(int) 264 | 265 | for u in range(X.shape[0]): 266 | # For each user obtain the index of rated movies 267 | idx = np.asarray(np.where(Xtr[u] != 0))[0].tolist() 268 | 269 | # extract a random subset of size n from the set of rated movies without repetition 270 | idx_tst = np.random.choice(idx, tst[u], replace=False) 271 | idx_train = list(set(idx).difference(set(idx_tst))) 272 | 273 | # change the selected rated movies to unrated in the train set 274 | Xtr[u, idx_tst] = 0 275 | # set the movies that appear already in the train set as 0 276 | Xtst[u, idx_train] = 0 277 | 278 | del idx, idx_train, idx_tst 279 | 280 | return Xtr, Xtst 281 | -------------------------------------------------------------------------------- /reco_utils/dataset/spark_splitters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | try: 7 | from pyspark.sql import Window 8 | from pyspark.sql.functions import ( 9 | col, 10 | row_number, 11 | broadcast, 12 | rand, 13 | collect_list, 14 | size, 15 | ) 16 | except ImportError: 17 | pass # skip this import if we are in pure python environment 18 | 19 | from reco_utils.common.constants import ( 20 | DEFAULT_ITEM_COL, 21 | DEFAULT_USER_COL, 22 | DEFAULT_TIMESTAMP_COL, 23 | DEFAULT_RATING_COL, 24 | ) 25 | from reco_utils.dataset.split_utils import process_split_ratio, min_rating_filter_spark 26 | 27 | 28 | def spark_random_split(data, ratio=0.75, seed=42): 29 | """Spark random splitter. 30 | 31 | Randomly split the data into several splits. 32 | 33 | Args: 34 | data (spark.DataFrame): Spark DataFrame to be split. 35 | ratio (float or list): Ratio for splitting data. If it is a single float number 36 | it splits data into two halves and the ratio argument indicates the ratio of 37 | training data set; if it is a list of float numbers, the splitter splits 38 | data into several portions corresponding to the split ratios. If a list 39 | is provided and the ratios are not summed to 1, they will be normalized. 40 | seed (int): Seed. 41 | 42 | Returns: 43 | list: Splits of the input data as spark.DataFrame. 44 | """ 45 | multi_split, ratio = process_split_ratio(ratio) 46 | 47 | if multi_split: 48 | return data.randomSplit(ratio, seed=seed) 49 | else: 50 | return data.randomSplit([ratio, 1 - ratio], seed=seed) 51 | 52 | 53 | def spark_chrono_split( 54 | data, 55 | ratio=0.75, 56 | min_rating=1, 57 | filter_by="user", 58 | col_user=DEFAULT_USER_COL, 59 | col_item=DEFAULT_ITEM_COL, 60 | col_timestamp=DEFAULT_TIMESTAMP_COL, 61 | ): 62 | """Spark chronological splitter. 63 | 64 | This function splits data in a chronological manner. That is, for each user / item, the 65 | split function takes proportions of ratings which is specified by the split ratio(s). 66 | The split is stratified. 67 | 68 | Args: 69 | data (spark.DataFrame): Spark DataFrame to be split. 70 | ratio (float or list): Ratio for splitting data. If it is a single float number 71 | it splits data into two sets and the ratio argument indicates the ratio of 72 | training data set; if it is a list of float numbers, the splitter splits 73 | data into several portions corresponding to the split ratios. If a list is 74 | provided and the ratios are not summed to 1, they will be normalized. 75 | seed (int): Seed. 76 | min_rating (int): minimum number of ratings for user or item. 77 | filter_by (str): either "user" or "item", depending on which of the two is to filter 78 | with min_rating. 79 | col_user (str): column name of user IDs. 80 | col_item (str): column name of item IDs. 81 | col_timestamp (str): column name of timestamps. 82 | 83 | Returns: 84 | list: Splits of the input data as spark.DataFrame. 85 | """ 86 | if not (filter_by == "user" or filter_by == "item"): 87 | raise ValueError("filter_by should be either 'user' or 'item'.") 88 | 89 | if min_rating < 1: 90 | raise ValueError("min_rating should be integer and larger than or equal to 1.") 91 | 92 | multi_split, ratio = process_split_ratio(ratio) 93 | 94 | split_by_column = col_user if filter_by == "user" else col_item 95 | 96 | if min_rating > 1: 97 | data = min_rating_filter_spark( 98 | data, 99 | min_rating=min_rating, 100 | filter_by=filter_by, 101 | col_user=col_user, 102 | col_item=col_item, 103 | ) 104 | 105 | ratio = ratio if multi_split else [ratio, 1 - ratio] 106 | ratio_index = np.cumsum(ratio) 107 | 108 | window_count = Window.partitionBy(split_by_column) 109 | window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp)) 110 | 111 | rating_all = data.withColumn( 112 | "count", size(collect_list(col_timestamp).over(window_count)) 113 | ) 114 | 115 | rating_rank = rating_all.withColumn( 116 | "rank", row_number().over(window_spec) / col("count") 117 | ) 118 | 119 | splits = [] 120 | for i, _ in enumerate(ratio_index): 121 | if i == 0: 122 | rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) 123 | else: 124 | rating_split = rating_rank.filter( 125 | (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) 126 | ) 127 | 128 | splits.append(rating_split) 129 | 130 | return splits 131 | 132 | 133 | def spark_stratified_split( 134 | data, 135 | ratio=0.75, 136 | min_rating=1, 137 | filter_by="user", 138 | col_user=DEFAULT_USER_COL, 139 | col_item=DEFAULT_ITEM_COL, 140 | col_rating=DEFAULT_RATING_COL, 141 | seed=42, 142 | ): 143 | """Spark stratified splitter. 144 | 145 | For each user / item, the split function takes proportions of ratings which is 146 | specified by the split ratio(s). The split is stratified. 147 | 148 | Args: 149 | data (spark.DataFrame): Spark DataFrame to be split. 150 | ratio (float or list): Ratio for splitting data. If it is a single float number 151 | it splits data into two halves and the ratio argument indicates the ratio of 152 | training data set; if it is a list of float numbers, the splitter splits 153 | data into several portions corresponding to the split ratios. If a list is 154 | provided and the ratios are not summed to 1, they will be normalized. 155 | Earlier indexed splits will have earlier times 156 | (e.g the latest time per user or item in split[0] <= the earliest time per user or item in split[1]) 157 | seed (int): Seed. 158 | min_rating (int): minimum number of ratings for user or item. 159 | filter_by (str): either "user" or "item", depending on which of the two is to filter 160 | with min_rating. 161 | col_user (str): column name of user IDs. 162 | col_item (str): column name of item IDs. 163 | col_rating (str): column name of ratings. 164 | 165 | Returns: 166 | list: Splits of the input data as spark.DataFrame. 167 | """ 168 | if not (filter_by == "user" or filter_by == "item"): 169 | raise ValueError("filter_by should be either 'user' or 'item'.") 170 | 171 | if min_rating < 1: 172 | raise ValueError("min_rating should be integer and larger than or equal to 1.") 173 | 174 | multi_split, ratio = process_split_ratio(ratio) 175 | 176 | split_by_column = col_user if filter_by == "user" else col_item 177 | 178 | if min_rating > 1: 179 | data = min_rating_filter_spark( 180 | data, 181 | min_rating=min_rating, 182 | filter_by=filter_by, 183 | col_user=col_user, 184 | col_item=col_item, 185 | ) 186 | 187 | ratio = ratio if multi_split else [ratio, 1 - ratio] 188 | ratio_index = np.cumsum(ratio) 189 | 190 | window_count = Window.partitionBy(split_by_column) 191 | window_spec = Window.partitionBy(split_by_column).orderBy(rand(seed=seed)) 192 | 193 | rating_all = data.withColumn( 194 | "count", size(collect_list(col_rating).over(window_count)) 195 | ) 196 | rating_rank = rating_all.withColumn( 197 | "rank", row_number().over(window_spec) / col("count") 198 | ) 199 | 200 | splits = [] 201 | for i, _ in enumerate(ratio_index): 202 | if i == 0: 203 | rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) 204 | else: 205 | rating_split = rating_rank.filter( 206 | (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) 207 | ) 208 | 209 | splits.append(rating_split) 210 | 211 | return splits 212 | 213 | 214 | def spark_timestamp_split( 215 | data, 216 | ratio=0.75, 217 | col_user=DEFAULT_USER_COL, 218 | col_item=DEFAULT_ITEM_COL, 219 | col_timestamp=DEFAULT_TIMESTAMP_COL, 220 | ): 221 | """Spark timestamp based splitter. 222 | 223 | The splitter splits the data into sets by timestamps without stratification on either user or item. 224 | The ratios are applied on the timestamp column which is divided accordingly into several partitions. 225 | 226 | Args: 227 | data (spark.DataFrame): Spark DataFrame to be split. 228 | ratio (float or list): Ratio for splitting data. If it is a single float number 229 | it splits data into two sets and the ratio argument indicates the ratio of 230 | training data set; if it is a list of float numbers, the splitter splits 231 | data into several portions corresponding to the split ratios. If a list is 232 | provided and the ratios are not summed to 1, they will be normalized. 233 | Earlier indexed splits will have earlier times 234 | (e.g the latest time in split[0] <= the earliest time in split[1]) 235 | col_user (str): column name of user IDs. 236 | col_item (str): column name of item IDs. 237 | col_timestamp (str): column name of timestamps. Float number represented in 238 | seconds since Epoch. 239 | 240 | Returns: 241 | list: Splits of the input data as spark.DataFrame. 242 | """ 243 | multi_split, ratio = process_split_ratio(ratio) 244 | 245 | ratio = ratio if multi_split else [ratio, 1 - ratio] 246 | ratio_index = np.cumsum(ratio) 247 | 248 | window_spec = Window.orderBy(col(col_timestamp)) 249 | rating = data.withColumn("rank", row_number().over(window_spec)) 250 | 251 | data_count = rating.count() 252 | rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) 253 | 254 | splits = [] 255 | for i, _ in enumerate(ratio_index): 256 | if i == 0: 257 | rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( 258 | "rank" 259 | ) 260 | else: 261 | rating_split = rating_rank.filter( 262 | (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) 263 | ).drop("rank") 264 | 265 | splits.append(rating_split) 266 | 267 | return splits 268 | -------------------------------------------------------------------------------- /reco_utils/dataset/sparse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import itertools 7 | 8 | from scipy.sparse import coo_matrix 9 | import logging 10 | 11 | # import default parameters 12 | from reco_utils.common.constants import ( 13 | DEFAULT_USER_COL, 14 | DEFAULT_ITEM_COL, 15 | DEFAULT_RATING_COL, 16 | DEFAULT_TIMESTAMP_COL, 17 | DEFAULT_PREDICTION_COL, 18 | ) 19 | 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | class AffinityMatrix: 25 | """Generate the user/item affinity matrix from a pandas dataframe and vice versa""" 26 | 27 | def __init__( 28 | self, 29 | DF, 30 | col_user=DEFAULT_USER_COL, 31 | col_item=DEFAULT_ITEM_COL, 32 | col_rating=DEFAULT_RATING_COL, 33 | col_pred=DEFAULT_PREDICTION_COL, 34 | save_path=None, 35 | ): 36 | """Initialize class parameters 37 | 38 | Args: 39 | DF (pd.DataFrame): a dataframe containing the data 40 | col_user (str): default name for user column 41 | col_item (str): default name for item column 42 | col_rating (str): default name for rating columns 43 | save_path (str): default path to save item/user maps 44 | 45 | """ 46 | self.df = DF # dataframe 47 | 48 | # pandas DF parameters 49 | self.col_item = col_item 50 | self.col_user = col_user 51 | self.col_rating = col_rating 52 | self.col_pred = col_pred 53 | 54 | # Options to save the model for future use 55 | self.save_path = save_path 56 | 57 | def _gen_index(self): 58 | """ 59 | Generate the user/item index: 60 | map_users, map_items: dictionaries mapping the original user/item index to matrix indices 61 | map_back_users, map_back_items: dictionaries to map back the matrix elements to the original 62 | dataframe indices 63 | 64 | Basic mechanics: 65 | As a first step we retieve the unique elements in the dataset. In this way we can take care 66 | of either completely missing rows (a user with no ratings) or completely missing columns 67 | (an item that has not being reviewed by anyone). The original indices in the dataframe are 68 | then mapped to an ordered, contiguous integer series to generate a compact matrix representation. 69 | 70 | Functions to map back to the original indices are also provided and can be saved in order to use 71 | a pretrained model. 72 | 73 | """ 74 | # sort entries by user index 75 | self.df_ = self.df.sort_values(by=[self.col_user]) 76 | 77 | # find unique user and item index 78 | unique_users = self.df_[self.col_user].unique() 79 | unique_items = self.df_[self.col_item].unique() 80 | 81 | self.Nusers = len(unique_users) 82 | self.Nitems = len(unique_items) 83 | 84 | # create a dictionary to map unique users/items to hashed values to generate the matrix 85 | self.map_users = {x: i for i, x in enumerate(unique_users)} 86 | self.map_items = {x: i for i, x in enumerate(unique_items)} 87 | 88 | # map back functions used to get back the original dataframe 89 | self.map_back_users = {i: x for i, x in enumerate(unique_users)} 90 | self.map_back_items = {i: x for i, x in enumerate(unique_items)} 91 | 92 | self.df_.loc[:, "hashedItems"] = self.df_[self.col_item].map(self.map_items) 93 | self.df_.loc[:, "hashedUsers"] = self.df_[self.col_user].map(self.map_users) 94 | 95 | # optionally save the inverse dictionary to work with trained models 96 | if self.save_path is not None: 97 | 98 | np.save(self.save_path + "/user_dict", self.map_users) 99 | np.save(self.save_path + "/item_dict", self.map_items) 100 | 101 | np.save(self.save_path + "/user_back_dict", self.map_back_users) 102 | np.save(self.save_path + "/item_back_dict", self.map_back_items) 103 | 104 | def gen_affinity_matrix(self): 105 | """Generate the user/item affinity matrix 106 | 107 | As a first step, two new columns are added to the input DF, containing the index maps 108 | generated by the gen_index() method. The new indices, together with the ratings, are 109 | then used to generate the user/item affinity matrix using scipy's sparse matrix method 110 | coo_matrix; for reference see: 111 | https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html 112 | 113 | The input format is: `coo_matrix((data, (rows, columns)), shape=(rows, columns))` 114 | 115 | Returns: 116 | scipy.sparse.coo_matrix: user-affinity matrix of dimensions (Nusers, Nitems) in numpy format. Unrated movies 117 | are assigned a value of 0. 118 | """ 119 | 120 | log.info("Generating the user/item affinity matrix...") 121 | 122 | self._gen_index() 123 | 124 | ratings = self.df_[self.col_rating] # ratings 125 | itm_id = self.df_["hashedItems"] # itm_id serving as columns 126 | usr_id = self.df_["hashedUsers"] # usr_id serving as rows 127 | 128 | # generate a sparse matrix representation using scipy's coo_matrix and convert to array format 129 | self.AM = coo_matrix( 130 | (ratings, (usr_id, itm_id)), shape=(self.Nusers, self.Nitems) 131 | ).toarray() 132 | 133 | zero = (self.AM == 0).sum() # number of unrated items 134 | total = self.AM.shape[0] * self.AM.shape[1] # number of elements in the matrix 135 | sparsness = zero / total * 100 # Percentage of zeros in the matrix 136 | 137 | log.info("Matrix generated, sparseness percentage: %d" % sparsness) 138 | 139 | return self.AM 140 | 141 | def map_back_sparse(self, X, kind): 142 | """Map back the user/affinity matrix to a pd dataframe 143 | 144 | Args: 145 | X (np.array, int32): user/item affinity matrix 146 | kind (string): specify if the output values are ratings or predictions 147 | 148 | Returns: 149 | pd.DataFrame: the generated pandas dataframe 150 | 151 | """ 152 | m, n = X.shape 153 | 154 | # 1) Create a DF from a sparse matrix 155 | # obtain the non zero items 156 | items = [np.asanyarray(np.where(X[i, :] != 0)).flatten() for i in range(m)] 157 | ratings = [X[i, items[i]] for i in range(m)] # obtain the non-zero ratings 158 | 159 | # Creates user ids following the DF format 160 | userids = [] 161 | for i in range(0, m): 162 | userids.extend([i] * len(items[i])) 163 | 164 | # Flatten the lists to follow the DF input format 165 | items = list(itertools.chain.from_iterable(items)) 166 | ratings = list(itertools.chain.from_iterable(ratings)) 167 | 168 | if kind == "ratings": 169 | col_out = self.col_rating 170 | else: 171 | col_out = self.col_pred 172 | 173 | # create a df 174 | out_df = pd.DataFrame.from_dict( 175 | {self.col_user: userids, self.col_item: items, col_out: ratings} 176 | ) 177 | 178 | # 2) map back user/item ids to their original value 179 | 180 | out_df[self.col_user] = out_df[self.col_user].map(self.map_back_users) 181 | out_df[self.col_item] = out_df[self.col_item].map(self.map_back_items) 182 | 183 | return out_df 184 | -------------------------------------------------------------------------------- /reco_utils/dataset/split_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import math 7 | 8 | from reco_utils.common.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL 9 | 10 | try: 11 | from pyspark.sql.functions import col, broadcast 12 | except ImportError: 13 | pass # so the environment without spark doesn't break 14 | 15 | 16 | def process_split_ratio(ratio): 17 | """Generate split ratio lists. 18 | 19 | Args: 20 | ratio (float or list): a float number that indicates split ratio or a list of float 21 | numbers that indicate split ratios (if it is a multi-split). 22 | 23 | Returns: 24 | tuple: a tuple containing 25 | bool: A boolean variable multi that indicates if the splitting is multi or single. 26 | list: A list of normalized split ratios. 27 | """ 28 | if isinstance(ratio, float): 29 | if ratio <= 0 or ratio >= 1: 30 | raise ValueError("Split ratio has to be between 0 and 1") 31 | 32 | multi = False 33 | elif isinstance(ratio, list): 34 | if any([x <= 0 for x in ratio]): 35 | raise ValueError( 36 | "All split ratios in the ratio list should be larger than 0." 37 | ) 38 | 39 | # normalize split ratios if they are not summed to 1 40 | if math.fsum(ratio) != 1.0: 41 | ratio = [x / math.fsum(ratio) for x in ratio] 42 | 43 | multi = True 44 | else: 45 | raise TypeError("Split ratio should be either float or a list of floats.") 46 | 47 | return multi, ratio 48 | 49 | 50 | def min_rating_filter_pandas( 51 | data, 52 | min_rating=1, 53 | filter_by="user", 54 | col_user=DEFAULT_USER_COL, 55 | col_item=DEFAULT_ITEM_COL, 56 | ): 57 | """Filter rating DataFrame for each user with minimum rating. 58 | 59 | Filter rating data frame with minimum number of ratings for user/item is usually useful to 60 | generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For 61 | example, a user is called warm if he has rated at least 4 items. 62 | 63 | Args: 64 | data (pd.DataFrame): DataFrame of user-item tuples. Columns of user and item 65 | should be present in the DataFrame while other columns like rating, 66 | timestamp, etc. can be optional. 67 | min_rating (int): minimum number of ratings for user or item. 68 | filter_by (str): either "user" or "item", depending on which of the two is to 69 | filter with min_rating. 70 | col_user (str): column name of user ID. 71 | col_item (str): column name of item ID. 72 | 73 | Returns: 74 | pd.DataFrame: DataFrame with at least columns of user and item that has been 75 | filtered by the given specifications. 76 | """ 77 | split_by_column, _ = _check_min_rating_filter( 78 | filter_by, min_rating, col_user, col_item 79 | ) 80 | rating_filtered = data.groupby(split_by_column).filter( 81 | lambda x: len(x) >= min_rating 82 | ) 83 | return rating_filtered 84 | 85 | 86 | def min_rating_filter_spark( 87 | data, 88 | min_rating=1, 89 | filter_by="user", 90 | col_user=DEFAULT_USER_COL, 91 | col_item=DEFAULT_ITEM_COL, 92 | ): 93 | """Filter rating DataFrame for each user with minimum rating. 94 | 95 | Filter rating data frame with minimum number of ratings for user/item is usually useful to 96 | generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For 97 | example, a user is called warm if he has rated at least 4 items. 98 | 99 | Args: 100 | data (spark.DataFrame): DataFrame of user-item tuples. Columns of user and item 101 | should be present in the DataFrame while other columns like rating, 102 | timestamp, etc. can be optional. 103 | min_rating (int): minimum number of ratings for user or item. 104 | filter_by (str): either "user" or "item", depending on which of the two is to 105 | filter with min_rating. 106 | col_user (str): column name of user ID. 107 | col_item (str): column name of item ID. 108 | 109 | Returns: 110 | spark.DataFrame: DataFrame with at least columns of user and item that has been 111 | filtered by the given specifications. 112 | """ 113 | split_by_column, split_with_column = _check_min_rating_filter( 114 | filter_by, min_rating, col_user, col_item 115 | ) 116 | rating_temp = ( 117 | data.groupBy(split_by_column) 118 | .agg({split_with_column: "count"}) 119 | .withColumnRenamed("count(" + split_with_column + ")", "n" + split_with_column) 120 | .where(col("n" + split_with_column) >= min_rating) 121 | ) 122 | 123 | rating_filtered = data.join(broadcast(rating_temp), split_by_column).drop( 124 | "n" + split_with_column 125 | ) 126 | return rating_filtered 127 | 128 | 129 | def _check_min_rating_filter(filter_by, min_rating, col_user, col_item): 130 | if not (filter_by == "user" or filter_by == "item"): 131 | raise ValueError("filter_by should be either 'user' or 'item'.") 132 | 133 | if min_rating < 1: 134 | raise ValueError("min_rating should be integer and larger than or equal to 1.") 135 | 136 | split_by_column = col_user if filter_by == "user" else col_item 137 | split_with_column = col_item if filter_by == "user" else col_user 138 | return split_by_column, split_with_column 139 | 140 | 141 | def split_pandas_data_with_ratios(data, ratios, seed=42, shuffle=False): 142 | """Helper function to split pandas DataFrame with given ratios 143 | 144 | .. note:: 145 | 146 | Implementation referenced from `this source `_. 147 | 148 | Args: 149 | data (pd.DataFrame): Pandas data frame to be split. 150 | ratios (list of floats): list of ratios for split. The ratios have to sum to 1. 151 | seed (int): random seed. 152 | shuffle (bool): whether data will be shuffled when being split. 153 | 154 | Returns: 155 | list: List of pd.DataFrame split by the given specifications. 156 | """ 157 | if math.fsum(ratios) != 1.0: 158 | raise ValueError("The ratios have to sum to 1") 159 | 160 | split_index = np.cumsum(ratios).tolist()[:-1] 161 | 162 | if shuffle: 163 | data = data.sample(frac=1, random_state=seed) 164 | 165 | splits = np.split(data, [round(x * len(data)) for x in split_index]) 166 | 167 | # Add split index (this makes splitting by group more efficient). 168 | for i in range(len(ratios)): 169 | splits[i]["split_index"] = i 170 | 171 | return splits 172 | -------------------------------------------------------------------------------- /reco_utils/dataset/wikidata.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import requests 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | API_URL_WIKIPEDIA = "https://en.wikipedia.org/w/api.php" 11 | API_URL_WIKIDATA = "https://query.wikidata.org/sparql" 12 | SESSION = None 13 | 14 | 15 | def get_session(session=None): 16 | """Get session object 17 | 18 | Args: 19 | session (requests.Session): request session object 20 | 21 | Returns: 22 | requests.Session: request session object 23 | """ 24 | 25 | if session is None: 26 | global SESSION 27 | if SESSION is None: 28 | SESSION = requests.Session() 29 | session = SESSION 30 | 31 | return session 32 | 33 | 34 | def find_wikidata_id(name, limit=1, session=None): 35 | """Find the entity ID in wikidata from a title string. 36 | 37 | Args: 38 | name (str): A string with search terms (eg. "Batman (1989) film") 39 | limit (int): Number of results to return 40 | session (requests.Session): requests session to reuse connections 41 | 42 | Returns: 43 | (str): wikidata entityID corresponding to the title string. 44 | 'entityNotFound' will be returned if no page is found 45 | """ 46 | 47 | session = get_session(session=session) 48 | 49 | params = dict( 50 | action="query", 51 | list="search", 52 | srsearch=bytes(name, encoding="utf8"), 53 | srlimit=limit, 54 | srprop="", 55 | format="json", 56 | ) 57 | 58 | try: 59 | response = session.get(API_URL_WIKIPEDIA, params=params) 60 | page_id = response.json()["query"]["search"][0]["pageid"] 61 | except Exception as e: 62 | # TODO: distinguish between connection error and entity not found 63 | logger.error("ENTITY NOT FOUND") 64 | return "entityNotFound" 65 | 66 | params = dict( 67 | action="query", 68 | prop="pageprops", 69 | ppprop="wikibase_item", 70 | pageids=[page_id], 71 | format="json", 72 | ) 73 | 74 | try: 75 | response = session.get(API_URL_WIKIPEDIA, params=params) 76 | entity_id = response.json()["query"]["pages"][str(page_id)]["pageprops"][ 77 | "wikibase_item" 78 | ] 79 | except Exception as e: 80 | # TODO: distinguish between connection error and entity not found 81 | logger.error("ENTITY NOT FOUND") 82 | return "entityNotFound" 83 | 84 | return entity_id 85 | 86 | 87 | def query_entity_links(entity_id, session=None): 88 | """Query all linked pages from a wikidata entityID 89 | 90 | Args: 91 | entity_id (str): A wikidata entity ID 92 | session (requests.Session): requests session to reuse connections 93 | 94 | Returns: 95 | (json): dictionary with linked pages. 96 | """ 97 | query = ( 98 | """ 99 | PREFIX entity: 100 | #partial results 101 | 102 | SELECT ?propUrl ?propLabel ?valUrl ?valLabel 103 | WHERE 104 | { 105 | hint:Query hint:optimizer 'None' . 106 | { BIND(entity:""" 107 | + entity_id 108 | + """ AS ?valUrl) . 109 | BIND("N/A" AS ?propUrl ) . 110 | BIND("identity"@en AS ?propLabel ) . 111 | } 112 | UNION 113 | { entity:""" 114 | + entity_id 115 | + """ ?propUrl ?valUrl . 116 | ?property ?ref ?propUrl . 117 | ?property rdf:type wikibase:Property . 118 | ?property rdfs:label ?propLabel 119 | } 120 | 121 | ?valUrl rdfs:label ?valLabel 122 | FILTER (LANG(?valLabel) = 'en') . 123 | OPTIONAL{ ?valUrl wdt:P18 ?picture .} 124 | FILTER (lang(?propLabel) = 'en' ) 125 | } 126 | ORDER BY ?propUrl ?valUrl 127 | LIMIT 500 128 | """ 129 | ) 130 | 131 | session = get_session(session=session) 132 | 133 | try: 134 | data = session.get( 135 | API_URL_WIKIDATA, params=dict(query=query, format="json") 136 | ).json() 137 | except Exception as e: 138 | logger.error("ENTITY NOT FOUND") 139 | return {} 140 | 141 | return data 142 | 143 | 144 | def read_linked_entities(data): 145 | """Obtain lists of liken entities (IDs and names) from dictionary 146 | 147 | Args: 148 | data (json): dictionary with linked pages 149 | 150 | Returns: 151 | (list): List of liked entityIDs 152 | (list): List of liked entity names 153 | """ 154 | 155 | return [ 156 | ( 157 | c.get("valUrl").get("value").replace("http://www.wikidata.org/entity/", ""), 158 | c.get("valLabel").get("value"), 159 | ) 160 | for c in data.get("results", {}).get("bindings", []) 161 | ] 162 | 163 | 164 | def query_entity_description(entity_id, session=None): 165 | """Query entity wikidata description from entityID 166 | 167 | Args: 168 | entity_id (str): A wikidata page ID. 169 | session (requests.Session): requests session to reuse connections 170 | 171 | Returns: 172 | (str): Wikidata short description of the entityID 173 | descriptionNotFound' will be returned if no 174 | description is found 175 | """ 176 | query = ( 177 | """ 178 | PREFIX wd: 179 | PREFIX schema: 180 | 181 | SELECT ?o 182 | WHERE 183 | { 184 | wd:""" 185 | + entity_id 186 | + """ schema:description ?o. 187 | FILTER ( lang(?o) = "en" ) 188 | } 189 | """ 190 | ) 191 | 192 | session = get_session(session=session) 193 | 194 | try: 195 | r = session.get(API_URL_WIKIDATA, params=dict(query=query, format="json")) 196 | description = r.json()["results"]["bindings"][0]["o"]["value"] 197 | except Exception as e: 198 | logger.error("DESCRIPTION NOT FOUND") 199 | return "descriptionNotFound" 200 | 201 | return description 202 | 203 | 204 | def search_wikidata(names, extras=None, describe=True, verbose=False): 205 | """Create DataFrame of Wikidata search results 206 | 207 | Args: 208 | names (list[str]): list of names to search for 209 | extras (dict(str: list)): optional extra items to assign to results for corresponding name 210 | describe (bool): optional flag to include description of entity 211 | verbose (bool): optional flag to print out intermediate data 212 | 213 | Returns: 214 | pd.DataFrame: wikipedia results for all names with found entities 215 | 216 | """ 217 | 218 | results = [] 219 | for idx, name in enumerate(names): 220 | entity_id = find_wikidata_id(name) 221 | if verbose: 222 | print("name: {name}, entity_id: {id}".format(name=name, id=entity_id)) 223 | 224 | if entity_id == "entityNotFound": 225 | continue 226 | 227 | json_links = query_entity_links(entity_id) 228 | related_links = read_linked_entities(json_links) 229 | description = query_entity_description(entity_id) if describe else "" 230 | 231 | for related_entity, related_name in related_links: 232 | result = dict( 233 | name=name, 234 | original_entity=entity_id, 235 | linked_entities=related_entity, 236 | name_linked_entities=related_name, 237 | ) 238 | if describe: 239 | result["description"] = description 240 | if extras is not None: 241 | for field, lst in extras.items(): 242 | result[field] = lst[idx] 243 | results.append(result) 244 | 245 | return pd.DataFrame(results) 246 | -------------------------------------------------------------------------------- /reco_utils/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/evaluation/__init__.py -------------------------------------------------------------------------------- /reco_utils/recommender/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/recommender/__init__.py -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/DataModel/ImplicitCF.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import random 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.sparse as sp 8 | import time 9 | from reco_utils.common.constants import ( 10 | DEFAULT_ITEM_COL, 11 | DEFAULT_USER_COL, 12 | DEFAULT_RATING_COL, 13 | DEFAULT_PREDICTION_COL, 14 | ) 15 | 16 | 17 | class ImplicitCF(object): 18 | """Data processing class for GCN models which use implicit feedback. 19 | 20 | Initialize train and test set, create normalized adjacency matrix and sample data for training epochs. 21 | 22 | """ 23 | 24 | def __init__( 25 | self, 26 | train, 27 | test=None, 28 | adj_dir=None, 29 | col_user=DEFAULT_USER_COL, 30 | col_item=DEFAULT_ITEM_COL, 31 | col_rating=DEFAULT_RATING_COL, 32 | col_prediction=DEFAULT_PREDICTION_COL, 33 | seed=None, 34 | ): 35 | """Constructor 36 | 37 | Args: 38 | adj_dir (str): Directory to save / load adjacency matrices. If it is None, adjacency 39 | matrices will be created and will not be saved. 40 | train (pd.DataFrame): Training data with at least columns (col_user, col_item, col_rating). 41 | test (pd.DataFrame): Test data with at least columns (col_user, col_item, col_rating). 42 | test can be None, if so, we only process the training data. 43 | col_user (str): User column name. 44 | col_item (str): Item column name. 45 | col_rating (str): Rating column name. 46 | seed (int): Seed. 47 | 48 | """ 49 | self.user_idx = None 50 | self.item_idx = None 51 | self.adj_dir = adj_dir 52 | self.col_user = col_user 53 | self.col_item = col_item 54 | self.col_rating = col_rating 55 | self.col_prediction = col_prediction 56 | self.train, self.test = self._data_processing(train, test) 57 | self._init_train_data() 58 | 59 | random.seed(seed) 60 | 61 | def _data_processing(self, train, test): 62 | """Process the dataset to reindex userID and itemID and only keep records with ratings greater than 0. 63 | 64 | Args: 65 | train (pd.DataFrame): Training data with at least columns (col_user, col_item, col_rating). 66 | test (pd.DataFrame): Test data with at least columns (col_user, col_item, col_rating). 67 | test can be None, if so, we only process the training data. 68 | 69 | Returns: 70 | list: train and test pd.DataFrame Dataset, which have been reindexed and filtered. 71 | 72 | """ 73 | df = train if test is None else train.append(test) 74 | 75 | if self.user_idx is None: 76 | user_idx = df[[self.col_user]].drop_duplicates().reindex() 77 | user_idx[self.col_user + "_idx"] = np.arange(len(user_idx)) 78 | self.n_users = len(user_idx) 79 | self.user_idx = user_idx 80 | 81 | self.user2id = dict( 82 | zip(user_idx[self.col_user], user_idx[self.col_user + "_idx"]) 83 | ) 84 | self.id2user = dict( 85 | zip(user_idx[self.col_user + "_idx"], user_idx[self.col_user]) 86 | ) 87 | 88 | if self.item_idx is None: 89 | item_idx = df[[self.col_item]].drop_duplicates() 90 | item_idx[self.col_item + "_idx"] = np.arange(len(item_idx)) 91 | self.n_items = len(item_idx) 92 | self.item_idx = item_idx 93 | 94 | self.item2id = dict( 95 | zip(item_idx[self.col_item], item_idx[self.col_item + "_idx"]) 96 | ) 97 | self.id2item = dict( 98 | zip(item_idx[self.col_item + "_idx"], item_idx[self.col_item]) 99 | ) 100 | 101 | return self._reindex(train), self._reindex(test) 102 | 103 | def _reindex(self, df): 104 | """Process the dataset to reindex userID and itemID and only keep records with ratings greater than 0. 105 | 106 | Args: 107 | df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating). 108 | 109 | Returns: 110 | list: train and test pandas.DataFrame Dataset, which have been reindexed and filtered. 111 | 112 | """ 113 | 114 | if df is None: 115 | return None 116 | 117 | df = pd.merge(df, self.user_idx, on=self.col_user, how="left") 118 | df = pd.merge(df, self.item_idx, on=self.col_item, how="left") 119 | 120 | df = df[df[self.col_rating] > 0] 121 | 122 | df_reindex = df[ 123 | [self.col_user + "_idx", self.col_item + "_idx", self.col_rating] 124 | ] 125 | df_reindex.columns = [self.col_user, self.col_item, self.col_rating] 126 | 127 | return df_reindex 128 | 129 | def _init_train_data(self): 130 | """Record items interated with each user in a dataframe self.interact_status, and create adjacency 131 | matrix self.R. 132 | 133 | """ 134 | self.interact_status = ( 135 | self.train.groupby(self.col_user)[self.col_item] 136 | .apply(set) 137 | .reset_index() 138 | .rename(columns={self.col_item: self.col_item + "_interacted"}) 139 | ) 140 | self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32) 141 | self.R[list(self.train[self.col_user]), list(self.train[self.col_item])] = 1.0 142 | 143 | def get_norm_adj_mat(self): 144 | """Load normalized adjacency matrix if it exists, otherwise create (and save) it. 145 | 146 | Returns: 147 | scipy.sparse.csr_matrix: Normalized adjacency matrix. 148 | 149 | """ 150 | try: 151 | norm_adj_mat = sp.load_npz(self.norm_adj_dir + "/norm_adj_mat.npz") 152 | print("Already load norm adj matrix.") 153 | 154 | except Exception: 155 | norm_adj_mat = self.create_norm_adj_mat() 156 | if self.adj_dir is not None: 157 | sp.save_npz(self.adj_dir + "/norm_adj_mat.npz", norm_adj_mat) 158 | return norm_adj_mat 159 | 160 | def create_norm_adj_mat(self): 161 | """Create normalized adjacency matrix. 162 | 163 | Returns: 164 | scipy.sparse.csr_matrix: Normalized adjacency matrix. 165 | 166 | """ 167 | adj_mat = sp.dok_matrix( 168 | (self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32 169 | ) 170 | adj_mat = adj_mat.tolil() 171 | R = self.R.tolil() 172 | 173 | adj_mat[: self.n_users, self.n_users :] = R 174 | adj_mat[self.n_users :, : self.n_users] = R.T 175 | adj_mat = adj_mat.todok() 176 | print("Already create adjacency matrix.") 177 | 178 | rowsum = np.array(adj_mat.sum(1)) 179 | d_inv = np.power(rowsum, -0.5).flatten() 180 | d_inv[np.isinf(d_inv)] = 0.0 181 | d_mat_inv = sp.diags(d_inv) 182 | norm_adj_mat = d_mat_inv.dot(adj_mat) 183 | norm_adj_mat = norm_adj_mat.dot(d_mat_inv) 184 | print("Already normalize adjacency matrix.") 185 | 186 | return norm_adj_mat.tocsr() 187 | 188 | def train_loader(self, batch_size): 189 | """Sample train data every batch. One positive item and one negative item sampled for each user. 190 | 191 | Args: 192 | batch_size (int): Batch size of users. 193 | 194 | Returns: 195 | np.array: Sampled users. 196 | np.array: Sampled positive items. 197 | np.array: Sampled negative items. 198 | 199 | """ 200 | 201 | def sample_neg(x): 202 | while True: 203 | neg_id = random.randint(0, self.n_items - 1) 204 | if neg_id not in x: 205 | return neg_id 206 | 207 | indices = range(self.n_users) 208 | if self.n_users < batch_size: 209 | users = [random.choice(indices) for _ in range(batch_size)] 210 | else: 211 | users = random.sample(indices, batch_size) 212 | 213 | interact = self.interact_status.iloc[users] 214 | pos_items = interact[self.col_item + "_interacted"].apply( 215 | lambda x: random.choice(list(x)) 216 | ) 217 | neg_items = interact[self.col_item + "_interacted"].apply( 218 | lambda x: sample_neg(x) 219 | ) 220 | 221 | return np.array(users), np.array(pos_items), np.array(neg_items) 222 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/recommender/deeprec/__init__.py -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/asvd.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : A2SVD 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | activation : [relu, relu] # activation function for DNN 14 | user_dropout: True 15 | dropout : [0.3, 0.3] #drop out values for DNN layer 16 | item_embedding_dim : 32 # the embedding dimension of items 17 | cate_embedding_dim : 8 # the embedding dimension of categories 18 | user_embedding_dim : 16 # the embedding dimension of users 19 | 20 | #train 21 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 22 | train: 23 | init_method: tnormal # method for initializing model parameters 24 | init_value : 0.01 # stddev values for initializing model parameters 25 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 26 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 27 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 28 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 29 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 30 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 31 | learning_rate : 0.001 32 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 33 | optimizer : lazyadam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 34 | epochs : 50 # number of epoch for training 35 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 36 | enable_BN : True # whether to use batch normalization in hidden layers 37 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 38 | max_seq_length : 50 # the maximum number of records in the history 39 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 40 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 41 | 42 | #show info 43 | #metric :'auc', 'logloss', 'group_auc' 44 | info: 45 | show_step : 100 # print training information after a certain number of mini-batch 46 | save_model: True # whether to save models 47 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 48 | metrics : ['auc','logloss'] # metrics for evaluation. 49 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 50 | MODEL_DIR : ./tests/resources/deeprec/a2svd/model/a2svd_model/ # directory of saved models. 51 | SUMMARIES_DIR : ./tests/resources/deeprec/a2svd/summary/a2svd_summary/ # directory of saved summaries. 52 | write_tfevents : True # whether to save summaries. 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/caser.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/sequential/yelp/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/sequential/yelp/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/sequential/yelp/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : Caser 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | activation : [relu, relu] # activation function for DNN 14 | user_dropout: False 15 | dropout : [0.0, 0.0] #drop out values for DNN layer 16 | embedding_dropout : 0.0 17 | item_embedding_dim : 32 # the embedding dimension of items 18 | cate_embedding_dim : 8 # the embedding dimension of categories 19 | user_embedding_dim : 16 # the embedding dimension of users 20 | 21 | #train 22 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 23 | train: 24 | init_method: tnormal # method for initializing model parameters 25 | init_value : 0.01 # stddev values for initializing model parameters 26 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 27 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 28 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 29 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 30 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 31 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 32 | learning_rate : 0.001 33 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 34 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 35 | epochs : 50 # number of epoch for training 36 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 37 | enable_BN : True # whether to use batch normalization in hidden layers 38 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 39 | max_seq_length : 50 # the maximum number of records in the history sequence 40 | T : 1 # prediction shape 41 | L : 3 # history sequence that involved in convolution shape 42 | n_v : 128 # number of vertical convolution layers 43 | n_h : 128 # number of horizonal convolution layers 44 | min_seq_length : 5 # the minimum number of records in the history sequence 45 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 46 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 47 | 48 | #show info 49 | #metric :'auc', 'logloss', 'group_auc' 50 | info: 51 | show_step : 100 # print training information after a certain number of mini-batch 52 | save_model: True # whether to save models 53 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 54 | metrics : ['auc','logloss'] #metrics for evaluation. 55 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 56 | MODEL_DIR : ./tests/resources/deeprec/caser/model/caser_model/ # directory of saved models. 57 | SUMMARIES_DIR : ./tests/resources/deeprec/caser/summary/caser_summary/ # directory of saved summaries. 58 | write_tfevents : True # whether to save summaries. 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/clsr.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/sequential/taobao/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/sequential/taobao/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/sequential/taobao/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : clsr 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | att_fcn_layer_sizes : [80, 40] 14 | activation : [relu, relu] # activation function for DNN 15 | user_dropout: False 16 | dropout : [0.0, 0.0] #drop out values for DNN layer 17 | embedding_dropout : 0.0 18 | item_embedding_dim : 32 # the embedding dimension of items 19 | cate_embedding_dim : 8 # the embedding dimension of categories 20 | user_embedding_dim : 40 # the embedding dimension of users 21 | 22 | #train 23 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 24 | train: 25 | init_method: tnormal # method for initializing model parameters 26 | init_value : 0.01 # stddev values for initializing model parameters 27 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 28 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 29 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 30 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 31 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 32 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 33 | learning_rate : 0.001 34 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 35 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 36 | epochs : 50 # number of epoch for training 37 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 38 | enable_BN : True # whether to use batch normalization in hidden layers 39 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 40 | max_seq_length : 50 # the maximum number of records in the history sequence 41 | hidden_size : 40 # the shape of hidden size used in RNN 42 | attention_size : 40 # the shape of attention size 43 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 44 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 45 | 46 | #show info 47 | #metric :'auc', 'logloss', 'group_auc' 48 | info: 49 | show_step : 100 # print training information after a certain number of mini-batch 50 | save_model: True # whether to save modl 51 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 52 | metrics : ['auc','logloss'] # metrics for evaluation. 53 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 54 | MODEL_DIR : ./tests/resources/deeprec/clsr/model/clsr_model/ # directory of saved models. 55 | SUMMARIES_DIR : ./tests/resources/deeprec/clsr/summary/clsr_summary/ # directory of saved summaries. 56 | write_tfevents : True # whether to save summaries. 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/dien.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : DIEN 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | att_fcn_layer_sizes : [80, 40] 14 | activation : [dice, dice] # activation function for DNN 15 | user_dropout: False 16 | dropout : [0.0, 0.0] #drop out values for DNN layer 17 | embedding_dropout : 0.0 18 | item_embedding_dim : 32 # the embedding dimension of items 19 | cate_embedding_dim : 8 # the embedding dimension of categories 20 | user_embedding_dim : 16 # the embedding dimension of users 21 | 22 | #train 23 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 24 | train: 25 | init_method: tnormal # method for initializing model parameters 26 | init_value : 0.01 # stddev values for initializing model parameters 27 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 28 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 29 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 30 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 31 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 32 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 33 | learning_rate : 0.001 34 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 35 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 36 | epochs : 50 # number of epoch for training 37 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 38 | enable_BN : True # whether to use batch normalization in hidden layers 39 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 40 | max_seq_length : 50 # the maximum number of records in the history 41 | hidden_size : 40 # the shape of hidden size used in RNN 42 | attention_size : 40 # the shape of attention size 43 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 44 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 45 | 46 | #show info 47 | #metric :'auc', 'logloss', 'group_auc' 48 | info: 49 | show_step : 100 # print training information after a certain number of mini-batch 50 | save_model: True # whether to save models 51 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 52 | metrics : ['auc','logloss'] # metrics for evaluation. 53 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 54 | MODEL_DIR : ./tests/resources/deeprec/a2svd/model/a2svd_model/ # directory of saved models. 55 | SUMMARIES_DIR : ./tests/resources/deeprec/a2svd/summary/a2svd_summary/ # directory of saved summaries. 56 | write_tfevents : True # whether to save summaries. 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/din.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : DIN 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | att_fcn_layer_sizes : [80, 40] 14 | activation : [relu, relu] # activation function for DNN 15 | user_dropout: False 16 | dropout : [0.0, 0.0] #drop out values for DNN layer 17 | embedding_dropout : 0.0 18 | item_embedding_dim : 32 # the embedding dimension of items 19 | cate_embedding_dim : 8 # the embedding dimension of categories 20 | user_embedding_dim : 16 # the embedding dimension of users 21 | 22 | #train 23 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 24 | train: 25 | init_method: tnormal # method for initializing model parameters 26 | init_value : 0.01 # stddev values for initializing model parameters 27 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 28 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 29 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 30 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 31 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 32 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 33 | learning_rate : 0.001 34 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 35 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 36 | epochs : 50 # number of epoch for training 37 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 38 | enable_BN : True # whether to use batch normalization in hidden layers 39 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 40 | max_seq_length : 50 # the maximum number of records in the history 41 | hidden_size : 40 # the shape of hidden size used in RNN 42 | attention_size : 40 # the shape of attention size 43 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 44 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 45 | 46 | #show info 47 | #metric :'auc', 'logloss', 'group_auc' 48 | info: 49 | show_step : 100 # print training information after a certain number of mini-batch 50 | save_model: True # whether to save models 51 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 52 | metrics : ['auc','logloss'] # metrics for evaluation. 53 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 54 | MODEL_DIR : ./tests/resources/deeprec/a2svd/model/a2svd_model/ # directory of saved models. 55 | SUMMARIES_DIR : ./tests/resources/deeprec/a2svd/summary/a2svd_summary/ # directory of saved summaries. 56 | write_tfevents : True # whether to save summaries. 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/gru4rec.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/sequential/yelp/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/sequential/yelp/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/sequential/yelp/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : GRU4Rec 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | activation : [relu, relu] # activation function for DNN 14 | user_dropout: False 15 | dropout : [0.0, 0.0] #drop out values for DNN layer 16 | embedding_dropout : 0.0 17 | item_embedding_dim : 32 # the embedding dimension of items 18 | cate_embedding_dim : 8 # the embedding dimension of categories 19 | user_embedding_dim : 16 # the embedding dimension of users 20 | 21 | #train 22 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 23 | train: 24 | init_method: tnormal # method for initializing model parameters 25 | init_value : 0.01 # stddev values for initializing model parameters 26 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 27 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 28 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 29 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 30 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 31 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 32 | learning_rate : 0.001 33 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 34 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 35 | epochs : 50 # number of epoch for training 36 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 37 | enable_BN : True # whether to use batch normalization in hidden layers 38 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 39 | max_seq_length : 50 # the maximum number of records in the history sequence 40 | hidden_size : 40 # the shape of hidden size used in RNN 41 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 42 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 43 | 44 | #show info 45 | #metric :'auc', 'logloss', 'group_auc' 46 | info: 47 | show_step : 100 # print training information after a certain number of mini-batch 48 | save_model: True # whether to save models 49 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 50 | metrics : ['auc','logloss'] # metrics for evaluation. 51 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 52 | MODEL_DIR : ./tests/resources/deeprec/gru4rec/model/gru4rec_model/ # directory of saved models. 53 | SUMMARIES_DIR : ./tests/resources/deeprec/gru4rec/summary/gru4rec_summary/ # directory of saved summaries. 54 | write_tfevents : True # whether to save summaries. 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/lgn.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : LGN 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | att_fcn_layer_sizes : [80, 40] 14 | ncf_layer_sizes : [80, 40] 15 | activation : [relu, relu] # activation function for DNN 16 | user_dropout: False 17 | dropout : [0.0, 0.0] #drop out values for DNN layer 18 | embedding_dropout : 0.0 19 | item_embedding_dim : 32 # the embedding dimension of items 20 | cate_embedding_dim : 8 # the embedding dimension of categories 21 | user_embedding_dim : 40 # the embedding dimension of users 22 | 23 | #train 24 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 25 | train: 26 | init_method: tnormal # method for initializing model parameters 27 | init_value : 0.01 # stddev values for initializing model parameters 28 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 29 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 30 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 31 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 32 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 33 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 34 | learning_rate : 0.001 35 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 36 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 37 | epochs : 50 # number of epoch for training 38 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 39 | enable_BN : True # whether to use batch normalization in hidden layers 40 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 41 | max_seq_length : 50 # the maximum number of records in the history 42 | hidden_size : 40 # the shape of hidden size used in RNN 43 | attention_size : 40 # the shape of attention size 44 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 45 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 46 | 47 | #show info 48 | #metric :'auc', 'logloss', 'group_auc' 49 | info: 50 | show_step : 100 # print training information after a certain number of mini-batch 51 | save_model: True # whether to save models 52 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 53 | metrics : ['auc','logloss'] # metrics for evaluation. 54 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 55 | MODEL_DIR : ./tests/resources/deeprec/a2svd/model/a2svd_model/ # directory of saved models. 56 | SUMMARIES_DIR : ./tests/resources/deeprec/a2svd/summary/a2svd_summary/ # directory of saved summaries. 57 | write_tfevents : True # whether to save summaries. 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/lightgcn.yaml: -------------------------------------------------------------------------------- 1 | #model 2 | model: 3 | model_type : "lightgcn" 4 | embed_size : 64 # the embedding dimension of users and items 5 | n_layers : 3 # number of layers of the model 6 | 7 | #train 8 | train: 9 | batch_size : 1024 10 | decay : 0.0001 # l2 regularization for embedding parameters 11 | epochs : 1000 # number of epochs for training 12 | learning_rate : 0.001 13 | eval_epoch : -1 # if it is not -1, evaluate the model every eval_epoch; -1 means that evaluation will not be performed during training 14 | top_k : 20 # number of items to recommend when calculating evaluation metrics 15 | 16 | #show info 17 | #metric : "recall", "ndcg", "precision", "map" 18 | info: 19 | save_model : False # whether to save model 20 | save_epoch : 100 # if save_model is set to True, save the model every save_epoch 21 | metrics : ["recall", "ndcg", "precision", "map"] # metrics for evaluation 22 | MODEL_DIR : ./tests/resources/deeprec/lightgcn/model/lightgcn_model/ # directory of saved models 23 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/ncf.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : NCF 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | att_fcn_layer_sizes : [80, 40] 14 | ncf_layer_sizes : [80, 40] 15 | activation : [relu, relu] # activation function for DNN 16 | user_dropout: False 17 | dropout : [0.0, 0.0] #drop out values for DNN layer 18 | embedding_dropout : 0.0 19 | item_embedding_dim : 32 # the embedding dimension of items 20 | cate_embedding_dim : 8 # the embedding dimension of categories 21 | user_embedding_dim : 40 # the embedding dimension of users 22 | 23 | #train 24 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 25 | train: 26 | init_method: tnormal # method for initializing model parameters 27 | init_value : 0.01 # stddev values for initializing model parameters 28 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 29 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 30 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 31 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 32 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 33 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 34 | learning_rate : 0.001 35 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 36 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 37 | epochs : 50 # number of epoch for training 38 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 39 | enable_BN : True # whether to use batch normalization in hidden layers 40 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 41 | max_seq_length : 50 # the maximum number of records in the history 42 | hidden_size : 40 # the shape of hidden size used in RNN 43 | attention_size : 40 # the shape of attention size 44 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 45 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 46 | 47 | #show info 48 | #metric :'auc', 'logloss', 'group_auc' 49 | info: 50 | show_step : 100 # print training information after a certain number of mini-batch 51 | save_model: True # whether to save models 52 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 53 | metrics : ['auc','logloss'] # metrics for evaluation. 54 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 55 | MODEL_DIR : ./tests/resources/deeprec/a2svd/model/a2svd_model/ # directory of saved models. 56 | SUMMARIES_DIR : ./tests/resources/deeprec/a2svd/summary/a2svd_summary/ # directory of saved summaries. 57 | write_tfevents : True # whether to save summaries. 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/nextitnet.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : NextItNet 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | activation : [relu, relu] # activation function for DNN 14 | user_dropout: True 15 | dropout : [0.3, 0.3] #drop out values for DNN layer 16 | item_embedding_dim : 32 # the embedding dimension of items 17 | cate_embedding_dim : 8 # the embedding dimension of categories 18 | user_embedding_dim : 16 # the embedding dimension of users 19 | 20 | #train 21 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 22 | train: 23 | init_method: tnormal # method for initializing model parameters 24 | init_value : 0.01 # stddev values for initializing model parameters 25 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 26 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 27 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 28 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 29 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 30 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 31 | learning_rate : 0.001 32 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 33 | optimizer : lazyadam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 34 | epochs : 50 # number of epoch for training 35 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 36 | enable_BN : True # whether to use batch normalization in hidden layers 37 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 38 | max_seq_length : 50 # the maximum number of records in the history sequence 39 | need_sample : True # whether to perform dynamic negative sampling in mini-batch 40 | train_num_ngs : 4 # indicates how many negative instances followed by one positive instances if need_sample is True 41 | 42 | min_seq_length : 3 # the minimum number of records in the history sequence 43 | dilations : [1, 2, 4, 1, 2, 4] # dilations in each delated CNN layer 44 | kernel_size : 3 # kernel size in each delated CNN layer 45 | 46 | #show info 47 | #metric :'auc', 'logloss', 'group_auc' 48 | info: 49 | show_step : 100 # print training information after a certain number of mini-batch 50 | save_model: True # whether to save models 51 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 52 | metrics : ['auc','logloss'] # metrics for evaluation. 53 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 54 | MODEL_DIR : ./tests/resources/deeprec/nextitnet/model/nextitnet_model/ # directory of saved models. 55 | SUMMARIES_DIR : ./tests/resources/deeprec/nextitnet/summary/nextitnet_summary/ # directory of saved summaries. 56 | write_tfevents : True # whether to save summaries. 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/config/sli_rec.yaml: -------------------------------------------------------------------------------- 1 | #data 2 | #data format:sequential model 3 | data: 4 | user_vocab : ./tests/resources/deeprec/sequential/yelp/user_vocab.pkl # the map file of user to id 5 | item_vocab : ./tests/resources/deeprec/sequential/yelp/item_vocab.pkl # the map file of item to id 6 | cate_vocab : ./tests/resources/deeprec/sequential/yelp/category_vocab.pkl # the map file of category to id 7 | 8 | #model 9 | model: 10 | method : classification # classification or regression 11 | model_type : sli_rec 12 | layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes. 13 | att_fcn_layer_sizes : [80, 40] 14 | activation : [relu, relu] # activation function for DNN 15 | user_dropout: False 16 | dropout : [0.0, 0.0] #drop out values for DNN layer 17 | embedding_dropout : 0.0 18 | item_embedding_dim : 32 # the embedding dimension of items 19 | cate_embedding_dim : 8 # the embedding dimension of categories 20 | user_embedding_dim : 16 # the embedding dimension of users 21 | 22 | #train 23 | #init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform 24 | train: 25 | init_method: tnormal # method for initializing model parameters 26 | init_value : 0.01 # stddev values for initializing model parameters 27 | embed_l2 : 0.0001 # l2 regularization for embedding parameters 28 | embed_l1 : 0.0000 # l1 regularization for embedding parameters 29 | layer_l2 : 0.0001 # l2 regularization for hidden layer parameters 30 | layer_l1 : 0.0000 # l1 regularization for hidden layer parameters 31 | cross_l2 : 0.0000 # l2 regularization for cross layer parameters 32 | cross_l1 : 0.000 # l1 regularization for cross layer parameters 33 | learning_rate : 0.001 34 | loss : softmax # pointwise: log_loss, cross_entropy_loss, square_loss pairwise: softmax 35 | optimizer : adam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam 36 | epochs : 50 # number of epoch for training 37 | batch_size : 400 # batch size, should be constrained as an integer multiple of the number of (1 + train_num_ngs) when need_sample is True 38 | enable_BN : True # whether to use batch normalization in hidden layers 39 | EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING 40 | max_seq_length : 50 # the maximum number of records in the history sequence 41 | hidden_size : 40 # the shape of hidden size used in RNN 42 | attention_size : 40 # the shape of attention size 43 | need_sample: True # whether to perform dynamic negative sampling in mini-batch 44 | train_num_ngs: 4 # indicates how many negative instances followed by one positive instances if need_sample is True 45 | 46 | #show info 47 | #metric :'auc', 'logloss', 'group_auc' 48 | info: 49 | show_step : 100 # print training information after a certain number of mini-batch 50 | save_model: True # whether to save modl 51 | save_epoch : 1 # if save_model is set to True, save the model every save_epoch. 52 | metrics : ['auc','logloss'] # metrics for evaluation. 53 | pairwise_metrics : ['mean_mrr', 'ndcg@2;4;6', 'hit@2;4;6', "group_auc"] # pairwise metrics for evaluation, available when pairwise comparisons are needed 54 | MODEL_DIR : ./tests/resources/deeprec/slirec/model/slirec_model/ # directory of saved models. 55 | SUMMARIES_DIR : ./tests/resources/deeprec/slirec/summary/slirec_summary/ # directory of saved summaries. 56 | write_tfevents : True # whether to save summaries. 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/recommender/deeprec/io/__init__.py -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/io/dkn_iterator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | from reco_utils.recommender.deeprec.io.iterator import BaseIterator 8 | 9 | 10 | __all__ = ["DKNTextIterator"] 11 | 12 | 13 | class DKNTextIterator(BaseIterator): 14 | """Data loader for the DKN model. 15 | DKN requires a special type of data format, where each instance contains a label, the candidate news article, 16 | and user's clicked news article. Articles are represented by title words and title entities. Words and entities 17 | are aligned. 18 | 19 | Iterator will not load the whole data into memory. Instead, it loads data into memory 20 | per mini-batch, so that large files can be used as input data. 21 | """ 22 | 23 | def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"): 24 | """Initialize an iterator. Create necessary placeholders for the model. 25 | 26 | Args: 27 | hparams (obj): Global hyper-parameters. Some key setttings such as #_feature and #_field are there. 28 | graph (obj): the running graph. All created placeholder will be added to this graph. 29 | col_spliter (str): column spliter in one line. 30 | ID_spliter (str): ID spliter in one line. 31 | """ 32 | self.col_spliter = col_spliter 33 | self.ID_spliter = ID_spliter 34 | self.batch_size = hparams.batch_size 35 | self.doc_size = hparams.doc_size 36 | self.history_size = hparams.history_size 37 | 38 | self.graph = graph 39 | with self.graph.as_default(): 40 | self.labels = tf.compat.v1.placeholder(tf.float32, [None, 1], name="label") 41 | self.candidate_news_index_batch = tf.compat.v1.placeholder( 42 | tf.int64, [self.batch_size, self.doc_size], name="candidate_news_index" 43 | ) 44 | self.click_news_index_batch = tf.compat.v1.placeholder( 45 | tf.int64, 46 | [self.batch_size, self.history_size, self.doc_size], 47 | name="click_news_index", 48 | ) 49 | self.candidate_news_entity_index_batch = tf.compat.v1.placeholder( 50 | tf.int64, 51 | [self.batch_size, self.doc_size], 52 | name="candidate_news_entity_index", 53 | ) 54 | self.click_news_entity_index_batch = tf.compat.v1.placeholder( 55 | tf.int64, 56 | [self.batch_size, self.history_size, self.doc_size], 57 | name="click_news_entity_index", 58 | ) 59 | self.news_word_index = {} 60 | self.news_entity_index = {} 61 | with tf.io.gfile.GFile(hparams.news_feature_file, "r") as rd: 62 | for line in rd: 63 | newsid, word_index, entity_index = line.strip().split(col_spliter) 64 | self.news_word_index[newsid] = [ 65 | int(item) for item in word_index.split(",") 66 | ] 67 | self.news_entity_index[newsid] = [ 68 | int(item) for item in entity_index.split(",") 69 | ] 70 | self.user_history = {} 71 | with tf.io.gfile.GFile(hparams.user_history_file, "r") as rd: 72 | for line in rd: 73 | if len(line.strip().split(col_spliter)) == 1: 74 | userid = line.strip() 75 | user_history = [] 76 | else: 77 | userid, user_history_string = line.strip().split(col_spliter) 78 | user_history = user_history_string.split(",") 79 | click_news_index = [] 80 | click_news_entity_index = [] 81 | if len(user_history) > self.history_size: 82 | user_history = user_history[-self.history_size :] 83 | for newsid in user_history: 84 | click_news_index.append(self.news_word_index[newsid]) 85 | click_news_entity_index.append(self.news_entity_index[newsid]) 86 | for i in range(self.history_size - len(user_history)): 87 | click_news_index.append(np.zeros(self.doc_size)) 88 | click_news_entity_index.append(np.zeros(self.doc_size)) 89 | self.user_history[userid] = (click_news_index, click_news_entity_index) 90 | 91 | def parser_one_line(self, line): 92 | """Parse one string line into feature values. 93 | 94 | Args: 95 | line (str): a string indicating one instance 96 | 97 | Returns: 98 | list: Parsed results including label, candidate_news_index, candidate_news_val, click_news_index, click_news_val, 99 | candidate_news_entity_index, click_news_entity_index, impression_id 100 | 101 | """ 102 | impression_id = 0 103 | words = line.strip().split(self.ID_spliter) 104 | if len(words) == 2: 105 | impression_id = words[1].strip() 106 | 107 | cols = words[0].strip().split(self.col_spliter) 108 | label = float(cols[0]) 109 | 110 | userid = cols[1] 111 | candidate_news = cols[2] 112 | 113 | candidate_news_index = self.news_word_index[candidate_news] 114 | candidate_news_entity_index = self.news_entity_index[candidate_news] 115 | click_news_index = self.user_history[userid][0] 116 | click_news_entity_index = self.user_history[userid][1] 117 | 118 | return ( 119 | label, 120 | candidate_news_index, 121 | click_news_index, 122 | candidate_news_entity_index, 123 | click_news_entity_index, 124 | impression_id, 125 | ) 126 | 127 | def load_data_from_file(self, infile): 128 | """Read and parse data from a file. 129 | 130 | Args: 131 | infile (str): text input file. Each line in this file is an instance. 132 | 133 | Returns: 134 | obj: An iterator that will yields parsed results, in the format of graph feed_dict. 135 | List: impression id list 136 | Int: size of the data in a batch 137 | """ 138 | candidate_news_index_batch = [] 139 | click_news_index_batch = [] 140 | candidate_news_entity_index_batch = [] 141 | click_news_entity_index_batch = [] 142 | label_list = [] 143 | impression_id_list = [] 144 | cnt = 0 145 | 146 | with tf.io.gfile.GFile(infile, "r") as rd: 147 | for line in rd: 148 | ( 149 | label, 150 | candidate_news_index, 151 | click_news_index, 152 | candidate_news_entity_index, 153 | click_news_entity_index, 154 | impression_id, 155 | ) = self.parser_one_line(line) 156 | 157 | candidate_news_index_batch.append(candidate_news_index) 158 | click_news_index_batch.append(click_news_index) 159 | candidate_news_entity_index_batch.append(candidate_news_entity_index) 160 | click_news_entity_index_batch.append(click_news_entity_index) 161 | label_list.append(label) 162 | impression_id_list.append(impression_id) 163 | 164 | cnt += 1 165 | if cnt >= self.batch_size: 166 | res = self._convert_data( 167 | label_list, 168 | candidate_news_index_batch, 169 | click_news_index_batch, 170 | candidate_news_entity_index_batch, 171 | click_news_entity_index_batch, 172 | impression_id_list, 173 | ) 174 | data_size = self.batch_size 175 | yield self.gen_feed_dict(res), impression_id_list, data_size 176 | candidate_news_index_batch = [] 177 | click_news_index_batch = [] 178 | candidate_news_entity_index_batch = [] 179 | click_news_entity_index_batch = [] 180 | label_list = [] 181 | impression_id_list = [] 182 | cnt = 0 183 | if cnt > 0: 184 | data_size = cnt 185 | while cnt < self.batch_size: 186 | candidate_news_index_batch.append( 187 | candidate_news_index_batch[cnt % data_size] 188 | ) 189 | click_news_index_batch.append( 190 | click_news_index_batch[cnt % data_size] 191 | ) 192 | candidate_news_entity_index_batch.append( 193 | candidate_news_entity_index_batch[cnt % data_size] 194 | ) 195 | click_news_entity_index_batch.append( 196 | click_news_entity_index_batch[cnt % data_size] 197 | ) 198 | label_list.append(label_list[cnt % data_size]) 199 | impression_id_list.append(impression_id_list[cnt % data_size]) 200 | cnt += 1 201 | res = self._convert_data( 202 | label_list, 203 | candidate_news_index_batch, 204 | click_news_index_batch, 205 | candidate_news_entity_index_batch, 206 | click_news_entity_index_batch, 207 | impression_id_list, 208 | ) 209 | yield self.gen_feed_dict(res), impression_id_list, data_size 210 | 211 | def load_infer_data_from_file(self, infile): 212 | """Read and parse data from a file for infer document embedding. 213 | 214 | Args: 215 | infile (str): text input file. Each line in this file is an instance. 216 | 217 | Returns: 218 | obj: An iterator that will yields parsed results, in the format of graph feed_dict. 219 | List: news id list 220 | Int: size of the data in a batch 221 | """ 222 | newsid_list = [] 223 | candidate_news_index_batch = [] 224 | candidate_news_entity_index_batch = [] 225 | cnt = 0 226 | with tf.io.gfile.GFile(infile, "r") as rd: 227 | for line in rd: 228 | newsid, word_index, entity_index = line.strip().split(" ") 229 | newsid_list.append(newsid) 230 | candidate_news_index = [] 231 | candidate_news_entity_index = [] 232 | for item in word_index.split(","): 233 | candidate_news_index.append(int(item)) 234 | for item in entity_index.split(","): 235 | candidate_news_entity_index.append(int(item)) 236 | 237 | candidate_news_index_batch.append(candidate_news_index) 238 | candidate_news_entity_index_batch.append(candidate_news_entity_index) 239 | 240 | cnt += 1 241 | if cnt >= self.batch_size: 242 | res = self._convert_infer_data( 243 | candidate_news_index_batch, candidate_news_entity_index_batch 244 | ) 245 | data_size = self.batch_size 246 | yield self.gen_infer_feed_dict(res), newsid_list, data_size 247 | candidate_news_index_batch = [] 248 | candidate_news_entity_index_batch = [] 249 | newsid_list = [] 250 | cnt = 0 251 | 252 | if cnt > 0: 253 | data_size = cnt 254 | while cnt < self.batch_size: 255 | candidate_news_index_batch.append( 256 | candidate_news_index_batch[cnt % data_size] 257 | ) 258 | candidate_news_entity_index_batch.append( 259 | candidate_news_entity_index_batch[cnt % data_size] 260 | ) 261 | cnt += 1 262 | res = self._convert_infer_data( 263 | candidate_news_index_batch, candidate_news_entity_index_batch 264 | ) 265 | yield self.gen_infer_feed_dict(res), newsid_list, data_size 266 | 267 | def _convert_data( 268 | self, 269 | label_list, 270 | candidate_news_index_batch, 271 | click_news_index_batch, 272 | candidate_news_entity_index_batch, 273 | click_news_entity_index_batch, 274 | impression_id_list, 275 | ): 276 | """Convert data into numpy arrays that are good for further model operation. 277 | 278 | Args: 279 | label_list (list): a list of ground-truth labels. 280 | candidate_news_index_batch (list): the candidate news article's words indices 281 | click_news_index_batch (list): words indices for user's clicked news articles 282 | candidate_news_entity_index_batch (list): the candidate news article's entities indices 283 | click_news_entity_index_batch (list): the user's clicked news article's entities indices 284 | impression_id_list (list) : the session's impression indices 285 | 286 | Returns: 287 | dict: A dictionary, contains multiple numpy arrays that are convenient for further operation. 288 | """ 289 | res = {} 290 | res["labels"] = np.asarray([[label] for label in label_list], dtype=np.float32) 291 | res["candidate_news_index_batch"] = np.asarray( 292 | candidate_news_index_batch, dtype=np.int64 293 | ) 294 | res["click_news_index_batch"] = np.asarray( 295 | click_news_index_batch, dtype=np.int64 296 | ) 297 | res["candidate_news_entity_index_batch"] = np.asarray( 298 | candidate_news_entity_index_batch, dtype=np.int64 299 | ) 300 | res["click_news_entity_index_batch"] = np.asarray( 301 | click_news_entity_index_batch, dtype=np.int64 302 | ) 303 | res["impression_id"] = np.asarray(impression_id_list, dtype=np.int64) 304 | return res 305 | 306 | def _convert_infer_data( 307 | self, candidate_news_index_batch, candidate_news_entity_index_batch 308 | ): 309 | """Convert data into numpy arrays that are good for further model operation. 310 | 311 | Args: 312 | candidate_news_index_batch (list): the candidate news article's words indices 313 | candidate_news_entity_index_batch (list): the candidate news article's entities indices 314 | Returns: 315 | dict: A dictionary, contains multiple numpy arrays that are convenient for further operation. 316 | """ 317 | res = {} 318 | res["candidate_news_index_batch"] = np.asarray( 319 | candidate_news_index_batch, dtype=np.int64 320 | ) 321 | res["candidate_news_entity_index_batch"] = np.asarray( 322 | candidate_news_entity_index_batch, dtype=np.int64 323 | ) 324 | return res 325 | 326 | def gen_feed_dict(self, data_dict): 327 | """Construct a dictionary that maps graph elements to values. 328 | 329 | Args: 330 | data_dict (dict): a dictionary that maps string name to numpy arrays. 331 | 332 | Returns: 333 | dict: a dictionary that maps graph elements to numpy arrays. 334 | 335 | """ 336 | feed_dict = { 337 | self.labels: data_dict["labels"].reshape([-1, 1]), 338 | self.candidate_news_index_batch: data_dict[ 339 | "candidate_news_index_batch" 340 | ].reshape([self.batch_size, self.doc_size]), 341 | self.click_news_index_batch: data_dict["click_news_index_batch"].reshape( 342 | [self.batch_size, self.history_size, self.doc_size] 343 | ), 344 | self.candidate_news_entity_index_batch: data_dict[ 345 | "candidate_news_entity_index_batch" 346 | ].reshape([-1, self.doc_size]), 347 | self.click_news_entity_index_batch: data_dict[ 348 | "click_news_entity_index_batch" 349 | ].reshape([-1, self.history_size, self.doc_size]), 350 | } 351 | return feed_dict 352 | 353 | def gen_infer_feed_dict(self, data_dict): 354 | """Construct a dictionary that maps graph elements to values. 355 | 356 | Args: 357 | data_dict (dict): a dictionary that maps string name to numpy arrays. 358 | 359 | Returns: 360 | dict: a dictionary that maps graph elements to numpy arrays. 361 | 362 | """ 363 | feed_dict = { 364 | self.candidate_news_index_batch: data_dict[ 365 | "candidate_news_index_batch" 366 | ].reshape([self.batch_size, self.doc_size]), 367 | self.candidate_news_entity_index_batch: data_dict[ 368 | "candidate_news_entity_index_batch" 369 | ].reshape([-1, self.doc_size]), 370 | } 371 | return feed_dict 372 | 373 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/io/iterator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import abc 7 | 8 | 9 | class BaseIterator(object): 10 | @abc.abstractmethod 11 | def parser_one_line(self, line): 12 | pass 13 | 14 | @abc.abstractmethod 15 | def load_data_from_file(self, infile): 16 | pass 17 | 18 | @abc.abstractmethod 19 | def _convert_data(self, labels, features): 20 | pass 21 | 22 | @abc.abstractmethod 23 | def gen_feed_dict(self, data_dict): 24 | pass 25 | 26 | 27 | class FFMTextIterator(BaseIterator): 28 | """Data loader for FFM format based models, such as xDeepFM. 29 | Iterator will not load the whole data into memory. Instead, it loads data into memory 30 | per mini-batch, so that large files can be used as input data. 31 | """ 32 | 33 | def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"): 34 | """Initialize an iterator. Create necessary placeholders for the model. 35 | 36 | Args: 37 | hparams (obj): Global hyper-parameters. Some key settings such as #_feature and #_field are there. 38 | graph (obj): the running graph. All created placeholder will be added to this graph. 39 | col_spliter (str): column splitter in one line. 40 | ID_spliter (str): ID splitter in one line. 41 | """ 42 | self.feature_cnt = hparams.FEATURE_COUNT 43 | self.field_cnt = hparams.FIELD_COUNT 44 | self.col_spliter = col_spliter 45 | self.ID_spliter = ID_spliter 46 | self.batch_size = hparams.batch_size 47 | 48 | self.graph = graph 49 | with self.graph.as_default(): 50 | self.labels = tf.placeholder(tf.float32, [None, 1], name="label") 51 | self.fm_feat_indices = tf.placeholder( 52 | tf.int64, [None, 2], name="fm_feat_indices" 53 | ) 54 | self.fm_feat_values = tf.placeholder( 55 | tf.float32, [None], name="fm_feat_values" 56 | ) 57 | self.fm_feat_shape = tf.placeholder(tf.int64, [None], name="fm_feat_shape") 58 | self.dnn_feat_indices = tf.placeholder( 59 | tf.int64, [None, 2], name="dnn_feat_indices" 60 | ) 61 | self.dnn_feat_values = tf.placeholder( 62 | tf.int64, [None], name="dnn_feat_values" 63 | ) 64 | self.dnn_feat_weights = tf.placeholder( 65 | tf.float32, [None], name="dnn_feat_weights" 66 | ) 67 | self.dnn_feat_shape = tf.placeholder( 68 | tf.int64, [None], name="dnn_feat_shape" 69 | ) 70 | 71 | def parser_one_line(self, line): 72 | """Parse one string line into feature values. 73 | 74 | Args: 75 | line (str): a string indicating one instance 76 | 77 | Returns: 78 | list: Parsed results,including label, features and impression_id 79 | 80 | """ 81 | impression_id = 0 82 | words = line.strip().split(self.ID_spliter) 83 | if len(words) == 2: 84 | impression_id = words[1].strip() 85 | 86 | cols = words[0].strip().split(self.col_spliter) 87 | 88 | label = float(cols[0]) 89 | 90 | features = [] 91 | for word in cols[1:]: 92 | if not word.strip(): 93 | continue 94 | tokens = word.split(":") 95 | features.append([int(tokens[0]) - 1, int(tokens[1]) - 1, float(tokens[2])]) 96 | 97 | return label, features, impression_id 98 | 99 | def load_data_from_file(self, infile): 100 | """Read and parse data from a file. 101 | 102 | Args: 103 | infile (str): text input file. Each line in this file is an instance. 104 | 105 | Returns: 106 | obj: An iterator that will yields parsed results, in the format of graph feed_dict. 107 | """ 108 | label_list = [] 109 | features_list = [] 110 | impression_id_list = [] 111 | cnt = 0 112 | 113 | with tf.gfile.GFile(infile, "r") as rd: 114 | for line in rd: 115 | label, features, impression_id = self.parser_one_line(line) 116 | 117 | features_list.append(features) 118 | label_list.append(label) 119 | impression_id_list.append(impression_id) 120 | 121 | cnt += 1 122 | if cnt == self.batch_size: 123 | res = self._convert_data(label_list, features_list) 124 | yield self.gen_feed_dict(res), impression_id_list, self.batch_size 125 | label_list = [] 126 | features_list = [] 127 | impression_id_list = [] 128 | cnt = 0 129 | if cnt > 0: 130 | res = self._convert_data(label_list, features_list) 131 | yield self.gen_feed_dict(res), impression_id_list, cnt 132 | 133 | def _convert_data(self, labels, features): 134 | """Convert data into numpy arrays that are good for further operation. 135 | 136 | Args: 137 | labels (list): a list of ground-truth labels. 138 | features (list): a 3-dimensional list, carrying a list (batch_size) of feature array, 139 | where each feature array is a list of [field_idx, feature_idx, feature_value] tuple. 140 | 141 | Returns: 142 | dict: A dictionary, contains multiple numpy arrays that are convenient for further operation. 143 | """ 144 | dim = self.feature_cnt 145 | FIELD_COUNT = self.field_cnt 146 | instance_cnt = len(labels) 147 | 148 | fm_feat_indices = [] 149 | fm_feat_values = [] 150 | fm_feat_shape = [instance_cnt, dim] 151 | 152 | dnn_feat_indices = [] 153 | dnn_feat_values = [] 154 | dnn_feat_weights = [] 155 | dnn_feat_shape = [instance_cnt * FIELD_COUNT, -1] 156 | 157 | for i in range(instance_cnt): 158 | m = len(features[i]) 159 | dnn_feat_dic = {} 160 | for j in range(m): 161 | fm_feat_indices.append([i, features[i][j][1]]) 162 | fm_feat_values.append(features[i][j][2]) 163 | if features[i][j][0] not in dnn_feat_dic: 164 | dnn_feat_dic[features[i][j][0]] = 0 165 | else: 166 | dnn_feat_dic[features[i][j][0]] += 1 167 | dnn_feat_indices.append( 168 | [ 169 | i * FIELD_COUNT + features[i][j][0], 170 | dnn_feat_dic[features[i][j][0]], 171 | ] 172 | ) 173 | dnn_feat_values.append(features[i][j][1]) 174 | dnn_feat_weights.append(features[i][j][2]) 175 | if dnn_feat_shape[1] < dnn_feat_dic[features[i][j][0]]: 176 | dnn_feat_shape[1] = dnn_feat_dic[features[i][j][0]] 177 | dnn_feat_shape[1] += 1 178 | 179 | sorted_index = sorted( 180 | range(len(dnn_feat_indices)), 181 | key=lambda k: (dnn_feat_indices[k][0], dnn_feat_indices[k][1]), 182 | ) 183 | 184 | res = {} 185 | res["fm_feat_indices"] = np.asarray(fm_feat_indices, dtype=np.int64) 186 | res["fm_feat_values"] = np.asarray(fm_feat_values, dtype=np.float32) 187 | res["fm_feat_shape"] = np.asarray(fm_feat_shape, dtype=np.int64) 188 | res["labels"] = np.asarray([[label] for label in labels], dtype=np.float32) 189 | 190 | res["dnn_feat_indices"] = np.asarray(dnn_feat_indices, dtype=np.int64)[ 191 | sorted_index 192 | ] 193 | res["dnn_feat_values"] = np.asarray(dnn_feat_values, dtype=np.int64)[ 194 | sorted_index 195 | ] 196 | res["dnn_feat_weights"] = np.asarray(dnn_feat_weights, dtype=np.float32)[ 197 | sorted_index 198 | ] 199 | res["dnn_feat_shape"] = np.asarray(dnn_feat_shape, dtype=np.int64) 200 | return res 201 | 202 | def gen_feed_dict(self, data_dict): 203 | """Construct a dictionary that maps graph elements to values. 204 | Args: 205 | data_dict (dict): a dictionary that maps string name to numpy arrays. 206 | 207 | Returns: 208 | dict: a dictionary that maps graph elements to numpy arrays. 209 | 210 | """ 211 | feed_dict = { 212 | self.labels: data_dict["labels"], 213 | self.fm_feat_indices: data_dict["fm_feat_indices"], 214 | self.fm_feat_values: data_dict["fm_feat_values"], 215 | self.fm_feat_shape: data_dict["fm_feat_shape"], 216 | self.dnn_feat_indices: data_dict["dnn_feat_indices"], 217 | self.dnn_feat_values: data_dict["dnn_feat_values"], 218 | self.dnn_feat_weights: data_dict["dnn_feat_weights"], 219 | self.dnn_feat_shape: data_dict["dnn_feat_shape"], 220 | } 221 | return feed_dict 222 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/io/nextitnet_iterator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import json 7 | import pickle as pkl 8 | import random 9 | import os 10 | import time 11 | 12 | from reco_utils.recommender.deeprec.io.sequential_iterator import SequentialIterator 13 | from reco_utils.recommender.deeprec.deeprec_utils import load_dict 14 | 15 | __all__ = ["NextItNetIterator"] 16 | 17 | 18 | class NextItNetIterator(SequentialIterator): 19 | """Data loader for the NextItNet model. 20 | NextItNet requires a special type of data format. In training stage, each instance will produce (sequence_length * train_num_ngs) target items and labels, to let NextItNet output predictions of every item in a sequence except only of the last item. 21 | """ 22 | 23 | def __init__(self, hparams, graph, col_spliter="\t"): 24 | """Initialize an iterator. Create necessary placeholders for the model. 25 | * different from sequential iterator 26 | 27 | Args: 28 | hparams (obj): Global hyper-parameters. Some key settings such as #_feature and #_field are there. 29 | graph (obj): the running graph. All created placeholder will be added to this graph. 30 | col_spliter (str): column spliter in one line. 31 | """ 32 | self.col_spliter = col_spliter 33 | 34 | self.userdict, self.itemdict, self.catedict = ( 35 | load_dict(hparams.user_vocab), 36 | load_dict(hparams.item_vocab), 37 | load_dict(hparams.cate_vocab), 38 | ) 39 | 40 | self.max_seq_length = hparams.max_seq_length 41 | self.batch_size = hparams.batch_size 42 | self.iter_data = dict() 43 | 44 | self.graph = graph 45 | with self.graph.as_default(): 46 | self.labels = tf.placeholder(tf.float32, [None, None], name="label") 47 | self.users = tf.placeholder(tf.int32, [None], name="users") 48 | self.items = tf.placeholder(tf.int32, [None, None], name="items") 49 | self.cates = tf.placeholder(tf.int32, [None, None], name="cates") 50 | self.item_history = tf.placeholder( 51 | tf.int32, [None, self.max_seq_length], name="item_history" 52 | ) 53 | self.item_cate_history = tf.placeholder( 54 | tf.int32, [None, self.max_seq_length], name="item_cate_history" 55 | ) 56 | self.mask = tf.placeholder( 57 | tf.int32, [None, self.max_seq_length], name="mask" 58 | ) 59 | self.time = tf.placeholder(tf.float32, [None], name="time") 60 | self.time_diff = tf.placeholder( 61 | tf.float32, [None, self.max_seq_length], name="time_diff" 62 | ) 63 | self.time_from_first_action = tf.placeholder( 64 | tf.float32, [None, self.max_seq_length], name="time_from_first_action" 65 | ) 66 | self.time_to_now = tf.placeholder( 67 | tf.float32, [None, self.max_seq_length], name="time_to_now" 68 | ) 69 | 70 | def _convert_data( 71 | self, 72 | label_list, 73 | user_list, 74 | item_list, 75 | item_cate_list, 76 | item_history_batch, 77 | item_cate_history_batch, 78 | time_list, 79 | time_diff_list, 80 | time_from_first_action_list, 81 | time_to_now_list, 82 | batch_num_ngs, 83 | ): 84 | """Convert data into numpy arrays that are good for further model operation. 85 | * different from sequential_iterator 86 | 87 | Args: 88 | label_list (list): a list of ground-truth labels. 89 | user_list (list): a list of user indexes. 90 | item_list (list): a list of item indexes. 91 | item_cate_list (list): a list of category indexes. 92 | item_history_batch (list): a list of item history indexes. 93 | item_cate_history_batch (list): a list of category history indexes. 94 | time_list (list): a list of current timestamp. 95 | time_diff_list (list): a list of timestamp between each sequential opertions. 96 | time_from_first_action_list (list): a list of timestamp from the first opertion. 97 | time_to_now_list (list): a list of timestamp to the current time. 98 | batch_num_ngs (int): The number of negative sampling while training in mini-batch. 99 | 100 | Returns: 101 | dict: A dictionary, contains multiple numpy arrays that are convenient for further operation. 102 | """ 103 | if batch_num_ngs: 104 | instance_cnt = len(label_list) 105 | if instance_cnt < 5: 106 | return 107 | 108 | label_list_all = [] 109 | item_list_all = [] 110 | item_cate_list_all = [] 111 | user_list_all = np.asarray( 112 | [[user] * (batch_num_ngs + 1) for user in user_list], dtype=np.int32 113 | ).flatten() 114 | time_list_all = np.asarray( 115 | [[t] * (batch_num_ngs + 1) for t in time_list], dtype=np.float32 116 | ).flatten() 117 | 118 | history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)] 119 | max_seq_length_batch = self.max_seq_length 120 | item_history_batch_all = np.zeros( 121 | (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch), 122 | dtype=np.int32, 123 | ) 124 | item_cate_history_batch_all = np.zeros( 125 | (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch), 126 | dtype=np.int32, 127 | ) 128 | time_diff_batch = np.zeros( 129 | (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch), 130 | dtype=np.float32, 131 | ) 132 | time_from_first_action_batch = np.zeros( 133 | (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch), 134 | dtype=np.float32, 135 | ) 136 | time_to_now_batch = np.zeros( 137 | (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch), 138 | dtype=np.float32, 139 | ) 140 | mask = np.zeros( 141 | (instance_cnt * (1 + batch_num_ngs), max_seq_length_batch), 142 | dtype=np.float32, 143 | ) 144 | 145 | for i in range(instance_cnt): 146 | this_length = min(history_lengths[i], max_seq_length_batch) 147 | for index in range(batch_num_ngs + 1): 148 | item_history_batch_all[ 149 | i * (batch_num_ngs + 1) + index, -this_length: 150 | ] = np.asarray(item_history_batch[i][-this_length:], dtype=np.int32) 151 | item_cate_history_batch_all[ 152 | i * (batch_num_ngs + 1) + index, -this_length: 153 | ] = np.asarray( 154 | item_cate_history_batch[i][-this_length:], dtype=np.int32 155 | ) 156 | mask[i * (batch_num_ngs + 1) + index, -this_length:] = 1.0 157 | time_diff_batch[ 158 | i * (batch_num_ngs + 1) + index, -this_length: 159 | ] = np.asarray(time_diff_list[i][-this_length:], dtype=np.float32) 160 | time_from_first_action_batch[ 161 | i * (batch_num_ngs + 1) + index, -this_length: 162 | ] = np.asarray( 163 | time_from_first_action_list[i][-this_length:], dtype=np.float32 164 | ) 165 | time_to_now_batch[ 166 | i * (batch_num_ngs + 1) + index, -this_length: 167 | ] = np.asarray(time_to_now_list[i][-this_length:], dtype=np.float32) 168 | 169 | for i in range(instance_cnt): 170 | positive_item = [ 171 | *item_history_batch_all[i * (batch_num_ngs + 1)][1:], 172 | item_list[i], 173 | ] 174 | positive_item_cate = [ 175 | *item_cate_history_batch_all[i * (batch_num_ngs + 1)][1:], 176 | item_cate_list[i], 177 | ] 178 | label_list_all.append([1] * max_seq_length_batch) 179 | item_list_all.append(positive_item) 180 | item_cate_list_all.append(positive_item_cate) 181 | 182 | count = 0 183 | while count < batch_num_ngs: 184 | negative_item_list = [] 185 | negative_item_cate_list = [] 186 | count_inner = 1 187 | while count_inner <= max_seq_length_batch: 188 | random_value = random.randint(0, instance_cnt - 1) 189 | negative_item = item_list[random_value] 190 | if negative_item == positive_item[count_inner - 1]: 191 | continue 192 | negative_item_list.append(negative_item) 193 | negative_item_cate_list.append(item_cate_list[random_value]) 194 | count_inner += 1 195 | 196 | label_list_all.append([0] * max_seq_length_batch) 197 | item_list_all.append(negative_item_list) 198 | item_cate_list_all.append(negative_item_cate_list) 199 | count += 1 200 | 201 | res = {} 202 | res["labels"] = np.asarray( 203 | label_list_all, dtype=np.float32 204 | ) # .reshape(-1,1) 205 | res["users"] = user_list_all 206 | res["items"] = np.asarray(item_list_all, dtype=np.int32) 207 | res["cates"] = np.asarray(item_cate_list_all, dtype=np.int32) 208 | res["item_history"] = item_history_batch_all 209 | res["item_cate_history"] = item_cate_history_batch_all 210 | res["mask"] = mask 211 | res["time"] = time_list_all 212 | res["time_diff"] = time_diff_batch 213 | res["time_from_first_action"] = time_from_first_action_batch 214 | res["time_to_now"] = time_to_now_batch 215 | 216 | return res 217 | 218 | else: 219 | instance_cnt = len(label_list) 220 | history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)] 221 | max_seq_length_batch = self.max_seq_length 222 | item_history_batch_all = np.zeros( 223 | (instance_cnt, max_seq_length_batch), dtype=np.int32 224 | ) 225 | item_cate_history_batch_all = np.zeros( 226 | (instance_cnt, max_seq_length_batch), dtype=np.int32 227 | ) 228 | time_diff_batch = np.zeros( 229 | (instance_cnt, max_seq_length_batch), dtype=np.float32 230 | ) 231 | time_from_first_action_batch = np.zeros( 232 | (instance_cnt, max_seq_length_batch), dtype=np.float32 233 | ) 234 | time_to_now_batch = np.zeros( 235 | (instance_cnt, max_seq_length_batch), dtype=np.float32 236 | ) 237 | mask = np.zeros((instance_cnt, max_seq_length_batch), dtype=np.float32) 238 | 239 | for i in range(instance_cnt): 240 | this_length = min(history_lengths[i], max_seq_length_batch) 241 | item_history_batch_all[i, -this_length:] = item_history_batch[i][ 242 | -this_length: 243 | ] 244 | item_cate_history_batch_all[i, -this_length:] = item_cate_history_batch[ 245 | i 246 | ][-this_length:] 247 | mask[i, -this_length:] = 1.0 248 | time_diff_batch[i, -this_length:] = time_diff_list[i][-this_length:] 249 | time_from_first_action_batch[ 250 | i, -this_length: 251 | ] = time_from_first_action_list[i][-this_length:] 252 | time_to_now_batch[i, -this_length:] = time_to_now_list[i][-this_length:] 253 | 254 | res = {} 255 | res["labels"] = np.asarray(label_list, dtype=np.float32).reshape([-1, 1]) 256 | res["users"] = np.asarray(user_list, dtype=np.float32) 257 | res["items"] = np.asarray(item_list, dtype=np.int32).reshape([-1, 1]) 258 | res["cates"] = np.asarray(item_cate_list, dtype=np.int32).reshape([-1, 1]) 259 | res["item_history"] = item_history_batch_all 260 | res["item_cate_history"] = item_cate_history_batch_all 261 | res["mask"] = mask 262 | res["time"] = np.asarray(time_list, dtype=np.float32) 263 | res["time_diff"] = time_diff_batch 264 | res["time_from_first_action"] = time_from_first_action_batch 265 | res["time_to_now"] = time_to_now_batch 266 | return res 267 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/reco_utils/recommender/deeprec/models/__init__.py -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/asvd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import ( 6 | SequentialBaseModel, 7 | ) 8 | 9 | __all__ = ["A2SVDModel"] 10 | 11 | 12 | class A2SVDModel(SequentialBaseModel): 13 | """A2SVD Model (Attentive Asynchronous Singular Value Decomposition) 14 | 15 | It extends ASVD with an attention module. 16 | 17 | ASVD: Y. Koren, "Factorization Meets the Neighborhood: a Multifaceted Collaborative 18 | Filtering Model", in Proceedings of the 14th ACM SIGKDD international conference on 19 | Knowledge discovery and data mining, pages 426–434, ACM, 2008. 20 | 21 | A2SVD: Z. Yu, J. Lian, A. Mahmoody, G. Liu and X. Xie, "Adaptive User Modeling with 22 | Long and Short-Term Preferences for Personailzed Recommendation", in Proceedings of 23 | the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19, 24 | Pages 4213-4219, AAAI Press, 2019. 25 | """ 26 | 27 | def _build_seq_graph(self): 28 | """The main function to create A2SVD model. 29 | 30 | Returns: 31 | obj:the output of A2SVD section. 32 | """ 33 | hparams = self.hparams 34 | with tf.variable_scope("a2svd"): 35 | hist_input = tf.concat( 36 | [self.item_history_embedding, self.cate_history_embedding], 2 37 | ) 38 | with tf.variable_scope("Attention_layer"): 39 | att_outputs1 = self._attention(hist_input, hparams.attention_size) 40 | asvd_output = tf.reduce_sum(att_outputs1, 1) 41 | tf.summary.histogram("a2svd_output", asvd_output) 42 | model_output = tf.concat([asvd_output, self.target_item_embedding], 1) 43 | self.model_output = model_output 44 | tf.summary.histogram("model_output", model_output) 45 | return model_output 46 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/caser.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import ( 6 | SequentialBaseModel, 7 | ) 8 | 9 | __all__ = ["CaserModel"] 10 | 11 | 12 | class CaserModel(SequentialBaseModel): 13 | """Caser Model 14 | 15 | J. Tang and K. Wang, "Personalized top-n sequential recommendation via convolutional 16 | sequence embedding", in Proceedings of the Eleventh ACM International Conference on 17 | Web Search and Data Mining, ACM, 2018. 18 | """ 19 | 20 | def __init__(self, hparams, iterator_creator): 21 | """Initialization of variables for caser 22 | 23 | Args: 24 | hparams (obj): A tf.contrib.training.HParams object, hold the entire set of hyperparameters. 25 | iterator_creator (obj): An iterator to load the data. 26 | """ 27 | self.hparams = hparams 28 | self.L = hparams.L # history sequence that involved in convolution shape 29 | self.T = hparams.T # prediction shape 30 | self.n_v = hparams.n_v # number of vertical convolution layers 31 | self.n_h = hparams.n_h # number of horizonal convolution layers 32 | self.lengths = [ 33 | i + 1 for i in range(self.L) 34 | ] # horizonal convolution filter shape 35 | super().__init__(hparams, iterator_creator) 36 | 37 | def _build_seq_graph(self): 38 | """The main function to create caser model. 39 | 40 | Returns: 41 | obj:the output of caser section. 42 | """ 43 | with tf.variable_scope("caser"): 44 | cnn_output = self._caser_cnn() 45 | model_output = tf.concat([cnn_output, self.target_item_embedding], 1) 46 | tf.summary.histogram("model_output", model_output) 47 | return model_output 48 | 49 | def _add_cnn(self, hist_matrix, vertical_dim, scope): 50 | """The main function to use CNN at both vertical and horizonal aspects. 51 | 52 | Args: 53 | hist_matrix (obj): The output of history sequential embeddings 54 | vertical_dim (int): The shape of embeddings of input 55 | scope (obj): The scope of CNN input. 56 | 57 | Returns: 58 | obj:the output of CNN layers. 59 | """ 60 | with tf.variable_scope(scope): 61 | with tf.variable_scope("vertical"): 62 | embedding_T = tf.transpose(hist_matrix, [0, 2, 1]) 63 | out_v = self._build_cnn(embedding_T, self.n_v, vertical_dim) 64 | out_v = tf.layers.flatten(out_v) 65 | with tf.variable_scope("horizonal"): 66 | out_hs = [] 67 | for h in self.lengths: 68 | conv_out = self._build_cnn(hist_matrix, self.n_h, h) 69 | max_pool_out = tf.reduce_max( 70 | conv_out, reduction_indices=[1], name="max_pool_{0}".format(h) 71 | ) 72 | out_hs.append(max_pool_out) 73 | out_h = tf.concat(out_hs, 1) 74 | return tf.concat([out_v, out_h], 1) 75 | 76 | def _caser_cnn(self): 77 | """The main function to use CNN at both item and category aspects. 78 | 79 | Returns: 80 | obj:the concatenated output of two parts of item and catrgory. 81 | """ 82 | item_out = self._add_cnn( 83 | self.item_history_embedding, self.item_embedding_dim, "item" 84 | ) 85 | tf.summary.histogram("item_out", item_out) 86 | cate_out = self._add_cnn( 87 | self.cate_history_embedding, self.cate_embedding_dim, "cate" 88 | ) 89 | tf.summary.histogram("cate_out", cate_out) 90 | cnn_output = tf.concat([item_out, cate_out], 1) 91 | tf.summary.histogram("cnn_output", cnn_output) 92 | return cnn_output 93 | 94 | def _build_cnn(self, history_matrix, nums, shape): 95 | """Call a CNN layer. 96 | 97 | Returns: 98 | obj:the output of cnn section. 99 | """ 100 | return tf.layers.conv1d( 101 | history_matrix, 102 | nums, 103 | shape, 104 | activation=tf.nn.relu, 105 | name="conv_" + str(shape), 106 | ) 107 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/dien.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sli_rec import ( 6 | SLI_RECModel, 7 | ) 8 | from tensorflow.contrib.rnn import GRUCell 9 | from reco_utils.recommender.deeprec.models.sequential.rnn_cell_implement import ( 10 | VecAttGRUCell, 11 | ) 12 | from reco_utils.recommender.deeprec.deeprec_utils import load_dict 13 | 14 | from reco_utils.recommender.deeprec.models.sequential.rnn_dien import dynamic_rnn 15 | 16 | __all__ = ["DIENModel"] 17 | 18 | 19 | class DIENModel(SLI_RECModel): 20 | 21 | def _build_seq_graph(self): 22 | """The main function to create din model. 23 | 24 | Returns: 25 | obj:the output of din section. 26 | """ 27 | hparams = self.hparams 28 | with tf.name_scope('dien'): 29 | hist_input = tf.concat( 30 | [self.item_history_embedding, self.cate_history_embedding], 2 31 | ) 32 | self.mask = self.iterator.mask 33 | self.real_mask = tf.cast(self.mask, tf.float32) 34 | self.hist_embedding_sum = tf.reduce_sum(hist_input*tf.expand_dims(self.real_mask, -1), 1) 35 | with tf.name_scope('rnn_1'): 36 | self.mask = self.iterator.mask 37 | self.sequence_length = tf.reduce_sum(self.mask, 1) 38 | rnn_outputs, _ = dynamic_rnn( 39 | GRUCell(hparams.hidden_size), 40 | inputs=hist_input, 41 | sequence_length=self.sequence_length, 42 | dtype=tf.float32, 43 | scope="gru1" 44 | ) 45 | tf.summary.histogram('GRU_outputs', rnn_outputs) 46 | 47 | # Attention layer 48 | with tf.name_scope('Attention_layer_1'): 49 | _, alphas = self._attention_fcn(self.target_item_embedding, rnn_outputs, return_alpha=True) 50 | 51 | with tf.name_scope('rnn_2'): 52 | _, final_state = dynamic_rnn( 53 | VecAttGRUCell(hparams.hidden_size), 54 | inputs=rnn_outputs, 55 | att_scores = tf.expand_dims(alphas, -1), 56 | sequence_length=self.sequence_length, 57 | dtype=tf.float32, 58 | scope="gru2" 59 | ) 60 | tf.summary.histogram('GRU2_Final_State', final_state) 61 | 62 | model_output = tf.concat([self.target_item_embedding, final_state, self.hist_embedding_sum, self.target_item_embedding*self.hist_embedding_sum], 1) 63 | tf.summary.histogram("model_output", model_output) 64 | return model_output 65 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/din.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sli_rec import ( 6 | SLI_RECModel, 7 | ) 8 | from tensorflow.nn import dynamic_rnn 9 | from reco_utils.recommender.deeprec.deeprec_utils import load_dict 10 | 11 | __all__ = ["DINModel"] 12 | 13 | 14 | class DINModel(SLI_RECModel): 15 | 16 | def _build_seq_graph(self): 17 | """The main function to create din model. 18 | 19 | Returns: 20 | obj:the output of din section. 21 | """ 22 | with tf.name_scope('din'): 23 | hist_input = tf.concat( 24 | [self.item_history_embedding, self.cate_history_embedding], 2 25 | ) 26 | self.mask = self.iterator.mask 27 | self.real_mask = tf.cast(self.mask, tf.float32) 28 | self.hist_embedding_sum = tf.reduce_sum(hist_input*tf.expand_dims(self.real_mask, -1), 1) 29 | attention_output = self._attention_fcn(self.target_item_embedding, hist_input) 30 | att_fea = tf.reduce_sum(attention_output, 1) 31 | tf.summary.histogram('att_fea', att_fea) 32 | model_output = tf.concat([self.target_item_embedding, self.hist_embedding_sum, att_fea], -1) 33 | tf.summary.histogram("model_output", model_output) 34 | return model_output 35 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/gru4rec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import ( 6 | SequentialBaseModel, 7 | ) 8 | from tensorflow.contrib.rnn import GRUCell, LSTMCell 9 | from tensorflow.nn import dynamic_rnn 10 | 11 | __all__ = ["GRU4RecModel"] 12 | 13 | 14 | class GRU4RecModel(SequentialBaseModel): 15 | """GRU4Rec Model 16 | 17 | B. Hidasi, A. Karatzoglou, L. Baltrunas, D. Tikk, "Session-based Recommendations 18 | with Recurrent Neural Networks", ICLR (Poster), 2016. 19 | """ 20 | 21 | def _build_seq_graph(self): 22 | """The main function to create GRU4Rec model. 23 | 24 | Returns: 25 | obj:the output of GRU4Rec section. 26 | """ 27 | with tf.variable_scope("gru4rec"): 28 | # final_state = self._build_lstm() 29 | final_state = self._build_gru() 30 | model_output = tf.concat([final_state, self.target_item_embedding], 1) 31 | tf.summary.histogram("model_output", model_output) 32 | return model_output 33 | 34 | def _build_lstm(self): 35 | """Apply an LSTM for modeling. 36 | 37 | Returns: 38 | obj: The output of LSTM section. 39 | """ 40 | with tf.name_scope("lstm"): 41 | self.mask = self.iterator.mask 42 | self.sequence_length = tf.reduce_sum(self.mask, 1) 43 | self.history_embedding = tf.concat( 44 | [self.item_history_embedding, self.cate_history_embedding], 2 45 | ) 46 | rnn_outputs, final_state = dynamic_rnn( 47 | LSTMCell(self.hidden_size), 48 | inputs=self.history_embedding, 49 | sequence_length=self.sequence_length, 50 | dtype=tf.float32, 51 | scope="lstm", 52 | ) 53 | tf.summary.histogram("LSTM_outputs", rnn_outputs) 54 | return final_state[1] 55 | 56 | def _build_gru(self): 57 | """Apply a GRU for modeling. 58 | 59 | Returns: 60 | obj: The output of GRU section. 61 | """ 62 | with tf.name_scope("gru"): 63 | self.mask = self.iterator.mask 64 | self.sequence_length = tf.reduce_sum(self.mask, 1) 65 | self.history_embedding = tf.concat( 66 | [self.item_history_embedding, self.cate_history_embedding], 2 67 | ) 68 | rnn_outputs, final_state = dynamic_rnn( 69 | GRUCell(self.hidden_size), 70 | inputs=self.history_embedding, 71 | sequence_length=self.sequence_length, 72 | dtype=tf.float32, 73 | scope="gru", 74 | ) 75 | tf.summary.histogram("GRU_outputs", rnn_outputs) 76 | return final_state 77 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/ncf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import ( 6 | SequentialBaseModel, 7 | ) 8 | from reco_utils.recommender.deeprec.deeprec_utils import load_dict 9 | 10 | __all__ = ["NCFModel"] 11 | 12 | 13 | class NCFModel(SequentialBaseModel): 14 | 15 | def _build_embedding(self): 16 | """The field embedding layer. Initialization of embedding variables.""" 17 | super(NCFModel, self)._build_embedding() 18 | hparams = self.hparams 19 | self.user_vocab_length = len(load_dict(hparams.user_vocab)) 20 | self.user_embedding_dim = hparams.user_embedding_dim 21 | 22 | with tf.variable_scope("embedding", initializer=self.initializer): 23 | self.user_gmf_lookup = tf.get_variable( 24 | name="user_gmf_embedding", 25 | shape=[self.user_vocab_length, self.user_embedding_dim], 26 | dtype=tf.float32, 27 | ) 28 | self.user_mlp_lookup = tf.get_variable( 29 | name="user_mlp_embedding", 30 | shape=[self.user_vocab_length, self.user_embedding_dim], 31 | dtype=tf.float32, 32 | ) 33 | self.item_gmf_lookup = tf.get_variable( 34 | name="item_gmf_embedding", 35 | shape=[self.item_vocab_length, self.user_embedding_dim], 36 | dtype=tf.float32, 37 | ) 38 | self.item_mlp_lookup = tf.get_variable( 39 | name="item_mlp_embedding", 40 | shape=[self.item_vocab_length, self.user_embedding_dim], 41 | dtype=tf.float32, 42 | ) 43 | 44 | def _lookup_from_embedding(self): 45 | """Lookup from embedding variables. A dropout layer follows lookup operations. 46 | """ 47 | super(NCFModel, self)._lookup_from_embedding() 48 | 49 | self.user_gmf_embedding = tf.nn.embedding_lookup( 50 | self.user_gmf_lookup, self.iterator.users 51 | ) 52 | tf.summary.histogram("user_gmf_embedding_output", self.user_gmf_embedding) 53 | 54 | self.user_mlp_embedding = tf.nn.embedding_lookup( 55 | self.user_mlp_lookup, self.iterator.users 56 | ) 57 | tf.summary.histogram("user_mlp_embedding_output", self.user_mlp_embedding) 58 | 59 | self.item_gmf_embedding = tf.nn.embedding_lookup( 60 | self.item_gmf_lookup, self.iterator.items 61 | ) 62 | tf.summary.histogram("item_gmf_embedding_output", self.item_gmf_embedding) 63 | 64 | self.item_mlp_embedding = tf.nn.embedding_lookup( 65 | self.item_mlp_lookup, self.iterator.items 66 | ) 67 | tf.summary.histogram("item_short_embedding_output", self.item_mlp_embedding) 68 | 69 | def _build_seq_graph(self): 70 | """The main function to create din model. 71 | 72 | Returns: 73 | obj:the output of din section. 74 | """ 75 | with tf.name_scope('ncf'): 76 | self.gmf = self.user_gmf_embedding * self.item_gmf_embedding 77 | self.mlp = tf.concat([self.user_mlp_embedding, self.item_mlp_embedding], -1) 78 | for layer_size in self.hparams.ncf_layer_sizes: 79 | self.mlp = tf.contrib.layers.fully_connected( 80 | self.mlp, 81 | num_outputs=layer_size, 82 | activation_fn=tf.nn.relu, 83 | weights_initializer=tf.contrib.layers.xavier_initializer( 84 | seed=self.seed 85 | ), 86 | ) 87 | model_output = tf.concat([self.gmf, self.mlp], -1) 88 | tf.summary.histogram("model_output", model_output) 89 | return model_output 90 | 91 | def _fcn_net(self, model_output, layer_sizes, scope): 92 | 93 | output = tf.contrib.layers.fully_connected( 94 | model_output, 95 | num_outputs=1, 96 | activation_fn=None, 97 | biases_initializer=None, 98 | weights_initializer=tf.contrib.layers.xavier_initializer( 99 | seed=self.seed 100 | ), 101 | ) 102 | self.logit = tf.sigmoid(output) 103 | return output 104 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/nextitnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import ( 6 | SequentialBaseModel, 7 | ) 8 | 9 | __all__ = ["NextItNetModel"] 10 | 11 | 12 | class NextItNetModel(SequentialBaseModel): 13 | """NextItNet Model 14 | 15 | Yuan, Fajie, et al. "A Simple Convolutional Generative Network 16 | for Next Item Recommendation." web search and data mining (2019): 582-590. 17 | 18 | It requires strong sequence with dataset. 19 | """ 20 | 21 | def _build_seq_graph(self): 22 | """The main function to create nextitnet model. 23 | 24 | Returns: 25 | obj:the output of nextitnet section. 26 | """ 27 | hparams = self.hparams 28 | is_training = tf.equal(self.is_train_stage, True) 29 | item_history_embedding = tf.cond( 30 | is_training, 31 | lambda: self.item_history_embedding[:: self.hparams.train_num_ngs + 1], 32 | lambda: self.item_history_embedding, 33 | ) 34 | cate_history_embedding = tf.cond( 35 | is_training, 36 | lambda: self.cate_history_embedding[:: self.hparams.train_num_ngs + 1], 37 | lambda: self.cate_history_embedding, 38 | ) 39 | 40 | with tf.variable_scope("nextitnet", reuse=tf.AUTO_REUSE): 41 | 42 | dilate_input = tf.concat( 43 | [item_history_embedding, cate_history_embedding], 2 44 | ) 45 | 46 | for layer_id, dilation in enumerate(hparams.dilations): 47 | dilate_input = tf.cond( 48 | is_training, 49 | lambda: self._nextitnet_residual_block_one( 50 | dilate_input, 51 | dilation, 52 | layer_id, 53 | dilate_input.get_shape()[-1], 54 | hparams.kernel_size, 55 | causal=True, 56 | train=True, 57 | ), 58 | lambda: self._nextitnet_residual_block_one( 59 | dilate_input, 60 | dilation, 61 | layer_id, 62 | dilate_input.get_shape()[-1], 63 | hparams.kernel_size, 64 | causal=True, 65 | train=False, 66 | ), 67 | ) 68 | 69 | self.dilate_input = dilate_input 70 | model_output = tf.cond( 71 | is_training, self._training_output, self._normal_output 72 | ) 73 | 74 | return model_output 75 | 76 | def _training_output(self): 77 | model_output = tf.repeat( 78 | self.dilate_input, self.hparams.train_num_ngs + 1, axis=0 79 | ) 80 | model_output = tf.concat([model_output, self.target_item_embedding], -1) 81 | model_output = tf.reshape( 82 | model_output, 83 | ( 84 | -1, 85 | self.hparams.train_num_ngs + 1, 86 | self.hparams.max_seq_length, 87 | model_output.get_shape()[-1], 88 | ), 89 | ) 90 | model_output = tf.transpose(model_output, [0, 2, 1, 3]) 91 | model_output = tf.reshape(model_output, (-1, model_output.get_shape()[-1])) 92 | return model_output 93 | 94 | def _normal_output(self): 95 | model_output = self.dilate_input[:, -1, :] 96 | model_output = tf.concat( 97 | [model_output, self.target_item_embedding[:, -1, :]], -1 98 | ) 99 | return model_output 100 | 101 | def _nextitnet_residual_block_one( 102 | self, 103 | input_, 104 | dilation, 105 | layer_id, 106 | residual_channels, 107 | kernel_size, 108 | causal=True, 109 | train=True, 110 | ): 111 | """The main function to use dilated CNN and residual network at sequence data 112 | 113 | Args: 114 | input_ (obj): The output of history sequential embeddings 115 | dilation (int): The dilation number of CNN layer 116 | layer_id (str): String value of layer ID, 0, 1, 2... 117 | residual_channels (int): Embedding size of input sequence 118 | kernel_size (int): Kernel size of CNN mask 119 | causal (bool): Whether to pad in front of the sequence or to pad surroundingly 120 | train (bool): is in training stage 121 | 122 | Returns: 123 | obj:the output of residual layers. 124 | """ 125 | resblock_type = "decoder" 126 | resblock_name = "nextitnet_residual_block_one_{}_layer_{}_{}".format( 127 | resblock_type, layer_id, dilation 128 | ) 129 | with tf.variable_scope(resblock_name): 130 | input_ln = self._layer_norm(input_, name="layer_norm1", trainable=train) 131 | relu1 = tf.nn.relu(input_ln) 132 | conv1 = self._conv1d( 133 | relu1, int(0.5 * int(residual_channels)), name="conv1d_1" 134 | ) 135 | conv1 = self._layer_norm(conv1, name="layer_norm2", trainable=train) 136 | relu2 = tf.nn.relu(conv1) 137 | 138 | dilated_conv = self._conv1d( 139 | relu2, 140 | int(0.5 * int(residual_channels)), 141 | dilation, 142 | kernel_size, 143 | causal=causal, 144 | name="dilated_conv", 145 | ) 146 | 147 | dilated_conv = self._layer_norm( 148 | dilated_conv, name="layer_norm3", trainable=train 149 | ) 150 | relu3 = tf.nn.relu(dilated_conv) 151 | conv2 = self._conv1d(relu3, residual_channels, name="conv1d_2") 152 | return input_ + conv2 153 | 154 | def _conv1d( 155 | self, 156 | input_, 157 | output_channels, 158 | dilation=1, 159 | kernel_size=1, 160 | causal=False, 161 | name="dilated_conv", 162 | ): 163 | """Call a dilated CNN layer 164 | 165 | Returns: 166 | obj:the output of dilated CNN layers. 167 | """ 168 | with tf.variable_scope(name): 169 | weight = tf.get_variable( 170 | "weight", 171 | [1, kernel_size, input_.get_shape()[-1], output_channels], 172 | initializer=tf.truncated_normal_initializer(stddev=0.02, seed=1), 173 | ) 174 | bias = tf.get_variable( 175 | "bias", [output_channels], initializer=tf.constant_initializer(0.0) 176 | ) 177 | 178 | if causal: 179 | padding = [[0, 0], [(kernel_size - 1) * dilation, 0], [0, 0]] 180 | padded = tf.pad(input_, padding) 181 | input_expanded = tf.expand_dims(padded, dim=1) 182 | out = ( 183 | tf.nn.atrous_conv2d( 184 | input_expanded, weight, rate=dilation, padding="VALID" 185 | ) 186 | + bias 187 | ) 188 | else: 189 | input_expanded = tf.expand_dims(input_, dim=1) 190 | out = ( 191 | tf.nn.conv2d( 192 | input_expanded, weight, strides=[1, 1, 1, 1], padding="SAME" 193 | ) 194 | + bias 195 | ) 196 | 197 | return tf.squeeze(out, [1]) 198 | 199 | # tf.contrib.layers.layer_norm 200 | def _layer_norm(self, x, name, epsilon=1e-8, trainable=True): 201 | """Call a layer normalization 202 | 203 | Returns: 204 | obj: Normalized data 205 | """ 206 | with tf.variable_scope(name): 207 | shape = x.get_shape() 208 | beta = tf.get_variable( 209 | "beta", 210 | [int(shape[-1])], 211 | initializer=tf.constant_initializer(0), 212 | trainable=trainable, 213 | ) 214 | gamma = tf.get_variable( 215 | "gamma", 216 | [int(shape[-1])], 217 | initializer=tf.constant_initializer(1), 218 | trainable=trainable, 219 | ) 220 | 221 | mean, variance = tf.nn.moments(x, axes=[len(shape) - 1], keep_dims=True) 222 | 223 | x = (x - mean) / tf.sqrt(variance + epsilon) 224 | 225 | return gamma * x + beta 226 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/sequential/sli_rec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import tensorflow as tf 5 | from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import ( 6 | SequentialBaseModel, 7 | ) 8 | from tensorflow.nn import dynamic_rnn 9 | from reco_utils.recommender.deeprec.models.sequential.rnn_cell_implement import ( 10 | Time4LSTMCell, 11 | ) 12 | 13 | __all__ = ["SLI_RECModel"] 14 | 15 | 16 | class SLI_RECModel(SequentialBaseModel): 17 | """SLI Rec model 18 | 19 | Z. Yu, J. Lian, A. Mahmoody, G. Liu and X. Xie, "Adaptive User Modeling with 20 | Long and Short-Term Preferences for Personailzed Recommendation", in Proceedings of 21 | the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19, 22 | Pages 4213-4219, AAAI Press, 2019. 23 | """ 24 | 25 | def _build_seq_graph(self): 26 | """The main function to create sli_rec model. 27 | 28 | Returns: 29 | obj:the output of sli_rec section. 30 | """ 31 | hparams = self.hparams 32 | with tf.variable_scope("sli_rec"): 33 | hist_input = tf.concat( 34 | [self.item_history_embedding, self.cate_history_embedding], 2 35 | ) 36 | self.mask = self.iterator.mask 37 | self.sequence_length = tf.reduce_sum(self.mask, 1) 38 | 39 | with tf.variable_scope("long_term_asvd"): 40 | att_outputs1 = self._attention(hist_input, hparams.attention_size) 41 | att_fea1 = tf.reduce_sum(att_outputs1, 1) 42 | tf.summary.histogram("att_fea1", att_fea1) 43 | 44 | item_history_embedding_new = tf.concat( 45 | [ 46 | self.item_history_embedding, 47 | tf.expand_dims(self.iterator.time_from_first_action, -1), 48 | ], 49 | -1, 50 | ) 51 | item_history_embedding_new = tf.concat( 52 | [ 53 | item_history_embedding_new, 54 | tf.expand_dims(self.iterator.time_to_now, -1), 55 | ], 56 | -1, 57 | ) 58 | with tf.variable_scope("rnn"): 59 | rnn_outputs, final_state = dynamic_rnn( 60 | Time4LSTMCell(hparams.hidden_size), 61 | inputs=item_history_embedding_new, 62 | sequence_length=self.sequence_length, 63 | dtype=tf.float32, 64 | scope="time4lstm", 65 | ) 66 | tf.summary.histogram("LSTM_outputs", rnn_outputs) 67 | 68 | with tf.variable_scope("attention_fcn"): 69 | att_outputs2 = self._attention_fcn( 70 | self.target_item_embedding, rnn_outputs 71 | ) 72 | att_fea2 = tf.reduce_sum(att_outputs2, 1) 73 | tf.summary.histogram("att_fea2", att_fea2) 74 | 75 | # ensemble 76 | with tf.name_scope("alpha"): 77 | 78 | if not hparams.manual_alpha: 79 | concat_all = tf.concat( 80 | [ 81 | self.target_item_embedding, 82 | att_fea1, 83 | att_fea2, 84 | tf.expand_dims(self.iterator.time_to_now[:, -1], -1), 85 | ], 86 | 1, 87 | ) 88 | last_hidden_nn_layer = concat_all 89 | alpha_logit = self._fcn_net( 90 | last_hidden_nn_layer, hparams.att_fcn_layer_sizes, scope="fcn_alpha" 91 | ) 92 | self.alpha_output = tf.sigmoid(alpha_logit) 93 | user_embed = att_fea1 * self.alpha_output + att_fea2 * (1.0 - self.alpha_output) 94 | tf.summary.histogram("alpha", self.alpha_output) 95 | error_with_category = self.alpha_output - self.iterator.attn_labels 96 | tf.summary.histogram("error_with_category", error_with_category) 97 | squared_error_with_category = tf.math.sqrt(tf.math.squared_difference(tf.reshape(self.alpha_output, [-1]), tf.reshape(self.iterator.attn_labels, [-1]))) 98 | tf.summary.histogram("squared_error_with_category", squared_error_with_category) 99 | else: 100 | self.alpha_output = tf.constant([[hparams.manual_alpha_value]]) 101 | user_embed = att_fea1 * hparams.manual_alpha_value + att_fea2 * (1.0 - hparams.manual_alpha_value) 102 | model_output = tf.concat([user_embed, self.target_item_embedding], 1) 103 | tf.summary.histogram("model_output", model_output) 104 | return model_output 105 | 106 | def _attention_fcn(self, query, user_embedding, return_alpha=False): 107 | """Apply attention by fully connected layers. 108 | 109 | Args: 110 | query (obj): The embedding of target item which is regarded as a query in attention operations. 111 | user_embedding (obj): The output of RNN layers which is regarded as user modeling. 112 | 113 | Returns: 114 | obj: Weighted sum of user modeling. 115 | """ 116 | hparams = self.hparams 117 | with tf.variable_scope("attention_fcn"): 118 | query_size = query.shape[1].value 119 | boolean_mask = tf.equal(self.mask, tf.ones_like(self.mask)) 120 | 121 | attention_mat = tf.get_variable( 122 | name="attention_mat", 123 | shape=[user_embedding.shape.as_list()[-1], query_size], 124 | initializer=self.initializer, 125 | ) 126 | att_inputs = tf.tensordot(user_embedding, attention_mat, [[2], [0]]) 127 | 128 | queries = tf.reshape( 129 | tf.tile(query, [1, att_inputs.shape[1].value]), tf.shape(att_inputs) 130 | ) 131 | last_hidden_nn_layer = tf.concat( 132 | [att_inputs, queries, att_inputs - queries, att_inputs * queries], -1 133 | ) 134 | att_fnc_output = self._fcn_net( 135 | last_hidden_nn_layer, hparams.att_fcn_layer_sizes, scope="att_fcn" 136 | ) 137 | att_fnc_output = tf.squeeze(att_fnc_output, -1) 138 | mask_paddings = tf.ones_like(att_fnc_output) * (-(2 ** 32) + 1) 139 | att_weights = tf.nn.softmax( 140 | tf.where(boolean_mask, att_fnc_output, mask_paddings), 141 | name="att_weights", 142 | ) 143 | output = user_embedding * tf.expand_dims(att_weights, -1) 144 | if not return_alpha: 145 | return output 146 | else: 147 | return output, att_weights 148 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsinghua-fib-lab/CLSR/92c6c7e069374fa2d7217f3a0987f64ce47e7cad/tests/__init__.py -------------------------------------------------------------------------------- /tests/resources/deeprec/sequential/README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | ## Taobao dataset 4 | 5 | The data files are available at [Google Drive](https://drive.google.com/file/d/1oZJPdPtgT-yTVheY3CiK_7DXyURaOo26/view?usp=sharing). 6 | 7 | ## Kuaishou dataset 8 | 9 | The data files will be made public after a process of internal review by the company. Please use the Taobao dataset for now. 10 | --------------------------------------------------------------------------------