├── .gitattributes ├── .gitignore ├── COPYING ├── README.md ├── README.rst ├── _config.yml ├── caserec ├── __init__.py ├── clustering │ ├── __init__.py │ ├── kmedoids.py │ └── paco.py ├── evaluation │ ├── __init__.py │ ├── base_evaluation.py │ ├── item_recomendation_functions.py │ ├── item_recommendation.py │ ├── rating_prediction.py │ └── statistical_analysis.py ├── recommenders │ ├── __init__.py │ ├── item_recommendation │ │ ├── __init__.py │ │ ├── base_item_recommendation.py │ │ ├── bprmf.py │ │ ├── content_based.py │ │ ├── ensemble_average.py │ │ ├── ensemble_bpr.py │ │ ├── group_based_recommender.py │ │ ├── item_attribute_knn.py │ │ ├── itemknn.py │ │ ├── most_popular.py │ │ ├── paco_recommender.py │ │ ├── random_rec.py │ │ ├── user_attribute_knn.py │ │ └── userknn.py │ └── rating_prediction │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── base_rating_prediction.cpython-37.pyc │ │ └── nnmf.cpython-37.pyc │ │ ├── base_knn.py │ │ ├── base_nsvd1.py │ │ ├── base_rating_prediction.py │ │ ├── corec.py │ │ ├── gsvdplusplus.py │ │ ├── item_attribute_knn.py │ │ ├── item_msmf.py │ │ ├── item_nsvd1.py │ │ ├── itemknn.py │ │ ├── matrixfactorization.py │ │ ├── most_popular.py │ │ ├── nnmf.py │ │ ├── random_rec.py │ │ ├── svd.py │ │ ├── svdplusplus.py │ │ ├── user_attribute_knn.py │ │ ├── user_nsvd1.py │ │ └── userknn.py └── utils │ ├── __init__.py │ ├── cross_validation.py │ ├── extra_functions.py │ ├── process_data.py │ └── split_database.py ├── examples ├── __init__.py ├── ranking_content_based.py ├── ranking_knn.py ├── ranking_mp.py ├── ranking_others.py ├── ranking_rating_based_algorithm.py ├── rating_prediction_knn.py └── rating_prediction_mf.py ├── requirements.txt ├── setup.cfg └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary and binary files 2 | *~ 3 | *.py[cod] 4 | *.so 5 | *.cfg 6 | !.isort.cfg 7 | !setup.cfg 8 | *.orig 9 | *.log 10 | *.pot 11 | __pycache__/* 12 | .cache/* 13 | .*.swp 14 | */.ipynb_checkpoints/* 15 | *.csv 16 | logs/ 17 | data/ 18 | *.zip 19 | 20 | # Project files 21 | .ropeproject 22 | .project 23 | .pydevproject 24 | .settings 25 | .idea 26 | tags 27 | 28 | # Package files 29 | *.egg 30 | *.eggs/ 31 | .installed.cfg 32 | *.egg-info 33 | 34 | # Unittest and coverage 35 | htmlcov/* 36 | .coverage 37 | .tox 38 | junit.xml 39 | coverage.xml 40 | .pytest_cache/ 41 | 42 | # Build and docs folder/files 43 | build/* 44 | dist/* 45 | sdist/* 46 | docs/api/* 47 | docs/_rst/* 48 | docs/_build/* 49 | cover/* 50 | MANIFEST 51 | 52 | # Per-project virtualenvs 53 | .venv*/ 54 | 55 | # Igoring jupyter notebooks 56 | *.ipynb 57 | 58 | # Igoring models 59 | *.hdf5 60 | 61 | # Igoring audio files 62 | *.wav 63 | *.mp3 64 | 65 | # Igoring pickle files 66 | *.pk 67 | 68 | # Igoring bin and arpa files 69 | *.bin 70 | *.arpa 71 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | © 2019. Case Recommender All Rights Reserved 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Case Recommender - A Python Framework for RecSys 2 | 3 | [![PyPI version](https://badge.fury.io/py/CaseRecommender.svg)](https://badge.fury.io/py/CaseRecommender) 4 | [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) 5 | [![GitHub license](https://img.shields.io/github/license/caserec/CaseRecommender.svg)](https://github.com/caserec/CaseRecommender/blob/master/COPYING) 6 | 7 | Case Recommender is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback. The framework aims to provide a rich set of components from which you can construct a customized recommender system from a set of algorithms. Case Recommender has different types of item recommendation and rating prediction approaches, and different metrics validation and evaluation. 8 | 9 | # Algorithms 10 | 11 | Item Recommendation: 12 | 13 | - BPRMF 14 | 15 | - ItemKNN 16 | 17 | - Item Attribute KNN 18 | 19 | - UserKNN 20 | 21 | - User Attribute KNN 22 | 23 | - Group-based (Clustering-based algorithm) 24 | 25 | - Paco Recommender (Co-Clustering-based algorithm) 26 | 27 | - Most Popular 28 | 29 | - Random 30 | 31 | - Content Based 32 | 33 | Rating Prediction: 34 | 35 | - Matrix Factorization (with and without baseline) 36 | 37 | - Non-negative Matrix Factorization 38 | 39 | - SVD 40 | 41 | - SVD++ 42 | 43 | - ItemKNN 44 | 45 | - Item Attribute KNN 46 | 47 | - UserKNN 48 | 49 | - User Attribute KNN 50 | 51 | - Item NSVD1 (with and without Batch) 52 | 53 | - User NSVD1 (with and without Batch) 54 | 55 | - Most Popular 56 | 57 | - Random 58 | 59 | - gSVD++ 60 | 61 | - Item-MSMF 62 | 63 | - (E) CoRec 64 | 65 | Clustering: 66 | 67 | - PaCo: EntroPy Anomalies in Co-Clustering 68 | 69 | - k-medoids 70 | 71 | # Evaluation and Validation Metrics 72 | 73 | - All-but-one Protocol 74 | 75 | - Cross-fold-Validation 76 | 77 | - Item Recommendation: Precision, Recall, NDCG and Map 78 | 79 | - Rating Prediction: MAE and RMSE 80 | 81 | - Statistical Analysis (T-test and Wilcoxon) 82 | 83 | # Requirements 84 | 85 | - Python 86 | - scipy 87 | - numpy 88 | - pandas 89 | - scikit-learn 90 | 91 | For Linux and MAC use: 92 | 93 | $ pip install requirements 94 | 95 | For Windows use: 96 | 97 | http://www.lfd.uci.edu/~gohlke/pythonlibs/ 98 | 99 | # Installation 100 | 101 | Case Recommender can be installed using pip: 102 | 103 | $ pip install caserecommender 104 | 105 | If you want to run the latest version of the code, you can install from git: 106 | 107 | $ pip install -U git+git://github.com/caserec/CaseRecommender.git 108 | 109 | # Quick Start and Guide 110 | 111 | For more information about RiVal and the documentation, visit the Case Recommender [Wiki](https://github.com/caserec/CaseRecommender/wiki). If you have not used Case Recommender before, do check out the Getting Started guide. 112 | 113 | # Usage 114 | 115 | Divide Database (Fold Cross Validation) 116 | 117 | >> from caserec.utils.split_database import SplitDatabase 118 | >> SplitDatabase(input_file=dataset, dir_folds=dir_path, n_splits=10).k_fold_cross_validation() 119 | 120 | Run Item Recommendation Algorithm (E.g: ItemKNN) 121 | 122 | >> from caserec.recommenders.item_recommendation.itemknn import ItemKNN 123 | >> ItemKNN(train_file, test_file).compute() 124 | 125 | Run Rating Prediction Algorithm (E.g: ItemKNN) 126 | 127 | >> from caserec.recommenders.rating_prediction.itemknn import ItemKNN 128 | >> ItemKNN(train_file, test_file).compute() 129 | 130 | Evaluate Ranking (Prec@N, Recall@N, NDCG@, Map@N and Map Total) 131 | 132 | >> from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation 133 | >> ItemRecommendationEvaluation().evaluate_with_files(predictions_file, test_file) 134 | 135 | Evaluate Ranking (MAE and RMSE) 136 | 137 | >> from caserec.evaluation.rating_prediction import RatingPredictionEvaluation 138 | >> RatingPredictionEvaluation().evaluate_with_files(predictions_file, test_file) 139 | 140 | # Input 141 | 142 | The input-files of traditional have to be placed in the corresponding subdirectory and are in csv-format with at least 143 | 3 columns. Example: user_1,item_1,feedback 144 | 145 | # Cite us 146 | 147 | If you use Case Recommender in a scientific publication, we would appreciate citations of our paper where this framework was first mentioned and used. 148 | 149 | To cite Case Recommender use: Arthur da Costa, Eduardo Fressato, Fernando Neto, Marcelo Manzato, and Ricardo Campello. 2019. Case recommender: a flexible and extensible python framework for recommender systems. In Proceedings of the 12th ACM Conference on Recommender Systems (RecSys '18). ACM, New York, NY, USA, 494-495. DOI: https://doi.org/10.1145/3240323.3241611. 150 | 151 | For TeX/LaTeX (BibTex): 152 | 153 | @inproceedings{daCosta:2018:CRF:3240323.3241611, 154 | author = {da Costa, Arthur and Fressato, Eduardo and Neto, Fernando and Manzato, Marcelo and Campello, Ricardo}, 155 | title = {Case Recommender: A Flexible and Extensible Python Framework for Recommender Systems}, 156 | booktitle = {Proceedings of the 12th ACM Conference on Recommender Systems}, 157 | series = {RecSys '18}, 158 | year = {2018}, 159 | isbn = {978-1-4503-5901-6}, 160 | location = {Vancouver, British Columbia, Canada}, 161 | pages = {494--495}, 162 | numpages = {2}, 163 | url = {http://doi.acm.org/10.1145/3240323.3241611}, 164 | doi = {10.1145/3240323.3241611}, 165 | acmid = {3241611}, 166 | publisher = {ACM}, 167 | address = {New York, NY, USA}, 168 | keywords = {framework, python, recommender systems}, 169 | } 170 | 171 | # Help CaseRecommender 172 | 173 | To help the project with contributions follow the steps: 174 | 175 | - Fork CaseRecommender 176 | 177 | - Make your alterations and commit 178 | 179 | - Create a topic branch - git checkout -b my_branch 180 | 181 | - Push to your branch - git push origin my_branch 182 | 183 | - Create a Pull Request from your branch. 184 | 185 | - You just contributed to the CaseRecommender project! 186 | 187 | For bugs or feedback use this link: https://github.com/caserec/CaseRecommender/issues 188 | 189 | # License (MIT) 190 | 191 | © 2019. Case Recommender All Rights Reserved 192 | 193 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 194 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 195 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 196 | permit persons to whom the Software is furnished to do so, subject to the following conditions: 197 | 198 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of 199 | the Software. 200 | 201 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 202 | THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 203 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 204 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 205 | IN THE SOFTWARE. 206 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Case Recommender - A Python Framework for RecSys 2 | =================================================== 3 | 4 | Case Recommender is a Python implementation of a number of popular recommendation algorithms for both implicit and 5 | explicit feedback. The framework aims to provide a rich set of components from which you can construct a customized 6 | recommender system from a set of algorithms. Case Recommender has different types of item recommendation and rating 7 | prediction approaches, and different metrics validation and evaluation. 8 | 9 | Algorithms 10 | ^^^^^^^^^^^^ 11 | 12 | Item Recommendation: 13 | 14 | - BPRMF 15 | 16 | - ItemKNN 17 | 18 | - Item Attribute KNN 19 | 20 | - UserKNN 21 | 22 | - User Attribute KNN 23 | 24 | - Group-based (Clustering-based algorithm) 25 | 26 | - Paco Recommender (Co-Clustering-based algorithm) 27 | 28 | - Most Popular 29 | 30 | - Random 31 | 32 | - Content Based 33 | 34 | Rating Prediction: 35 | 36 | - Matrix Factorization (with and without baseline) 37 | 38 | - SVD 39 | 40 | - Non-negative Matrix Factorization 41 | 42 | - SVD++ 43 | 44 | - ItemKNN 45 | 46 | - Item Attribute KNN 47 | 48 | - UserKNN 49 | 50 | - User Attribute KNN 51 | 52 | - Item NSVD1 (with and without Batch) 53 | 54 | - User NSVD1 (with and without Batch) 55 | 56 | - Most Popular 57 | 58 | - Random 59 | 60 | - gSVD++ 61 | 62 | - Item-MSMF 63 | 64 | - (E)CoRec 65 | 66 | Clustering: 67 | 68 | - PaCo: EntroPy Anomalies in Co-Clustering 69 | 70 | - k-medoids 71 | 72 | Evaluation and Validation Metrics 73 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 74 | 75 | - All-but-one Protocol 76 | 77 | - Cross-fold- Validation 78 | 79 | - Item Recommendation: Precision, Recall, NDCG and Map 80 | 81 | - Rating Prediction: MAE and RMSE 82 | 83 | - Statistical Analysis (T-test and Wilcoxon) 84 | 85 | Requirements 86 | ^^^^^^^^^^^^^ 87 | 88 | - Python >= 3 89 | - scipy 90 | - numpy 91 | - pandas 92 | - scikit-learn 93 | 94 | For Linux, Windows and MAC use: 95 | 96 | $ pip install requirements 97 | 98 | For Windows libraries help use: 99 | 100 | http://www.lfd.uci.edu/~gohlke/pythonlibs/ 101 | 102 | Quick Start and Guide 103 | ^^^^^^^^^^^^^^^^^^^^^^ 104 | 105 | For more information about RiVal and the documentation, 106 | visit the Case Recommender 107 | `Wiki `_. If you have not used Case Recommender before, do check out the Getting Started guide. 108 | 109 | 110 | Installation 111 | ^^^^^^^^^^^^^ 112 | 113 | Case Recommender can be installed using pip: 114 | 115 | $ pip install caserecommender 116 | 117 | If you want to run the latest version of the code, you can install from git: 118 | 119 | $ pip install -U git+git://github.com/caserec/CaseRecommender.git 120 | 121 | More Details 122 | ^^^^^^^^^^^^^ 123 | 124 | `https://github.com/caserec/CaseRecommender `_ 125 | 126 | 127 | License (MIT) 128 | ^^^^^^^^^^^^^^ 129 | 130 | © 2019. Case Recommender All Rights Reserved 131 | 132 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 133 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 134 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 135 | permit persons to whom the Software is furnished to do so, subject to the following conditions: 136 | 137 | The above copyright notice and this permission notice shall be included in all copies or substantial portions 138 | of the Software. 139 | 140 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 141 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 142 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 143 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 144 | DEALINGS IN THE SOFTWARE. 145 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /caserec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/__init__.py -------------------------------------------------------------------------------- /caserec/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Arthur' 2 | -------------------------------------------------------------------------------- /caserec/clustering/kmedoids.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | K-medoids Clustering Algorithm 4 | [Co-Clustering Algorithm] 5 | 6 | Literature: 7 | H.S. Park , C.H. Jun: 8 | A simple and fast algorithm for K-medoids clustering 9 | Expert Systems with Applications, 36, (2) (2009), 3336–3341. 10 | 11 | """ 12 | 13 | # © 2019. Case Recommender (MIT License) 14 | 15 | import numpy as np 16 | 17 | __author__ = 'Arthur Fortes ' 18 | 19 | 20 | def kmedoids(distance_matrix, k, max_interactions=10000, random_seed=None): 21 | """ 22 | k-medoids 23 | 24 | Usage:: 25 | 26 | >> sm, c = kmedoids(distance_matrix, k=3) 27 | 28 | The k-medoids algorithm is a clustering algorithm related to the k-means algorithm and the medoidshift algorithm. 29 | Both the k-means and k-medoids algorithms are partitional (breaking the dataset up into groups) and both attempt to 30 | minimize the distance between points labeled to be in a cluster and a point designated as the center of that 31 | cluster. In contrast to the k-means algorithm, k-medoids chooses datapoints as centers (medoids or exemplars) 32 | and works with a generalization of the Manhattan Norm to define distance between datapoints instead of. 33 | This method was proposed in 1987[1] for the work with norm and other distances. 34 | 35 | k-medoid is a classical partitioning technique of clustering that clusters the data set of n objects into k 36 | clusters known a priori. A useful tool for determining k is the silhouette. It is more robust to noise and outliers 37 | as compared to k-means because it minimizes a sum of pairwise dissimilarities instead of a sum of squared 38 | Euclidean distances. 39 | 40 | A medoid can be defined as the object of a cluster whose average dissimilarity to all the objects in the cluster 41 | is minimal. i.e. it is a most centrally located point in the cluster. 42 | 43 | :param distance_matrix: Matrix with distances between the instances 44 | :type distance_matrix: matrix 45 | 46 | :param k: Number of groups to be generated 47 | :type k: int 48 | 49 | :param max_interactions: Number max of interaction to converge 50 | :type max_interactions: int, default 10000 51 | 52 | :param random_seed: Seed of random 53 | :type random_seed: int, default None 54 | 55 | :return: Support vector and List of labels (len = number of instances) 56 | 57 | """ 58 | 59 | # Set seed in random 60 | if random_seed is not None: 61 | np.random.seed(random_seed) 62 | 63 | # determine dimensions of distance matrix 64 | row, col = distance_matrix.shape 65 | 66 | if k > col: 67 | raise Exception("Error:: Too many medoids") 68 | 69 | # randomly initialize an array of k-medoid indices 70 | support_matrix = np.arange(col) 71 | np.random.shuffle(support_matrix) 72 | support_matrix = np.sort(support_matrix[:k]) 73 | 74 | # create a copy of the array of medoid indices 75 | new_support_matrix = np.copy(support_matrix) 76 | 77 | # initialize a dictionary to represent clusters 78 | clusters = {} 79 | 80 | for _ in range(max_interactions): 81 | # determine clusters, i. e. arrays of data indices 82 | j_vector = np.argmin(distance_matrix[:, support_matrix], axis=1) 83 | for label in range(k): 84 | clusters[label] = np.where(j_vector == label)[0] 85 | 86 | # update cluster medoids 87 | for label in range(k): 88 | j_vector = np.mean(distance_matrix[np.ix_( 89 | clusters[label], clusters[label])], axis=1) 90 | try: 91 | j = np.argmin(j_vector) 92 | new_support_matrix[label] = clusters[label][j] 93 | except ValueError: 94 | pass 95 | np.sort(new_support_matrix) 96 | 97 | # check for convergence 98 | if np.array_equal(support_matrix, new_support_matrix): 99 | break 100 | support_matrix = np.copy(new_support_matrix) 101 | 102 | else: 103 | # final update of cluster memberships 104 | j_vector = np.argmin(distance_matrix[:, support_matrix], axis=1) 105 | for label in range(k): 106 | clusters[label] = np.where(j_vector == label)[0] 107 | 108 | remove_keys = set() 109 | for key in clusters: 110 | if len(clusters[key]) == 0: 111 | remove_keys.add(key) 112 | 113 | if remove_keys: 114 | for key in remove_keys: 115 | clusters.pop(key, None) 116 | 117 | # return results 118 | return support_matrix, clusters 119 | -------------------------------------------------------------------------------- /caserec/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Arthur' 2 | -------------------------------------------------------------------------------- /caserec/evaluation/base_evaluation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | This class is base for evaluation strategies 4 | 5 | Types of evaluation: 6 | - Simple: Evaluation with traditional strategy 7 | - All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the ranking 8 | 9 | """ 10 | 11 | # © 2019. Case Recommender (MIT License) 12 | 13 | from collections import defaultdict 14 | 15 | from caserec.utils.process_data import ReadFile 16 | 17 | __author__ = 'Arthur Fortes ' 18 | 19 | 20 | class BaseEvaluation(object): 21 | def __init__(self, sep='\t', metrics=None, all_but_one_eval=False, verbose=True, as_table=False, table_sep='\t', save_eval_file = None): 22 | """ 23 | Class to be base for evaluation strategies 24 | 25 | :param sep: Delimiter for input files 26 | :type sep: str, default '\t' 27 | 28 | :param metrics: List of evaluation metrics 29 | :type metrics: list, default None 30 | 31 | :param all_but_one_eval: If True, considers only one pair (u, i) from the test set to evaluate the ranking 32 | :type all_but_one_eval: bool, default False 33 | 34 | :param verbose: Print the evaluation results 35 | :type verbose: bool, default True 36 | 37 | :param as_table: Print the evaluation results as table (only work with verbose=True) 38 | :type as_table: bool, default False 39 | 40 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 41 | :type table_sep: str, default '\t' 42 | 43 | """ 44 | self.sep = sep 45 | self.all_but_one_eval = all_but_one_eval 46 | self.metrics = metrics 47 | self.verbose = verbose 48 | self.as_table = as_table 49 | self.table_sep = table_sep 50 | 51 | def evaluate(self, predictions, test_set): 52 | """ 53 | Method to be implemented for each strategy using their respective metrics. 54 | Use read() in ReadFile to transform your file in a dict 55 | 56 | :param predictions: Dictionary with ranking information 57 | :type predictions: dict 58 | 59 | :param test_set: Dictionary with test set information. 60 | :type test_set: dict 61 | 62 | """ 63 | raise NotImplemented 64 | 65 | def evaluate_with_files(self, prediction_file, test_file): 66 | """ 67 | Method to evaluate predictions using files 68 | 69 | :param prediction_file: Predictions file with at least 2 columns for item recommendation 70 | (eg. user item [score (optional)]) and 3 columns for rating prediction (eg. user item rating) 71 | :type prediction_file: str 72 | 73 | :param test_file: Test file 74 | :type test_file: str 75 | 76 | :return: Dictionary with all evaluation metrics and results 77 | :rtype: dict 78 | 79 | """ 80 | 81 | predict = ReadFile(prediction_file, sep=self.sep).read() 82 | test_set = ReadFile(test_file, sep=self.sep).read() 83 | 84 | return self.evaluate(predict['feedback'], test_set) 85 | 86 | def evaluate_recommender(self, predictions, test_set): 87 | """ 88 | Method to evaluate recommender results. This method should be called by item recommender algorithms 89 | 90 | :param predictions: List with recommender output. e.g. [[user, item, score], [user, item2, score] ...] 91 | :type predictions: list 92 | 93 | :param test_set: Dictionary with test set information. 94 | :type test_set: dict 95 | 96 | :return: Dictionary with all evaluation metrics and results 97 | :rtype: dict 98 | 99 | """ 100 | 101 | predictions_dict = {} 102 | 103 | for sample in predictions: 104 | predictions_dict.setdefault(sample[0], {}).update({sample[1]: sample[2]}) 105 | 106 | return self.evaluate(predictions_dict, test_set) 107 | 108 | def evaluate_folds(self, folds_dir, predictions_file_name, test_file_name, k_folds=10): 109 | """ 110 | Evaluate ranking in a set of folds. The name of folds needs to be integer and start with 0. e.g. 111 | Exist a dir '/home/user/folds', in which contains folds 0, 1, ..., 10. 112 | 113 | :param folds_dir: Directory of folds 114 | :type folds_dir: str 115 | 116 | :param k_folds: Number of folds 117 | :type k_folds: int, default 10 118 | 119 | :param predictions_file_name: Name of the ranking file 120 | :type predictions_file_name: str 121 | 122 | :param test_file_name: Name of the test file 123 | :type test_file_name: str 124 | 125 | :return: Dictionary with all evaluation metrics and results 126 | :rtype: dict 127 | 128 | """ 129 | 130 | folds_results = defaultdict() 131 | 132 | for fold in range(k_folds): 133 | predictions_file = folds_dir + str(fold) + '/' + predictions_file_name 134 | test_file = folds_dir + str(fold) + '/' + test_file_name 135 | 136 | for key, value in self.evaluate_with_files(predictions_file, test_file).items(): 137 | folds_results[key] = folds_results.get(key, 0) + value 138 | 139 | folds_results = {k: round(v / k_folds, 6) for k, v in folds_results.items()} 140 | 141 | if self.verbose: 142 | self.print_results(folds_results) 143 | 144 | return folds_results 145 | 146 | def print_results(self, evaluation_results, save_eval_file = None): 147 | """ 148 | Method to print the results 149 | 150 | :param evaluation_results: Dictionary with results. e.g. {metric: value} 151 | :type evaluation_results: dict 152 | 153 | """ 154 | 155 | if self.as_table: 156 | header = '' 157 | values = '' 158 | for metric in self.metrics: 159 | header += metric.upper() + self.table_sep 160 | values += str(evaluation_results[metric.upper()]) + self.table_sep 161 | print(header) 162 | print(values) 163 | 164 | else: 165 | evaluation = 'Eval:: ' 166 | for metrics in self.metrics: 167 | evaluation += metrics.upper() + ': ' + str(evaluation_results[metrics.upper()]) + ' ' 168 | print(evaluation) 169 | -------------------------------------------------------------------------------- /caserec/evaluation/item_recomendation_functions.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | These functions are responsible for evaluate item recommendation algorithms (rankings). 4 | 5 | They are used by evaluation/item_recommendation.py 6 | 7 | """ 8 | 9 | # © 2019. Case Recommender (MIT License) 10 | 11 | import numpy as np 12 | 13 | __author__ = 'Arthur Fortes ' 14 | 15 | 16 | def precision_at_k(ranking, k): 17 | """ 18 | Score is precision @ k 19 | Relevance is binary (nonzero is relevant). 20 | 21 | :param ranking: Relevance scores (list or numpy) in rank order (first element is the first item) 22 | :type ranking: list, np.array 23 | 24 | :param k: length of ranking 25 | :type k: int 26 | 27 | :return: Precision @ k 28 | :rtype: float 29 | 30 | """ 31 | 32 | assert k >= 1 33 | ranking = np.asarray(ranking)[:k] != 0 34 | if ranking.size != k: 35 | raise ValueError('Relevance score length < k') 36 | return np.mean(ranking) 37 | 38 | 39 | def average_precision(ranking): 40 | """ 41 | Score is average precision (area under PR curve). Relevance is binary (nonzero is relevant). 42 | 43 | :param ranking: Relevance scores (list or numpy) in rank order (first element is the first item) 44 | :type ranking: list, np.array 45 | 46 | :return: Average precision 47 | :rtype: float 48 | 49 | """ 50 | 51 | ranking = np.asarray(ranking) != 0 52 | out = [precision_at_k(ranking, k + 1) for k in range(ranking.size) if ranking[k]] 53 | if not out: 54 | return 0. 55 | return np.mean(out) 56 | 57 | 58 | def mean_average_precision(ranking): 59 | """ 60 | Score is mean average precision. Relevance is binary (nonzero is relevant). 61 | 62 | :param ranking: Relevance scores (list or numpy) in rank order (first element is the first item) 63 | :type ranking: list, np.array 64 | 65 | :return: Mean average precision 66 | :rtype: float 67 | """ 68 | 69 | return np.mean([average_precision(r) for r in ranking]) 70 | 71 | 72 | def ndcg_at_k(ranking): 73 | """ 74 | Score is normalized discounted cumulative gain (ndcg). Relevance is positive real values. Can use binary 75 | as the previous methods. 76 | 77 | :param ranking: ranking to evaluate in dcg format [0, 0, 1], where 1 is correct info 78 | :type ranking: list 79 | 80 | :return: Normalized discounted cumulative gain 81 | :rtype: float 82 | 83 | """ 84 | 85 | ranking = np.asfarray(ranking) 86 | r_ideal = np.asfarray(sorted(ranking, reverse=True)) 87 | dcg_ideal = r_ideal[0] + np.sum(r_ideal[1:] / np.log2(np.arange(2, r_ideal.size + 1))) 88 | dcg_ranking = ranking[0] + np.sum(ranking[1:] / np.log2(np.arange(2, ranking.size + 1))) 89 | 90 | return dcg_ranking / dcg_ideal 91 | -------------------------------------------------------------------------------- /caserec/evaluation/item_recommendation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | This class is responsible for evaluate item recommendation algorithms (rankings). 4 | 5 | This file contains item recommendation evaluation metrics: 6 | - Mean average precision - MAP 7 | - Precision 8 | - Recall 9 | - Normalized Discounted Cumulative Gain - NDCG 10 | 11 | Types of evaluation: 12 | - Simple: Evaluation with traditional strategy 13 | - All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the ranking 14 | 15 | """ 16 | 17 | # © 2019. Case Recommender (MIT License) 18 | 19 | import numpy as np 20 | import random 21 | 22 | from caserec.evaluation.base_evaluation import BaseEvaluation 23 | from caserec.evaluation.item_recomendation_functions import precision_at_k, mean_average_precision, ndcg_at_k 24 | 25 | __author__ = 'Arthur Fortes ' 26 | 27 | 28 | class ItemRecommendationEvaluation(BaseEvaluation): 29 | def __init__(self, sep='\t', n_ranks=list([1, 3, 5, 10]), 30 | metrics=list(['PREC', 'RECALL', 'MAP', 'NDCG']), all_but_one_eval=False, 31 | verbose=True, as_table=False, table_sep='\t'): 32 | """ 33 | Class to evaluate predictions in a item recommendation (ranking) scenario 34 | 35 | :param sep: Delimiter for input files 36 | :type sep: str, default '\t' 37 | 38 | :param n_ranks: List of positions to evaluate the ranking 39 | :type n_ranks: list, default [1, 3, 5, 10] 40 | 41 | :param metrics: List of evaluation metrics 42 | :type metrics: list, default ('PREC', 'RECALL', 'MAP', 'NDCG') 43 | 44 | :param all_but_one_eval: If True, considers only one pair (u, i) from the test set to evaluate the ranking 45 | :type all_but_one_eval: bool, default False 46 | 47 | :param verbose: Print the evaluation results 48 | :type verbose: bool, default True 49 | 50 | :param as_table: Print the evaluation results as table (only work with verbose=True) 51 | :type as_table: bool, default False 52 | 53 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 54 | :type table_sep: str, default '\t' 55 | 56 | """ 57 | 58 | if type(metrics) == list: 59 | metrics = [m + '@' + str(n) for m in metrics for n in n_ranks] 60 | super(ItemRecommendationEvaluation, self).__init__(sep=sep, metrics=metrics, all_but_one_eval=all_but_one_eval, 61 | verbose=verbose, as_table=as_table, table_sep=table_sep) 62 | 63 | self.n_ranks = n_ranks 64 | 65 | def evaluate(self, predictions, test_set): 66 | """ 67 | Method to calculate all the metrics for item recommendation scenario using dictionaries of ranking 68 | and test set. Use read() in ReadFile to transform your file in a dict 69 | 70 | :param predictions: Dictionary with ranking information 71 | :type predictions: dict 72 | 73 | :param test_set: Dictionary with test set information. 74 | :type test_set: dict 75 | 76 | :return: Dictionary with all evaluation metrics and results 77 | :rtype: dict 78 | 79 | """ 80 | 81 | eval_results = {} 82 | num_user = len(test_set['users']) 83 | partial_map_all = None 84 | 85 | if self.all_but_one_eval: 86 | for user in test_set['users']: 87 | # select a random item 88 | test_set['items_seen_by_user'][user] = [random.choice(test_set['items_seen_by_user'].get(user, [-1]))] 89 | 90 | for i, n in enumerate(self.n_ranks): 91 | if n < 1: 92 | raise ValueError('Error: N must >= 1.') 93 | 94 | partial_precision = list() 95 | partial_recall = list() 96 | partial_ndcg = list() 97 | partial_map = list() 98 | 99 | for user in test_set['users']: 100 | hit_cont = 0 101 | # Generate user intersection list between the recommended items and test. 102 | list_feedback = set(list(predictions.get(user, []))[:n]) 103 | intersection = list(list_feedback.intersection(test_set['items_seen_by_user'].get(user, []))) 104 | 105 | if len(intersection) > 0: 106 | ig_ranking = np.zeros(n) 107 | for item in intersection: 108 | hit_cont += 1 109 | ig_ranking[list(predictions[user]).index(item)] = 1 110 | 111 | partial_precision.append(precision_at_k([ig_ranking], n)) 112 | partial_recall.append((float(len(intersection)) / float(len(test_set['items_seen_by_user'][user])))) 113 | partial_map.append(mean_average_precision([ig_ranking])) 114 | partial_ndcg.append(ndcg_at_k(list(ig_ranking))) 115 | 116 | partial_map_all = partial_map 117 | 118 | # create a dictionary with final results 119 | eval_results.update({ 120 | 'PREC@' + str(n): round(sum(partial_precision) / float(num_user), 6), 121 | 'RECALL@' + str(n): round(sum(partial_recall) / float(num_user), 6), 122 | 'NDCG@' + str(n): round(sum(partial_ndcg) / float(num_user), 6), 123 | 'MAP@' + str(n): round(sum(partial_map) / float(num_user), 6), 124 | 'MAP': round(sum(partial_map_all) / float(num_user), 6) 125 | 126 | }) 127 | 128 | # if (self.save_eval_file is not None): 129 | # # Saving evaluations to a file 130 | # from caserec.utils.process_data import WriteFile 131 | 132 | # WriteFile(output_file=save_eval_file, data=) 133 | 134 | if self.verbose: 135 | self.print_results(eval_results) 136 | 137 | return eval_results 138 | -------------------------------------------------------------------------------- /caserec/evaluation/rating_prediction.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | This class is responsible for evaluate rating prediction algorithms. 4 | 5 | This file contains rating prediction evaluation metrics: 6 | - Mean Absolute Error - MAE 7 | - Root Mean Squared Error - RMSE 8 | 9 | Types of evaluation: 10 | - Simple: Evaluation with traditional strategy 11 | - All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the predictions 12 | 13 | """ 14 | 15 | from sklearn.metrics import mean_absolute_error, mean_squared_error 16 | import numpy as np 17 | import random 18 | 19 | from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation 20 | from caserec.evaluation.base_evaluation import BaseEvaluation 21 | 22 | __author__ = 'Arthur Fortes ' 23 | 24 | 25 | class RatingPredictionEvaluation(BaseEvaluation): 26 | def __init__(self, sep='\t', metrics=list(['MAE', 'RMSE']), all_but_one_eval=False, verbose=True, as_table=False, 27 | table_sep='\t', as_rank=False, n_rank=(5, 10)): 28 | """ 29 | Class to evaluate predictions in a rating prediction scenario 30 | 31 | :param sep: Delimiter for input files 32 | :type sep: str, default '\t' 33 | 34 | :param metrics: List of evaluation metrics 35 | :type metrics: list, default ('MAE', 'RMSE') 36 | 37 | :param all_but_one_eval: If True, considers only one pair (u, i) from the test set to evaluate the ranking 38 | :type all_but_one_eval: bool, default False 39 | 40 | :param verbose: Print the evaluation results 41 | :type verbose: bool, default True 42 | 43 | :param as_table: Print the evaluation results as table (only work with verbose=True) 44 | :type as_table: bool, default False 45 | 46 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 47 | :type table_sep: str, default '\t' 48 | 49 | :param as_rank: If True, evaluate as rank. 50 | :type as_rank: bool, default False 51 | 52 | """ 53 | 54 | super(RatingPredictionEvaluation, self).__init__(sep=sep, metrics=metrics, all_but_one_eval=all_but_one_eval, 55 | verbose=verbose, as_table=as_table, table_sep=table_sep) 56 | self.as_rank = as_rank 57 | self.n_rank = n_rank 58 | 59 | def evaluate(self, predictions, test_set): 60 | """ 61 | Method to calculate all the metrics for item recommendation scenario using dictionaries of ranking 62 | and test set. Use read() in ReadFile to transform your prediction and test files in a dict 63 | 64 | :param predictions: Dict of predictions 65 | :type predictions: dict 66 | 67 | :param test_set: Dictionary with test set information. 68 | :type test_set: dict 69 | 70 | :return: Dictionary with all evaluation metrics and results 71 | :rtype: dict 72 | 73 | """ 74 | 75 | eval_results = {} 76 | predictions_list = [] 77 | test_list = [] 78 | 79 | if not self.as_rank: 80 | # Create All but one set, selecting only one sample from the test set for each user 81 | if self.all_but_one_eval: 82 | for user in test_set['users']: 83 | # select a random item 84 | item = random.choice(test_set['feedback'][user]) 85 | test_set['feedback'][user] = {item: test_set['feedback'][user][item]} 86 | 87 | for user in predictions: 88 | for item in predictions[user]: 89 | rui_predict = predictions[user][item] 90 | rui_test = test_set["feedback"].get(user, {}).get(item, np.nan) 91 | if not np.isnan(rui_test): 92 | predictions_list.append(rui_predict) 93 | test_list.append(float(rui_test)) 94 | 95 | eval_results.update({ 96 | 'MAE': round(mean_absolute_error(test_list, predictions_list), 6), 97 | 'RMSE': round(np.sqrt(mean_squared_error(test_list, predictions_list)), 6) 98 | }) 99 | 100 | if self.verbose: 101 | self.print_results(eval_results) 102 | 103 | else: 104 | new_predict_set = [] 105 | new_test_set = {} 106 | 107 | for user in predictions: 108 | partial_predictions = [] 109 | for item in predictions[user]: 110 | 111 | if predictions[user][item] > 3: 112 | partial_predictions.append([user, item, predictions[user][item]]) 113 | 114 | if test_set["feedback"].get(user, {}).get(item, 0) > 3: 115 | new_test_set.setdefault(user, []).append(item) 116 | 117 | partial_predictions = sorted(partial_predictions, key=lambda x: -x[2]) 118 | new_predict_set += partial_predictions 119 | 120 | new_test_set['items_seen_by_user'] = new_test_set 121 | new_test_set['users'] = test_set['users'] 122 | 123 | eval_results = ItemRecommendationEvaluation(n_ranks=self.n_rank, 124 | all_but_one_eval=self.all_but_one_eval, 125 | metrics=self.metrics).evaluate_recommender(new_predict_set, new_test_set) 126 | 127 | return eval_results 128 | -------------------------------------------------------------------------------- /caserec/evaluation/statistical_analysis.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | This class contains Statical functions for recommender systems. 4 | 5 | - T-test 6 | - Wilcoxon 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | from scipy.stats import ttest_ind, ranksums 13 | import numpy as np 14 | 15 | __author__ = 'Arthur Fortes ' 16 | 17 | 18 | class StatisticalAnalysis(object): 19 | def __init__(self, sample1, sample2, method='ttest'): 20 | """ 21 | Class for statical analyse. This class compares 2 list of sample and generate a statical analyse 22 | 23 | :param sample1: List of results of a recommender 1 in K folds (list with len K) 24 | :type sample1: list 25 | 26 | :param sample2: List of results of a recommender 2 in K folds (list with len K) 27 | :type sample2: list 28 | 29 | :param method: 30 | :type method: str, default 'ttest' 31 | 32 | """ 33 | self.sample1 = np.array(sample1) 34 | self.sample2 = np.array(sample2) 35 | self.method = method 36 | 37 | def general_analysis(self): 38 | """ 39 | Analyzing the difference 40 | 41 | Instead you might compute the difference and apply some common measure like the sum of absolute differences 42 | (SAD), the sum of squared differences (SSD) or the correlation coefficient: 43 | """ 44 | 45 | print("=== Information About Samples ===") 46 | print("Standard Deviation Sample1: " + str(np.std(self.sample1))) 47 | print("Standard Deviation Sample2: " + str(np.std(self.sample2)) + "\n") 48 | print("=== Analyzing the Difference Between Samples ===") 49 | print("SAD:" + str(np.sum(np.abs(self.sample1 - self.sample2)))) 50 | print("SSD:" + str(np.sum(np.square(self.sample1 - self.sample2)))) 51 | print("Correlation:" + str(np.corrcoef(np.array((self.sample1, self.sample2)))[0, 1]) + "\n") 52 | 53 | def ttest(self): 54 | """ 55 | T-student 56 | 57 | Calculates the T-test for the means of TWO INDEPENDENT samples of scores. 58 | 59 | This is a two-sided test for the null hypothesis that 2 independent samples have identical 60 | average (expected) values 61 | 62 | This test assumes that the populations have identical variances. 63 | """ 64 | 65 | t, p = ttest_ind(self.sample1, self.sample2) 66 | print("=== T- Student Analysis ===") 67 | print("The calculated t-statistic: " + str(t)) 68 | print("The two-tailed p-value: " + str(p) + "\n") 69 | 70 | def wilcoxon(self): 71 | """ 72 | Wilcoxon 73 | 74 | The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from 75 | the same distribution. In particular, it tests whether the distribution of the differences x - y 76 | is symmetric about zero. It is a non-parametric version of the paired T-test. 77 | """ 78 | 79 | t, p = ranksums(self.sample1, self.sample2) 80 | print("=== Wilcoxon Analysis ===") 81 | print("The calculated t-statistic: " + str(t)) 82 | print("The two-tailed p-value: " + str(p) + "\n") 83 | 84 | def execute(self): 85 | self.general_analysis() 86 | if self.method.lower() == "wilcoxon": 87 | self.wilcoxon() 88 | elif self.method.lower() == "ttest": 89 | self.ttest() 90 | else: 91 | print("Error: Method Invalid!") 92 | -------------------------------------------------------------------------------- /caserec/recommenders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/__init__.py -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Arthur' 2 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/base_item_recommendation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | This class is base for item recommendation algorithms. 4 | 5 | """ 6 | 7 | # © 2019. Case Recommender (MIT License) 8 | 9 | from scipy.spatial.distance import squareform, pdist 10 | import numpy as np 11 | 12 | from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation 13 | from caserec.utils.extra_functions import print_header 14 | from caserec.utils.process_data import ReadFile, WriteFile 15 | 16 | __author__ = 'Arthur Fortes ' 17 | 18 | 19 | class BaseItemRecommendation(object): 20 | def __init__(self, train_file, test_file, output_file=None, as_binary=False, rank_length=10, 21 | similarity_metric="cosine", sep='\t', output_sep='\t'): 22 | """ 23 | This class is base for all item recommendation algorithms. Inherits the class Recommender 24 | and implements / adds common methods and attributes for rank approaches. 25 | 26 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 27 | (user item feedback_value). 28 | :type train_file: str 29 | 30 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 31 | (user item feedback_value). 32 | :type test_file: str, default None 33 | 34 | :param output_file: File with dir to write the final predictions 35 | :type output_file: str, default None 36 | 37 | :param similarity_metric: 38 | :type similarity_metric: str, default cosine 39 | 40 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 41 | :type rank_length: int, default 10 42 | 43 | :param as_binary: If True, the explicit feedback will be transform to binary 44 | :type as_binary: bool, default False 45 | 46 | :param sep: Delimiter for input files 47 | :type sep: str, default '\t' 48 | 49 | :param output_sep: Delimiter for output file 50 | :type output_sep: str, default '\t' 51 | 52 | """ 53 | 54 | self.train_file = train_file 55 | self.test_file = test_file 56 | self.as_binary = as_binary 57 | self.similarity_metric = similarity_metric 58 | self.output_file = output_file 59 | self.rank_length = rank_length 60 | self.sep = sep 61 | self.output_sep = output_sep 62 | 63 | # internal vars 64 | self.item_to_item_id = {} 65 | self.item_id_to_item = {} 66 | self.user_to_user_id = {} 67 | self.user_id_to_user = {} 68 | self.train_set = None 69 | self.test_set = None 70 | self.users = None 71 | self.items = None 72 | self.matrix = None 73 | self.evaluation_results = None 74 | self.recommender_name = None 75 | self.extra_info_header = None 76 | self.ranking = [] 77 | 78 | def read_files(self): 79 | """ 80 | Method to initialize recommender algorithm. 81 | 82 | """ 83 | self.train_set = ReadFile(self.train_file, sep=self.sep, as_binary=self.as_binary).read() 84 | 85 | if self.test_file is not None: 86 | self.test_set = ReadFile(self.test_file, sep=self.sep).read() 87 | self.users = sorted(set(list(self.train_set['users']) + list(self.test_set['users']))) 88 | self.items = sorted(set(list(self.train_set['items']) + list(self.test_set['items']))) 89 | else: 90 | self.users = self.train_set['users'] 91 | self.items = self.train_set['items'] 92 | 93 | for i, item in enumerate(self.items): 94 | self.item_to_item_id.update({item: i}) 95 | self.item_id_to_item.update({i: item}) 96 | for u, user in enumerate(self.users): 97 | self.user_to_user_id.update({user: u}) 98 | self.user_id_to_user.update({u: user}) 99 | 100 | def create_matrix(self): 101 | """ 102 | Method to create a feedback matrix 103 | 104 | """ 105 | 106 | self.matrix = np.zeros((len(self.users), len(self.items))) 107 | 108 | for user in self.train_set['users']: 109 | for item in self.train_set['feedback'][user]: 110 | self.matrix[self.user_to_user_id[user]][self.item_to_item_id[item]] = \ 111 | self.train_set['feedback'][user][item] 112 | 113 | def compute_similarity(self, transpose=False): 114 | """ 115 | Method to compute a similarity matrix from original df_matrix 116 | 117 | :param transpose: If True, calculate the similarity in a transpose matrix 118 | :type transpose: bool, default False 119 | 120 | """ 121 | 122 | # Calculate distance matrix 123 | if transpose: 124 | similarity_matrix = np.float32(squareform(pdist(self.matrix.T, self.similarity_metric))) 125 | else: 126 | similarity_matrix = np.float32(squareform(pdist(self.matrix, self.similarity_metric))) 127 | 128 | # Remove NaNs 129 | similarity_matrix[np.isnan(similarity_matrix)] = 1.0 130 | # transform distances in similarities. Values in matrix range from 0-1 131 | similarity_matrix = (similarity_matrix.max() - similarity_matrix) / similarity_matrix.max() 132 | 133 | return similarity_matrix 134 | 135 | def evaluate(self, metrics, verbose=True, as_table=False, table_sep='\t', n_ranks=None): 136 | """ 137 | Method to evaluate the final ranking 138 | 139 | :param metrics: List of evaluation metrics 140 | :type metrics: list, default ('Prec', 'Recall', 'MAP, 'NDCG') 141 | 142 | :param verbose: Print the evaluation results 143 | :type verbose: bool, default True 144 | 145 | :param as_table: Print the evaluation results as table 146 | :type as_table: bool, default False 147 | 148 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 149 | :type table_sep: str, default '\t' 150 | 151 | :param n_ranks: List of positions to evaluate the ranking 152 | :type n_ranks: list, None 153 | 154 | """ 155 | 156 | self.evaluation_results = {} 157 | 158 | if metrics is None: 159 | metrics = list(['PREC', 'RECALL', 'MAP', 'NDCG']) 160 | 161 | if n_ranks is None: 162 | n_ranks = list([1, 3, 5, 10]) 163 | 164 | results = ItemRecommendationEvaluation(verbose=verbose, as_table=as_table, table_sep=table_sep, 165 | metrics=metrics, n_ranks=n_ranks) 166 | 167 | self.evaluation_results = results.evaluate_recommender(predictions=self.ranking, test_set=self.test_set) 168 | 169 | def write_ranking(self): 170 | """ 171 | Method to write final ranking 172 | 173 | """ 174 | 175 | if self.output_file is not None: 176 | WriteFile(self.output_file, data=self.ranking, sep=self.sep).write() 177 | 178 | def compute(self, verbose=True): 179 | """ 180 | Method to run the recommender algorithm 181 | 182 | :param verbose: Print the information about recommender 183 | :type verbose: bool, default True 184 | 185 | """ 186 | 187 | # read files 188 | self.read_files() 189 | 190 | # initialize empty ranking (Don't remove: important to Cross Validation) 191 | self.ranking = [] 192 | 193 | if verbose: 194 | test_info = None 195 | 196 | main_info = { 197 | 'title': 'Item Recommendation > ' + self.recommender_name, 198 | 'n_users': len(self.train_set['users']), 199 | 'n_items': len(self.train_set['items']), 200 | 'n_interactions': self.train_set['number_interactions'], 201 | 'sparsity': self.train_set['sparsity'] 202 | } 203 | 204 | if self.test_file is not None: 205 | test_info = { 206 | 'n_users': len(self.test_set['users']), 207 | 'n_items': len(self.test_set['items']), 208 | 'n_interactions': self.test_set['number_interactions'], 209 | 'sparsity': self.test_set['sparsity'] 210 | } 211 | 212 | print_header(main_info, test_info) 213 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/content_based.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Content Based Recommender. 4 | 5 | Literature: 6 | Guangyuan Piao and John G. Breslin. 2016. Measuring semantic distance for linked open data-enabled recommender 7 | systems. In Proceedings of the 31st Annual ACM Symposium on Applied Computing (SAC '16). ACM, New York, NY, USA, 8 | 315-320. DOI: https://doi.org/10.1145/2851613.2851839 9 | 10 | """ 11 | 12 | # © 2019. Case Recommender (MIT License) 13 | 14 | import numpy as np 15 | 16 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 17 | from caserec.utils.process_data import ReadFile 18 | from caserec.utils.extra_functions import timed 19 | 20 | __author__ = 'Eduardo Fressato ' 21 | 22 | 23 | class ContentBased(BaseItemRecommendation): 24 | def __init__(self, train_file=None, test_file=None, output_file=None, similarity_file=None, similarity_sep='\t', 25 | rank_length=10, as_binary=True, sep='\t', output_sep='\t'): 26 | 27 | """ 28 | Content Based Recommender for Item Recommendation 29 | 30 | Usage:: 31 | 32 | >> ContentBased(train, test, similarity_file=similarity_file).compute() 33 | 34 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 35 | (user item feedback_value). 36 | :type train_file: str 37 | 38 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type test_file: str, default None 41 | 42 | :param output_file: File with dir to write the final predictions 43 | :type output_file: str, default None 44 | 45 | :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns 46 | (item item similarity). 47 | :type similarity_file: str, default None 48 | 49 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 50 | :type rank_length: int, default 10 51 | 52 | :param similarity_sep: Delimiter for similarity or metadata file 53 | :type similarity_sep: str, default '\t' 54 | 55 | :param sep: Delimiter for input files file 56 | :type sep: str, default '\t' 57 | 58 | :param output_sep: Delimiter for output file 59 | :type output_sep: str, default '\t' 60 | 61 | """ 62 | 63 | super(ContentBased, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 64 | as_binary=as_binary, rank_length=rank_length, sep=sep, output_sep=output_sep) 65 | 66 | self.recommender_name = 'Content Based Algorithm' 67 | 68 | self.similarity_file = similarity_file 69 | self.similarity_sep = similarity_sep 70 | self.si_matrix = None 71 | self.similar_items = None 72 | 73 | self.users_profile = None 74 | 75 | def init_model(self): 76 | """ 77 | Method to initialize the model. Create and read a similarity matrix 78 | 79 | """ 80 | if self.similarity_file is not None: 81 | similarity = ReadFile(self.similarity_file, sep=self.similarity_sep, as_binary=False 82 | ).read_metadata_or_similarity() 83 | 84 | self.si_matrix = np.zeros((len(self.items), len(self.items))) 85 | 86 | # Fill similarity matrix 87 | for i in similarity['col_1']: 88 | for i_j in similarity['dict'][i]: 89 | self.si_matrix[self.item_to_item_id[i], self.item_to_item_id[int(i_j)]] = similarity['dict'][i][i_j] 90 | 91 | # Remove NaNs 92 | self.si_matrix[np.isnan(self.si_matrix)] = 0.0 93 | 94 | else: 95 | raise ValueError("This algorithm needs a similarity matrix file!") 96 | 97 | def create_user_profile(self): 98 | self.users_profile = self.train_set['items_seen_by_user'] 99 | 100 | def predict(self): 101 | for u in self.train_set['users']: 102 | self.ranking += self.predict_user_rank(u) 103 | 104 | def predict_user_rank(self, user): 105 | unseen_items = set(self.items).difference(self.users_profile[user]) 106 | 107 | list_scores = [] 108 | for i in unseen_items: 109 | list_scores.append(self.predict_item_score(user, i)) 110 | 111 | return sorted(list_scores, key=lambda x: -x[2])[:self.rank_length] 112 | 113 | def predict_item_score(self, user, item): 114 | sum_sim = 0 115 | for i in self.users_profile[user]: 116 | sum_sim += self.si_matrix[self.item_to_item_id[item]][self.item_to_item_id[i]] 117 | 118 | return [user, item, sum_sim / len(self.users_profile[user])] 119 | 120 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t', n_ranks=None): 121 | """ 122 | Extends compute method from BaseItemRecommendation. Method to run recommender algorithm 123 | 124 | :param verbose: Print recommender and database information 125 | :type verbose: bool, default True 126 | 127 | :param metrics: List of evaluation metrics 128 | :type metrics: list, default None 129 | 130 | :param verbose_evaluation: Print the evaluation results 131 | :type verbose_evaluation: bool, default True 132 | 133 | :param as_table: Print the evaluation results as table 134 | :type as_table: bool, default False 135 | 136 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 137 | :type table_sep: str, default '\t' 138 | 139 | :param n_ranks: List of positions to evaluate the ranking 140 | :type n_ranks: list, None 141 | 142 | """ 143 | 144 | super(ContentBased, self).compute(verbose=verbose) 145 | 146 | if verbose: 147 | print("training_time:: %4f sec" % timed(self.init_model)) 148 | if self.extra_info_header is not None: 149 | print(self.extra_info_header) 150 | 151 | self.create_user_profile() 152 | print("prediction_time:: %4f sec" % timed(self.predict)) 153 | print('\n') 154 | else: 155 | self.init_model() 156 | self.create_user_profile() 157 | self.predict() 158 | 159 | self.write_ranking() 160 | 161 | if self.test_file is not None: 162 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep, n_ranks=n_ranks) 163 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/ensemble_average.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Ensemble Average 4 | [Item Recommendation (Ranking)] 5 | 6 | Literature: 7 | Arthur Fortes da Costa and Marcelo G. Manzato: 8 | Multimodal Interactions in Recommender Systems: An Ensembling Approach 9 | BRACIS 2014. 10 | https://ieeexplore.ieee.org/document/6984809/ 11 | 12 | """ 13 | 14 | # © 2019. Case Recommender (MIT License) 15 | 16 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 17 | 18 | __author__ = 'Arthur Fortes ' 19 | 20 | 21 | class EnsembleAverage(BaseItemRecommendation): 22 | """ 23 | Code being refactored, returns in the next version. 24 | 25 | """ 26 | raise NotImplemented 27 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/ensemble_bpr.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Ensemble BPR Learning 4 | [Item Recommendation (Ranking)] 5 | 6 | Literature: 7 | Arthur Fortes da Costa and Marcelo G. Manzato: 8 | Ensemble Learning in Recommender Systems: Combining Multiple User Interactions for Ranking Personalization. 9 | WebMedia 2014. 10 | https://dl.acm.org/citation.cfm?id=2664556 11 | 12 | """ 13 | 14 | # © 2019. Case Recommender (MIT License) 15 | 16 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 17 | 18 | __author__ = 'Arthur Fortes ' 19 | 20 | 21 | class EnsembleBPRLearning(BaseItemRecommendation): 22 | """ 23 | Code being refactored, returns in the next version. 24 | 25 | """ 26 | raise NotImplemented 27 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/item_attribute_knn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Item Based Collaborative Filtering Recommender with Attributes (Item Attribute KNN) 4 | [Item Recommendation (Ranking)] 5 | 6 | Its philosophy is as follows: in order to determine the rating of User u on item m, we can find other movies that 7 | are similar to item m, and based on User u’s ratings on those similar movies we infer his rating on item m. 8 | However, instead of traditional ItemKNN, this approach uses a metadata or pre-computed similarity matrix. 9 | 10 | """ 11 | 12 | # © 2019. Case Recommender (MIT License) 13 | 14 | from collections import defaultdict 15 | import numpy as np 16 | 17 | from caserec.recommenders.item_recommendation.itemknn import ItemKNN 18 | from caserec.utils.process_data import ReadFile 19 | 20 | __author__ = 'Arthur Fortes ' 21 | 22 | 23 | class ItemAttributeKNN(ItemKNN): 24 | def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None, 25 | k_neighbors=30, rank_length=10, as_binary=False, as_similar_first=True, metadata_as_binary=False, 26 | metadata_similarity_sep='\t', similarity_metric="cosine", sep='\t', output_sep='\t'): 27 | """ 28 | Item Attribute KNN for Item Recommendation 29 | 30 | This algorithm predicts a rank for each user based on the similar items that he/her consumed, 31 | using a metadata or similarity pre-computed file 32 | 33 | Usage:: 34 | 35 | >> ItemAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute() 36 | >> ItemAttributeKNN(train, test, metadata_file=metadata, as_similar_first=True).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns 50 | (item metadata). 51 | :type metadata_file: str, default None 52 | 53 | :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns 54 | (item item similarity). 55 | :type similarity_file: str, default None 56 | 57 | :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users)) 58 | :type k_neighbors: int, default None 59 | 60 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 61 | :type rank_length: int, default 10 62 | 63 | :param as_binary: If True, the explicit feedback will be transform to binary 64 | :type as_binary: bool, default False 65 | 66 | :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k 67 | most similar users and then take the intersection with the users that 68 | seen that item. 69 | :type as_similar_first: bool, default True 70 | 71 | :param metadata_as_binary: f True, the explicit value will be transform to binary 72 | :type metadata_as_binary: bool, default False 73 | 74 | :param metadata_similarity_sep: Delimiter for similarity or metadata file 75 | :type metadata_similarity_sep: str, default '\t' 76 | 77 | :param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about 78 | distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html 79 | :type similarity_metric: str, default cosine 80 | 81 | :param sep: Delimiter for input files file 82 | :type sep: str, default '\t' 83 | 84 | :param output_sep: Delimiter for output file 85 | :type output_sep: str, default '\t' 86 | """ 87 | super(ItemAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 88 | k_neighbors=k_neighbors, rank_length=rank_length, as_binary=as_binary, 89 | as_similar_first=as_similar_first, similarity_metric=similarity_metric, 90 | sep=sep, output_sep=output_sep) 91 | 92 | self.recommender_name = 'Item Attribute KNN Algorithm' 93 | 94 | self.metadata_file = metadata_file 95 | self.similarity_file = similarity_file 96 | self.metadata_as_binary = metadata_as_binary 97 | self.metadata_similarity_sep = metadata_similarity_sep 98 | 99 | def init_model(self): 100 | """ 101 | Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity 102 | matrix 103 | 104 | """ 105 | 106 | self.similar_items = defaultdict(list) 107 | 108 | # Set the value for k 109 | if self.k_neighbors is None: 110 | self.k_neighbors = int(np.sqrt(len(self.items))) 111 | 112 | if self.metadata_file is not None: 113 | metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary 114 | ).read_metadata_or_similarity() 115 | 116 | self.matrix = np.zeros((len(self.items), len(metadata['col_2']))) 117 | 118 | meta_to_meta_id = {} 119 | for m, data in enumerate(metadata['col_2']): 120 | meta_to_meta_id[data] = m 121 | 122 | for item in metadata['col_1']: 123 | for m in metadata['dict'][item]: 124 | self.matrix[self.item_to_item_id[item], meta_to_meta_id[m]] = metadata['dict'][item][m] 125 | 126 | # create header info for metadata 127 | sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100 128 | 129 | self.extra_info_header = ">> metadata:: %d items and %d metadata (%d interactions) | sparsity:: %.2f%%" % \ 130 | (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'], 131 | sparsity) 132 | 133 | # Create similarity matrix based on metadata or similarity file. Transpose=False, because it is an 134 | # item x metadata matrix 135 | self.si_matrix = self.compute_similarity(transpose=False) 136 | 137 | elif self.similarity_file is not None: 138 | similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False 139 | ).read_metadata_or_similarity() 140 | 141 | self.si_matrix = np.zeros((len(self.items), len(self.items))) 142 | 143 | # Fill similarity matrix 144 | for i in similarity['col_1']: 145 | for i_j in similarity['dict'][i]: 146 | self.si_matrix[self.item_to_item_id[i], self.item_to_item_id[int(i_j)]] = similarity['dict'][i][i_j] 147 | 148 | # Remove NaNs 149 | self.si_matrix[np.isnan(self.si_matrix)] = 0.0 150 | 151 | else: 152 | raise ValueError("This algorithm needs a similarity matrix or a metadata file!") 153 | 154 | # Create original matrix user x item for prediction process 155 | self.create_matrix() 156 | 157 | for i_id, item in enumerate(self.items): 158 | self.similar_items[i_id] = sorted(range(len(self.si_matrix[i_id])), 159 | key=lambda k: -self.si_matrix[i_id][k])[1:self.k_neighbors + 1] 160 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/itemknn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Item Based Collaborative Filtering Recommender (Item KNN) 4 | [Item Recommendation (Ranking)] 5 | 6 | Item KNN predicts a user’s ranking based on similar items which him/her access. 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | from collections import defaultdict 13 | import numpy as np 14 | 15 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 16 | from caserec.utils.extra_functions import timed 17 | 18 | __author__ = 'Arthur Fortes ' 19 | 20 | 21 | class ItemKNN(BaseItemRecommendation): 22 | def __init__(self, train_file=None, test_file=None, output_file=None, similarity_metric="cosine", k_neighbors=None, 23 | rank_length=10, as_binary=False, as_similar_first=True, sep='\t', output_sep='\t'): 24 | 25 | """ 26 | Item KNN for Item Recommendation 27 | 28 | This algorithm predicts a rank for each user based on the similar items that he/her consumed. 29 | 30 | Usage:: 31 | 32 | >> ItemKNN(train, test, as_similar_first=True).compute() 33 | >> ItemKNN(train, test, ranking_file, as_binary=True).compute() 34 | 35 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 36 | (user item feedback_value). 37 | :type train_file: str 38 | 39 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 40 | (user item feedback_value). 41 | :type test_file: str, default None 42 | 43 | :param output_file: File with dir to write the final predictions 44 | :type output_file: str, default None 45 | 46 | :param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about 47 | distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html 48 | :type similarity_metric: str, default cosine 49 | 50 | :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_items)) 51 | :type k_neighbors: int, default None 52 | 53 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 54 | :type rank_length: int, default 10 55 | 56 | :param as_binary: If True, the explicit feedback will be transform to binary 57 | :type as_binary: bool, default False 58 | 59 | :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k 60 | most similar users and then take the intersection with the users that 61 | seen that item. 62 | :type as_similar_first: bool, default True 63 | 64 | :param sep: Delimiter for input files 65 | :type sep: str, default '\t' 66 | 67 | :param output_sep: Delimiter for output file 68 | :type output_sep: str, default '\t' 69 | 70 | """ 71 | 72 | super(ItemKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 73 | as_binary=as_binary, rank_length=rank_length, similarity_metric=similarity_metric, 74 | sep=sep, output_sep=output_sep) 75 | 76 | self.recommender_name = 'ItemKNN Algorithm' 77 | 78 | self.as_similar_first = as_similar_first 79 | self.k_neighbors = k_neighbors 80 | 81 | # internal vars 82 | self.si_matrix = None 83 | self.similar_items = None 84 | 85 | def init_model(self): 86 | """ 87 | Method to initialize the model. Create and calculate a similarity matrix 88 | 89 | """ 90 | self.similar_items = defaultdict(list) 91 | 92 | # Set the value for k 93 | if self.k_neighbors is None: 94 | self.k_neighbors = int(np.sqrt(len(self.items))) 95 | 96 | self.create_matrix() 97 | self.si_matrix = self.compute_similarity(transpose=True) 98 | 99 | for i_id, item in enumerate(self.items): 100 | self.similar_items[i_id] = sorted(range(len(self.si_matrix[i_id])), 101 | key=lambda k: -self.si_matrix[i_id][k])[1:self.k_neighbors + 1] 102 | 103 | def predict(self): 104 | """ 105 | This method predict a rank for a specific user. 106 | 107 | """ 108 | 109 | for u_id, user in enumerate(self.users): 110 | if len(self.train_set['feedback'].get(user, [])) != 0: 111 | if self.as_similar_first: 112 | self.ranking += self.predict_similar_first_scores(user, u_id) 113 | else: 114 | self.ranking += self.predict_scores(user, u_id) 115 | 116 | else: 117 | # Implement cold start user 118 | pass 119 | 120 | def predict_scores(self, user, user_id): 121 | partial_predictions = [] 122 | # Selects items that user has not interacted with. 123 | u_list = list(np.flatnonzero(self.matrix[user_id] == 0)) 124 | seen_items_id = np.flatnonzero(self.matrix[user_id]) 125 | 126 | # predict score for item_i 127 | for i_id in u_list: 128 | sim_sum = sorted(np.take(self.si_matrix[i_id], seen_items_id), key=lambda x: -x) 129 | partial_predictions.append((user, self.items[i_id], sum(sim_sum[:self.k_neighbors]))) 130 | 131 | return sorted(partial_predictions, key=lambda x: -x[2])[:self.rank_length] 132 | 133 | def predict_similar_first_scores(self, user, user_id): 134 | """ 135 | In this implementation, for each unknown item, which will be 136 | predicted, we first look for its k most similar items and then take the intersection with the seen items of 137 | the user. Finally, the score of the unknown item will be the sum of the similarities of k's most similar 138 | to it, taking into account only the items that each user seen. 139 | 140 | """ 141 | 142 | predictions = [] 143 | 144 | # Selects items that user has not interacted with. 145 | u_list = list(np.flatnonzero(self.matrix[user_id] == 0)) 146 | seen_items_id = np.flatnonzero(self.matrix[user_id]) 147 | 148 | # predict score for item_i 149 | for i_id in u_list: 150 | # s_id = list(filter(set(self.similar_items[i]).__contains__, seen_items_id)) 151 | s_id = list(set(self.similar_items[i_id]).intersection(seen_items_id)) 152 | sim_sum = np.take(self.si_matrix[i_id], s_id) 153 | predictions.append((user, self.items[i_id], sum(sim_sum))) 154 | 155 | return sorted(predictions, key=lambda x: -x[2])[:self.rank_length] 156 | 157 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t', n_ranks=None): 158 | """ 159 | Extends compute method from BaseItemRecommendation. Method to run recommender algorithm 160 | 161 | :param verbose: Print recommender and database information 162 | :type verbose: bool, default True 163 | 164 | :param metrics: List of evaluation metrics 165 | :type metrics: list, default None 166 | 167 | :param verbose_evaluation: Print the evaluation results 168 | :type verbose_evaluation: bool, default True 169 | 170 | :param as_table: Print the evaluation results as table 171 | :type as_table: bool, default False 172 | 173 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 174 | :type table_sep: str, default '\t' 175 | 176 | :param n_ranks: List of positions to evaluate the ranking 177 | :type n_ranks: list, None 178 | 179 | """ 180 | 181 | super(ItemKNN, self).compute(verbose=verbose) 182 | 183 | if verbose: 184 | print("training_time:: %4f sec" % timed(self.init_model)) 185 | if self.extra_info_header is not None: 186 | print(self.extra_info_header) 187 | print("prediction_time:: %4f sec" % timed(self.predict)) 188 | print('\n') 189 | 190 | else: 191 | self.init_model() 192 | self.predict() 193 | 194 | self.write_ranking() 195 | 196 | if self.test_file is not None: 197 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep, n_ranks=n_ranks) 198 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/most_popular.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Most Popular Collaborative Filtering Recommender 4 | [Item Recommendation (Ranking)] 5 | 6 | Most Popular predicts a user’s ranking based on popularity of user and items. 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 13 | from caserec.utils.extra_functions import timed 14 | 15 | __author__ = 'Arthur Fortes ' 16 | 17 | 18 | class MostPopular(BaseItemRecommendation): 19 | def __init__(self, train_file=None, test_file=None, output_file=None, as_binary=False, rank_length=10, sep='\t', 20 | output_sep='\t'): 21 | """ 22 | Most Popular for Item Recommendation 23 | 24 | This algorithm predicts a rank for each user using the count of number of feedback of users and items 25 | 26 | Usage:: 27 | 28 | >> MostPopular(train, test).compute() 29 | >> MostPopular(train, test, as_binary=True).compute() 30 | 31 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 32 | (user item feedback_value). 33 | :type train_file: str 34 | 35 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 36 | (user item feedback_value). 37 | :type test_file: str, default None 38 | 39 | :param output_file: File with dir to write the final predictions 40 | :type output_file: str, default None 41 | 42 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 43 | :type rank_length: int, default 10 44 | 45 | :param as_binary: If True, the explicit feedback will be transform to binary 46 | :type as_binary: bool, default False 47 | 48 | :param sep: Delimiter for input files 49 | :type sep: str, default '\t' 50 | 51 | :param output_sep: Delimiter for output file 52 | :type output_sep: str, default '\t' 53 | 54 | """ 55 | 56 | super(MostPopular, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 57 | as_binary=as_binary, rank_length=rank_length, sep=sep, output_sep=output_sep) 58 | 59 | self.recommender_name = 'Most Popular' 60 | 61 | def predict(self): 62 | """ 63 | This method predict final result, building an rank of each user of the train set. 64 | 65 | """ 66 | 67 | for user in set(self.users): 68 | predictions = list() 69 | 70 | for item in self.train_set['items_unobserved'].get(user, []): 71 | 72 | if self.as_binary: 73 | predictions.append((user, item, len(self.train_set['users_viewed_item'][item]))) 74 | else: 75 | count_value = 0 76 | for user_v in self.train_set['users_viewed_item'][item]: 77 | count_value += self.train_set['feedback'][user_v][item] 78 | predictions.append((user, item, count_value)) 79 | 80 | predictions = sorted(predictions, key=lambda x: -x[2]) 81 | self.ranking += predictions[:self.rank_length] 82 | 83 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 84 | """ 85 | Extends compute method from BaseItemRecommendation. Method to run recommender algorithm 86 | 87 | :param verbose: Print recommender and database information 88 | :type verbose: bool, default True 89 | 90 | :param metrics: List of evaluation measures 91 | :type metrics: list, default None 92 | 93 | :param verbose_evaluation: Print the evaluation results 94 | :type verbose_evaluation: bool, default True 95 | 96 | :param as_table: Print the evaluation results as table 97 | :type as_table: bool, default False 98 | 99 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 100 | :type table_sep: str, default '\t' 101 | 102 | """ 103 | 104 | super(MostPopular, self).compute(verbose=verbose) 105 | 106 | if verbose: 107 | print("prediction_time:: %4f sec" % timed(self.predict)) 108 | print('\n') 109 | 110 | else: 111 | self.predict() 112 | 113 | self.write_ranking() 114 | 115 | if self.test_file is not None: 116 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 117 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/paco_recommender.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | PaCo Recommender Algorithm 4 | [Co-Clustering Algorithm] 5 | 6 | Literature: 7 | Michail Vlachos, Francesco Fusco, Charalambos Mavroforakis, Anastasios Kyrillidis, and 8 | Vassilios G. Vassiliadis: 9 | Improving Co-Cluster Quality with Application to Product Recommendations. 2014. 10 | http://dl.acm.org/citation.cfm?id=2661980 11 | 12 | """ 13 | 14 | from collections import defaultdict 15 | import numpy as np 16 | 17 | from caserec.clustering.paco import PaCo 18 | from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation 19 | from caserec.utils.process_data import ReadFile 20 | 21 | __author__ = 'Arthur Fortes ' 22 | 23 | 24 | class PaCoRecommender(object): 25 | def __init__(self, train_file, test_file=None, output_file=None, k_row=None, l_col=None, 26 | density_low=0.01, as_binary=True, min_density=0.3): 27 | 28 | """ 29 | PaCo for Item Recommendation 30 | 31 | This algorithm predicts a rank for each user using a co-clustering algorithm 32 | 33 | Usage:: 34 | 35 | >> PaCoRecommender(train, test).compute() 36 | >> PaCoRecommender(train, test, as_binary=True).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param k_row: Number of clusters generated by k-means in rows 50 | :type k_row: int, default None 51 | 52 | :param l_col: (int) Number of clusters generated by k-means in rows 53 | :type l_col: int, default None 54 | 55 | :param density_low: Threshold to change the density matrix values 56 | :type density_low: float, default 0.008 57 | 58 | :param as_binary: If True, the explicit feedback will be transform to binary 59 | :type as_binary: bool, default True 60 | 61 | :param min_density: Considers bi-clusters until min-density 62 | :type min_density: float, default 0.3 63 | 64 | """ 65 | self.recommender_name = 'PaCo Recommender Algorithm' 66 | 67 | self.train_file = train_file 68 | self.test_file = test_file 69 | if test_file is not None: 70 | self.test_set = ReadFile(test_file).read() 71 | self.train_set = ReadFile(train_file, as_binary=as_binary).read() 72 | self.output_file = output_file 73 | self.k_row = k_row 74 | self.l_col = l_col 75 | self.density_low = density_low 76 | self.min_density = min_density 77 | 78 | self.users = self.train_set['users'] 79 | self.items = self.train_set['items'] 80 | 81 | self.item_to_item_id = {} 82 | self.item_id_to_item = {} 83 | self.user_to_user_id = {} 84 | self.user_id_to_user = {} 85 | 86 | for i, item in enumerate(self.items): 87 | self.item_to_item_id.update({item: i}) 88 | self.item_id_to_item.update({i: item}) 89 | for u, user in enumerate(self.users): 90 | self.user_to_user_id.update({user: u}) 91 | self.user_id_to_user.update({u: user}) 92 | 93 | self.predictions = [] 94 | self.uns_items = defaultdict() 95 | self.co_clustering = None 96 | 97 | def run_co_clustering(self): 98 | self.co_clustering = PaCo(self.train_file, k_row=self.k_row, l_col=self.l_col, density_low=self.density_low) 99 | self.co_clustering.fit() 100 | if len(self.co_clustering.density) == 1: 101 | raise ValueError('Error: Co-clustering generated only 1 bi-cluster!') 102 | 103 | def recommender(self): 104 | for n, k in enumerate(self.co_clustering.list_row): 105 | cols = self.co_clustering.density[n].argsort() 106 | cols = np.array(cols).ravel()[::-1] 107 | 108 | for u_idx in k: 109 | user = self.user_id_to_user[u_idx] 110 | unseen_items = set() 111 | for l in cols: 112 | if self.co_clustering.density[n, l] != 0 and self.co_clustering.density[n, l] != 1 and \ 113 | self.co_clustering.density[n, l] >= self.min_density: 114 | for i_idx in self.co_clustering.list_col[l]: 115 | item = self.item_id_to_item[i_idx] 116 | if self.train_set['feedback'][user].get(item, -1) == -1: 117 | unseen_items.add(item) 118 | 119 | self.uns_items[user] = unseen_items 120 | 121 | for user in self.train_set['users']: 122 | ranking = [] 123 | for item in self.uns_items[user]: 124 | rui = len(self.train_set['users_viewed_item'][item]) 125 | ranking.append((user, item, rui)) 126 | self.predictions += sorted(ranking, key=lambda x: -x[2])[:10] 127 | 128 | if self.output_file is not None: 129 | with open(self.output_file, 'w') as fw: 130 | for sample in self.predictions: 131 | fw.write("%d\t%d\t%f\n" % (sample[0], sample[1], sample[2])) 132 | 133 | def compute(self, verbose=True, metrics=list(['PREC', 'RECALL', 'MAP', 'NDCG']), verbose_evaluation=True, 134 | as_table=False, table_sep='\t'): 135 | 136 | if verbose: 137 | print("[Case Recommender: Item Recommendation > %s]\n" % self.recommender_name) 138 | 139 | self.run_co_clustering() 140 | self.recommender() 141 | 142 | if self.test_file is not None: 143 | ItemRecommendationEvaluation(metrics=metrics, as_table=as_table, table_sep=table_sep, 144 | verbose=verbose_evaluation).evaluate_recommender(self.predictions, 145 | self.test_set) 146 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/random_rec.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Random Collaborative Filtering Recommender 4 | [Item Recommendation (Ranking)] 5 | 6 | Random predicts a user’s ranking based on random scores. 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | import random 13 | 14 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 15 | from caserec.utils.extra_functions import timed 16 | 17 | __author__ = 'Arthur Fortes ' 18 | 19 | 20 | class RandomRec(BaseItemRecommendation): 21 | def __init__(self, train_file=None, test_file=None, output_file=None, rank_length=10, sep='\t', output_sep='\t'): 22 | """ 23 | Random Recommender for Item Recommendation 24 | 25 | This algorithm predicts a rank for each user using the count of number of feedback of users and items 26 | 27 | Usage:: 28 | 29 | >> RandomRec(train).compute() 30 | >> RandomRec(train, test, ranking).compute() 31 | 32 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 33 | (user item feedback_value). 34 | :type train_file: str 35 | 36 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 37 | (user item feedback_value). 38 | :type test_file: str, default None 39 | 40 | :param output_file: File with dir to write the final predictions 41 | :type output_file: str, default None 42 | 43 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 44 | :type rank_length: int, default 10 45 | 46 | :param sep: Delimiter for input files 47 | :type sep: str, default '\t' 48 | 49 | :param output_sep: Delimiter for output file 50 | :type output_sep: str, default '\t' 51 | 52 | """ 53 | 54 | super(RandomRec, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 55 | rank_length=rank_length, sep=sep, output_sep=output_sep) 56 | 57 | self.recommender_name = 'Random Recommender' 58 | 59 | def predict(self): 60 | """ 61 | Method to predict a rank for each user. 62 | 63 | For each pair out of train set, predict a random score for it. 64 | 65 | """ 66 | 67 | for user in set(self.users): 68 | predictions = list() 69 | for item in self.train_set['items_unobserved'].get(user, []): 70 | predictions.append((user, item, random.uniform(0, 1))) 71 | predictions = sorted(predictions, key=lambda x: -x[2]) 72 | self.ranking += predictions[:self.rank_length] 73 | 74 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 75 | """ 76 | Extends compute method from BaseItemRecommendation. Method to run recommender algorithm 77 | 78 | :param verbose: Print recommender and database information 79 | :type verbose: bool, default True 80 | 81 | :param metrics: List of evaluation metrics 82 | :type metrics: list, default None 83 | 84 | :param verbose_evaluation: Print the evaluation results 85 | :type verbose_evaluation: bool, default True 86 | 87 | :param as_table: Print the evaluation results as table 88 | :type as_table: bool, default False 89 | 90 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 91 | :type table_sep: str, default '\t' 92 | 93 | """ 94 | 95 | super(RandomRec, self).compute(verbose=verbose) 96 | 97 | if verbose: 98 | print("prediction_time:: %4f sec" % timed(self.predict)) 99 | print('\n') 100 | 101 | else: 102 | self.predict() 103 | 104 | self.write_ranking() 105 | 106 | if self.test_file is not None: 107 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 108 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/user_attribute_knn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | User Based Collaborative Filtering Recommender with Attributes (User Attribute KNN) 4 | [Item Recommendation (Ranking)] 5 | 6 | User-Attribute-kNN predicts a user’s ranking according to how similar users rated the same item. The algorithm 7 | matches similar users based on the similarity of their attributes scores. However, instead of traditional UserKNN, 8 | this approach uses a pre-computed similarity matrix based on metadata. 9 | 10 | 11 | """ 12 | 13 | # © 2019. Case Recommender (MIT License) 14 | 15 | import numpy as np 16 | 17 | from caserec.recommenders.item_recommendation.userknn import UserKNN 18 | from caserec.utils.process_data import ReadFile 19 | 20 | __author__ = 'Arthur Fortes ' 21 | 22 | 23 | class UserAttributeKNN(UserKNN): 24 | def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None, 25 | k_neighbors=30, rank_length=10, as_binary=False, as_similar_first=True, metadata_as_binary=False, 26 | metadata_similarity_sep='\t', similarity_metric="cosine", sep='\t', output_sep='\t'): 27 | """ 28 | User Attribute KNN for Item Recommendation 29 | 30 | This algorithm predicts a rank for each user based on the similar items that his neighbors 31 | (similar users) consumed, using a metadata or similarity pre-computed file 32 | 33 | Usage:: 34 | 35 | >> UserAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute() 36 | >> UserAttributeKNN(train, test, metadata_file=metadata, as_similar_first=True).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns 50 | (user metadata). 51 | :type metadata_file: str, default None 52 | 53 | :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns 54 | (user user similarity). 55 | :type similarity_file: str, default None 56 | 57 | :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users)) 58 | :type k_neighbors: int, default None 59 | 60 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 61 | :type rank_length: int, default 10 62 | 63 | :param as_binary: If True, the explicit feedback will be transform to binary 64 | :type as_binary: bool, default False 65 | 66 | :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k 67 | most similar users and then take the intersection with the users that 68 | seen that item. 69 | :type as_similar_first: bool, default True 70 | 71 | :param metadata_as_binary: f True, the explicit value will be transform to binary 72 | :type metadata_as_binary: bool, default False 73 | 74 | :param metadata_similarity_sep: Delimiter for similarity or metadata file 75 | :type metadata_similarity_sep: str, default '\t' 76 | 77 | :param similarity_metric: 78 | :type similarity_metric: str, default cosine 79 | 80 | :param sep: Delimiter for input files file 81 | :type sep: str, default '\t' 82 | 83 | :param output_sep: Delimiter for output file 84 | :type output_sep: str, default '\t' 85 | """ 86 | super(UserAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 87 | k_neighbors=k_neighbors, rank_length=rank_length, as_binary=as_binary, 88 | as_similar_first=as_similar_first, similarity_metric=similarity_metric, 89 | sep=sep, output_sep=output_sep) 90 | 91 | self.recommender_name = 'User Attribute KNN Algorithm' 92 | 93 | self.metadata_file = metadata_file 94 | self.similarity_file = similarity_file 95 | self.metadata_as_binary = metadata_as_binary 96 | self.metadata_similarity_sep = metadata_similarity_sep 97 | 98 | def init_model(self): 99 | """ 100 | Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity 101 | matrix 102 | 103 | """ 104 | 105 | self.users_id_viewed_item = {} 106 | 107 | for item in self.items: 108 | for user in self.train_set['users_viewed_item'].get(item, []): 109 | self.users_id_viewed_item.setdefault(item, []).append(self.user_to_user_id[user]) 110 | 111 | # Set the value for k 112 | if self.k_neighbors is None: 113 | self.k_neighbors = int(np.sqrt(len(self.users))) 114 | 115 | if self.metadata_file is not None: 116 | metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary 117 | ).read_metadata_or_similarity() 118 | 119 | self.matrix = np.zeros((len(self.users), len(metadata['col_2']))) 120 | 121 | meta_to_meta_id = {} 122 | 123 | for m, data in enumerate(metadata['col_2']): 124 | meta_to_meta_id[data] = m 125 | 126 | for user_m in metadata['col_1']: 127 | for m1 in metadata['dict'][user_m]: 128 | try: 129 | self.matrix[self.user_to_user_id[user_m], meta_to_meta_id[m1]] = metadata['dict'][user_m][m1] 130 | except KeyError: 131 | pass 132 | 133 | # create header info for metadata 134 | sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100 135 | 136 | self.extra_info_header = ">> metadata:: %d users and %d metadata (%d interactions) | sparsity:: %.2f%%" % \ 137 | (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'], 138 | sparsity) 139 | 140 | # Create similarity matrix based on metadata or similarity file 141 | self.su_matrix = self.compute_similarity(transpose=False) 142 | 143 | elif self.similarity_file is not None: 144 | similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False 145 | ).read_metadata_or_similarity() 146 | 147 | self.su_matrix = np.zeros((len(self.users), len(self.users))) 148 | 149 | # Fill similarity matrix 150 | for u in similarity['col_1']: 151 | for u_j in similarity['dict'][u]: 152 | self.su_matrix[self.user_to_user_id[u], self.user_to_user_id[int(u_j)]] = similarity['dict'][u][u_j] 153 | 154 | # Remove NaNs 155 | self.su_matrix[np.isnan(self.su_matrix)] = 0.0 156 | 157 | else: 158 | raise ValueError("This algorithm needs a similarity matrix or a metadata file!") 159 | 160 | # Create original matrix user x item for prediction process 161 | self.create_matrix() 162 | -------------------------------------------------------------------------------- /caserec/recommenders/item_recommendation/userknn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | User Based Collaborative Filtering Recommender (User KNN) 4 | [Item Recommendation (Ranking)] 5 | 6 | User KNN predicts a user’s ranking based on similar users behavior. 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | import numpy as np 13 | 14 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation 15 | from caserec.utils.extra_functions import timed 16 | 17 | __author__ = 'Arthur Fortes ' 18 | 19 | 20 | class UserKNN(BaseItemRecommendation): 21 | def __init__(self, train_file=None, test_file=None, output_file=None, similarity_metric="cosine", k_neighbors=None, 22 | rank_length=10, as_binary=False, as_similar_first=True, sep='\t', output_sep='\t'): 23 | """ 24 | User KNN for Item Recommendation 25 | 26 | This algorithm predicts a rank for each user based on the similar items that his neighbors 27 | (similar users) consumed. 28 | 29 | Usage:: 30 | 31 | >> UserKNN(train, test, as_similar_first=True).compute() 32 | >> UserKNN(train, test, ranking_file, as_binary=True).compute() 33 | 34 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 35 | (user item feedback_value). 36 | :type train_file: str 37 | 38 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type test_file: str, default None 41 | 42 | :param output_file: File with dir to write the final predictions 43 | :type output_file: str, default None 44 | 45 | :param similarity_metric: Pairwise metric to compute the similarity between the users. Reference about 46 | distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html 47 | :type similarity_metric: str, default cosine 48 | 49 | :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users)) 50 | :type k_neighbors: int, default None 51 | 52 | :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm 53 | :type rank_length: int, default 10 54 | 55 | :param as_binary: If True, the explicit feedback will be transform to binary 56 | :type as_binary: bool, default False 57 | 58 | :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k 59 | most similar users and then take the intersection with the users that 60 | seen that item. 61 | :type as_similar_first: bool, default True 62 | 63 | :param sep: Delimiter for input files 64 | :type sep: str, default '\t' 65 | 66 | :param output_sep: Delimiter for output file 67 | :type output_sep: str, default '\t' 68 | 69 | """ 70 | 71 | super(UserKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 72 | as_binary=as_binary, rank_length=rank_length, similarity_metric=similarity_metric, 73 | sep=sep, output_sep=output_sep) 74 | 75 | self.recommender_name = 'UserKNN Algorithm' 76 | 77 | self.as_similar_first = as_similar_first 78 | self.k_neighbors = k_neighbors 79 | 80 | # internal vars 81 | self.su_matrix = None 82 | self.users_id_viewed_item = None 83 | 84 | def init_model(self): 85 | """ 86 | Method to initialize the model. Create and calculate a similarity matrix 87 | 88 | """ 89 | self.users_id_viewed_item = {} 90 | 91 | self.create_matrix() 92 | self.su_matrix = self.compute_similarity(transpose=False) 93 | 94 | # Set the value for k 95 | if self.k_neighbors is None: 96 | self.k_neighbors = int(np.sqrt(len(self.users))) 97 | 98 | for item in self.items: 99 | for user in self.train_set['users_viewed_item'].get(item, []): 100 | self.users_id_viewed_item.setdefault(item, []).append(self.user_to_user_id[user]) 101 | 102 | def predict(self): 103 | """ 104 | Method to predict a rank for each user. 105 | 106 | """ 107 | 108 | for u_id, user in enumerate(self.users): 109 | if len(self.train_set['feedback'].get(user, [])) != 0: 110 | u_list = list(np.flatnonzero(self.matrix[u_id] == 0)) 111 | 112 | if self.as_similar_first: 113 | self.ranking += self.predict_similar_first_scores(user, u_id, u_list) 114 | else: 115 | self.ranking += self.predict_scores(user, u_id, u_list) 116 | else: 117 | # Implement cold start user 118 | pass 119 | 120 | def predict_scores(self, user, user_id, unpredicted_items): 121 | """ 122 | Method to predict a rank for each user. In this implementation, for each unknown item, 123 | which will be predicted, we first look for users that seen that item and calculate the similarity between them 124 | and the user. Then we sort these similarities and get the most similar k's. Finally, the score of the 125 | unknown item will be the sum of the similarities. 126 | 127 | """ 128 | 129 | predictions = [] 130 | for item_id in unpredicted_items: 131 | item = self.items[item_id] 132 | sim_sum = [] 133 | for user_v in self.users_id_viewed_item.get(item, []): 134 | sim_sum.append(self.su_matrix[user_id, user_v]) 135 | sim_sum = sorted(sim_sum, reverse=True) 136 | 137 | predictions.append((user, item, sum(sim_sum[:self.k_neighbors]))) 138 | 139 | return sorted(predictions, key=lambda x: -x[2])[:self.rank_length] 140 | 141 | def predict_similar_first_scores(self, user, user_id, unpredicted_items): 142 | """ 143 | Method to predict a rank for each user. In this implementation, for each unknown item, which will be 144 | predicted, we first look for its k most similar users and then take the intersection with the users that 145 | seen that item. Finally, the score of the unknown item will be the sum of the similarities. 146 | 147 | """ 148 | 149 | predictions = [] 150 | 151 | # Select user neighbors, sorting user similarity vector. Returns a list with index of sorting values 152 | neighbors = sorted(range(len(self.su_matrix[user_id])), key=lambda m: -self.su_matrix[user_id][m]) 153 | 154 | for item_id in unpredicted_items: 155 | item = self.items[item_id] 156 | # Intersection bt. the neighbors closest to the user and the users who accessed the unknown item. 157 | common_users = list(set(self.users_id_viewed_item.get(item, [])). 158 | intersection(neighbors[1:self.k_neighbors])) 159 | 160 | sim_sum = 0 161 | for user_v in common_users: 162 | sim_sum += self.su_matrix[user_id, user_v] 163 | 164 | predictions.append((user, item, sim_sum)) 165 | 166 | return sorted(predictions, key=lambda x: -x[2])[:self.rank_length] 167 | 168 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, 169 | as_table=False, table_sep='\t', n_ranks=None): 170 | """ 171 | Extends compute method from BaseItemRecommendation. Method to run recommender algorithm 172 | 173 | :param verbose: Print recommender and database information 174 | :type verbose: bool, default True 175 | 176 | :param metrics: List of evaluation metrics 177 | :type metrics: list, default None 178 | 179 | :param verbose_evaluation: Print the evaluation results 180 | :type verbose_evaluation: bool, default True 181 | 182 | :param as_table: Print the evaluation results as table 183 | :type as_table: bool, default False 184 | 185 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 186 | :type table_sep: str, default '\t' 187 | 188 | """ 189 | 190 | super(UserKNN, self).compute(verbose=verbose) 191 | 192 | if verbose: 193 | print("training_time:: %4f sec" % timed(self.init_model)) 194 | if self.extra_info_header is not None: 195 | print(self.extra_info_header) 196 | print("prediction_time:: %4f sec" % timed(self.predict)) 197 | 198 | print('\n') 199 | 200 | else: 201 | # Execute all in silence without prints 202 | self.extra_info_header = None 203 | self.init_model() 204 | self.predict() 205 | 206 | self.write_ranking() 207 | 208 | if self.test_file is not None: 209 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep, n_ranks=n_ranks) 210 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Arthur' 2 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/rating_prediction/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/__pycache__/base_rating_prediction.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/rating_prediction/__pycache__/base_rating_prediction.cpython-37.pyc -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/__pycache__/nnmf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/rating_prediction/__pycache__/nnmf.cpython-37.pyc -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/base_knn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | This file is base for neighborhood-based algorithms 4 | 5 | Used by: ItemKNN, Item Attribute KNN, UserKNN and User Attribute KNN 6 | 7 | """ 8 | 9 | # © 2019. Case Recommender (MIT License) 10 | 11 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 12 | 13 | __author__ = 'Arthur Fortes ' 14 | 15 | 16 | class BaseKNN(BaseRatingPrediction): 17 | def __init__(self, train_file, test_file, output_file=None, reg_bi=10, reg_bu=15, similarity_metric='cosine', 18 | sep='\t', output_sep='\t'): 19 | """ 20 | This class is base for all neighborhood-based algorithms. 21 | 22 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 23 | (user item feedback_value). 24 | :type train_file: str 25 | 26 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 27 | (user item feedback_value). 28 | :type test_file: str, default None 29 | 30 | :param output_file: File with dir to write the final predictions 31 | :type output_file: str, default None 32 | 33 | :param reg_bi: Regularization factor for items 34 | :type reg_bi: int, default 10 35 | 36 | :param reg_bu: Regularization factor for users 37 | :type reg_bu: int, default 15 38 | 39 | :param similarity_metric: 40 | :type similarity_metric: str, default cosine 41 | 42 | :param sep: Delimiter for input files 43 | :type sep: str, default'\t' 44 | 45 | :param output_sep: Delimiter for output file 46 | :type output_sep: str, default '\t' 47 | 48 | """ 49 | super(BaseKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 50 | similarity_metric=similarity_metric, sep=sep, output_sep=output_sep) 51 | 52 | self.reg_bi = reg_bi 53 | self.reg_bu = reg_bu 54 | 55 | # internal vars 56 | self.number_users = None 57 | self.number_items = None 58 | self.bu = {} 59 | self.bi = {} 60 | self.bui = {} 61 | 62 | def init_model(self): 63 | """ 64 | Method to treat and initialize the model. Create a matrix user x item 65 | 66 | """ 67 | 68 | self.number_users = len(self.users) 69 | self.number_items = len(self.items) 70 | 71 | self.create_matrix() 72 | 73 | def train_baselines(self): 74 | """ 75 | Method to train baselines for each pair user, item 76 | 77 | """ 78 | 79 | self.bu = {} 80 | self.bi = {} 81 | self.bui = {} 82 | 83 | for i in range(10): 84 | self.compute_bi() 85 | self.compute_bu() 86 | self.compute_bui() 87 | 88 | def compute_bi(self): 89 | """ 90 | Method to compute bi values 91 | 92 | bi = (rui - mi - bu) / (regBi + number of interactions) 93 | 94 | """ 95 | 96 | self.bi = dict() 97 | 98 | for item in self.items: 99 | count = 0 100 | 101 | for user in self.train_set['users_viewed_item'].get(item, []): 102 | self.bi[item] = self.bi.get(item, 0) + float(self.train_set['feedback'][user].get(item, 0)) - \ 103 | self.train_set['mean_value'] - self.bu.get(user, 0) 104 | count += 1 105 | 106 | if count > 1: 107 | self.bi[item] = float(self.bi[item]) / float(self.reg_bi + count) 108 | elif count == 0: 109 | self.bi[item] = self.train_set['mean_value'] 110 | 111 | def compute_bu(self): 112 | """ 113 | Method to compute bu values 114 | 115 | bu = (rui - mi - bi) / (regBu + number of interactions) 116 | 117 | """ 118 | 119 | self.bu = dict() 120 | for user in self.users: 121 | count = 0 122 | 123 | for item in self.train_set['items_seen_by_user'].get(user, []): 124 | self.bu[user] = self.bu.get(user, 0) + float(self.train_set['feedback'][user].get(item, 0)) - \ 125 | self.train_set['mean_value'] - self.bi.get(item, 0) 126 | count += 1 127 | 128 | if count > 1: 129 | self.bu[user] = float(self.bu[user]) / float(self.reg_bu + count) 130 | elif count == 0: 131 | self.bu[user] = self.train_set['mean_value'] 132 | 133 | def compute_bui(self): 134 | """ 135 | Method to compute bui values 136 | 137 | bui = mi + bu + bi 138 | """ 139 | 140 | for user in self.users: 141 | for item in self.items: 142 | self.bui.setdefault(user, {}).update( 143 | {item: self.train_set['mean_value'] + self.bu.get(user, 0) + self.bi.get(item, 0)}) 144 | 145 | del self.bu 146 | del self.bi 147 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/base_nsvd1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | This class is base for NSVD1 algorithms. 4 | 5 | Used by: ItemNSVD1, and UserNSVD1 6 | 7 | Literature: 8 | István Pilászy and Domonkos Tikk: 9 | Recommending new movies: even a few ratings are more valuable than metadata 10 | RecSys 2009 11 | https://dl.acm.org/citation.cfm?id=1639731 12 | 13 | """ 14 | 15 | # © 2019. Case Recommender (MIT License) 16 | 17 | import numpy as np 18 | 19 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 20 | 21 | __author__ = 'Arthur Fortes ' 22 | 23 | 24 | class BaseNSVD1(BaseRatingPrediction): 25 | def __init__(self, train_file, test_file, output_file=None, factors=10, init_mean=0, init_stdev=0.1, 26 | sep='\t', output_sep='\t', random_seed=None): 27 | """ 28 | This class is base for all NSVD1 algorithms. 29 | 30 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 31 | (user item feedback_value). 32 | :type train_file: str 33 | 34 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 35 | (user item feedback_value). 36 | :type test_file: str, default None 37 | 38 | :param output_file: File with dir to write the final predictions 39 | :type output_file: str, default None 40 | 41 | :param factors: Number of latent factors per user/item 42 | :type factors: int, default 10 43 | 44 | :param init_mean: Mean of the normal distribution used to initialize the latent factors 45 | :type init_mean: float, default 0 46 | 47 | :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors 48 | :type init_stdev: float, default 0.1 49 | 50 | :param sep: Delimiter for input files 51 | :type sep: str, default'\t' 52 | 53 | :param output_sep: Delimiter for output file 54 | :type output_sep: str, default '\t' 55 | 56 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 57 | :type random_seed: int, default None 58 | 59 | """ 60 | super(BaseNSVD1, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep, 61 | output_sep=output_sep) 62 | 63 | self.factors = factors 64 | self.init_mean = init_mean 65 | self.init_stdev = init_stdev 66 | 67 | if random_seed is not None: 68 | np.random.seed(random_seed) 69 | 70 | # internal vars 71 | self.number_users = len(self.users) 72 | self.number_items = len(self.items) 73 | self.item_to_item_id = {} 74 | self.item_id_to_item = {} 75 | self.user_to_user_id = {} 76 | self.user_id_to_user = {} 77 | self.x = None 78 | self.p = None 79 | self.q = None 80 | self.w = None 81 | self.b = None 82 | self.c = None 83 | self.metadata = None 84 | self.number_metadata = None 85 | 86 | self.last_rmse = 0 87 | self.predictions = [] 88 | 89 | def init_model(self): 90 | """ 91 | Method to treat and initialize the model 92 | 93 | """ 94 | 95 | # Map items and users with their respective ids and upgrade unobserved items with test set samples 96 | for i, item in enumerate(self.items): 97 | self.item_to_item_id.update({item: i}) 98 | self.item_id_to_item.update({i: item}) 99 | for u, user in enumerate(self.users): 100 | self.user_to_user_id.update({user: u}) 101 | self.user_id_to_user.update({u: user}) 102 | 103 | def create_factors(self): 104 | self.b = np.random.normal(self.init_mean, self.init_stdev, self.number_users) 105 | self.c = np.random.normal(self.init_mean, self.init_stdev, self.number_items) 106 | self.p = np.random.normal(self.init_mean, self.init_stdev, (self.number_users, self.factors)) 107 | self.q = np.random.normal(self.init_mean, self.init_stdev, (self.number_items, self.factors)) 108 | self.w = np.random.normal(self.init_mean, self.init_stdev, (self.number_metadata, self.factors)) 109 | 110 | def _predict(self, user, item, cond=True): 111 | rui = self.b[user] + self.c[item] + np.dot(self.p[user], self.q[item]) 112 | 113 | if cond: 114 | if rui > self.train_set["max_value"]: 115 | rui = self.train_set["max_value"] 116 | if rui < self.train_set["min_value"]: 117 | rui = self.train_set["min_value"] 118 | 119 | return rui 120 | 121 | def predict(self): 122 | """ 123 | This method computes a final rating for unknown pairs (user, item) 124 | 125 | """ 126 | 127 | if self.test_file is not None: 128 | for user in self.test_set['users']: 129 | for item in self.test_set['feedback'][user]: 130 | rui = self._predict(self.user_to_user_id[user], self.item_to_item_id[item]) 131 | self.predictions.append((user, item, rui)) 132 | else: 133 | raise NotImplemented 134 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/base_rating_prediction.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | This class is base for rating prediction algorithms. 4 | 5 | """ 6 | 7 | # © 2019. Case Recommender (MIT License) 8 | 9 | from scipy.spatial.distance import squareform, pdist 10 | import numpy as np 11 | 12 | 13 | from caserec.evaluation.rating_prediction import RatingPredictionEvaluation 14 | from caserec.utils.extra_functions import print_header 15 | from caserec.utils.process_data import ReadFile, WriteFile 16 | 17 | __author__ = 'Arthur Fortes ' 18 | 19 | 20 | class BaseRatingPrediction(object): 21 | def __init__(self, train_file, test_file, output_file=None, similarity_metric='cosine', sep='\t', 22 | output_sep='\t'): 23 | """ 24 | This class is base for all rating prediction algorithms. Inherits the class Recommender 25 | and implements / adds common methods and attributes for rating prediction approaches. 26 | 27 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 28 | (user item feedback_value). 29 | :type train_file: str 30 | 31 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 32 | (user item feedback_value). 33 | :type test_file: str, default None 34 | 35 | :param output_file: File with dir to write the final predictions 36 | :type output_file: str, default None 37 | 38 | :param similarity_metric: 39 | :type similarity_metric: str, default cosine 40 | 41 | :param sep: Delimiter for input files 42 | :type sep: str, default '\t' 43 | 44 | :param output_sep: Delimiter for output file 45 | :type output_sep: str, default '\t' 46 | 47 | """ 48 | 49 | self.train_file = train_file 50 | self.test_file = test_file 51 | self.similarity_metric = similarity_metric 52 | self.output_file = output_file 53 | self.sep = sep 54 | self.output_sep = output_sep 55 | 56 | # internal vars 57 | self.item_to_item_id = {} 58 | self.item_id_to_item = {} 59 | self.user_to_user_id = {} 60 | self.user_id_to_user = {} 61 | self.train_set = None 62 | self.test_set = None 63 | self.users = None 64 | self.items = None 65 | self.matrix = None 66 | self.evaluation_results = None 67 | self.recommender_name = None 68 | self.extra_info_header = None 69 | self.predictions = [] 70 | 71 | def read_files(self): 72 | """ 73 | Method to initialize recommender algorithm. 74 | 75 | """ 76 | 77 | # Getting train_set as a dict_file = {'feedback': dict_feedback, 'users': list_users, 'items': list_items, 78 | # 'sparsity': sparsity, 'number_interactions': number_interactions, 'users_viewed_item': users_viewed_item, 'items_unobserved': items_unobserved, 79 | # 'items_seen_by_user': items_seen_by_user, 'mean_value': mean_value, 'max_value': max(list_feedback), 'min_value': min(list_feedback)} 80 | self.train_set = ReadFile(self.train_file, sep=self.sep).read() 81 | 82 | if self.test_file is not None: 83 | self.test_set = ReadFile(self.test_file, sep=self.sep).read() 84 | 85 | # Combining users/items from train and test set 86 | self.users = sorted(set(list(self.train_set['users']) + list(self.test_set['users']))) 87 | self.items = sorted(set(list(self.train_set['items']) + list(self.test_set['items']))) 88 | else: 89 | self.users = self.train_set['users'] 90 | self.items = self.train_set['items'] 91 | 92 | for i, item in enumerate(self.items): 93 | self.item_to_item_id.update({item: i}) 94 | self.item_id_to_item.update({i: item}) 95 | for u, user in enumerate(self.users): 96 | self.user_to_user_id.update({user: u}) 97 | self.user_id_to_user.update({u: user}) 98 | 99 | def create_matrix(self): 100 | """ 101 | Method to create a feedback matrix having users as rows and items as columns 102 | 103 | """ 104 | 105 | self.matrix = np.zeros((len(self.users), len(self.items))) 106 | 107 | for user in self.train_set['users']: 108 | for item in self.train_set['feedback'][user]: 109 | self.matrix[self.user_to_user_id[user]][self.item_to_item_id[item]] = \ 110 | self.train_set['feedback'][user][item] 111 | 112 | def compute_similarity(self, transpose=False): 113 | """ 114 | Method to compute a similarity matrix from original df_matrix 115 | 116 | :param transpose: If True, calculate the similarity in a transpose matrix 117 | :type transpose: bool, default False 118 | 119 | """ 120 | 121 | # Calculate distance matrix 122 | if transpose: 123 | similarity_matrix = np.float32(squareform(pdist(self.matrix.T, self.similarity_metric))) 124 | else: 125 | similarity_matrix = np.float32(squareform(pdist(self.matrix, self.similarity_metric))) 126 | 127 | # Remove NaNs 128 | similarity_matrix[np.isnan(similarity_matrix)] = 1.0 129 | # transform distances in similarities. Values in matrix range from 0-1 130 | similarity_matrix = (similarity_matrix.max() - similarity_matrix) / similarity_matrix.max() 131 | 132 | return similarity_matrix 133 | 134 | def evaluate(self, metrics, verbose=True, as_table=False, table_sep='\t'): 135 | """ 136 | Method to evaluate the final ranking 137 | 138 | :param metrics: List of evaluation metrics 139 | :type metrics: list, default ('MAE', 'RMSE') 140 | 141 | :param verbose: Print the evaluation results 142 | :type verbose: bool, default True 143 | 144 | :param as_table: Print the evaluation results as table 145 | :type as_table: bool, default False 146 | 147 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 148 | :type table_sep: str, default '\t' 149 | 150 | """ 151 | 152 | self.evaluation_results = {} 153 | 154 | if metrics is None: 155 | metrics = list(['MAE', 'RMSE']) 156 | 157 | results = RatingPredictionEvaluation(verbose=verbose, as_table=as_table, table_sep=table_sep, metrics=metrics 158 | ).evaluate_recommender(predictions=self.predictions, 159 | test_set=self.test_set) 160 | 161 | for metric in metrics: 162 | self.evaluation_results[metric.upper()] = results[metric.upper()] 163 | 164 | def write_predictions(self): 165 | """ 166 | Method to write final ranking 167 | 168 | """ 169 | 170 | if self.output_file is not None: 171 | WriteFile(self.output_file, data=self.predictions, sep=self.sep).write() 172 | 173 | def compute(self, verbose=True): 174 | """ 175 | Method to run the recommender algorithm 176 | 177 | :param verbose: Print the information about recommender 178 | :type verbose: bool, default True 179 | 180 | """ 181 | 182 | # read files 183 | self.read_files() 184 | 185 | # initialize empty predictions (Don't remove: important to Cross Validation) 186 | self.predictions = [] 187 | 188 | if verbose: 189 | test_info = None 190 | 191 | main_info = { 192 | 'title': 'Rating Prediction > ' + self.recommender_name, 193 | 'n_users': len(self.train_set['users']), 194 | 'n_items': len(self.train_set['items']), 195 | 'n_interactions': self.train_set['number_interactions'], 196 | 'sparsity': self.train_set['sparsity'] 197 | } 198 | 199 | if self.test_file is not None: 200 | test_info = { 201 | 'n_users': len(self.test_set['users']), 202 | 'n_items': len(self.test_set['items']), 203 | 'n_interactions': self.test_set['number_interactions'], 204 | 'sparsity': self.test_set['sparsity'] 205 | } 206 | 207 | print_header(main_info, test_info) 208 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/item_attribute_knn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Item Based Collaborative Filtering Recommender with Attributes (Item Attribute KNN) 4 | [Rating Prediction] 5 | 6 | Its philosophy is as follows: in order to determine the rating of User u on item m, we can find other movies that 7 | are similar to item m, and based on User u’s ratings on those similar movies we infer his rating on item m. 8 | However, instead of traditional ItemKNN, this approach uses a metadata or pre-computed similarity matrix. 9 | 10 | """ 11 | 12 | # © 2019. Case Recommender (MIT License) 13 | 14 | from collections import defaultdict 15 | import numpy as np 16 | 17 | from caserec.recommenders.rating_prediction.itemknn import ItemKNN 18 | from caserec.utils.process_data import ReadFile 19 | 20 | __author__ = 'Arthur Fortes ' 21 | 22 | 23 | class ItemAttributeKNN(ItemKNN): 24 | def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None, 25 | k_neighbors=30, as_similar_first=True, metadata_as_binary=False, metadata_similarity_sep='\t', 26 | similarity_metric="cosine", sep='\t', output_sep='\t'): 27 | """ 28 | Item Attribute KNN for Rating Prediction 29 | 30 | This algorithm predicts a rank for each user based on the similar items that he/her consumed, 31 | using a metadata or similarity pre-computed file 32 | 33 | Usage:: 34 | 35 | >> ItemAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute() 36 | >> ItemAttributeKNN(train, test, metadata_file=metadata, as_similar_first=False).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str, default None 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns 50 | (item metadata). 51 | :type metadata_file: str, default None 52 | 53 | :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns 54 | (item item similarity). 55 | :type similarity_file: str, default None 56 | 57 | :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users)) 58 | :type k_neighbors: int, default None 59 | 60 | :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k 61 | most similar users and then take the intersection with the users that 62 | seen that item. 63 | :type as_similar_first: bool, default True 64 | 65 | :param metadata_as_binary: f True, the explicit value will be transform to binary 66 | :type metadata_as_binary: bool, default False 67 | 68 | :param metadata_similarity_sep: Delimiter for similarity or metadata file 69 | :type metadata_similarity_sep: str, default '\t' 70 | 71 | :param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about 72 | distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html 73 | :type similarity_metric: str, default cosine 74 | 75 | :param sep: Delimiter for input files file 76 | :type sep: str, default '\t' 77 | 78 | :param output_sep: Delimiter for output file 79 | :type output_sep: str, default '\t' 80 | 81 | """ 82 | 83 | super(ItemAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 84 | k_neighbors=k_neighbors, as_similar_first=as_similar_first, sep=sep, 85 | output_sep=output_sep, similarity_metric=similarity_metric) 86 | 87 | self.recommender_name = 'Item Attribute KNN Algorithm' 88 | 89 | self.metadata_file = metadata_file 90 | self.similarity_file = similarity_file 91 | self.metadata_as_binary = metadata_as_binary 92 | self.metadata_similarity_sep = metadata_similarity_sep 93 | 94 | def init_model(self): 95 | """ 96 | Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity 97 | matrix 98 | 99 | """ 100 | 101 | self.similar_items = defaultdict(list) 102 | 103 | # Set the value for k 104 | if self.k_neighbors is None: 105 | self.k_neighbors = int(np.sqrt(len(self.items))) 106 | 107 | if self.metadata_file is not None: 108 | metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary 109 | ).read_metadata_or_similarity() 110 | 111 | self.matrix = np.zeros((len(self.items), len(metadata['col_2']))) 112 | 113 | meta_to_meta_id = {} 114 | for m, data in enumerate(metadata['col_2']): 115 | meta_to_meta_id[data] = m 116 | 117 | for item in metadata['col_1']: 118 | for m in metadata['dict'][item]: 119 | self.matrix[self.item_to_item_id[item], meta_to_meta_id[m]] = metadata['dict'][item][m] 120 | 121 | # create header info for metadata 122 | sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100 123 | 124 | self.extra_info_header = ">> metadata:: %d items and %d metadata (%d interactions) | sparsity:: %.2f%%" % \ 125 | (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'], 126 | sparsity) 127 | 128 | # Create similarity matrix based on metadata or similarity file. Transpose=False, because it is an 129 | # item x metadata matrix 130 | self.si_matrix = self.compute_similarity(transpose=False) 131 | 132 | elif self.similarity_file is not None: 133 | similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False 134 | ).read_metadata_or_similarity() 135 | 136 | self.si_matrix = np.zeros((len(self.items), len(self.items))) 137 | 138 | # Fill similarity matrix 139 | for i in similarity['col_1']: 140 | for i_j in similarity['dict'][i]: 141 | self.si_matrix[self.item_to_item_id[i], self.item_to_item_id[int(i_j)]] = similarity['dict'][i][i_j] 142 | 143 | # Remove NaNs 144 | self.si_matrix[np.isnan(self.si_matrix)] = 0.0 145 | 146 | else: 147 | raise ValueError("This algorithm needs a similarity matrix or a metadata file!") 148 | 149 | # Create original matrix user x item for prediction process 150 | self.create_matrix() 151 | 152 | for i_id, item in enumerate(self.items): 153 | self.similar_items[i_id] = sorted(range(len(self.si_matrix[i_id])), 154 | key=lambda k: -self.si_matrix[i_id][k])[1:self.k_neighbors + 1] 155 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/item_msmf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Item-MSMF: Items Most Similar based on Matrix Factorization 4 | [Rating Prediction] 5 | 6 | Literature: 7 | 2018 Brazilian Conference on Intelligent Systems (BRACIS). 8 | Link soon. 9 | 10 | """ 11 | 12 | import numpy as np 13 | 14 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization 15 | from caserec.utils.extra_functions import timed 16 | 17 | __author__ = 'Eduardo Fressato ' 18 | 19 | 20 | class ItemMSMF(MatrixFactorization): 21 | def __init__(self, train_file=None, test_file=None, output_file=None, similarity_file=None, neighbors=20, 22 | factors=10, learn_rate=0.01, epochs=30, delta=0.015, init_mean=0.1, init_stdev=0.1, baseline=True, 23 | bias_learn_rate=0.005, delta_bias=0.002, stop_criteria=0.009, sep='\t', output_sep='\t', 24 | similarity_sep='\t', random_seed=None, verbose=True): 25 | 26 | super(ItemMSMF, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep, 27 | learn_rate=learn_rate, factors=factors, epochs=epochs, delta=delta, 28 | init_mean=init_mean, init_stdev=init_stdev, baseline=baseline, 29 | bias_learn_rate=bias_learn_rate, delta_bias=delta_bias, 30 | stop_criteria=stop_criteria, output_sep=output_sep, random_seed=random_seed) 31 | 32 | """ 33 | Item-MSMF: Items Most Similar based on Matrix Factorization 34 | 35 | The Item-MSMF algorithm, this is recommender technique based on matrix factorization, that incorporates 36 | similarities of items which are calculated based on metadata. This approach to address the item 37 | cold-start through a shared latent factor vector representation of similar items based on those items 38 | which have enough interactions with users. In this way, the new items representations that are not 39 | accurate in terms of rating prediction, is replaced them with a weighted average of the latent factor 40 | vectors of the most similar items. 41 | 42 | Usage:: 43 | 44 | >> ItemMSMF(train, test, similarity_file, neighbors).compute() 45 | 46 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 47 | (user item feedback_value). 48 | :type train_file: str 49 | 50 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 51 | (user item feedback_value). 52 | :type test_file: str, default None 53 | 54 | :param output_file: File with dir to write the final predictions 55 | :type output_file: str, default None 56 | 57 | :param similarity_file: File which contains the similarity of items. This file needs to have at least 3 columns 58 | (item item similarity). 59 | :type similarity_file: str, default None 60 | 61 | :param neighbors: Number of items that replace the new item vector 62 | :type neighbors: int, default 20 63 | 64 | :param factors: Number of latent factors per user/item 65 | :type factors: int, default 10 66 | 67 | :param learn_rate: Learning rate (alpha) 68 | :type learn_rate: float, default 0.05 69 | 70 | :param epochs: Number of epochs over the training data 71 | :type epochs: int, default 30 72 | 73 | :param delta: Regularization value 74 | :type delta: float, default 0.015 75 | 76 | :param init_mean: Mean of the normal distribution used to initialize the latent factors 77 | :type init_mean: float, default 0 78 | 79 | :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors 80 | :type init_stdev: float, default 0.1 81 | 82 | :param bias_learn_rate: Learning rate for baselines 83 | :type bias_learn_rate: float, default 0.005 84 | 85 | :param delta_bias: Regularization value for baselines 86 | :type delta_bias: float, default 0.002 87 | 88 | :param stop_criteria: Difference between errors for stopping criteria 89 | :type stop_criteria: float, default 0.009 90 | 91 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 92 | :type random_seed: int, default None 93 | 94 | :param verbose: Print information 95 | :type verbose: bool, default True 96 | 97 | """ 98 | 99 | self.recommender_name = 'Item-MSMF for Cold Start' 100 | self.verbose = verbose 101 | 102 | self.similarity_file = similarity_file 103 | self.similarity_sep = similarity_sep 104 | self.si_matrix = None 105 | self.new_items = set() 106 | self.k = neighbors 107 | 108 | def init_model(self): 109 | super(ItemMSMF, self).init_model() 110 | if self.verbose: 111 | print("\nread_similarity_matrix_time:: %4f sec" % timed(self.fill_similarity_matrix)) 112 | else: 113 | self.fill_similarity_matrix() 114 | 115 | def fill_similarity_matrix(self): 116 | self.si_matrix = np.zeros((len(self.items), len(self.items))) 117 | items_sim = set() 118 | 119 | with open(self.similarity_file, "r", encoding='utf-8') as infile: 120 | items = set(self.items) 121 | for line in infile: 122 | if line.strip(): 123 | inline = line.split(self.similarity_sep) 124 | item_a, item_b, sim = int(inline[0]), int(inline[1]), float(inline[2].rstrip()) 125 | 126 | if item_a in items and item_b in items: 127 | map_a = self.item_to_item_id[item_a] 128 | map_b = self.item_to_item_id[item_b] 129 | items_sim.add(item_a) 130 | items_sim.add(item_b) 131 | self.si_matrix[map_a][map_b] = sim 132 | self.si_matrix[map_b][map_a] = sim 133 | 134 | if self.verbose: 135 | print("Number of item in similarity file:", len(items_sim)) 136 | del items_sim 137 | 138 | def search_new_items(self): 139 | for i in self.test_set['items']: 140 | if i not in self.train_set['items']: 141 | self.new_items.add(i) 142 | 143 | def search_similar_items(self, item): 144 | item_index = self.item_to_item_id[item] 145 | count = 0 146 | list_items = [] 147 | list_similar = sorted(enumerate(self.si_matrix[item_index]), key=lambda x: -x[1]) 148 | 149 | for i, sim in list_similar: 150 | if i != item_index: 151 | if self.item_id_to_item[i] in self.train_set['items']: 152 | list_items.append((self.item_id_to_item[i], sim)) 153 | count += 1 154 | if count == self.k: 155 | return list_items 156 | 157 | def replace_vector_new_item(self): 158 | 159 | for item in self.new_items: 160 | list_items = self.search_similar_items(item) 161 | 162 | q_i = self.q[self.item_to_item_id[list_items[0][0]]].copy() * list_items[0][1] 163 | b_i = self.bi[self.item_to_item_id[list_items[0][0]]].copy() * list_items[0][1] 164 | sum_sim = list_items[0][1] 165 | 166 | for item_j, sim in list_items[1:]: 167 | q_i += self.q[self.item_to_item_id[item_j]].copy() * sim 168 | b_i += self.bi[self.item_to_item_id[item_j]].copy() * sim 169 | sum_sim += sim 170 | 171 | if sum_sim > 0: 172 | q_i = q_i / sum_sim 173 | b_i = b_i / sum_sim 174 | 175 | self.q[self.item_to_item_id[item]] = q_i.copy() 176 | if self.baseline: 177 | self.bi[self.item_to_item_id[item]] = b_i.copy() 178 | 179 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 180 | 181 | if verbose: 182 | super(MatrixFactorization, self).compute(verbose=verbose) 183 | self.init_model() 184 | 185 | print("training_time:: %4f sec" % timed(self.fit)) 186 | if self.extra_info_header is not None: 187 | print(self.extra_info_header) 188 | 189 | search_time = timed(self.search_new_items) 190 | replace_time = timed(self.replace_vector_new_item) 191 | prediction_time = timed(self.predict) 192 | 193 | print("search_new_items_time:: %4f sec" % search_time) 194 | print("vectors_replacement_time:: %4f sec" % replace_time) 195 | print("prediction_time:: %4f sec" % prediction_time) 196 | print("total_prediction_time:: %4f sec" % (search_time + replace_time + prediction_time)) 197 | print("\n") 198 | 199 | else: 200 | # Execute all in silence without prints 201 | super(MatrixFactorization, self).compute(verbose=verbose) 202 | self.init_model() 203 | self.fit() 204 | self.search_new_items() 205 | self.replace_vector_new_item() 206 | self.predict() 207 | 208 | self.write_predictions() 209 | 210 | if self.test_file is not None: 211 | return self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 212 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/matrixfactorization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Matrix Factorization Collaborative Filtering Recommender 4 | [Rating Prediction] 5 | 6 | Literature: 7 | Koren, Yehuda and Bell, Robert and Volinsky, Chris: 8 | Matrix Factorization Techniques for Recommender Systems 9 | Journal Computer 2009. 10 | http://dl.acm.org/citation.cfm?id=1608614 11 | 12 | """ 13 | 14 | # © 2019. Case Recommender (MIT License) 15 | 16 | import numpy as np 17 | 18 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 19 | from caserec.utils.extra_functions import timed 20 | 21 | __author__ = 'Arthur Fortes ' 22 | 23 | 24 | class MatrixFactorization(BaseRatingPrediction): 25 | def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, learn_rate=0.01, epochs=30, 26 | delta=0.015, init_mean=0.1, init_stdev=0.1, baseline=False, bias_learn_rate=0.005, delta_bias=0.002, 27 | stop_criteria=0.009, sep='\t', output_sep='\t', random_seed=None): 28 | """ 29 | Matrix Factorization for rating prediction 30 | 31 | Matrix factorization models map both users and items to a joint latent factor space of dimensionality f, 32 | such that user-item interactions are modeled as inner products in that space. 33 | 34 | Usage:: 35 | 36 | >> MatrixFactorization(train, test).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param factors: Number of latent factors per user/item 50 | :type factors: int, default 10 51 | 52 | :param learn_rate: Learning rate (alpha) 53 | :type learn_rate: float, default 0.05 54 | 55 | :param epochs: Number of epochs over the training data 56 | :type epochs: int, default 30 57 | 58 | :param delta: Regularization value 59 | :type delta: float, default 0.015 60 | 61 | :param init_mean: Mean of the normal distribution used to initialize the latent factors 62 | :type init_mean: float, default 0 63 | 64 | :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors 65 | :type init_stdev: float, default 0.1 66 | 67 | :param baseline: Use the train data to build baselines (SVD Algorithm); else: Use only the mean 68 | :type baseline: bool, default False 69 | 70 | :param bias_learn_rate: Learning rate for baselines 71 | :type bias_learn_rate: float, default 0.005 72 | 73 | :param delta_bias: Regularization value for baselines 74 | :type delta_bias: float, default 0.002 75 | 76 | :param stop_criteria: Difference between errors for stopping criteria 77 | :type stop_criteria: float, default 0.001 78 | 79 | :param sep: Delimiter for input files 80 | :type sep: str, default'\t' 81 | 82 | :param output_sep: Delimiter for output file 83 | :type output_sep: str, default '\t' 84 | 85 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 86 | :type random_seed: int, default None 87 | 88 | """ 89 | super(MatrixFactorization, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 90 | sep=sep, output_sep=output_sep) 91 | 92 | self.recommender_name = 'Matrix Factorization' 93 | 94 | self.epochs = epochs 95 | self.learn_rate = learn_rate 96 | self.delta = delta 97 | self.factors = factors 98 | self.init_mean = init_mean 99 | self.init_stdev = init_stdev 100 | self.baseline = baseline 101 | self.bias_learn_rate = bias_learn_rate 102 | self.delta_bias = delta_bias 103 | self.stop_criteria = stop_criteria 104 | 105 | if random_seed is not None: 106 | np.random.seed(random_seed) 107 | 108 | # internal vars 109 | self.feedback_triples = None 110 | self.p = None 111 | self.q = None 112 | self.bu = None 113 | self.bi = None 114 | 115 | def init_model(self): 116 | """ 117 | Method to treat and initialize the model 118 | 119 | """ 120 | self.feedback_triples = [] 121 | 122 | # Map interaction with ids 123 | for user in self.train_set['feedback']: 124 | for item in self.train_set['feedback'][user]: 125 | self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item], 126 | self.train_set['feedback'][user][item])) 127 | 128 | # Initialize factors 129 | self.create_factors() 130 | 131 | def fit(self): 132 | """ 133 | This method performs iterations of stochastic gradient ascent over the training data. 134 | 135 | """ 136 | 137 | rmse_old = .0 138 | 139 | for epoch in range(self.epochs): 140 | 141 | error_final = .0 142 | 143 | for user, item, feedback in self.feedback_triples: 144 | 145 | eui = feedback - self._predict_score(user, item, False) 146 | error_final += (eui ** 2.0) 147 | 148 | # Adjust the factors 149 | u_f = self.p[user] 150 | i_f = self.q[item] 151 | 152 | # Compute factor updates 153 | delta_u = np.subtract(np.multiply(eui, i_f), np.multiply(self.delta, u_f)) 154 | delta_i = np.subtract(np.multiply(eui, u_f), np.multiply(self.delta, i_f)) 155 | 156 | # apply updates 157 | self.p[user] += np.multiply(self.learn_rate, delta_u) 158 | self.q[item] += np.multiply(self.learn_rate, delta_i) 159 | 160 | if self.baseline: 161 | self.bu[user] += self.bias_learn_rate * (eui - self.delta_bias * self.bu[user]) 162 | self.bi[item] += self.bias_learn_rate * (eui - self.delta_bias * self.bi[item]) 163 | 164 | rmse_new = np.sqrt(error_final / self.train_set["number_interactions"]) 165 | if np.fabs(rmse_new - rmse_old) <= self.stop_criteria: 166 | break 167 | else: 168 | rmse_old = rmse_new 169 | 170 | def create_factors(self): 171 | """ 172 | This method create factors for users, items and bias 173 | 174 | """ 175 | 176 | self.p = np.random.normal(self.init_mean, self.init_stdev, (len(self.users), self.factors)) 177 | self.q = np.random.normal(self.init_mean, self.init_stdev, (len(self.items), self.factors)) 178 | 179 | if self.baseline: 180 | self.bu = np.zeros(len(self.users), np.double) 181 | self.bi = np.zeros(len(self.items), np.double) 182 | 183 | def _predict_score(self, u, i, cond=True): 184 | """ 185 | Method to predict a single score for a pair (user, item) 186 | 187 | :param u: User ID 188 | :type u: int 189 | 190 | :param i: Item ID 191 | :type i: int 192 | 193 | :param cond: Use max and min values of train set to limit score 194 | :type cond: bool, default True 195 | 196 | :return: Score generate for pair (user, item) 197 | :rtype: float 198 | 199 | """ 200 | 201 | if self.baseline: 202 | rui = self.train_set["mean_value"] + self.bu[u] + self.bi[i] + np.dot(self.p[u], self.q[i]) 203 | else: 204 | rui = self.train_set['mean_value'] + np.dot(self.p[u], self.q[i]) 205 | 206 | if cond: 207 | if rui > self.train_set["max_value"]: 208 | rui = self.train_set["max_value"] 209 | elif rui < self.train_set["min_value"]: 210 | rui = self.train_set["min_value"] 211 | 212 | return rui 213 | 214 | def predict(self): 215 | """ 216 | This method computes a final rating for unknown pairs (user, item) 217 | 218 | """ 219 | 220 | if self.test_file is not None: 221 | for user in self.test_set['users']: 222 | for item in self.test_set['feedback'][user]: 223 | self.predictions.append((user, item, self._predict_score(self.user_to_user_id[user], 224 | self.item_to_item_id[item], True))) 225 | else: 226 | raise NotImplemented 227 | 228 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 229 | """ 230 | Extends compute method from BaseRatingPrediction. Method to run recommender algorithm 231 | 232 | :param verbose: Print recommender and database information 233 | :type verbose: bool, default True 234 | 235 | :param metrics: List of evaluation measures 236 | :type metrics: list, default None 237 | 238 | :param verbose_evaluation: Print the evaluation results 239 | :type verbose_evaluation: bool, default True 240 | 241 | :param as_table: Print the evaluation results as table 242 | :type as_table: bool, default False 243 | 244 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 245 | :type table_sep: str, default '\t' 246 | 247 | """ 248 | 249 | super(MatrixFactorization, self).compute(verbose=verbose) 250 | 251 | if verbose: 252 | self.init_model() 253 | print("training_time:: %4f sec" % timed(self.fit)) 254 | if self.extra_info_header is not None: 255 | print(self.extra_info_header) 256 | 257 | print("prediction_time:: %4f sec" % timed(self.predict)) 258 | 259 | print('\n') 260 | 261 | else: 262 | # Execute all in silence without prints 263 | self.init_model() 264 | self.fit() 265 | self.predict() 266 | 267 | self.write_predictions() 268 | 269 | if self.test_file is not None: 270 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 271 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/most_popular.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Most Popular Collaborative Filtering Recommender 4 | [Rating Prediction] 5 | 6 | Most Popular predicts ratings for unobserved items for each user based on popularity of user and items. 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 13 | from caserec.utils.extra_functions import timed 14 | import numpy as np 15 | 16 | __author__ = 'Arthur Fortes ' 17 | 18 | 19 | class MostPopular(BaseRatingPrediction): 20 | def __init__(self, train_file=None, test_file=None, output_file=None, sep='\t', output_sep='\t'): 21 | """ 22 | Most Popular for Item Recommendation 23 | 24 | This algorithm predicts a rank for each user using the count of number of feedback of users and items 25 | 26 | Usage:: 27 | 28 | >> MostPopular(train, test).compute() 29 | 30 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 31 | (user item feedback_value). 32 | :type train_file: str 33 | 34 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 35 | (user item feedback_value). 36 | :type test_file: str, default None 37 | 38 | :param output_file: File with dir to write the final predictions 39 | :type output_file: str, default None 40 | 41 | :param sep: Delimiter for input files 42 | :type sep: str, default '\t' 43 | 44 | :param output_sep: Delimiter for output file 45 | :type output_sep: str, default '\t' 46 | 47 | """ 48 | 49 | super(MostPopular, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 50 | sep=sep, output_sep=output_sep) 51 | 52 | self.recommender_name = 'Most Popular' 53 | 54 | def predict(self): 55 | """ 56 | This method predict final result, building an rank of each user of the train set. 57 | 58 | """ 59 | 60 | if self.test_file is not None: 61 | for user in self.test_set['users']: 62 | for item in self.test_set['feedback'][user]: 63 | 64 | count_value = 0 65 | feedback_value = 0 66 | 67 | for user_v in self.train_set['users_viewed_item'].get(item, []): 68 | feedback_value += self.train_set['feedback'][user_v][item] 69 | count_value += 1 70 | 71 | if feedback_value == 0: 72 | try: 73 | feedback_value = np.mean(list(self.train_set['feedback'][user].values())) 74 | except KeyError: 75 | feedback_value = self.train_set['mean_value'] 76 | else: 77 | feedback_value /= count_value 78 | 79 | self.predictions.append((user, item, feedback_value)) 80 | else: 81 | raise NotImplemented 82 | 83 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 84 | """ 85 | Extends compute method from BaseItemRecommendation. Method to run recommender algorithm 86 | 87 | :param verbose: Print recommender and database information 88 | :type verbose: bool, default True 89 | 90 | :param metrics: List of evaluation measures 91 | :type metrics: list, default None 92 | 93 | :param verbose_evaluation: Print the evaluation results 94 | :type verbose_evaluation: bool, default True 95 | 96 | :param as_table: Print the evaluation results as table 97 | :type as_table: bool, default False 98 | 99 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 100 | :type table_sep: str, default '\t' 101 | 102 | """ 103 | 104 | super(MostPopular, self).compute(verbose=verbose) 105 | 106 | if verbose: 107 | print("prediction_time:: %4f sec" % timed(self.predict)) 108 | print('\n') 109 | 110 | else: 111 | self.predict() 112 | 113 | self.write_predictions() 114 | 115 | if self.test_file is not None: 116 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 117 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/nnmf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Non-negative Matrix Factorization 4 | [Rating Prediction] 5 | 6 | Literature: 7 | Badrul Sarwar , George Karypis , Joseph Konstan , John Riedl: 8 | Incremental Singular Value Decomposition Algorithms for Highly Scalable Recommender Systems 9 | Fifth International Conference on Computer and Information Science 2002. 10 | http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.7894 11 | 12 | """ 13 | 14 | # © 2019. Case Recommender (MIT License) 15 | 16 | import numpy as np 17 | from sklearn.decomposition import NMF 18 | 19 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 20 | from caserec.utils.extra_functions import timed 21 | 22 | __author__ = 'Joao Felipe Guedes ' 23 | 24 | 25 | class NNMF(BaseRatingPrediction): 26 | def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, sep='\t', output_sep='\t', 27 | random_seed=None): 28 | """ 29 | Matrix Factorization for rating prediction 30 | 31 | Matrix factorization models map both users and items to a joint latent factor space of dimensionality f, 32 | such that user-item interactions are modeled as inner products in that space. 33 | 34 | Usage:: 35 | 36 | >> MatrixFactorization(train, test).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param factors: Number of latent factors per user/item 50 | :type factors: int, default 10 51 | 52 | :param sep: Delimiter for input files 53 | :type sep: str, default '\t' 54 | 55 | :param output_sep: Delimiter for output file 56 | :type output_sep: str, default '\t' 57 | 58 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 59 | :type random_seed: int, default None 60 | 61 | """ 62 | super(NNMF, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep, 63 | output_sep=output_sep) 64 | 65 | self.recommender_name = 'NNMF' 66 | self.factors = factors 67 | 68 | if random_seed is not None: 69 | np.random.seed(random_seed) 70 | 71 | # internal vars 72 | self.feedback_triples = None 73 | self.prediction_matrix = None 74 | 75 | def init_model(self): 76 | """ 77 | Method to treat and initialize the model 78 | 79 | """ 80 | 81 | self.feedback_triples = [] 82 | 83 | # Map interaction with ids 84 | for user in self.train_set['feedback']: 85 | for item in self.train_set['feedback'][user]: 86 | self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item], 87 | self.train_set['feedback'][user][item])) 88 | 89 | self.create_matrix() 90 | 91 | def fit(self): 92 | """ 93 | This method performs Non-negative matrix factorization over the training data. 94 | 95 | """ 96 | 97 | model = NMF(n_components=self.factors, init='random', random_state=0) 98 | 99 | P = model.fit_transform(self.matrix) 100 | 101 | Q = model.components_ 102 | 103 | self.prediction_matrix = np.dot(P, Q) 104 | 105 | def predict_score(self, u, i, cond=True): 106 | """ 107 | Method to predict a single score for a pair (user, item) 108 | 109 | :param u: User ID 110 | :type u: int 111 | 112 | :param i: Item ID 113 | :type i: int 114 | 115 | :param cond: Use max and min values of train set to limit score 116 | :type cond: bool, default True 117 | 118 | :return: Score generate for pair (user, item) 119 | :rtype: float 120 | 121 | """ 122 | 123 | rui = self.train_set["mean_value"] + self.prediction_matrix[u][i] 124 | 125 | if cond: 126 | if rui > self.train_set["max_value"]: 127 | rui = self.train_set["max_value"] 128 | elif rui < self.train_set["min_value"]: 129 | rui = self.train_set["min_value"] 130 | 131 | return rui 132 | 133 | def predict(self): 134 | """ 135 | This method computes a final rating for unknown pairs (user, item) 136 | 137 | """ 138 | 139 | if self.test_file is not None: 140 | for user in self.test_set['users']: 141 | for item in self.test_set['feedback'][user]: 142 | self.predictions.append((user, item, self.predict_score(self.user_to_user_id[user], 143 | self.item_to_item_id[item], True))) 144 | else: 145 | raise NotImplemented 146 | 147 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 148 | """ 149 | Extends compute method from BaseRatingPrediction. Method to run recommender algorithm 150 | 151 | :param verbose: Print recommender and database information 152 | :type verbose: bool, default True 153 | 154 | :param metrics: List of evaluation measures 155 | :type metrics: list, default None 156 | 157 | :param verbose_evaluation: Print the evaluation results 158 | :type verbose_evaluation: bool, default True 159 | 160 | :param as_table: Print the evaluation results as table 161 | :type as_table: bool, default False 162 | 163 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 164 | :type table_sep: str, default '\t' 165 | 166 | """ 167 | 168 | super(NNMF, self).compute(verbose=verbose) 169 | 170 | if verbose: 171 | self.init_model() 172 | print("training_time:: %4f sec" % timed(self.fit)) 173 | if self.extra_info_header is not None: 174 | print(self.extra_info_header) 175 | 176 | print("prediction_time:: %4f sec" % timed(self.predict)) 177 | 178 | print('\n') 179 | 180 | else: 181 | # Execute all in silence without prints 182 | self.init_model() 183 | self.fit() 184 | self.predict() 185 | 186 | self.write_predictions() 187 | 188 | if self.test_file is not None: 189 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/random_rec.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Random Collaborative Filtering Recommender 4 | [Rating Prediction (Rating)] 5 | 6 | Random predicts a user’s ratings based on random distributions of rates. 7 | 8 | """ 9 | 10 | # © 2019. Case Recommender (MIT License) 11 | 12 | import numpy as np 13 | 14 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 15 | from caserec.utils.extra_functions import timed 16 | 17 | __author__ = 'Fernando S. de Aguiar Neto ' 18 | 19 | 20 | class RandomRec(BaseRatingPrediction): 21 | def __init__(self, train_file, test_file, uniform=True, output_file=None, sep='\t', output_sep='\t', 22 | random_seed=None): 23 | """ 24 | Random recommendation for Rating Prediction 25 | 26 | This algorithm predicts ratings for each user-item 27 | 28 | Usage:: 29 | 30 | >> RandomRec(train, test).compute() 31 | 32 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 33 | (user item feedback_value). 34 | :type train_file: str 35 | 36 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 37 | (user item feedback_value). 38 | :type test_file: str, default None 39 | 40 | :param uniform: Indicates whether the ratings are drawn from a uniform sample or not 41 | if False, the ratings are drawn from a normal distribution with the same mean and standard deviation 42 | as the feedback provided in train 43 | :type uniform: bool, default True 44 | 45 | :param output_file: File with dir to write the final predictions 46 | :type output_file: str, default None 47 | 48 | :param sep: Delimiter for input files 49 | :type sep: str, default '\t' 50 | 51 | :param output_sep: Delimiter for output file 52 | :type output_sep: str, default '\t' 53 | 54 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 55 | :type random_seed: int, default None 56 | 57 | """ 58 | 59 | super(RandomRec, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 60 | sep=sep, output_sep=output_sep) 61 | 62 | if random_seed is not None: 63 | np.random.seed(random_seed) 64 | 65 | self.uniform = uniform 66 | 67 | self.recommender_name = 'Random Recommender' 68 | 69 | def predict(self): 70 | if not self.uniform: 71 | feedbacks = [] 72 | for user in self.train_set["users"]: 73 | for item in self.train_set['items_seen_by_user'][user]: 74 | feedbacks.append(self.train_set['feedback'][user][item]) 75 | 76 | std = np.std(feedbacks) 77 | 78 | if self.test_file is not None: 79 | for user in self.test_set['users']: 80 | for item in self.test_set['feedback'][user]: 81 | if self.uniform: 82 | feedback_value = np.random.uniform(self.train_set['min_value'], self.train_set['max_value']) 83 | else: 84 | feedback_value = np.random.normal(self.train_set['mean_value'], std) 85 | 86 | self.predictions.append((user, item, feedback_value)) 87 | else: 88 | raise NotImplemented 89 | 90 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 91 | """ 92 | Extends compute method from BaseRatingPrediction. Method to run recommender algorithm 93 | 94 | :param verbose: Print recommender and database information 95 | :type verbose: bool, default True 96 | 97 | :param metrics: List of evaluation measures 98 | :type metrics: list, default None 99 | 100 | :param verbose_evaluation: Print the evaluation results 101 | :type verbose_evaluation: bool, default True 102 | 103 | :param as_table: Print the evaluation results as table 104 | :type as_table: bool, default False 105 | 106 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 107 | :type table_sep: str, default '\t' 108 | 109 | """ 110 | 111 | super(RandomRec, self).compute(verbose=verbose) 112 | 113 | if verbose: 114 | print("prediction_time:: %4f sec" % timed(self.predict)) 115 | print('\n') 116 | 117 | else: 118 | self.predict() 119 | 120 | self.write_predictions() 121 | 122 | if self.test_file is not None: 123 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 124 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/svd.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Singular Value Decomposition Based Collaborative Filtering Recommender 4 | [Rating Prediction] 5 | 6 | Literature: 7 | Badrul Sarwar , George Karypis , Joseph Konstan , John Riedl: 8 | Incremental Singular Value Decomposition Algorithms for Highly Scalable Recommender Systems 9 | Fifth International Conference on Computer and Information Science 2002. 10 | http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.7894 11 | 12 | """ 13 | 14 | # © 2019. Case Recommender (MIT License) 15 | 16 | import numpy as np 17 | from scipy.sparse.linalg import svds 18 | 19 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction 20 | from caserec.utils.extra_functions import timed 21 | 22 | __author__ = 'Arthur Fortes ' 23 | 24 | 25 | class SVD(BaseRatingPrediction): 26 | def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, sep='\t', output_sep='\t', 27 | random_seed=None): 28 | """ 29 | Matrix Factorization for rating prediction 30 | 31 | Matrix factorization models map both users and items to a joint latent factor space of dimensionality f, 32 | such that user-item interactions are modeled as inner products in that space. 33 | 34 | Usage:: 35 | 36 | >> MatrixFactorization(train, test).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param factors: Number of latent factors per user/item 50 | :type factors: int, default 10 51 | 52 | :param sep: Delimiter for input files 53 | :type sep: str, default '\t' 54 | 55 | :param output_sep: Delimiter for output file 56 | :type output_sep: str, default '\t' 57 | 58 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 59 | :type random_seed: int, default None 60 | 61 | """ 62 | super(SVD, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep, 63 | output_sep=output_sep) 64 | 65 | self.recommender_name = 'SVD' 66 | self.factors = factors 67 | 68 | if random_seed is not None: 69 | np.random.seed(random_seed) 70 | 71 | # internal vars 72 | self.feedback_triples = None 73 | self.prediction_matrix = None 74 | 75 | def init_model(self): 76 | """ 77 | Method to treat and initialize the model 78 | 79 | """ 80 | 81 | self.feedback_triples = [] 82 | 83 | # Map interaction with ids 84 | for user in self.train_set['feedback']: 85 | for item in self.train_set['feedback'][user]: 86 | self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item], 87 | self.train_set['feedback'][user][item])) 88 | 89 | self.create_matrix() 90 | 91 | def fit(self): 92 | """ 93 | This method performs Singular Value Decomposition over the training data. 94 | 95 | """ 96 | 97 | u, s, vt = svds(self.matrix, k=self.factors) 98 | s_diagonal_matrix = np.diag(s) 99 | self.prediction_matrix = np.dot(np.dot(u, s_diagonal_matrix), vt) 100 | 101 | def predict_score(self, u, i, cond=True): 102 | """ 103 | Method to predict a single score for a pair (user, item) 104 | 105 | :param u: User ID 106 | :type u: int 107 | 108 | :param i: Item ID 109 | :type i: int 110 | 111 | :param cond: Use max and min values of train set to limit score 112 | :type cond: bool, default True 113 | 114 | :return: Score generate for pair (user, item) 115 | :rtype: float 116 | 117 | """ 118 | 119 | rui = self.train_set["mean_value"] + self.prediction_matrix[u][i] 120 | 121 | if cond: 122 | if rui > self.train_set["max_value"]: 123 | rui = self.train_set["max_value"] 124 | elif rui < self.train_set["min_value"]: 125 | rui = self.train_set["min_value"] 126 | 127 | return rui 128 | 129 | def predict(self): 130 | """ 131 | This method computes a final rating for unknown pairs (user, item) 132 | 133 | """ 134 | 135 | if self.test_file is not None: 136 | for user in self.test_set['users']: 137 | for item in self.test_set['feedback'][user]: 138 | self.predictions.append((user, item, self.predict_score(self.user_to_user_id[user], 139 | self.item_to_item_id[item], True))) 140 | else: 141 | raise NotImplemented 142 | 143 | def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'): 144 | """ 145 | Extends compute method from BaseRatingPrediction. Method to run recommender algorithm 146 | 147 | :param verbose: Print recommender and database information 148 | :type verbose: bool, default True 149 | 150 | :param metrics: List of evaluation measures 151 | :type metrics: list, default None 152 | 153 | :param verbose_evaluation: Print the evaluation results 154 | :type verbose_evaluation: bool, default True 155 | 156 | :param as_table: Print the evaluation results as table 157 | :type as_table: bool, default False 158 | 159 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 160 | :type table_sep: str, default '\t' 161 | 162 | """ 163 | 164 | super(SVD, self).compute(verbose=verbose) 165 | 166 | if verbose: 167 | self.init_model() 168 | print("training_time:: %4f sec" % timed(self.fit)) 169 | if self.extra_info_header is not None: 170 | print(self.extra_info_header) 171 | 172 | print("prediction_time:: %4f sec" % timed(self.predict)) 173 | 174 | print('\n') 175 | 176 | else: 177 | # Execute all in silence without prints 178 | self.init_model() 179 | self.fit() 180 | self.predict() 181 | 182 | self.write_predictions() 183 | 184 | if self.test_file is not None: 185 | self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep) 186 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/svdplusplus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | SVD++ Based Collaborative Filtering Recommender 4 | [Rating Prediction] 5 | 6 | Literature: 7 | Yehuda Koren: 8 | Factorization meets the neighborhood: a multifaceted collaborative filtering model 9 | KDD 2008 10 | http://portal.acm.org/citation.cfm?id=1401890.1401944 11 | 12 | """ 13 | 14 | # © 2019. Case Recommender (MIT License) 15 | 16 | import numpy as np 17 | 18 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization 19 | 20 | __author__ = 'Arthur Fortes ' 21 | 22 | 23 | class SVDPlusPlus(MatrixFactorization): 24 | def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, learn_rate=0.01, epochs=10, 25 | delta=0.015, init_mean=0.1, init_stdev=0.1, bias_learn_rate=0.005, delta_bias=0.002, 26 | stop_criteria=0.009, sep='\t', output_sep='\t', random_seed=None, update_delta=False): 27 | """ 28 | SVD++ for rating prediction 29 | 30 | The SVD++ algorithm, an extension of SVD taking into account implicit ratings. Just as for SVD, the parameters 31 | are learned using a SGD on the regularized squared error objective. 32 | 33 | Usage:: 34 | 35 | >> SVDPlusPlus(train, test).compute() 36 | 37 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 38 | (user item feedback_value). 39 | :type train_file: str 40 | 41 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 42 | (user item feedback_value). 43 | :type test_file: str, default None 44 | 45 | :param output_file: File with dir to write the final predictions 46 | :type output_file: str, default None 47 | 48 | :param factors: Number of latent factors per user/item 49 | :type factors: int, default 10 50 | 51 | :param learn_rate: Learning rate (alpha) 52 | :type learn_rate: float, default 0.05 53 | 54 | :param epochs: Number of epochs over the training data 55 | :type epochs: int, default 30 56 | 57 | :param delta: Regularization value 58 | :type delta: float, default 0.015 59 | 60 | :param init_mean: Mean of the normal distribution used to initialize the latent factors 61 | :type init_mean: float, default 0 62 | 63 | :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors 64 | :type init_stdev: float, default 0.1 65 | 66 | :param bias_learn_rate: Learning rate for baselines 67 | :type bias_learn_rate: float, default 0.005 68 | 69 | :param delta_bias: Regularization value for baselines 70 | :type delta_bias: float, default 0.002 71 | 72 | :param stop_criteria: Difference between errors for stopping criteria 73 | :type stop_criteria: float, default 0.009 74 | 75 | :param sep: Delimiter for input files 76 | :type sep: str, default '\t' 77 | 78 | :param output_sep: Delimiter for output file 79 | :type output_sep: str, default '\t' 80 | 81 | :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments. 82 | :type random_seed: int, default None 83 | 84 | """ 85 | 86 | super(SVDPlusPlus, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 87 | factors=factors, learn_rate=learn_rate, epochs=epochs, delta=delta, 88 | init_mean=init_mean, init_stdev=init_stdev, baseline=True, 89 | bias_learn_rate=bias_learn_rate, delta_bias=delta_bias, 90 | stop_criteria=stop_criteria, sep=sep, output_sep=output_sep, 91 | random_seed=random_seed) 92 | 93 | self.recommender_name = 'SVDPlusPlus' 94 | self.update_delta = update_delta 95 | 96 | self.y = None 97 | self.n_u = None 98 | self.items_id_seen_by_user = None 99 | 100 | def init_model(self): 101 | """ 102 | Method to treat and initialize the model. . Extends init_model from MatrixFactorization 103 | 104 | """ 105 | 106 | super(SVDPlusPlus, self).init_model() 107 | 108 | self.n_u = {} 109 | self.items_id_seen_by_user = {} 110 | 111 | for user in self.train_set['users']: 112 | for item in self.train_set['items_seen_by_user'][user]: 113 | self.items_id_seen_by_user.setdefault(self.user_to_user_id[user], []).append(self.item_to_item_id[item]) 114 | # |N(u)|^(-1/2) 115 | self.n_u[self.user_to_user_id[user]] = np.sqrt(len(self.train_set['items_seen_by_user'][user])) 116 | 117 | def fit(self): 118 | """ 119 | This method performs iterations of stochastic gradient ascent over the training data. 120 | 121 | """ 122 | 123 | rmse_old = .0 124 | for epoch in range(self.epochs): 125 | error_final = .0 126 | 127 | for user, item, feedback in self.feedback_triples: 128 | pu = self.p[user] + self.y_sum_rows(user) 129 | 130 | # Calculate error 131 | eui = feedback - self._predict_svd_plus_plus_score(user, item, pu, False) 132 | error_final += (eui ** 2.0) 133 | 134 | # update bu and bi 135 | self.bu[user] += self.bias_learn_rate * (eui - self.delta_bias * self.bu[user]) 136 | self.bi[item] += self.bias_learn_rate * (eui - self.delta_bias * self.bi[item]) 137 | 138 | # Adjust the factors 139 | norm_eui = eui / self.n_u[user] 140 | 141 | i_f = self.q[item] 142 | 143 | # Compute factor updates 144 | delta_u = np.subtract(np.multiply(eui, i_f), np.multiply(self.delta, self.p[user])) 145 | self.p[user] += np.multiply(self.learn_rate, delta_u) 146 | 147 | delta_i = np.subtract(np.multiply(eui, pu), np.multiply(self.delta, i_f)) 148 | self.q[item] += np.multiply(self.learn_rate, delta_i) 149 | 150 | # update y (implicit factor) 151 | common_update = norm_eui * i_f 152 | 153 | for j in self.items_id_seen_by_user[user]: 154 | delta_y = np.subtract(common_update, self.delta * self.y[j]) 155 | self.y[j] += self.learn_rate * delta_y 156 | 157 | rmse_new = np.sqrt(error_final / self.train_set["number_interactions"]) 158 | 159 | if np.fabs(rmse_new - rmse_old) <= self.stop_criteria: 160 | break 161 | else: 162 | rmse_old = rmse_new 163 | 164 | def create_factors(self): 165 | """ 166 | This method extends create_factors from Matrix Factorization, adding y factors 167 | 168 | """ 169 | 170 | super(SVDPlusPlus, self).create_factors() 171 | self.y = np.random.normal(self.init_mean, self.init_stdev, (len(self.items), self.factors)) 172 | 173 | def _predict_svd_plus_plus_score(self, u, i, pu, cond=True): 174 | """ 175 | 176 | :param u: User ID (from self.items) 177 | :type u: int 178 | 179 | :param i: Item ID (from self.items) 180 | :type i: int 181 | 182 | :param pu: User updated vector (pu * y) 183 | :type pu: list or np.array 184 | 185 | :param cond: Use max and min values of train set to limit score 186 | :type cond: bool, default True 187 | 188 | :return: prediction for user u and item i 189 | :rtype: float 190 | 191 | """ 192 | rui = self.train_set["mean_value"] + self.bu[u] + self.bi[i] + np.dot(pu, self.q[i]) 193 | 194 | if cond: 195 | if rui > self.train_set["max_value"]: 196 | rui = self.train_set["max_value"] 197 | elif rui < self.train_set["min_value"]: 198 | rui = self.train_set["min_value"] 199 | return rui 200 | 201 | def y_sum_rows(self, user): 202 | """ 203 | Incorporating implicit feedback in the SVD: Sum (j E N(u)) Yj 204 | 205 | :param user: User ID 206 | :type user: int 207 | 208 | :return: Sum of y vectors for seen items of user 209 | 210 | """ 211 | 212 | sum_imp = np.zeros(self.factors) 213 | for ui in self.items_id_seen_by_user[user]: 214 | sum_imp += self.y[ui] 215 | return sum_imp / self.n_u[user] 216 | 217 | def predict(self): 218 | """ 219 | This method computes a final rating for unknown pairs (user, item) 220 | 221 | """ 222 | 223 | if self.test_file is not None: 224 | for user in self.test_set['users']: 225 | pu = self.p[self.user_to_user_id[user]] + self.y_sum_rows(self.user_to_user_id[user]) 226 | 227 | for item in self.test_set['feedback'][user]: 228 | self.predictions.append( 229 | (user, item, self._predict_svd_plus_plus_score(self.user_to_user_id[user], 230 | self.item_to_item_id[item], pu, True))) 231 | else: 232 | raise NotImplemented 233 | -------------------------------------------------------------------------------- /caserec/recommenders/rating_prediction/user_attribute_knn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | User Based Collaborative Filtering Recommender with Attributes (User Attribute KNN) 4 | [Rating Prediction] 5 | 6 | User-Attribute-kNN predicts a user’s rating according to how similar users rated the same item. The algorithm 7 | matches similar users based on the similarity of their attributes scores. However, instead of traditional UserKNN, 8 | this approach uses a pre-computed similarity matrix based on metadata. 9 | 10 | 11 | """ 12 | 13 | # © 2019. Case Recommender (MIT License) 14 | 15 | import numpy as np 16 | 17 | from caserec.recommenders.rating_prediction.userknn import UserKNN 18 | from caserec.utils.process_data import ReadFile 19 | 20 | __author__ = 'Arthur Fortes ' 21 | 22 | 23 | class UserAttributeKNN(UserKNN): 24 | def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None, 25 | k_neighbors=30, as_similar_first=True, metadata_as_binary=False, 26 | metadata_similarity_sep='\t', similarity_metric="cosine", sep='\t', output_sep='\t'): 27 | """ 28 | User Attribute KNN for Rating Prediction 29 | 30 | This algorithm predicts a rating for each pair (user, item) based on the similar items that his neighbors 31 | (similar users) consumed, using a metadata or similarity pre-computed file 32 | 33 | Usage:: 34 | 35 | >> UserAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute() 36 | >> UserAttributeKNN(train, test, metadata_file=metadata, as_similar_first=True).compute() 37 | 38 | :param train_file: File which contains the train set. This file needs to have at least 3 columns 39 | (user item feedback_value). 40 | :type train_file: str 41 | 42 | :param test_file: File which contains the test set. This file needs to have at least 3 columns 43 | (user item feedback_value). 44 | :type test_file: str, default None 45 | 46 | :param output_file: File with dir to write the final predictions 47 | :type output_file: str, default None 48 | 49 | :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns 50 | (user metadata). 51 | :type metadata_file: str, default None 52 | 53 | :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns 54 | (user user similarity). 55 | :type similarity_file: str, default None 56 | 57 | :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users)) 58 | :type k_neighbors: int, default None 59 | 60 | :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k 61 | most similar users and then take the intersection with the users that 62 | seen that item. 63 | :type as_similar_first: bool, default True 64 | 65 | :param metadata_as_binary: f True, the explicit value will be transform to binary 66 | :type metadata_as_binary: bool, default False 67 | 68 | :param metadata_similarity_sep: Delimiter for similarity or metadata file 69 | :type metadata_similarity_sep: str, default '\t' 70 | 71 | :param similarity_metric: 72 | :type similarity_metric: str, default cosine 73 | 74 | :param sep: Delimiter for input files file 75 | :type sep: str, default '\t' 76 | 77 | :param output_sep: Delimiter for output file 78 | :type output_sep: str, default '\t' 79 | """ 80 | super(UserAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, 81 | k_neighbors=k_neighbors, as_similar_first=as_similar_first, 82 | similarity_metric=similarity_metric, sep=sep, output_sep=output_sep) 83 | 84 | self.recommender_name = 'User Attribute KNN Algorithm' 85 | 86 | self.metadata_file = metadata_file 87 | self.similarity_file = similarity_file 88 | self.metadata_as_binary = metadata_as_binary 89 | self.metadata_similarity_sep = metadata_similarity_sep 90 | 91 | def init_model(self): 92 | """ 93 | Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity 94 | matrix 95 | 96 | """ 97 | self.users_id_viewed_item = {} 98 | 99 | # Set the value for k 100 | if self.k_neighbors is None: 101 | self.k_neighbors = int(np.sqrt(len(self.users))) 102 | 103 | for item in self.items: 104 | for user in self.train_set['users_viewed_item'].get(item, []): 105 | self.users_id_viewed_item.setdefault(item, []).append(self.user_to_user_id[user]) 106 | 107 | if self.metadata_file is not None: 108 | metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary 109 | ).read_metadata_or_similarity() 110 | 111 | self.matrix = np.zeros((len(self.users), len(metadata['col_2']))) 112 | 113 | meta_to_meta_id = {} 114 | for m, data in enumerate(metadata['col_2']): 115 | meta_to_meta_id[data] = m 116 | 117 | for user_m in metadata['col_1']: 118 | for m1 in metadata['dict'][user_m]: 119 | try: 120 | self.matrix[self.user_to_user_id[user_m], meta_to_meta_id[m1]] = metadata['dict'][user_m][m1] 121 | except KeyError: 122 | pass 123 | 124 | # create header info for metadata 125 | sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100 126 | 127 | self.extra_info_header = ">> metadata:: %d users and %d metadata (%d interactions) | sparsity:: %.2f%%" % \ 128 | (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'], 129 | sparsity) 130 | 131 | # Create similarity matrix based on metadata or similarity file 132 | self.su_matrix = self.compute_similarity(transpose=False) 133 | 134 | elif self.similarity_file is not None: 135 | similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False 136 | ).read_metadata_or_similarity() 137 | 138 | self.su_matrix = np.zeros((len(self.users), len(self.users))) 139 | 140 | # Fill similarity matrix 141 | for u in similarity['col_1']: 142 | for u_j in similarity['dict'][u]: 143 | self.su_matrix[self.user_to_user_id[u], self.user_to_user_id[int(u_j)]] = similarity['dict'][u][u_j] 144 | 145 | # Remove NaNs 146 | self.su_matrix[np.isnan(self.su_matrix)] = 0.0 147 | 148 | else: 149 | raise ValueError("This algorithm needs a similarity matrix or a metadata file!") 150 | 151 | # Create original matrix user x item for prediction process 152 | self.create_matrix() 153 | -------------------------------------------------------------------------------- /caserec/utils/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Arthur' 2 | -------------------------------------------------------------------------------- /caserec/utils/cross_validation.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | Cross Validation fo Recommender Algorithms 4 | 5 | """ 6 | 7 | # © 2019. Case Recommender (MIT License) 8 | 9 | from collections import defaultdict 10 | import numpy as np 11 | import shutil 12 | 13 | from caserec.utils.split_database import SplitDatabase 14 | 15 | __author__ = 'Arthur Fortes ' 16 | 17 | 18 | class CrossValidation(object): 19 | def __init__(self, input_file, recommender, dir_folds, k_folds=10, header=None, sep='\t', write_predictions=False, 20 | write_sep='\t', recommender_verbose=False, evaluation_in_fold_verbose=True, metrics=None, 21 | as_table=False, table_sep='\t', del_folds=False, random_seed=None): 22 | """ 23 | Cross Validation 24 | 25 | This strategy is responsible to divide the database in K folds, in which each fold contain a train and a test 26 | set. Its also responsible to run and evaluate the recommender results in each fold and calculate the mean and 27 | the standard deviation. 28 | 29 | Usage: 30 | >> rec = MostPopular(as_binary=True) 31 | >> CrossValidation(db, rec, fold_d, evaluation_in_fold_verbose=False).compute() 32 | 33 | :param input_file: Database file 34 | :type input_file: str 35 | 36 | :param recommender: Initialize the recommender algorithm. e.g.: MostPopular(as_binary=True) 37 | :type recommender: class 38 | 39 | :param dir_folds: Directory to write folds (train and test files) 40 | :type dir_folds: str 41 | 42 | :param k_folds: How much folds the strategy will divide 43 | :type k_folds: int, default 10 44 | 45 | :param header: Skip header line 46 | :type header: int, default None 47 | 48 | :param sep: Delimiter for input files 49 | :type sep: str, default '\t' 50 | 51 | :param write_predictions: Write the recommender predictions in each fold 52 | :type write_predictions: bool, default False 53 | 54 | :param write_sep: Delimiter for output files 55 | :type write_sep: str, default '\t' 56 | 57 | :param recommender_verbose: Print header of recommender in each fold 58 | :type recommender_verbose: bool, default False 59 | 60 | :param evaluation_in_fold_verbose: Print evaluation of recommender in each fold 61 | :type evaluation_in_fold_verbose: bool, default True 62 | 63 | :param metrics: List of evaluation metrics 64 | :type metrics: str, default None 65 | 66 | :param as_table: Print the evaluation results as table 67 | :type as_table: bool, default False 68 | 69 | :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True) 70 | :type table_sep: str, default '\t' 71 | 72 | :param del_folds: Delete folds after evaluation 73 | :type del_folds: bool, default False 74 | 75 | :param random_seed: Random seed 76 | :type random_seed: int, default None 77 | 78 | """ 79 | 80 | self.input_file = input_file 81 | self.recommender = recommender 82 | self.dir_folds = dir_folds 83 | self.k_folds = k_folds 84 | self.header = header 85 | self.sep = sep 86 | self.write_predictions = write_predictions 87 | self.write_sep = write_sep 88 | self.recommender_verbose = recommender_verbose 89 | self.evaluation_in_fold_verbose = evaluation_in_fold_verbose 90 | self.metrics = metrics 91 | self.as_table = as_table 92 | self.table_sep = table_sep 93 | self.del_folds = del_folds 94 | self.random_seed = random_seed 95 | 96 | # internal vars 97 | self.folds_results = defaultdict(list) 98 | 99 | def generate_folds(self): 100 | """ 101 | Method to generate folds with k fold cross validation 102 | 103 | """ 104 | 105 | SplitDatabase(input_file=self.input_file, n_splits=self.k_folds, dir_folds=self.dir_folds, 106 | sep_read=self.sep, header=self.header).k_fold_cross_validation(random_state=self.random_seed) 107 | 108 | def execute_algorithm(self): 109 | """ 110 | Method to run recommender algorithm in k folds 111 | 112 | """ 113 | 114 | for k in range(self.k_folds): 115 | train_file = self.dir_folds + 'folds/%d/train.dat' % k 116 | test_file = self.dir_folds + 'folds/%d/test.dat' % k 117 | 118 | self.recommender.train_file = train_file 119 | self.recommender.test_file = test_file 120 | 121 | if self.write_predictions: 122 | output_file = self.dir_folds + 'folds/%d/output.dat' % k 123 | self.recommender.output_file = output_file 124 | 125 | self.recommender.compute(verbose=self.recommender_verbose, 126 | verbose_evaluation=self.evaluation_in_fold_verbose, metrics=self.metrics) 127 | 128 | if self.metrics is None: 129 | self.metrics = self.recommender.evaluation_results.keys() 130 | 131 | for metric in self.metrics: 132 | self.folds_results[metric.upper()].append(self.recommender.evaluation_results[metric.upper()]) 133 | 134 | def evaluate(self, verbose=True): 135 | """ 136 | Method to evaluate folds results and generate mean and standard deviation 137 | 138 | :param verbose: If True, print evaluation results 139 | :type verbose: bool, default True 140 | 141 | """ 142 | 143 | mean_dict = defaultdict(dict) 144 | std_dict = defaultdict(dict) 145 | 146 | for metric in self.metrics: 147 | mean_dict[metric.upper()] = np.mean(self.folds_results[metric.upper()]) 148 | std_dict[metric.upper()] = np.std(self.folds_results[metric.upper()]) 149 | 150 | if verbose: 151 | if self.as_table: 152 | header = '' 153 | values_mean = '' 154 | values_std = '' 155 | for metric in self.metrics: 156 | header += metric.upper() + self.table_sep 157 | values_mean += str(round(mean_dict[metric.upper()], 6)) + self.table_sep 158 | values_std += str(round(std_dict[metric.upper()], 6)) + self.table_sep 159 | print('Metric%s%s' % (self.table_sep, header)) 160 | print('Mean%s%s' % (self.table_sep, values_mean)) 161 | print('STD%s%s' % (self.table_sep, values_std)) 162 | else: 163 | evaluation_mean = 'Mean:: ' 164 | evaluation_std = 'STD:: ' 165 | for metrics in self.metrics: 166 | evaluation_mean += "%s: %.6f " % (metrics.upper(), mean_dict[metrics.upper()]) 167 | evaluation_std += "%s: %.6f " % (metrics.upper(), std_dict[metrics.upper()]) 168 | print(evaluation_mean) 169 | print(evaluation_std) 170 | 171 | def erase_folds(self): 172 | """ 173 | Method to delete folds after evaluation 174 | 175 | """ 176 | 177 | folds = self.dir_folds + 'folds/' 178 | shutil.rmtree(folds) 179 | 180 | def compute(self, verbose=True): 181 | """ 182 | Method to run the cross validation 183 | 184 | :param verbose: If True, print header 185 | :type verbose: bool, default True 186 | 187 | """ 188 | 189 | if verbose: 190 | 191 | print("[Case Recommender: Cross Validation]\n") 192 | print("Database:: %s \nRecommender Algorithm:: %s | K Folds: %d\n" % (self.input_file, 193 | self.recommender.recommender_name, 194 | self.k_folds)) 195 | 196 | self.generate_folds() 197 | self.execute_algorithm() 198 | self.evaluate(verbose) 199 | 200 | if self.del_folds: 201 | self.erase_folds() 202 | -------------------------------------------------------------------------------- /caserec/utils/extra_functions.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | This file has some auxiliary functions for Case Recommender. Method: 4 | - check_error_file: check if file exist 5 | - check_len_lists: check if the size of two list are equal 6 | - timed: measure the execution time of a function 7 | - print_header: print header in the algorithms 8 | 9 | """ 10 | 11 | # © 2019. Case Recommender (MIT License) 12 | 13 | import sys 14 | import time 15 | 16 | __author__ = 'Arthur Fortes ' 17 | 18 | 19 | def check_error_file(file_check): 20 | """ 21 | Function to check if file exist 22 | 23 | :param file_check: File to check 24 | :type file_check: str 25 | 26 | """ 27 | 28 | try: 29 | open(file_check) 30 | except TypeError: 31 | raise TypeError("File cannot be empty or file is invalid: " + str(file_check)) 32 | 33 | 34 | def check_len_lists(list1, list2): 35 | """ 36 | Function to check if 2 have the same length 37 | 38 | :param list1: First list 39 | :type list1: list 40 | 41 | :param list2: Second list 42 | :type list2: list 43 | 44 | """ 45 | 46 | if len(list1) != len(list2): 47 | print("Error: Number of files in train list and rank list must be equal!") 48 | sys.exit() 49 | 50 | 51 | def timed(f): 52 | """ 53 | Function to calculate the time of execution 54 | 55 | :param f: Function name without () 56 | :type f: function name 57 | 58 | :return: Time of execution 59 | :rtype: float 60 | 61 | """ 62 | start = time.time() 63 | f() 64 | elapsed = time.time() - start 65 | return elapsed 66 | 67 | 68 | def print_header(header_info, test_info=None): 69 | """ 70 | Function to print the header with information of the files 71 | 72 | :param header_info: Dictionary with information about dataset or train file 73 | :type header_info: dict 74 | 75 | :param test_info: Dictionary with information about test file 76 | :type test_info: dict 77 | 78 | """ 79 | 80 | print("[Case Recommender: %s]\n" % header_info['title']) 81 | print("train data:: %d users and %d items (%d interactions) | sparsity:: %.2f%%" % 82 | (header_info['n_users'], header_info['n_items'], header_info['n_interactions'], header_info['sparsity'])) 83 | 84 | if test_info is not None: 85 | print("test data:: %d users and %d items (%d interactions) | sparsity:: %.2f%%\n" % 86 | (test_info['n_users'], test_info['n_items'], test_info['n_interactions'], test_info['sparsity'])) 87 | 88 | 89 | class ComputeBui(object): 90 | """ 91 | Compute baselines based on training information considering information about users and items 92 | 93 | """ 94 | def __init__(self, training_set): 95 | """ 96 | 97 | :param training_set: Dictionary returned by ReadFile with method read() 98 | :type training_set: dict 99 | """ 100 | self.training_set = training_set 101 | self.bu = dict() 102 | self.bi = dict() 103 | self.bui = dict() 104 | 105 | def train_baselines(self): 106 | for i in range(10): 107 | self.compute_bi() 108 | self.compute_bu() 109 | self.compute_bui() 110 | 111 | def compute_bi(self): 112 | # bi = (rui - mi - bu) / (regBi + number of interactions) 113 | self.bi = dict() 114 | 115 | for item in self.training_set['items']: 116 | cont = 0 117 | for user in self.training_set['users_viewed_item'][item]: 118 | self.bi[item] = self.bi.get(item, 0) + float(self.training_set['feedback'][user][item]) - \ 119 | self.training_set['mean_value'] - self.bu.get(user, 0) 120 | cont += 1 121 | if cont > 1: 122 | self.bi[item] = float(self.bi[item]) / float(10 + cont) 123 | 124 | def compute_bu(self): 125 | # bu = (rui - mi - bi) / (regBu + number of interactions) 126 | self.bu = dict() 127 | for user in self.training_set['users']: 128 | cont = 0 129 | for item in self.training_set['items_seen_by_user'][user]: 130 | self.bu[user] = self.bu.get(user, 0) + float(self.training_set['feedback'][user][item]) - \ 131 | self.training_set['mean_value'] - self.bi.get(item, 0) 132 | cont += 1 133 | if cont > 1: 134 | self.bu[user] = float(self.bu[user]) / float(15 + cont) 135 | 136 | def compute_bui(self): 137 | # bui = mi + bu + bi 138 | for user in self.training_set['users']: 139 | for item in self.training_set['items']: 140 | try: 141 | self.bui.setdefault(user, {}).update( 142 | {item: self.training_set['mean_value'] + self.bu[user] + self.bi[item]}) 143 | except KeyError: 144 | self.bui.setdefault(user, {}).update({item: self.training_set['mean_value']}) 145 | 146 | def execute(self): 147 | self.train_baselines() 148 | return self.bui 149 | -------------------------------------------------------------------------------- /caserec/utils/split_database.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """" 3 | This class is responsible for divide databases in k folds with two strategies: 4 | k-fold cross-validation or ShuffleSplit 5 | 6 | """ 7 | 8 | # © 2019. Case Recommender (MIT License) 9 | 10 | 11 | from sklearn.model_selection import KFold, ShuffleSplit 12 | import os 13 | 14 | from caserec.utils.process_data import ReadFile, WriteFile 15 | 16 | __author__ = 'Arthur Fortes ' 17 | 18 | 19 | class SplitDatabase(ReadFile): 20 | def __init__(self, input_file, dir_folds=None, n_splits=10, sep_read='\t', sep_write='\t', header=None, 21 | names=None, as_binary=False, binary_col=None, write_mode='w'): 22 | """ 23 | Given a database, this class is responsible for creating a training and test sets 24 | for k folds with well-known strategies: 25 | 26 | - k-fold cross-validation 27 | - ShuffleSplit 28 | 29 | Usage: 30 | 31 | >> SplitDatabase(input_file=database, dir_folds=dir_path, n_folds=10).k_fold_cross_validation() 32 | >> SplitDatabase(input_file=database, dir_folds=dir_path, n_folds=10).shuffle_split(test_size=0.3) 33 | # To use only one fold, you should use only shuffle_split. k_fold_cross_validation works only with 34 | # n_folds >= 2: 35 | >> SplitDatabase(input_file=database, dir_folds=dir_path, n_folds=1).shuffle_split(test_size=0.1) 36 | 37 | :param input_file: Input File with at least 2 columns. 38 | :type input_file: str 39 | 40 | :param dir_folds: Directory to write folds (train and test files) 41 | :type dir_folds: str 42 | 43 | :param n_splits: How much folds the strategy will divide 44 | :type n_splits: int, default 10 45 | 46 | :param sep_read: Delimiter for input files 47 | :type sep_read: str, default '\t' 48 | 49 | :param sep_write: Delimiter for output files 50 | :type sep_write: str, default '\t' 51 | 52 | :param header: Skip header line (only work with method: read_with_pandas) 53 | :type header: int, default None 54 | 55 | :param names: Name of columns (only work with method: read_with_pandas) 56 | :type names: str, default None 57 | 58 | :param as_binary: If True, the explicit feedback will be transform to binary 59 | :type as_binary: bool, default False 60 | 61 | :param binary_col: Index of columns to read as binary (only work with method: read_with_pandas) 62 | :type binary_col: int, default 2 63 | 64 | :param write_mode: Method to write file 65 | :type write_mode: str, default 'w' 66 | 67 | """ 68 | 69 | super(SplitDatabase, self).__init__(input_file, sep=sep_read, header=header, names=names, as_binary=as_binary, 70 | binary_col=binary_col) 71 | 72 | self.dir_folds = dir_folds 73 | self.n_splits = n_splits 74 | self.sep_write = sep_write 75 | self.write_mode = write_mode 76 | self.df = self.read_with_pandas() 77 | 78 | if self.dir_folds is not None: 79 | self.create_folds() 80 | 81 | def create_folds(self): 82 | self.dir_folds += "folds/" 83 | if not os.path.exists(self.dir_folds): 84 | os.mkdir(self.dir_folds) 85 | 86 | for n in range(self.n_splits): 87 | if not os.path.exists(self.dir_folds + str(n)): 88 | os.mkdir(self.dir_folds + str(n)) 89 | 90 | def write_files(self, trained_model): 91 | fold = 0 92 | for train_index, test_index in trained_model: 93 | if self.dir_folds is not None: 94 | train_file = self.dir_folds + str(fold) + '/train.dat' 95 | test_file = self.dir_folds + str(fold) + '/test.dat' 96 | 97 | df_train = self.df.iloc[train_index] 98 | df_test = self.df.iloc[test_index] 99 | 100 | WriteFile(train_file, sep=self.sep_write, mode=self.write_mode 101 | ).write_with_pandas(df_train.sort_values(by=[0, 1])) 102 | WriteFile(test_file, sep=self.sep_write, mode=self.write_mode 103 | ).write_with_pandas(df_test.sort_values(by=[0, 1])) 104 | 105 | fold += 1 106 | 107 | def k_fold_cross_validation(self, shuffle=True, random_state=None): 108 | """ 109 | k-fold cross-validation 110 | 111 | In k-fold cross-validation, the original sample is randomly partitioned into 112 | k equal sized subsamples. Of the k subsamples, a single subsample is retained as 113 | the validation data for testing the model, and the remaining k − 1 subsamples are 114 | used as training data. The cross-validation process is then repeated k times (the folds), 115 | with each of the k subsamples used exactly once as the validation data. 116 | 117 | The k results from the folds can then be averaged (or otherwise combined) to produce a 118 | single estimation. Reference: https://en.wikipedia.org/wiki/Cross-validation_(statistics) 119 | 120 | :param shuffle: 121 | :type shuffle: 122 | 123 | :param random_state: 124 | :type random_state: 125 | 126 | :return: 127 | """ 128 | 129 | kfold = KFold(n_splits=self.n_splits, shuffle=shuffle, random_state=random_state) 130 | trained_model = list(kfold.split(self.df)) 131 | 132 | if self.dir_folds is not None: 133 | self.write_files(trained_model) 134 | 135 | def shuffle_split(self, test_size=0.1, random_state=None): 136 | """ 137 | Shuffle Split 138 | 139 | Random permutation cross-validator 140 | 141 | Yields indices to split data into training and test sets. 142 | 143 | Note: contrary to other cross-validation strategies, random splits do not guarantee that 144 | all folds will be different, although this is still very likely for sizeable databases. 145 | 146 | :param test_size: 147 | :type test_size: 148 | 149 | :param random_state: 150 | :type random_state: 151 | 152 | :return: 153 | """ 154 | ss = ShuffleSplit(n_splits=self.n_splits, test_size=test_size, random_state=random_state) 155 | trained_model = list(ss.split(self.df)) 156 | 157 | if self.dir_folds is not None: 158 | self.write_files(trained_model) 159 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/examples/__init__.py -------------------------------------------------------------------------------- /examples/ranking_content_based.py: -------------------------------------------------------------------------------- 1 | from caserec.recommenders.item_recommendation.content_based import ContentBased 2 | from caserec.recommenders.item_recommendation.item_attribute_knn import ItemAttributeKNN 3 | 4 | train = '../../datasets/ml-100k/folds/0/train.dat' 5 | test = '../../datasets/ml-100k/folds/0/test.dat' 6 | rank_cb = '../../datasets/ml-100k/folds/0/rank_cb.dat' 7 | rank_attr = '../../datasets/ml-100k/folds/0/rank_attr.dat' 8 | similarity = '../../datasets/ml-100k/folds/0/vsm.dat' 9 | top_n = 10 10 | metrics = ('PREC', 'RECALL', 'NDCG', 'MAP') 11 | 12 | ItemAttributeKNN(train, test, similarity_file=similarity, output_file=rank_attr, rank_length=50).\ 13 | compute(metrics=metrics, n_ranks=[10, 20, 50]) 14 | ContentBased(train, test, similarity_file=similarity, output_file=rank_cb, rank_length=50).\ 15 | compute(metrics=metrics, n_ranks=[10, 20, 50]) 16 | 17 | -------------------------------------------------------------------------------- /examples/ranking_knn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running KNN Recommenders [Item Recommendation] 3 | 4 | - Cross Validation 5 | - Simple 6 | 7 | """ 8 | 9 | from caserec.recommenders.item_recommendation.user_attribute_knn import UserAttributeKNN 10 | from caserec.recommenders.item_recommendation.item_attribute_knn import ItemAttributeKNN 11 | from caserec.recommenders.item_recommendation.itemknn import ItemKNN 12 | from caserec.recommenders.item_recommendation.userknn import UserKNN 13 | from caserec.utils.cross_validation import CrossValidation 14 | 15 | db = '../../datasets/ml-100k/u.data' 16 | folds_path = '../../datasets/ml-100k/' 17 | 18 | metadata_item = '../../datasets/ml-100k/db_item_subject.dat' 19 | sm_item = '../../datasets/ml-100k/sim_item.dat' 20 | metadata_user = '../../datasets/ml-100k/metadata_user.dat' 21 | sm_user = '../../datasets/ml-100k/sim_user.dat' 22 | 23 | tr = '../../datasets/ml-100k/folds/0/train.dat' 24 | te = '../../datasets/ml-100k/folds/0/test.dat' 25 | 26 | """ 27 | 28 | UserKNN 29 | 30 | """ 31 | 32 | # # Cross Validation 33 | # recommender = UserKNN(as_binary=True) 34 | # 35 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute() 36 | # 37 | # # Simple 38 | # UserKNN(tr, te, as_binary=True).compute() 39 | UserAttributeKNN(tr, te, metadata_file=metadata_user).compute() 40 | # UserAttributeKNN(tr, te, similarity_file=sm_user).compute() 41 | 42 | """ 43 | 44 | ItemKNN 45 | 46 | """ 47 | 48 | # # Cross Validation 49 | # recommender = ItemKNN(as_binary=True) 50 | # 51 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute() 52 | # 53 | # # Simple 54 | # ItemKNN(tr, te, as_binary=True).compute() 55 | # ItemAttributeKNN(tr, te, metadata_file=metadata_item).compute() 56 | # ItemAttributeKNN(tr, te, similarity_file=sm_item).compute() 57 | -------------------------------------------------------------------------------- /examples/ranking_mp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running Most Popular Recommender [Item Recommendation] 3 | 4 | - Cross Validation 5 | - Simple 6 | 7 | """ 8 | 9 | from caserec.recommenders.item_recommendation.most_popular import MostPopular 10 | from caserec.utils.cross_validation import CrossValidation 11 | 12 | db = '../../datasets/ml-100k/u.data' 13 | folds_path = '../../datasets/ml-100k/' 14 | 15 | tr = '../../datasets/ml-100k/folds/0/train.dat' 16 | te = '../../datasets/ml-100k/folds/0/test.dat' 17 | 18 | # Cross Validation 19 | recommender = MostPopular(as_binary=True) 20 | 21 | CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute() 22 | 23 | # Simple 24 | MostPopular(tr, te, as_binary=True).compute() 25 | -------------------------------------------------------------------------------- /examples/ranking_others.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Running item recommendation algorithms 4 | 5 | """ 6 | from caserec.recommenders.item_recommendation.bprmf import BprMF 7 | 8 | tr = '../../datasets/ml-100k/folds/0/train.dat' 9 | te = '../../datasets/ml-100k/folds/0/test.dat' 10 | 11 | 12 | BprMF(tr, te, batch_size=30).compute() 13 | -------------------------------------------------------------------------------- /examples/ranking_rating_based_algorithm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running Precision and Recall metrics on rating-based algorithms 3 | 4 | """ 5 | 6 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization 7 | from caserec.recommenders.rating_prediction.nnmf import NNMF 8 | from caserec.utils.process_data import ReadFile 9 | from caserec.evaluation.rating_prediction import RatingPredictionEvaluation 10 | 11 | tr = '../../datasets/ml-100k/folds/0/train.dat' 12 | te = '../../datasets/ml-100k/folds/0/test.dat' 13 | 14 | # File to be saved model's predictions 15 | predictions_output_filepath = './predictions_output.dat' 16 | 17 | # Creating model and computing train / test sets 18 | # model = MatrixFactorization(tr, te, output_file = predictions_output_filepath) 19 | model = NNMF(tr, te, output_file = predictions_output_filepath) 20 | 21 | model.compute(verbose=False) 22 | 23 | # Using ReadFile class to read predictions from file 24 | reader = ReadFile(input_file=predictions_output_filepath) 25 | predictions = reader.read() 26 | 27 | # Creating evaluator with item-recommendation parameters 28 | evaluator = RatingPredictionEvaluation(sep = '\t', n_rank = [10], as_rank = True, metrics = ['PREC']) 29 | 30 | # Getting evaluation 31 | item_rec_metrics = evaluator.evaluate(predictions['feedback'], model.test_set) 32 | 33 | print ('\nItem Recommendation Metrics:\n', item_rec_metrics) 34 | 35 | model.predict() 36 | 37 | print ('\nOriginal Rating Prediction Metrics:\n', model.evaluation_results) -------------------------------------------------------------------------------- /examples/rating_prediction_knn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running KNN Recommenders [Rating Prediction] 3 | 4 | - Cross Validation 5 | - Simple 6 | 7 | """ 8 | 9 | from caserec.recommenders.rating_prediction.user_attribute_knn import UserAttributeKNN 10 | from caserec.recommenders.rating_prediction.item_attribute_knn import ItemAttributeKNN 11 | from caserec.recommenders.rating_prediction.itemknn import ItemKNN 12 | from caserec.recommenders.rating_prediction.userknn import UserKNN 13 | from caserec.utils.cross_validation import CrossValidation 14 | 15 | db = '../../datasets/ml-100k/u.data' 16 | folds_path = '../../datasets/ml-100k/' 17 | 18 | metadata_item = '../datasets/ml-100k/db_item_subject.dat' 19 | sm_item = '../datasets/ml-100k/sim_item.dat' 20 | metadata_user = '../datasets/ml-100k/metadata_user.dat' 21 | sm_user = '../datasets/ml-100k/sim_user.dat' 22 | 23 | tr = '../datasets/ml-100k/folds/0/train.dat' 24 | te = '../datasets/ml-100k/folds/0/test.dat' 25 | 26 | """ 27 | 28 | UserKNN 29 | 30 | """ 31 | 32 | # # Cross Validation 33 | # recommender = UserKNN() 34 | # 35 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute() 36 | # 37 | # # # Simple 38 | # UserKNN(tr, te).compute() 39 | # # UserAttributeKNN(tr, te, metadata_file=metadata_user).compute() 40 | # # UserAttributeKNN(tr, te, similarity_file=sm_user).compute() 41 | 42 | """ 43 | 44 | ItemKNN 45 | 46 | """ 47 | 48 | # # Cross Validation 49 | recommender = ItemKNN() 50 | 51 | CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute() 52 | # 53 | # # Simple 54 | # ItemKNN(tr, te).compute() 55 | # ItemAttributeKNN(tr, te, metadata_file=metadata_item).compute() 56 | # ItemAttributeKNN(tr, te, similarity_file=sm_item).compute() 57 | -------------------------------------------------------------------------------- /examples/rating_prediction_mf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running MF / SVD Recommenders [Rating Prediction] 3 | 4 | - Cross Validation 5 | - Simple 6 | 7 | """ 8 | 9 | from caserec.recommenders.rating_prediction.svdplusplus import SVDPlusPlus 10 | from caserec.recommenders.rating_prediction.nnmf import NNMF 11 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization 12 | from caserec.utils.cross_validation import CrossValidation 13 | 14 | db = '../../datasets/ml-100k/u.data' 15 | folds_path = '../../datasets/ml-100k/' 16 | 17 | metadata_item = '../../datasets/ml-100k/db_item_subject.dat' 18 | sm_item = '../../datasets/ml-100k/sim_item.dat' 19 | metadata_user = '../../datasets/ml-100k/metadata_user.dat' 20 | sm_user = '../../datasets/ml-100k/sim_user.dat' 21 | 22 | tr = '../../datasets/ml-100k/folds/0/train.dat' 23 | te = '../../datasets/ml-100k/folds/0/test.dat' 24 | 25 | """ 26 | 27 | UserKNN 28 | 29 | """ 30 | 31 | # Cross Validation 32 | # recommender = MatrixFactorization() 33 | 34 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute() 35 | 36 | # # Simple 37 | # MatrixFactorization(tr, te).compute() 38 | # SVDPlusPlus(tr, te).compute() 39 | 40 | NNMF(tr, te, factors = 20).compute() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # requirements 2 | pandas 3 | scikit-learn 4 | scipy 5 | numpy 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [bdist_wheel] 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """" 2 | Setup for Case Recommender 3 | 4 | """ 5 | 6 | # © 2019. Case Recommender (MIT License) 7 | 8 | from distutils.core import setup 9 | from setuptools import find_packages 10 | from os import path 11 | 12 | here = path.abspath(path.dirname(__file__)) 13 | 14 | __author__ = 'Arthur Fortes ' 15 | 16 | # Get the long description from the README file 17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 18 | long_description = f.read() 19 | 20 | # Get requiriments 21 | REQUIRED_PACKAGES = [ 22 | 'numpy', 23 | 'scipy', 24 | 'scikit-learn', 25 | 'pandas' 26 | ] 27 | 28 | setup( 29 | name='CaseRecommender', 30 | packages=find_packages(), 31 | version='1.1.1', 32 | license='MIT License', 33 | description='A recommender systems framework for Python', 34 | long_description=long_description, 35 | install_requires=REQUIRED_PACKAGES, 36 | 37 | author='Arthur Fortes ', 38 | author_email='fortes.arthur@gmail.com', 39 | 40 | url='https://github.com/caserec/CaseRecommender', 41 | download_url='https://github.com/caserec/CaseRecommender/archive/master.zip', 42 | 43 | keywords=['recommender systems', 'framework', 'collaborative filtering', 'content-based filtering', 44 | 'recommendation'], 45 | 46 | classifiers=[ 47 | # Indicate who your project is intended for 48 | 'Intended Audience :: Developers', 49 | 'Topic :: Software Development :: Build Tools', 50 | 51 | 'License :: OSI Approved :: MIT License', 52 | 53 | 'Programming Language :: Python :: 3', 54 | 'Programming Language :: Python :: 3.4', 55 | 'Programming Language :: Python :: 3.5', 56 | 'Programming Language :: Python :: 3.6', 57 | 'Intended Audience :: Developers', 58 | 'Topic :: Software Development :: Build Tools', 59 | ], 60 | ) 61 | --------------------------------------------------------------------------------