├── .gitattributes
├── .gitignore
├── COPYING
├── README.md
├── README.rst
├── _config.yml
├── caserec
    ├── __init__.py
    ├── clustering
    │   ├── __init__.py
    │   ├── kmedoids.py
    │   └── paco.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── base_evaluation.py
    │   ├── item_recomendation_functions.py
    │   ├── item_recommendation.py
    │   ├── rating_prediction.py
    │   └── statistical_analysis.py
    ├── recommenders
    │   ├── __init__.py
    │   ├── item_recommendation
    │   │   ├── __init__.py
    │   │   ├── base_item_recommendation.py
    │   │   ├── bprmf.py
    │   │   ├── content_based.py
    │   │   ├── ensemble_average.py
    │   │   ├── ensemble_bpr.py
    │   │   ├── group_based_recommender.py
    │   │   ├── item_attribute_knn.py
    │   │   ├── itemknn.py
    │   │   ├── most_popular.py
    │   │   ├── paco_recommender.py
    │   │   ├── random_rec.py
    │   │   ├── user_attribute_knn.py
    │   │   └── userknn.py
    │   └── rating_prediction
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       ├── base_rating_prediction.cpython-37.pyc
    │   │       └── nnmf.cpython-37.pyc
    │   │   ├── base_knn.py
    │   │   ├── base_nsvd1.py
    │   │   ├── base_rating_prediction.py
    │   │   ├── corec.py
    │   │   ├── gsvdplusplus.py
    │   │   ├── item_attribute_knn.py
    │   │   ├── item_msmf.py
    │   │   ├── item_nsvd1.py
    │   │   ├── itemknn.py
    │   │   ├── matrixfactorization.py
    │   │   ├── most_popular.py
    │   │   ├── nnmf.py
    │   │   ├── random_rec.py
    │   │   ├── svd.py
    │   │   ├── svdplusplus.py
    │   │   ├── user_attribute_knn.py
    │   │   ├── user_nsvd1.py
    │   │   └── userknn.py
    └── utils
    │   ├── __init__.py
    │   ├── cross_validation.py
    │   ├── extra_functions.py
    │   ├── process_data.py
    │   └── split_database.py
├── examples
    ├── __init__.py
    ├── ranking_content_based.py
    ├── ranking_knn.py
    ├── ranking_mp.py
    ├── ranking_others.py
    ├── ranking_rating_based_algorithm.py
    ├── rating_prediction_knn.py
    └── rating_prediction_mf.py
├── requirements.txt
├── setup.cfg
└── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Temporary and binary files
 2 | *~
 3 | *.py[cod]
 4 | *.so
 5 | *.cfg
 6 | !.isort.cfg
 7 | !setup.cfg
 8 | *.orig
 9 | *.log
10 | *.pot
11 | __pycache__/*
12 | .cache/*
13 | .*.swp
14 | */.ipynb_checkpoints/*
15 | *.csv
16 | logs/
17 | data/
18 | *.zip
19 | 
20 | # Project files
21 | .ropeproject
22 | .project
23 | .pydevproject
24 | .settings
25 | .idea
26 | tags
27 | 
28 | # Package files
29 | *.egg
30 | *.eggs/
31 | .installed.cfg
32 | *.egg-info
33 | 
34 | # Unittest and coverage
35 | htmlcov/*
36 | .coverage
37 | .tox
38 | junit.xml
39 | coverage.xml
40 | .pytest_cache/
41 | 
42 | # Build and docs folder/files
43 | build/*
44 | dist/*
45 | sdist/*
46 | docs/api/*
47 | docs/_rst/*
48 | docs/_build/*
49 | cover/*
50 | MANIFEST
51 | 
52 | # Per-project virtualenvs
53 | .venv*/
54 | 
55 | # Igoring jupyter notebooks
56 | *.ipynb
57 | 
58 | # Igoring models
59 | *.hdf5
60 | 
61 | # Igoring audio files
62 | *.wav
63 | *.mp3
64 | 
65 | # Igoring pickle files
66 | *.pk
67 | 
68 | # Igoring bin and arpa files
69 | *.bin
70 | *.arpa
71 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | © 2019. Case Recommender All Rights Reserved
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Case Recommender - A Python Framework for RecSys
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/CaseRecommender.svg)](https://badge.fury.io/py/CaseRecommender)
  4 | [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
  5 | [![GitHub license](https://img.shields.io/github/license/caserec/CaseRecommender.svg)](https://github.com/caserec/CaseRecommender/blob/master/COPYING)
  6 | 
  7 | Case Recommender is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback. The framework aims to provide a rich set of components from which you can construct a customized recommender system from a set of algorithms. Case Recommender has different types of item recommendation and rating prediction approaches, and different metrics validation and evaluation.
  8 | 
  9 | # Algorithms
 10 | 
 11 | Item Recommendation:
 12 | 
 13 | - BPRMF
 14 | 
 15 | - ItemKNN
 16 | 
 17 | - Item Attribute KNN
 18 | 
 19 | - UserKNN
 20 | 
 21 | - User Attribute KNN
 22 | 
 23 | - Group-based (Clustering-based algorithm)
 24 | 
 25 | - Paco Recommender (Co-Clustering-based algorithm)
 26 | 
 27 | - Most Popular
 28 | 
 29 | - Random
 30 | 
 31 | - Content Based
 32 | 
 33 | Rating Prediction:
 34 | 
 35 | - Matrix Factorization (with and without baseline)
 36 | 
 37 | - Non-negative Matrix Factorization
 38 | 
 39 | - SVD
 40 | 
 41 | - SVD++
 42 | 
 43 | - ItemKNN
 44 | 
 45 | - Item Attribute KNN
 46 | 
 47 | - UserKNN
 48 | 
 49 | - User Attribute KNN
 50 | 
 51 | - Item NSVD1 (with and without Batch)
 52 | 
 53 | - User NSVD1 (with and without Batch)
 54 | 
 55 | - Most Popular
 56 | 
 57 | - Random
 58 | 
 59 | - gSVD++
 60 | 
 61 | - Item-MSMF
 62 | 
 63 | - (E) CoRec
 64 | 
 65 | Clustering:
 66 | 
 67 | - PaCo: EntroPy Anomalies in Co-Clustering
 68 | 
 69 | - k-medoids
 70 | 
 71 | # Evaluation and Validation Metrics
 72 | 
 73 | - All-but-one Protocol
 74 | 
 75 | - Cross-fold-Validation
 76 | 
 77 | - Item Recommendation: Precision, Recall, NDCG and Map
 78 | 
 79 | - Rating Prediction: MAE and RMSE
 80 | 
 81 | - Statistical Analysis (T-test and Wilcoxon)
 82 | 
 83 | # Requirements
 84 | 
 85 | - Python
 86 | - scipy
 87 | - numpy
 88 | - pandas
 89 | - scikit-learn
 90 | 
 91 | For Linux and MAC use:
 92 | 
 93 |     $ pip install requirements
 94 | 
 95 | For Windows use:
 96 | 
 97 |     http://www.lfd.uci.edu/~gohlke/pythonlibs/
 98 | 
 99 | # Installation
100 | 
101 | Case Recommender can be installed using pip:
102 | 
103 |     $ pip install caserecommender
104 | 
105 | If you want to run the latest version of the code, you can install from git:
106 | 
107 |     $ pip install -U git+git://github.com/caserec/CaseRecommender.git
108 | 
109 | # Quick Start and Guide
110 | 
111 | For more information about RiVal and the documentation, visit the Case Recommender [Wiki](https://github.com/caserec/CaseRecommender/wiki). If you have not used Case Recommender before, do check out the Getting Started guide.
112 | 
113 | # Usage
114 | 
115 | Divide Database (Fold Cross Validation)
116 | 
117 |     >> from caserec.utils.split_database import SplitDatabase
118 |     >> SplitDatabase(input_file=dataset, dir_folds=dir_path, n_splits=10).k_fold_cross_validation()
119 | 
120 | Run Item Recommendation Algorithm (E.g: ItemKNN)
121 | 
122 |     >> from caserec.recommenders.item_recommendation.itemknn import ItemKNN
123 |     >> ItemKNN(train_file, test_file).compute()
124 | 
125 | Run Rating Prediction Algorithm (E.g: ItemKNN)
126 | 
127 |     >> from caserec.recommenders.rating_prediction.itemknn import ItemKNN
128 |     >> ItemKNN(train_file, test_file).compute()
129 | 
130 | Evaluate Ranking (Prec@N, Recall@N, NDCG@, Map@N and Map Total)
131 | 
132 |     >> from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation
133 |     >> ItemRecommendationEvaluation().evaluate_with_files(predictions_file, test_file)
134 | 
135 | Evaluate Ranking (MAE and RMSE)
136 | 
137 |     >> from caserec.evaluation.rating_prediction import RatingPredictionEvaluation
138 |     >> RatingPredictionEvaluation().evaluate_with_files(predictions_file, test_file)
139 | 
140 | # Input
141 | 
142 | The input-files of traditional have to be placed in the corresponding subdirectory and are in csv-format with at least
143 | 3 columns. Example: user_1,item_1,feedback
144 | 
145 | # Cite us
146 | 
147 | If you use Case Recommender in a scientific publication, we would appreciate citations of our paper where this framework was first mentioned and used.
148 | 
149 | To cite Case Recommender use: Arthur da Costa, Eduardo Fressato, Fernando Neto, Marcelo Manzato, and Ricardo Campello. 2019. Case recommender: a flexible and extensible python framework for recommender systems. In Proceedings of the 12th ACM Conference on Recommender Systems (RecSys '18). ACM, New York, NY, USA, 494-495. DOI: https://doi.org/10.1145/3240323.3241611.
150 | 
151 | For TeX/LaTeX (BibTex):
152 | 
153 |         @inproceedings{daCosta:2018:CRF:3240323.3241611,
154 |             author = {da Costa, Arthur and Fressato, Eduardo and Neto, Fernando and Manzato, Marcelo and Campello, Ricardo},
155 |             title = {Case Recommender: A Flexible and Extensible Python Framework for Recommender Systems},
156 |             booktitle = {Proceedings of the 12th ACM Conference on Recommender Systems},
157 |             series = {RecSys '18},
158 |             year = {2018},
159 |             isbn = {978-1-4503-5901-6},
160 |             location = {Vancouver, British Columbia, Canada},
161 |             pages = {494--495},
162 |             numpages = {2},
163 |             url = {http://doi.acm.org/10.1145/3240323.3241611},
164 |             doi = {10.1145/3240323.3241611},
165 |             acmid = {3241611},
166 |             publisher = {ACM},
167 |             address = {New York, NY, USA},
168 |             keywords = {framework, python, recommender systems},
169 |         }
170 | 
171 | # Help CaseRecommender
172 | 
173 | To help the project with contributions follow the steps:
174 | 
175 | - Fork CaseRecommender
176 | 
177 | - Make your alterations and commit
178 | 
179 | - Create a topic branch - git checkout -b my_branch
180 | 
181 | - Push to your branch - git push origin my_branch
182 | 
183 | - Create a Pull Request from your branch.
184 | 
185 | - You just contributed to the CaseRecommender project!
186 | 
187 | For bugs or feedback use this link: https://github.com/caserec/CaseRecommender/issues
188 | 
189 | # License (MIT)
190 | 
191 |     © 2019. Case Recommender All Rights Reserved
192 | 
193 |     Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
194 |     documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
195 |     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
196 |     permit persons to whom the Software is furnished to do so, subject to the following conditions:
197 | 
198 |     The above copyright notice and this permission notice shall be included in all copies or substantial portions of
199 |     the Software.
200 | 
201 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
202 |     THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
203 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
204 |     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
205 |     IN THE SOFTWARE.
206 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Case Recommender - A Python Framework for RecSys
  2 | ===================================================
  3 | 
  4 | Case Recommender is a Python implementation of a number of popular recommendation algorithms for both implicit and
  5 | explicit feedback.  The framework aims to provide a rich set of components from which you can construct a customized
  6 | recommender system from a set of algorithms. Case Recommender has different types of item recommendation and rating
  7 | prediction approaches, and different metrics validation and evaluation.
  8 | 
  9 | Algorithms
 10 | ^^^^^^^^^^^^
 11 | 
 12 | Item Recommendation:
 13 | 
 14 | - BPRMF
 15 | 
 16 | - ItemKNN
 17 | 
 18 | - Item Attribute KNN
 19 | 
 20 | - UserKNN
 21 | 
 22 | - User Attribute KNN
 23 | 
 24 | - Group-based (Clustering-based algorithm)
 25 | 
 26 | - Paco Recommender (Co-Clustering-based algorithm)
 27 | 
 28 | - Most Popular
 29 | 
 30 | - Random
 31 | 
 32 | - Content Based
 33 | 
 34 | Rating Prediction:
 35 | 
 36 | - Matrix Factorization (with and without baseline)
 37 | 
 38 | - SVD
 39 | 
 40 | - Non-negative Matrix Factorization
 41 | 
 42 | - SVD++
 43 | 
 44 | - ItemKNN
 45 | 
 46 | - Item Attribute KNN
 47 | 
 48 | - UserKNN
 49 | 
 50 | - User Attribute KNN
 51 | 
 52 | - Item NSVD1 (with and without Batch)
 53 | 
 54 | - User NSVD1 (with and without Batch)
 55 | 
 56 | - Most Popular
 57 | 
 58 | - Random
 59 | 
 60 | - gSVD++
 61 | 
 62 | - Item-MSMF
 63 | 
 64 | - (E)CoRec
 65 | 
 66 | Clustering:
 67 | 
 68 | - PaCo: EntroPy Anomalies in Co-Clustering
 69 | 
 70 | - k-medoids
 71 | 
 72 | Evaluation and Validation Metrics
 73 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 74 | 
 75 | - All-but-one Protocol
 76 | 
 77 | - Cross-fold- Validation
 78 | 
 79 | - Item Recommendation: Precision, Recall, NDCG and Map
 80 | 
 81 | - Rating Prediction: MAE and RMSE
 82 | 
 83 | - Statistical Analysis (T-test and Wilcoxon)
 84 | 
 85 | Requirements
 86 | ^^^^^^^^^^^^^
 87 | 
 88 | - Python >= 3
 89 | - scipy
 90 | - numpy
 91 | - pandas
 92 | - scikit-learn
 93 | 
 94 | For Linux, Windows and MAC use:
 95 | 
 96 |     $ pip install requirements
 97 | 
 98 | For Windows libraries help use:
 99 | 
100 |     http://www.lfd.uci.edu/~gohlke/pythonlibs/
101 | 
102 | Quick Start and Guide
103 | ^^^^^^^^^^^^^^^^^^^^^^
104 | 
105 | For more information about RiVal and the documentation, 
106 | visit the Case Recommender 
107 | `Wiki <https://github.com/caserec/CaseRecommender/wiki>`_. If you have not used Case Recommender before, do check out the Getting Started guide.
108 | 
109 | 
110 | Installation
111 | ^^^^^^^^^^^^^
112 | 
113 | Case Recommender can be installed using pip:
114 | 
115 |     $ pip install caserecommender
116 | 
117 | If you want to run the latest version of the code, you can install from git:
118 | 
119 |     $ pip install -U git+git://github.com/caserec/CaseRecommender.git
120 | 
121 | More Details
122 | ^^^^^^^^^^^^^
123 | 
124 |     `https://github.com/caserec/CaseRecommender <https://github.com/caserec/CaseRecommender>`_
125 | 
126 | 
127 | License (MIT)
128 | ^^^^^^^^^^^^^^
129 | 
130 |     © 2019. Case Recommender All Rights Reserved
131 | 
132 |     Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
133 |     documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
134 |     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
135 |     permit persons to whom the Software is furnished to do so, subject to the following conditions:
136 | 
137 |     The above copyright notice and this permission notice shall be included in all copies or substantial portions
138 |     of the Software.
139 | 
140 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
141 |     TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
142 |     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
143 |     OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
144 |     DEALINGS IN THE SOFTWARE.
145 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/caserec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/__init__.py


--------------------------------------------------------------------------------
/caserec/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Arthur'
2 | 


--------------------------------------------------------------------------------
/caserec/clustering/kmedoids.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     K-medoids Clustering Algorithm
  4 |     [Co-Clustering Algorithm]
  5 | 
  6 |     Literature:
  7 |         H.S. Park , C.H. Jun:
  8 |         A simple and fast algorithm for K-medoids clustering
  9 |         Expert Systems with Applications, 36, (2) (2009), 3336–3341.
 10 | 
 11 | """
 12 | 
 13 | # © 2019. Case Recommender (MIT License)
 14 | 
 15 | import numpy as np
 16 | 
 17 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 18 | 
 19 | 
 20 | def kmedoids(distance_matrix, k, max_interactions=10000, random_seed=None):
 21 |     """
 22 |     k-medoids
 23 | 
 24 |     Usage::
 25 | 
 26 |         >> sm, c = kmedoids(distance_matrix, k=3)
 27 | 
 28 |     The k-medoids algorithm is a clustering algorithm related to the k-means algorithm and the medoidshift algorithm.
 29 |     Both the k-means and k-medoids algorithms are partitional (breaking the dataset up into groups) and both attempt to
 30 |     minimize the distance between points labeled to be in a cluster and a point designated as the center of that
 31 |     cluster. In contrast to the k-means algorithm, k-medoids chooses datapoints as centers (medoids or exemplars)
 32 |     and works with a generalization of the Manhattan Norm to define distance between datapoints instead of.
 33 |     This method was proposed in 1987[1] for the work with norm and other distances.
 34 | 
 35 |     k-medoid is a classical partitioning technique of clustering that clusters the data set of n objects into k
 36 |     clusters known a priori. A useful tool for determining k is the silhouette. It is more robust to noise and outliers
 37 |     as compared to k-means because it minimizes a sum of pairwise dissimilarities instead of a sum of squared
 38 |     Euclidean distances.
 39 | 
 40 |     A medoid can be defined as the object of a cluster whose average dissimilarity to all the objects in the cluster
 41 |     is minimal. i.e. it is a most centrally located point in the cluster.
 42 | 
 43 |     :param distance_matrix: Matrix with distances between the instances
 44 |     :type distance_matrix: matrix
 45 | 
 46 |     :param k: Number of groups to be generated
 47 |     :type k: int
 48 | 
 49 |     :param max_interactions: Number max of interaction to converge
 50 |     :type max_interactions: int, default 10000
 51 | 
 52 |     :param random_seed: Seed of random
 53 |     :type random_seed: int, default None
 54 | 
 55 |     :return: Support vector and List of labels (len = number of instances)
 56 | 
 57 |     """
 58 | 
 59 |     # Set seed in random
 60 |     if random_seed is not None:
 61 |         np.random.seed(random_seed)
 62 | 
 63 |     # determine dimensions of distance matrix
 64 |     row, col = distance_matrix.shape
 65 | 
 66 |     if k > col:
 67 |         raise Exception("Error:: Too many medoids")
 68 | 
 69 |     # randomly initialize an array of k-medoid indices
 70 |     support_matrix = np.arange(col)
 71 |     np.random.shuffle(support_matrix)
 72 |     support_matrix = np.sort(support_matrix[:k])
 73 | 
 74 |     # create a copy of the array of medoid indices
 75 |     new_support_matrix = np.copy(support_matrix)
 76 | 
 77 |     # initialize a dictionary to represent clusters
 78 |     clusters = {}
 79 | 
 80 |     for _ in range(max_interactions):
 81 |         # determine clusters, i. e. arrays of data indices
 82 |         j_vector = np.argmin(distance_matrix[:, support_matrix], axis=1)
 83 |         for label in range(k):
 84 |             clusters[label] = np.where(j_vector == label)[0]
 85 | 
 86 |         # update cluster medoids
 87 |         for label in range(k):
 88 |             j_vector = np.mean(distance_matrix[np.ix_(
 89 |                 clusters[label], clusters[label])], axis=1)
 90 |             try:
 91 |                 j = np.argmin(j_vector)
 92 |                 new_support_matrix[label] = clusters[label][j]
 93 |             except ValueError:
 94 |                 pass
 95 |         np.sort(new_support_matrix)
 96 | 
 97 |         # check for convergence
 98 |         if np.array_equal(support_matrix, new_support_matrix):
 99 |             break
100 |         support_matrix = np.copy(new_support_matrix)
101 | 
102 |     else:
103 |         # final update of cluster memberships
104 |         j_vector = np.argmin(distance_matrix[:, support_matrix], axis=1)
105 |         for label in range(k):
106 |             clusters[label] = np.where(j_vector == label)[0]
107 | 
108 |     remove_keys = set()
109 |     for key in clusters:
110 |         if len(clusters[key]) == 0:
111 |             remove_keys.add(key)
112 | 
113 |     if remove_keys:
114 |         for key in remove_keys:
115 |             clusters.pop(key, None)
116 | 
117 |     # return results
118 |     return support_matrix, clusters
119 | 


--------------------------------------------------------------------------------
/caserec/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Arthur'
2 | 


--------------------------------------------------------------------------------
/caserec/evaluation/base_evaluation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     This class is base for evaluation strategies
  4 | 
  5 |     Types of evaluation:
  6 |         - Simple: Evaluation with traditional strategy
  7 |         - All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the ranking
  8 | 
  9 | """
 10 | 
 11 | # © 2019. Case Recommender (MIT License)
 12 | 
 13 | from collections import defaultdict
 14 | 
 15 | from caserec.utils.process_data import ReadFile
 16 | 
 17 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 18 | 
 19 | 
 20 | class BaseEvaluation(object):
 21 |     def __init__(self, sep='\t', metrics=None, all_but_one_eval=False, verbose=True, as_table=False, table_sep='\t', save_eval_file = None):
 22 |         """
 23 |         Class to be base for evaluation strategies
 24 | 
 25 |         :param sep: Delimiter for input files
 26 |         :type sep: str, default '\t'
 27 | 
 28 |         :param metrics: List of evaluation metrics
 29 |         :type metrics: list, default None
 30 | 
 31 |         :param all_but_one_eval: If True, considers only one pair (u, i) from the test set to evaluate the ranking
 32 |         :type all_but_one_eval: bool, default False
 33 | 
 34 |         :param verbose: Print the evaluation results
 35 |         :type verbose: bool, default True
 36 | 
 37 |         :param as_table: Print the evaluation results as table (only work with verbose=True)
 38 |         :type as_table: bool, default False
 39 | 
 40 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
 41 |         :type table_sep: str, default '\t'
 42 | 
 43 |         """
 44 |         self.sep = sep
 45 |         self.all_but_one_eval = all_but_one_eval
 46 |         self.metrics = metrics
 47 |         self.verbose = verbose
 48 |         self.as_table = as_table
 49 |         self.table_sep = table_sep
 50 | 
 51 |     def evaluate(self, predictions, test_set):
 52 |         """
 53 |         Method to be implemented for each strategy using their respective metrics.
 54 |         Use read() in ReadFile to transform your file in a dict
 55 | 
 56 |         :param predictions: Dictionary with ranking information
 57 |         :type predictions: dict
 58 | 
 59 |         :param test_set: Dictionary with test set information.
 60 |         :type test_set: dict
 61 | 
 62 |         """
 63 |         raise NotImplemented
 64 | 
 65 |     def evaluate_with_files(self, prediction_file, test_file):
 66 |         """
 67 |         Method to evaluate predictions using files
 68 | 
 69 |         :param prediction_file: Predictions file with at least 2 columns for item recommendation
 70 |         (eg. user item [score (optional)]) and 3 columns for rating prediction (eg. user item rating)
 71 |         :type prediction_file: str
 72 | 
 73 |         :param test_file: Test file
 74 |         :type test_file: str
 75 | 
 76 |         :return: Dictionary with all evaluation metrics and results
 77 |         :rtype: dict
 78 | 
 79 |         """
 80 | 
 81 |         predict = ReadFile(prediction_file, sep=self.sep).read()
 82 |         test_set = ReadFile(test_file, sep=self.sep).read()
 83 | 
 84 |         return self.evaluate(predict['feedback'], test_set)
 85 | 
 86 |     def evaluate_recommender(self, predictions, test_set):
 87 |         """
 88 |         Method to evaluate recommender results. This method should be called by item recommender algorithms
 89 | 
 90 |         :param predictions: List with recommender output. e.g. [[user, item, score], [user, item2, score] ...]
 91 |         :type predictions: list
 92 | 
 93 |         :param test_set: Dictionary with test set information.
 94 |         :type test_set: dict
 95 | 
 96 |         :return: Dictionary with all evaluation metrics and results
 97 |         :rtype: dict
 98 | 
 99 |         """
100 | 
101 |         predictions_dict = {}
102 | 
103 |         for sample in predictions:
104 |             predictions_dict.setdefault(sample[0], {}).update({sample[1]: sample[2]})
105 | 
106 |         return self.evaluate(predictions_dict, test_set)
107 | 
108 |     def evaluate_folds(self, folds_dir, predictions_file_name, test_file_name, k_folds=10):
109 |         """
110 |         Evaluate ranking in a set of folds. The name of folds needs to be integer and start with 0. e.g.
111 |         Exist a dir '/home/user/folds', in which contains folds 0, 1, ..., 10.
112 | 
113 |         :param folds_dir: Directory of folds
114 |         :type folds_dir: str
115 | 
116 |         :param k_folds: Number of folds
117 |         :type k_folds: int, default 10
118 | 
119 |         :param predictions_file_name: Name of the ranking file
120 |         :type predictions_file_name: str
121 | 
122 |         :param test_file_name: Name of the test file
123 |         :type test_file_name: str
124 | 
125 |         :return: Dictionary with all evaluation metrics and results
126 |         :rtype: dict
127 | 
128 |         """
129 | 
130 |         folds_results = defaultdict()
131 | 
132 |         for fold in range(k_folds):
133 |             predictions_file = folds_dir + str(fold) + '/' + predictions_file_name
134 |             test_file = folds_dir + str(fold) + '/' + test_file_name
135 | 
136 |             for key, value in self.evaluate_with_files(predictions_file, test_file).items():
137 |                 folds_results[key] = folds_results.get(key, 0) + value
138 | 
139 |         folds_results = {k: round(v / k_folds, 6) for k, v in folds_results.items()}
140 | 
141 |         if self.verbose:
142 |             self.print_results(folds_results)
143 | 
144 |         return folds_results
145 | 
146 |     def print_results(self, evaluation_results, save_eval_file = None):
147 |         """
148 |         Method to print the results
149 | 
150 |         :param evaluation_results: Dictionary with results. e.g. {metric: value}
151 |         :type evaluation_results: dict
152 | 
153 |         """
154 | 
155 |         if self.as_table:
156 |             header = ''
157 |             values = ''
158 |             for metric in self.metrics:
159 |                 header += metric.upper() + self.table_sep
160 |                 values += str(evaluation_results[metric.upper()]) + self.table_sep
161 |             print(header)
162 |             print(values)
163 | 
164 |         else:
165 |             evaluation = 'Eval:: '
166 |             for metrics in self.metrics:
167 |                 evaluation += metrics.upper() + ': ' + str(evaluation_results[metrics.upper()]) + ' '
168 |             print(evaluation)
169 | 


--------------------------------------------------------------------------------
/caserec/evaluation/item_recomendation_functions.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """"
 3 |     These functions are responsible for evaluate item recommendation algorithms (rankings).
 4 | 
 5 |     They are used by evaluation/item_recommendation.py
 6 | 
 7 | """
 8 | 
 9 | # © 2019. Case Recommender (MIT License)
10 | 
11 | import numpy as np
12 | 
13 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
14 | 
15 | 
16 | def precision_at_k(ranking, k):
17 |     """
18 |     Score is precision @ k
19 |     Relevance is binary (nonzero is relevant).
20 | 
21 |     :param ranking: Relevance scores (list or numpy) in rank order (first element is the first item)
22 |     :type ranking: list, np.array
23 | 
24 |     :param k: length of ranking
25 |     :type k: int
26 | 
27 |     :return: Precision @ k
28 |     :rtype: float
29 | 
30 |     """
31 | 
32 |     assert k >= 1
33 |     ranking = np.asarray(ranking)[:k] != 0
34 |     if ranking.size != k:
35 |         raise ValueError('Relevance score length < k')
36 |     return np.mean(ranking)
37 | 
38 | 
39 | def average_precision(ranking):
40 |     """
41 |     Score is average precision (area under PR curve). Relevance is binary (nonzero is relevant).
42 | 
43 |     :param ranking: Relevance scores (list or numpy) in rank order (first element is the first item)
44 |     :type ranking: list, np.array
45 | 
46 |     :return: Average precision
47 |     :rtype: float
48 | 
49 |     """
50 | 
51 |     ranking = np.asarray(ranking) != 0
52 |     out = [precision_at_k(ranking, k + 1) for k in range(ranking.size) if ranking[k]]
53 |     if not out:
54 |         return 0.
55 |     return np.mean(out)
56 | 
57 | 
58 | def mean_average_precision(ranking):
59 |     """
60 |     Score is mean average precision. Relevance is binary (nonzero is relevant).
61 | 
62 |     :param ranking: Relevance scores (list or numpy) in rank order (first element is the first item)
63 |     :type ranking: list, np.array
64 | 
65 |     :return: Mean average precision
66 |     :rtype: float
67 |     """
68 | 
69 |     return np.mean([average_precision(r) for r in ranking])
70 | 
71 | 
72 | def ndcg_at_k(ranking):
73 |     """
74 |     Score is normalized discounted cumulative gain (ndcg). Relevance is positive real values.  Can use binary
75 |     as the previous methods.
76 | 
77 |     :param ranking: ranking to evaluate in dcg format [0, 0, 1], where 1 is correct info
78 |     :type ranking: list
79 | 
80 |     :return: Normalized discounted cumulative gain
81 |     :rtype: float
82 | 
83 |     """
84 | 
85 |     ranking = np.asfarray(ranking)
86 |     r_ideal = np.asfarray(sorted(ranking, reverse=True))
87 |     dcg_ideal = r_ideal[0] + np.sum(r_ideal[1:] / np.log2(np.arange(2, r_ideal.size + 1)))
88 |     dcg_ranking = ranking[0] + np.sum(ranking[1:] / np.log2(np.arange(2, ranking.size + 1)))
89 | 
90 |     return dcg_ranking / dcg_ideal
91 | 


--------------------------------------------------------------------------------
/caserec/evaluation/item_recommendation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     This class is responsible for evaluate item recommendation algorithms (rankings).
  4 | 
  5 |     This file contains item recommendation evaluation metrics:
  6 |         - Mean average precision - MAP
  7 |         - Precision
  8 |         - Recall
  9 |         - Normalized Discounted Cumulative Gain - NDCG
 10 | 
 11 |     Types of evaluation:
 12 |         - Simple: Evaluation with traditional strategy
 13 |         - All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the ranking
 14 | 
 15 | """
 16 | 
 17 | # © 2019. Case Recommender (MIT License)
 18 | 
 19 | import numpy as np
 20 | import random
 21 | 
 22 | from caserec.evaluation.base_evaluation import BaseEvaluation
 23 | from caserec.evaluation.item_recomendation_functions import precision_at_k, mean_average_precision, ndcg_at_k
 24 | 
 25 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 26 | 
 27 | 
 28 | class ItemRecommendationEvaluation(BaseEvaluation):
 29 |     def __init__(self, sep='\t', n_ranks=list([1, 3, 5, 10]),
 30 |                  metrics=list(['PREC', 'RECALL', 'MAP', 'NDCG']), all_but_one_eval=False,
 31 |                  verbose=True, as_table=False, table_sep='\t'):
 32 |         """
 33 |         Class to evaluate predictions in a item recommendation (ranking) scenario
 34 | 
 35 |         :param sep: Delimiter for input files
 36 |         :type sep: str, default '\t'
 37 | 
 38 |         :param n_ranks: List of positions to evaluate the ranking
 39 |         :type n_ranks: list, default [1, 3, 5, 10]
 40 | 
 41 |         :param metrics: List of evaluation metrics
 42 |         :type metrics: list, default ('PREC', 'RECALL', 'MAP', 'NDCG')
 43 | 
 44 |         :param all_but_one_eval: If True, considers only one pair (u, i) from the test set to evaluate the ranking
 45 |         :type all_but_one_eval: bool, default False
 46 | 
 47 |         :param verbose: Print the evaluation results
 48 |         :type verbose: bool, default True
 49 | 
 50 |         :param as_table: Print the evaluation results as table (only work with verbose=True)
 51 |         :type as_table: bool, default False
 52 | 
 53 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
 54 |         :type table_sep: str, default '\t'
 55 | 
 56 |         """
 57 | 
 58 |         if type(metrics) == list:
 59 |             metrics = [m + '@' + str(n) for m in metrics for n in n_ranks]
 60 |         super(ItemRecommendationEvaluation, self).__init__(sep=sep, metrics=metrics, all_but_one_eval=all_but_one_eval,
 61 |                                                            verbose=verbose, as_table=as_table, table_sep=table_sep)
 62 | 
 63 |         self.n_ranks = n_ranks
 64 | 
 65 |     def evaluate(self, predictions, test_set):
 66 |         """
 67 |         Method to calculate all the metrics for item recommendation scenario using dictionaries of ranking
 68 |         and test set. Use read() in ReadFile to transform your file in a dict
 69 | 
 70 |         :param predictions: Dictionary with ranking information
 71 |         :type predictions: dict
 72 | 
 73 |         :param test_set: Dictionary with test set information.
 74 |         :type test_set: dict
 75 | 
 76 |         :return: Dictionary with all evaluation metrics and results
 77 |         :rtype: dict
 78 | 
 79 |         """
 80 | 
 81 |         eval_results = {}
 82 |         num_user = len(test_set['users'])
 83 |         partial_map_all = None
 84 | 
 85 |         if self.all_but_one_eval:
 86 |             for user in test_set['users']:
 87 |                 # select a random item
 88 |                 test_set['items_seen_by_user'][user] = [random.choice(test_set['items_seen_by_user'].get(user, [-1]))]
 89 | 
 90 |         for i, n in enumerate(self.n_ranks):
 91 |             if n < 1:
 92 |                 raise ValueError('Error: N must >= 1.')
 93 | 
 94 |             partial_precision = list()
 95 |             partial_recall = list()
 96 |             partial_ndcg = list()
 97 |             partial_map = list()
 98 | 
 99 |             for user in test_set['users']:
100 |                 hit_cont = 0
101 |                 # Generate user intersection list between the recommended items and test.
102 |                 list_feedback = set(list(predictions.get(user, []))[:n])
103 |                 intersection = list(list_feedback.intersection(test_set['items_seen_by_user'].get(user, [])))
104 | 
105 |                 if len(intersection) > 0:
106 |                     ig_ranking = np.zeros(n)
107 |                     for item in intersection:
108 |                         hit_cont += 1
109 |                         ig_ranking[list(predictions[user]).index(item)] = 1
110 | 
111 |                     partial_precision.append(precision_at_k([ig_ranking], n))
112 |                     partial_recall.append((float(len(intersection)) / float(len(test_set['items_seen_by_user'][user]))))
113 |                     partial_map.append(mean_average_precision([ig_ranking]))
114 |                     partial_ndcg.append(ndcg_at_k(list(ig_ranking)))
115 | 
116 |                 partial_map_all = partial_map
117 | 
118 |             # create a dictionary with final results
119 |             eval_results.update({
120 |                 'PREC@' + str(n): round(sum(partial_precision) / float(num_user), 6),
121 |                 'RECALL@' + str(n): round(sum(partial_recall) / float(num_user), 6),
122 |                 'NDCG@' + str(n): round(sum(partial_ndcg) / float(num_user), 6),
123 |                 'MAP@' + str(n): round(sum(partial_map) / float(num_user), 6),
124 |                 'MAP': round(sum(partial_map_all) / float(num_user), 6)
125 | 
126 |             })
127 | 
128 |         # if (self.save_eval_file is not None):
129 |         #     # Saving evaluations to a file 
130 |         #     from caserec.utils.process_data import WriteFile
131 | 
132 |         #     WriteFile(output_file=save_eval_file, data=)
133 | 
134 |         if self.verbose:
135 |             self.print_results(eval_results)
136 | 
137 |         return eval_results
138 | 


--------------------------------------------------------------------------------
/caserec/evaluation/rating_prediction.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     This class is responsible for evaluate rating prediction algorithms.
  4 | 
  5 |     This file contains rating prediction evaluation metrics:
  6 |         - Mean Absolute Error - MAE
  7 |         - Root Mean Squared Error - RMSE
  8 | 
  9 |     Types of evaluation:
 10 |         - Simple: Evaluation with traditional strategy
 11 |         - All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the predictions
 12 | 
 13 | """
 14 | 
 15 | from sklearn.metrics import mean_absolute_error, mean_squared_error
 16 | import numpy as np
 17 | import random
 18 | 
 19 | from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation
 20 | from caserec.evaluation.base_evaluation import BaseEvaluation
 21 | 
 22 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 23 | 
 24 | 
 25 | class RatingPredictionEvaluation(BaseEvaluation):
 26 |     def __init__(self, sep='\t', metrics=list(['MAE', 'RMSE']), all_but_one_eval=False, verbose=True, as_table=False,
 27 |                  table_sep='\t', as_rank=False, n_rank=(5, 10)):
 28 |         """
 29 |         Class to evaluate predictions in a rating prediction scenario
 30 | 
 31 |         :param sep: Delimiter for input files
 32 |         :type sep: str, default '\t'
 33 | 
 34 |         :param metrics: List of evaluation metrics
 35 |         :type metrics: list, default ('MAE', 'RMSE')
 36 | 
 37 |         :param all_but_one_eval: If True, considers only one pair (u, i) from the test set to evaluate the ranking
 38 |         :type all_but_one_eval: bool, default False
 39 | 
 40 |         :param verbose: Print the evaluation results
 41 |         :type verbose: bool, default True
 42 | 
 43 |         :param as_table: Print the evaluation results as table (only work with verbose=True)
 44 |         :type as_table: bool, default False
 45 | 
 46 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
 47 |         :type table_sep: str, default '\t'
 48 |         
 49 |         :param as_rank: If True, evaluate as rank.
 50 |         :type as_rank: bool, default False
 51 | 
 52 |         """
 53 | 
 54 |         super(RatingPredictionEvaluation, self).__init__(sep=sep, metrics=metrics, all_but_one_eval=all_but_one_eval,
 55 |                                                          verbose=verbose, as_table=as_table, table_sep=table_sep)
 56 |         self.as_rank = as_rank
 57 |         self.n_rank = n_rank
 58 | 
 59 |     def evaluate(self, predictions, test_set):
 60 |         """
 61 |         Method to calculate all the metrics for item recommendation scenario using dictionaries of ranking
 62 |         and test set. Use read() in ReadFile to transform your prediction and test files in a dict
 63 | 
 64 |         :param predictions: Dict of predictions
 65 |         :type predictions: dict
 66 | 
 67 |         :param test_set: Dictionary with test set information.
 68 |         :type test_set: dict
 69 | 
 70 |         :return: Dictionary with all evaluation metrics and results
 71 |         :rtype: dict
 72 | 
 73 |         """
 74 | 
 75 |         eval_results = {}
 76 |         predictions_list = []
 77 |         test_list = []
 78 |         
 79 |         if not self.as_rank:
 80 |             # Create All but one set, selecting only one sample from the test set for each user
 81 |             if self.all_but_one_eval:
 82 |                 for user in test_set['users']:
 83 |                     # select a random item
 84 |                     item = random.choice(test_set['feedback'][user])
 85 |                     test_set['feedback'][user] = {item: test_set['feedback'][user][item]}
 86 |     
 87 |             for user in predictions:
 88 |                 for item in predictions[user]:
 89 |                     rui_predict = predictions[user][item]
 90 |                     rui_test = test_set["feedback"].get(user, {}).get(item, np.nan)
 91 |                     if not np.isnan(rui_test):
 92 |                         predictions_list.append(rui_predict)
 93 |                         test_list.append(float(rui_test))
 94 |     
 95 |             eval_results.update({
 96 |                 'MAE': round(mean_absolute_error(test_list, predictions_list), 6),
 97 |                 'RMSE': round(np.sqrt(mean_squared_error(test_list, predictions_list)), 6)
 98 |             })
 99 |     
100 |             if self.verbose:
101 |                 self.print_results(eval_results)
102 |                 
103 |         else:
104 |             new_predict_set = []
105 |             new_test_set = {}
106 | 
107 |             for user in predictions:
108 |                 partial_predictions = []
109 |                 for item in predictions[user]:
110 | 
111 |                     if predictions[user][item] > 3:
112 |                         partial_predictions.append([user, item, predictions[user][item]])
113 | 
114 |                     if test_set["feedback"].get(user, {}).get(item, 0) > 3:
115 |                         new_test_set.setdefault(user, []).append(item)
116 | 
117 |                 partial_predictions = sorted(partial_predictions, key=lambda x: -x[2])
118 |                 new_predict_set += partial_predictions
119 | 
120 |             new_test_set['items_seen_by_user'] = new_test_set
121 |             new_test_set['users'] = test_set['users']
122 | 
123 |             eval_results = ItemRecommendationEvaluation(n_ranks=self.n_rank,
124 |                                          all_but_one_eval=self.all_but_one_eval,
125 |                                          metrics=self.metrics).evaluate_recommender(new_predict_set, new_test_set)
126 | 
127 |         return eval_results
128 | 


--------------------------------------------------------------------------------
/caserec/evaluation/statistical_analysis.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 |     This class contains Statical functions for recommender systems.
 4 | 
 5 |     - T-test
 6 |     - Wilcoxon
 7 | 
 8 | """
 9 | 
10 | # © 2019. Case Recommender (MIT License)
11 | 
12 | from scipy.stats import ttest_ind, ranksums
13 | import numpy as np
14 | 
15 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
16 | 
17 | 
18 | class StatisticalAnalysis(object):
19 |     def __init__(self, sample1, sample2, method='ttest'):
20 |         """
21 |         Class for statical analyse. This class compares 2 list of sample and generate a statical analyse
22 | 
23 |         :param sample1: List of results of a recommender 1 in K folds (list with len K)
24 |         :type sample1: list
25 | 
26 |         :param sample2: List of results of a recommender 2 in K folds (list with len K)
27 |         :type sample2: list
28 | 
29 |         :param method:
30 |         :type method: str, default 'ttest'
31 | 
32 |         """
33 |         self.sample1 = np.array(sample1)
34 |         self.sample2 = np.array(sample2)
35 |         self.method = method
36 | 
37 |     def general_analysis(self):
38 |         """
39 |         Analyzing the difference
40 | 
41 |         Instead you might compute the difference and apply some common measure like the sum of absolute differences 
42 |         (SAD), the sum of squared differences (SSD) or the correlation coefficient:
43 |         """
44 | 
45 |         print("=== Information About Samples ===")
46 |         print("Standard Deviation Sample1: " + str(np.std(self.sample1)))
47 |         print("Standard Deviation Sample2: " + str(np.std(self.sample2)) + "\n")
48 |         print("=== Analyzing the Difference Between Samples ===")
49 |         print("SAD:" + str(np.sum(np.abs(self.sample1 - self.sample2))))
50 |         print("SSD:" + str(np.sum(np.square(self.sample1 - self.sample2))))
51 |         print("Correlation:" + str(np.corrcoef(np.array((self.sample1, self.sample2)))[0, 1]) + "\n")
52 | 
53 |     def ttest(self):
54 |         """
55 |         T-student
56 |     
57 |         Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
58 |     
59 |         This is a two-sided test for the null hypothesis that 2 independent samples have identical 
60 |         average (expected) values
61 |     
62 |         This test assumes that the populations have identical variances.
63 |         """
64 | 
65 |         t, p = ttest_ind(self.sample1, self.sample2)
66 |         print("=== T- Student Analysis ===")
67 |         print("The calculated t-statistic: " + str(t))
68 |         print("The two-tailed p-value: " + str(p) + "\n")
69 | 
70 |     def wilcoxon(self):
71 |         """
72 |         Wilcoxon
73 |         
74 |         The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from 
75 |         the same distribution. In particular, it tests whether the distribution of the differences x - y 
76 |         is symmetric about zero. It is a non-parametric version of the paired T-test.
77 |         """
78 | 
79 |         t, p = ranksums(self.sample1, self.sample2)
80 |         print("=== Wilcoxon Analysis ===")
81 |         print("The calculated t-statistic: " + str(t))
82 |         print("The two-tailed p-value: " + str(p) + "\n")
83 | 
84 |     def execute(self):
85 |         self.general_analysis()
86 |         if self.method.lower() == "wilcoxon":
87 |             self.wilcoxon()
88 |         elif self.method.lower() == "ttest":
89 |             self.ttest()
90 |         else:
91 |             print("Error: Method Invalid!")
92 | 


--------------------------------------------------------------------------------
/caserec/recommenders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/__init__.py


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Arthur'
2 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/base_item_recommendation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     This class is base for item recommendation algorithms.
  4 | 
  5 | """
  6 | 
  7 | # © 2019. Case Recommender (MIT License)
  8 | 
  9 | from scipy.spatial.distance import squareform, pdist
 10 | import numpy as np
 11 | 
 12 | from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation
 13 | from caserec.utils.extra_functions import print_header
 14 | from caserec.utils.process_data import ReadFile, WriteFile
 15 | 
 16 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 17 | 
 18 | 
 19 | class BaseItemRecommendation(object):
 20 |     def __init__(self, train_file, test_file, output_file=None, as_binary=False, rank_length=10,
 21 |                  similarity_metric="cosine", sep='\t', output_sep='\t'):
 22 |         """
 23 |          This class is base for all item recommendation algorithms. Inherits the class Recommender
 24 |          and implements / adds common methods and attributes for rank approaches.
 25 | 
 26 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 27 |         (user item feedback_value).
 28 |         :type train_file: str
 29 | 
 30 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 31 |         (user item feedback_value).
 32 |         :type test_file: str, default None
 33 | 
 34 |         :param output_file: File with dir to write the final predictions
 35 |         :type output_file: str, default None
 36 | 
 37 |         :param similarity_metric:
 38 |         :type similarity_metric: str, default cosine
 39 | 
 40 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 41 |         :type rank_length: int, default 10
 42 | 
 43 |         :param as_binary: If True, the explicit feedback will be transform to binary
 44 |         :type as_binary: bool, default False
 45 | 
 46 |         :param sep: Delimiter for input files
 47 |         :type sep: str, default '\t'
 48 | 
 49 |         :param output_sep: Delimiter for output file
 50 |         :type output_sep: str, default '\t'
 51 | 
 52 |         """
 53 | 
 54 |         self.train_file = train_file
 55 |         self.test_file = test_file
 56 |         self.as_binary = as_binary
 57 |         self.similarity_metric = similarity_metric
 58 |         self.output_file = output_file
 59 |         self.rank_length = rank_length
 60 |         self.sep = sep
 61 |         self.output_sep = output_sep
 62 | 
 63 |         # internal vars
 64 |         self.item_to_item_id = {}
 65 |         self.item_id_to_item = {}
 66 |         self.user_to_user_id = {}
 67 |         self.user_id_to_user = {}
 68 |         self.train_set = None
 69 |         self.test_set = None
 70 |         self.users = None
 71 |         self.items = None
 72 |         self.matrix = None
 73 |         self.evaluation_results = None
 74 |         self.recommender_name = None
 75 |         self.extra_info_header = None
 76 |         self.ranking = []
 77 | 
 78 |     def read_files(self):
 79 |         """
 80 |         Method to initialize recommender algorithm.
 81 | 
 82 |         """
 83 |         self.train_set = ReadFile(self.train_file, sep=self.sep, as_binary=self.as_binary).read()
 84 | 
 85 |         if self.test_file is not None:
 86 |             self.test_set = ReadFile(self.test_file, sep=self.sep).read()
 87 |             self.users = sorted(set(list(self.train_set['users']) + list(self.test_set['users'])))
 88 |             self.items = sorted(set(list(self.train_set['items']) + list(self.test_set['items'])))
 89 |         else:
 90 |             self.users = self.train_set['users']
 91 |             self.items = self.train_set['items']
 92 | 
 93 |         for i, item in enumerate(self.items):
 94 |             self.item_to_item_id.update({item: i})
 95 |             self.item_id_to_item.update({i: item})
 96 |         for u, user in enumerate(self.users):
 97 |             self.user_to_user_id.update({user: u})
 98 |             self.user_id_to_user.update({u: user})
 99 | 
100 |     def create_matrix(self):
101 |         """
102 |         Method to create a feedback matrix
103 | 
104 |         """
105 | 
106 |         self.matrix = np.zeros((len(self.users), len(self.items)))
107 | 
108 |         for user in self.train_set['users']:
109 |             for item in self.train_set['feedback'][user]:
110 |                 self.matrix[self.user_to_user_id[user]][self.item_to_item_id[item]] = \
111 |                     self.train_set['feedback'][user][item]
112 | 
113 |     def compute_similarity(self, transpose=False):
114 |         """
115 |         Method to compute a similarity matrix from original df_matrix
116 | 
117 |         :param transpose: If True, calculate the similarity in a transpose matrix
118 |         :type transpose: bool, default False
119 | 
120 |         """
121 | 
122 |         # Calculate distance matrix
123 |         if transpose:
124 |             similarity_matrix = np.float32(squareform(pdist(self.matrix.T, self.similarity_metric)))
125 |         else:
126 |             similarity_matrix = np.float32(squareform(pdist(self.matrix, self.similarity_metric)))
127 | 
128 |         # Remove NaNs
129 |         similarity_matrix[np.isnan(similarity_matrix)] = 1.0
130 |         # transform distances in similarities. Values in matrix range from 0-1
131 |         similarity_matrix = (similarity_matrix.max() - similarity_matrix) / similarity_matrix.max()
132 | 
133 |         return similarity_matrix
134 | 
135 |     def evaluate(self, metrics, verbose=True, as_table=False, table_sep='\t', n_ranks=None):
136 |         """
137 |         Method to evaluate the final ranking
138 | 
139 |         :param metrics: List of evaluation metrics
140 |         :type metrics: list, default ('Prec', 'Recall', 'MAP, 'NDCG')
141 | 
142 |         :param verbose: Print the evaluation results
143 |         :type verbose: bool, default True
144 | 
145 |         :param as_table: Print the evaluation results as table
146 |         :type as_table: bool, default False
147 | 
148 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
149 |         :type table_sep: str, default '\t'
150 | 
151 |         :param n_ranks: List of positions to evaluate the ranking
152 |         :type n_ranks: list, None
153 | 
154 |         """
155 | 
156 |         self.evaluation_results = {}
157 | 
158 |         if metrics is None:
159 |             metrics = list(['PREC', 'RECALL', 'MAP', 'NDCG'])
160 | 
161 |         if n_ranks is None:
162 |             n_ranks = list([1, 3, 5, 10])
163 | 
164 |         results = ItemRecommendationEvaluation(verbose=verbose, as_table=as_table, table_sep=table_sep,
165 |                                                metrics=metrics, n_ranks=n_ranks)
166 | 
167 |         self.evaluation_results = results.evaluate_recommender(predictions=self.ranking, test_set=self.test_set)
168 | 
169 |     def write_ranking(self):
170 |         """
171 |         Method to write final ranking
172 | 
173 |         """
174 | 
175 |         if self.output_file is not None:
176 |             WriteFile(self.output_file, data=self.ranking, sep=self.sep).write()
177 | 
178 |     def compute(self, verbose=True):
179 |         """
180 |         Method to run the recommender algorithm
181 | 
182 |         :param verbose: Print the information about recommender
183 |         :type verbose: bool, default True
184 | 
185 |         """
186 | 
187 |         # read files
188 |         self.read_files()
189 | 
190 |         # initialize empty ranking (Don't remove: important to Cross Validation)
191 |         self.ranking = []
192 | 
193 |         if verbose:
194 |             test_info = None
195 | 
196 |             main_info = {
197 |                 'title': 'Item Recommendation > ' + self.recommender_name,
198 |                 'n_users': len(self.train_set['users']),
199 |                 'n_items': len(self.train_set['items']),
200 |                 'n_interactions': self.train_set['number_interactions'],
201 |                 'sparsity': self.train_set['sparsity']
202 |                     }
203 | 
204 |             if self.test_file is not None:
205 |                 test_info = {
206 |                     'n_users': len(self.test_set['users']),
207 |                     'n_items': len(self.test_set['items']),
208 |                     'n_interactions': self.test_set['number_interactions'],
209 |                     'sparsity': self.test_set['sparsity']
210 |                 }
211 | 
212 |             print_header(main_info, test_info)
213 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/content_based.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Content Based Recommender.
  4 | 
  5 |     Literature:
  6 |     Guangyuan Piao and John G. Breslin. 2016. Measuring semantic distance for linked open data-enabled recommender
  7 |     systems. In Proceedings of the 31st Annual ACM Symposium on Applied Computing (SAC '16). ACM, New York, NY, USA,
  8 |     315-320. DOI: https://doi.org/10.1145/2851613.2851839
  9 | 
 10 | """
 11 | 
 12 | # © 2019. Case Recommender (MIT License)
 13 | 
 14 | import numpy as np
 15 | 
 16 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
 17 | from caserec.utils.process_data import ReadFile
 18 | from caserec.utils.extra_functions import timed
 19 | 
 20 | __author__ = 'Eduardo Fressato <eduardofressato@hotmail.com>'
 21 | 
 22 | 
 23 | class ContentBased(BaseItemRecommendation):
 24 |     def __init__(self, train_file=None, test_file=None, output_file=None, similarity_file=None, similarity_sep='\t',
 25 |                  rank_length=10, as_binary=True, sep='\t', output_sep='\t'):
 26 | 
 27 |         """
 28 |         Content Based Recommender for Item Recommendation
 29 | 
 30 |         Usage::
 31 | 
 32 |             >> ContentBased(train, test, similarity_file=similarity_file).compute()
 33 | 
 34 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 35 |         (user item feedback_value).
 36 |         :type train_file: str
 37 | 
 38 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type test_file: str, default None
 41 | 
 42 |         :param output_file: File with dir to write the final predictions
 43 |         :type output_file: str, default None
 44 | 
 45 |         :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns
 46 |         (item item similarity).
 47 |         :type similarity_file: str, default None
 48 | 
 49 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 50 |         :type rank_length: int, default 10
 51 | 
 52 |         :param similarity_sep: Delimiter for similarity or metadata file
 53 |         :type similarity_sep: str, default '\t'
 54 | 
 55 |         :param sep: Delimiter for input files file
 56 |         :type sep: str, default '\t'
 57 | 
 58 |         :param output_sep: Delimiter for output file
 59 |         :type output_sep: str, default '\t'
 60 | 
 61 |         """
 62 | 
 63 |         super(ContentBased, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 64 |                                            as_binary=as_binary, rank_length=rank_length, sep=sep, output_sep=output_sep)
 65 | 
 66 |         self.recommender_name = 'Content Based Algorithm'
 67 | 
 68 |         self.similarity_file = similarity_file
 69 |         self.similarity_sep = similarity_sep
 70 |         self.si_matrix = None
 71 |         self.similar_items = None
 72 | 
 73 |         self.users_profile = None
 74 | 
 75 |     def init_model(self):
 76 |         """
 77 |         Method to initialize the model. Create and read a similarity matrix
 78 | 
 79 |         """
 80 |         if self.similarity_file is not None:
 81 |             similarity = ReadFile(self.similarity_file, sep=self.similarity_sep, as_binary=False
 82 |                                   ).read_metadata_or_similarity()
 83 | 
 84 |             self.si_matrix = np.zeros((len(self.items), len(self.items)))
 85 | 
 86 |             # Fill similarity matrix
 87 |             for i in similarity['col_1']:
 88 |                 for i_j in similarity['dict'][i]:
 89 |                     self.si_matrix[self.item_to_item_id[i], self.item_to_item_id[int(i_j)]] = similarity['dict'][i][i_j]
 90 | 
 91 |             # Remove NaNs
 92 |             self.si_matrix[np.isnan(self.si_matrix)] = 0.0
 93 | 
 94 |         else:
 95 |             raise ValueError("This algorithm needs a similarity matrix file!")
 96 | 
 97 |     def create_user_profile(self):
 98 |         self.users_profile = self.train_set['items_seen_by_user']
 99 | 
100 |     def predict(self):
101 |         for u in self.train_set['users']:
102 |             self.ranking += self.predict_user_rank(u)
103 | 
104 |     def predict_user_rank(self, user):
105 |         unseen_items = set(self.items).difference(self.users_profile[user])
106 | 
107 |         list_scores = []
108 |         for i in unseen_items:
109 |             list_scores.append(self.predict_item_score(user, i))
110 | 
111 |         return sorted(list_scores, key=lambda x: -x[2])[:self.rank_length]
112 | 
113 |     def predict_item_score(self, user, item):
114 |         sum_sim = 0
115 |         for i in self.users_profile[user]:
116 |             sum_sim += self.si_matrix[self.item_to_item_id[item]][self.item_to_item_id[i]]
117 | 
118 |         return [user, item, sum_sim / len(self.users_profile[user])]
119 | 
120 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t', n_ranks=None):
121 |         """
122 |         Extends compute method from BaseItemRecommendation. Method to run recommender algorithm
123 | 
124 |         :param verbose: Print recommender and database information
125 |         :type verbose: bool, default True
126 | 
127 |         :param metrics: List of evaluation metrics
128 |         :type metrics: list, default None
129 | 
130 |         :param verbose_evaluation: Print the evaluation results
131 |         :type verbose_evaluation: bool, default True
132 | 
133 |         :param as_table: Print the evaluation results as table
134 |         :type as_table: bool, default False
135 | 
136 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
137 |         :type table_sep: str, default '\t'
138 | 
139 |         :param n_ranks: List of positions to evaluate the ranking
140 |         :type n_ranks: list, None
141 | 
142 |         """
143 | 
144 |         super(ContentBased, self).compute(verbose=verbose)
145 | 
146 |         if verbose:
147 |             print("training_time:: %4f sec" % timed(self.init_model))
148 |             if self.extra_info_header is not None:
149 |                 print(self.extra_info_header)
150 | 
151 |             self.create_user_profile()
152 |             print("prediction_time:: %4f sec" % timed(self.predict))
153 |             print('\n')
154 |         else:
155 |             self.init_model()
156 |             self.create_user_profile()
157 |             self.predict()
158 | 
159 |         self.write_ranking()
160 | 
161 |         if self.test_file is not None:
162 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep, n_ranks=n_ranks)
163 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/ensemble_average.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """"
 3 |     Ensemble Average
 4 |     [Item Recommendation (Ranking)]
 5 | 
 6 |     Literature:
 7 |         Arthur Fortes da Costa and Marcelo G. Manzato:
 8 |         Multimodal Interactions in Recommender Systems: An Ensembling Approach
 9 |         BRACIS 2014.
10 |         https://ieeexplore.ieee.org/document/6984809/
11 | 
12 | """
13 | 
14 | # © 2019. Case Recommender (MIT License)
15 | 
16 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
17 | 
18 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
19 | 
20 | 
21 | class EnsembleAverage(BaseItemRecommendation):
22 |     """
23 |     Code being refactored, returns in the next version.
24 | 
25 |     """
26 |     raise NotImplemented
27 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/ensemble_bpr.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """"
 3 |     Ensemble BPR Learning
 4 |     [Item Recommendation (Ranking)]
 5 | 
 6 |     Literature:
 7 |         Arthur Fortes da Costa and Marcelo G. Manzato:
 8 |         Ensemble Learning in Recommender Systems: Combining Multiple User Interactions for Ranking Personalization.
 9 |         WebMedia 2014.
10 |         https://dl.acm.org/citation.cfm?id=2664556
11 | 
12 | """
13 | 
14 | # © 2019. Case Recommender (MIT License)
15 | 
16 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
17 | 
18 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
19 | 
20 | 
21 | class EnsembleBPRLearning(BaseItemRecommendation):
22 |     """
23 |     Code being refactored, returns in the next version.
24 | 
25 |     """
26 |     raise NotImplemented
27 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/item_attribute_knn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Item Based Collaborative Filtering Recommender with Attributes (Item Attribute KNN)
  4 |     [Item Recommendation (Ranking)]
  5 | 
  6 |     Its philosophy is as follows: in order to determine the rating of User u on item m, we can find other movies that
  7 |     are similar to item m, and based on User u’s ratings on those similar movies we infer his rating on item m.
  8 |     However, instead of traditional ItemKNN, this approach uses a metadata or pre-computed similarity matrix.
  9 | 
 10 | """
 11 | 
 12 | # © 2019. Case Recommender (MIT License)
 13 | 
 14 | from collections import defaultdict
 15 | import numpy as np
 16 | 
 17 | from caserec.recommenders.item_recommendation.itemknn import ItemKNN
 18 | from caserec.utils.process_data import ReadFile
 19 | 
 20 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 21 | 
 22 | 
 23 | class ItemAttributeKNN(ItemKNN):
 24 |     def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None,
 25 |                  k_neighbors=30, rank_length=10, as_binary=False, as_similar_first=True, metadata_as_binary=False,
 26 |                  metadata_similarity_sep='\t', similarity_metric="cosine", sep='\t', output_sep='\t'):
 27 |         """
 28 |         Item Attribute KNN for Item Recommendation
 29 | 
 30 |         This algorithm predicts a rank for each user based on the similar items that he/her consumed,
 31 |         using a metadata or similarity pre-computed file
 32 | 
 33 |         Usage::
 34 | 
 35 |             >> ItemAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute()
 36 |             >> ItemAttributeKNN(train, test, metadata_file=metadata, as_similar_first=True).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns
 50 |         (item metadata).
 51 |         :type metadata_file: str, default None
 52 | 
 53 |         :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns
 54 |         (item item similarity).
 55 |         :type similarity_file: str, default None
 56 | 
 57 |         :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users))
 58 |         :type k_neighbors: int, default None
 59 | 
 60 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 61 |         :type rank_length: int, default 10
 62 | 
 63 |         :param as_binary: If True, the explicit feedback will be transform to binary
 64 |         :type as_binary: bool, default False
 65 | 
 66 |         :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k
 67 |         most similar users and then take the intersection with the users that
 68 |         seen that item.
 69 |         :type as_similar_first: bool, default True
 70 | 
 71 |         :param metadata_as_binary: f True, the explicit value will be transform to binary
 72 |         :type metadata_as_binary: bool, default False
 73 | 
 74 |         :param metadata_similarity_sep: Delimiter for similarity or metadata file
 75 |         :type metadata_similarity_sep: str, default '\t'
 76 | 
 77 |         :param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about
 78 |         distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html
 79 |         :type similarity_metric: str, default cosine
 80 | 
 81 |         :param sep: Delimiter for input files file
 82 |         :type sep: str, default '\t'
 83 | 
 84 |         :param output_sep: Delimiter for output file
 85 |         :type output_sep: str, default '\t'
 86 |         """
 87 |         super(ItemAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 88 |                                                k_neighbors=k_neighbors, rank_length=rank_length, as_binary=as_binary,
 89 |                                                as_similar_first=as_similar_first, similarity_metric=similarity_metric,
 90 |                                                sep=sep, output_sep=output_sep)
 91 | 
 92 |         self.recommender_name = 'Item Attribute KNN Algorithm'
 93 | 
 94 |         self.metadata_file = metadata_file
 95 |         self.similarity_file = similarity_file
 96 |         self.metadata_as_binary = metadata_as_binary
 97 |         self.metadata_similarity_sep = metadata_similarity_sep
 98 | 
 99 |     def init_model(self):
100 |         """
101 |         Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity
102 |         matrix
103 | 
104 |         """
105 | 
106 |         self.similar_items = defaultdict(list)
107 | 
108 |         # Set the value for k
109 |         if self.k_neighbors is None:
110 |             self.k_neighbors = int(np.sqrt(len(self.items)))
111 | 
112 |         if self.metadata_file is not None:
113 |             metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary
114 |                                 ).read_metadata_or_similarity()
115 | 
116 |             self.matrix = np.zeros((len(self.items), len(metadata['col_2'])))
117 | 
118 |             meta_to_meta_id = {}
119 |             for m, data in enumerate(metadata['col_2']):
120 |                 meta_to_meta_id[data] = m
121 | 
122 |             for item in metadata['col_1']:
123 |                 for m in metadata['dict'][item]:
124 |                     self.matrix[self.item_to_item_id[item], meta_to_meta_id[m]] = metadata['dict'][item][m]
125 | 
126 |             # create header info for metadata
127 |             sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100
128 | 
129 |             self.extra_info_header = ">> metadata:: %d items and %d metadata (%d interactions) | sparsity:: %.2f%%" % \
130 |                                      (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'],
131 |                                       sparsity)
132 | 
133 |             # Create similarity matrix based on metadata or similarity file. Transpose=False, because it is an
134 |             # item x metadata matrix
135 |             self.si_matrix = self.compute_similarity(transpose=False)
136 | 
137 |         elif self.similarity_file is not None:
138 |             similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False
139 |                                   ).read_metadata_or_similarity()
140 | 
141 |             self.si_matrix = np.zeros((len(self.items), len(self.items)))
142 | 
143 |             # Fill similarity matrix
144 |             for i in similarity['col_1']:
145 |                 for i_j in similarity['dict'][i]:
146 |                     self.si_matrix[self.item_to_item_id[i], self.item_to_item_id[int(i_j)]] = similarity['dict'][i][i_j]
147 | 
148 |             # Remove NaNs
149 |             self.si_matrix[np.isnan(self.si_matrix)] = 0.0
150 | 
151 |         else:
152 |             raise ValueError("This algorithm needs a similarity matrix or a metadata file!")
153 | 
154 |         # Create original matrix user x item for prediction process
155 |         self.create_matrix()
156 | 
157 |         for i_id, item in enumerate(self.items):
158 |             self.similar_items[i_id] = sorted(range(len(self.si_matrix[i_id])),
159 |                                               key=lambda k: -self.si_matrix[i_id][k])[1:self.k_neighbors + 1]
160 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/itemknn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Item Based Collaborative Filtering Recommender (Item KNN)
  4 |     [Item Recommendation (Ranking)]
  5 | 
  6 |     Item KNN predicts a user’s ranking based on similar items which him/her access.
  7 | 
  8 | """
  9 | 
 10 | # © 2019. Case Recommender (MIT License)
 11 | 
 12 | from collections import defaultdict
 13 | import numpy as np
 14 | 
 15 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
 16 | from caserec.utils.extra_functions import timed
 17 | 
 18 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 19 | 
 20 | 
 21 | class ItemKNN(BaseItemRecommendation):
 22 |     def __init__(self, train_file=None, test_file=None, output_file=None, similarity_metric="cosine", k_neighbors=None,
 23 |                  rank_length=10, as_binary=False, as_similar_first=True, sep='\t', output_sep='\t'):
 24 | 
 25 |         """
 26 |         Item KNN for Item Recommendation
 27 | 
 28 |         This algorithm predicts a rank for each user based on the similar items that he/her consumed.
 29 | 
 30 |         Usage::
 31 | 
 32 |             >> ItemKNN(train, test, as_similar_first=True).compute()
 33 |             >> ItemKNN(train, test, ranking_file, as_binary=True).compute()
 34 | 
 35 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 36 |         (user item feedback_value).
 37 |         :type train_file: str
 38 | 
 39 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 40 |         (user item feedback_value).
 41 |         :type test_file: str, default None
 42 | 
 43 |         :param output_file: File with dir to write the final predictions
 44 |         :type output_file: str, default None
 45 | 
 46 |         :param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about
 47 |         distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html
 48 |         :type similarity_metric: str, default cosine
 49 | 
 50 |         :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_items))
 51 |         :type k_neighbors: int, default None
 52 | 
 53 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 54 |         :type rank_length: int, default 10
 55 | 
 56 |         :param as_binary: If True, the explicit feedback will be transform to binary
 57 |         :type as_binary: bool, default False
 58 | 
 59 |         :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k
 60 |         most similar users and then take the intersection with the users that
 61 |         seen that item.
 62 |         :type as_similar_first: bool, default True
 63 | 
 64 |         :param sep: Delimiter for input files
 65 |         :type sep: str, default '\t'
 66 | 
 67 |         :param output_sep: Delimiter for output file
 68 |         :type output_sep: str, default '\t'
 69 | 
 70 |         """
 71 | 
 72 |         super(ItemKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 73 |                                       as_binary=as_binary, rank_length=rank_length, similarity_metric=similarity_metric,
 74 |                                       sep=sep, output_sep=output_sep)
 75 | 
 76 |         self.recommender_name = 'ItemKNN Algorithm'
 77 | 
 78 |         self.as_similar_first = as_similar_first
 79 |         self.k_neighbors = k_neighbors
 80 | 
 81 |         # internal vars
 82 |         self.si_matrix = None
 83 |         self.similar_items = None
 84 | 
 85 |     def init_model(self):
 86 |         """
 87 |         Method to initialize the model. Create and calculate a similarity matrix
 88 | 
 89 |         """
 90 |         self.similar_items = defaultdict(list)
 91 | 
 92 |         # Set the value for k
 93 |         if self.k_neighbors is None:
 94 |             self.k_neighbors = int(np.sqrt(len(self.items)))
 95 | 
 96 |         self.create_matrix()
 97 |         self.si_matrix = self.compute_similarity(transpose=True)
 98 | 
 99 |         for i_id, item in enumerate(self.items):
100 |             self.similar_items[i_id] = sorted(range(len(self.si_matrix[i_id])),
101 |                                               key=lambda k: -self.si_matrix[i_id][k])[1:self.k_neighbors + 1]
102 | 
103 |     def predict(self):
104 |         """
105 |         This method predict a rank for a specific user.
106 | 
107 |         """
108 | 
109 |         for u_id, user in enumerate(self.users):
110 |             if len(self.train_set['feedback'].get(user, [])) != 0:
111 |                 if self.as_similar_first:
112 |                     self.ranking += self.predict_similar_first_scores(user, u_id)
113 |                 else:
114 |                     self.ranking += self.predict_scores(user, u_id)
115 | 
116 |             else:
117 |                 # Implement cold start user
118 |                 pass
119 | 
120 |     def predict_scores(self, user, user_id):
121 |         partial_predictions = []
122 |         # Selects items that user has not interacted with.
123 |         u_list = list(np.flatnonzero(self.matrix[user_id] == 0))
124 |         seen_items_id = np.flatnonzero(self.matrix[user_id])
125 | 
126 |         # predict score for item_i
127 |         for i_id in u_list:
128 |             sim_sum = sorted(np.take(self.si_matrix[i_id], seen_items_id), key=lambda x: -x)
129 |             partial_predictions.append((user, self.items[i_id], sum(sim_sum[:self.k_neighbors])))
130 | 
131 |         return sorted(partial_predictions, key=lambda x: -x[2])[:self.rank_length]
132 | 
133 |     def predict_similar_first_scores(self, user, user_id):
134 |         """
135 |         In this implementation, for each unknown item, which will be
136 |         predicted, we first look for its k most similar items and then take the intersection with the seen items of
137 |         the user. Finally, the score of the unknown item will be the sum of the  similarities of k's most similar
138 |         to it, taking into account only the items that each user seen.
139 | 
140 |         """
141 | 
142 |         predictions = []
143 | 
144 |         # Selects items that user has not interacted with.
145 |         u_list = list(np.flatnonzero(self.matrix[user_id] == 0))
146 |         seen_items_id = np.flatnonzero(self.matrix[user_id])
147 | 
148 |         # predict score for item_i
149 |         for i_id in u_list:
150 |             # s_id = list(filter(set(self.similar_items[i]).__contains__, seen_items_id))
151 |             s_id = list(set(self.similar_items[i_id]).intersection(seen_items_id))
152 |             sim_sum = np.take(self.si_matrix[i_id], s_id)
153 |             predictions.append((user, self.items[i_id], sum(sim_sum)))
154 | 
155 |         return sorted(predictions, key=lambda x: -x[2])[:self.rank_length]
156 | 
157 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t', n_ranks=None):
158 |         """
159 |         Extends compute method from BaseItemRecommendation. Method to run recommender algorithm
160 | 
161 |         :param verbose: Print recommender and database information
162 |         :type verbose: bool, default True
163 | 
164 |         :param metrics: List of evaluation metrics
165 |         :type metrics: list, default None
166 | 
167 |         :param verbose_evaluation: Print the evaluation results
168 |         :type verbose_evaluation: bool, default True
169 | 
170 |         :param as_table: Print the evaluation results as table
171 |         :type as_table: bool, default False
172 | 
173 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
174 |         :type table_sep: str, default '\t'
175 | 
176 |         :param n_ranks: List of positions to evaluate the ranking
177 |         :type n_ranks: list, None
178 | 
179 |         """
180 | 
181 |         super(ItemKNN, self).compute(verbose=verbose)
182 | 
183 |         if verbose:
184 |             print("training_time:: %4f sec" % timed(self.init_model))
185 |             if self.extra_info_header is not None:
186 |                 print(self.extra_info_header)
187 |             print("prediction_time:: %4f sec" % timed(self.predict))
188 |             print('\n')
189 | 
190 |         else:
191 |             self.init_model()
192 |             self.predict()
193 | 
194 |         self.write_ranking()
195 | 
196 |         if self.test_file is not None:
197 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep, n_ranks=n_ranks)
198 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/most_popular.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Most Popular Collaborative Filtering Recommender
  4 |     [Item Recommendation (Ranking)]
  5 | 
  6 |     Most Popular predicts a user’s ranking based on popularity of user and items.
  7 | 
  8 | """
  9 | 
 10 | # © 2019. Case Recommender (MIT License)
 11 | 
 12 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
 13 | from caserec.utils.extra_functions import timed
 14 | 
 15 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 16 | 
 17 | 
 18 | class MostPopular(BaseItemRecommendation):
 19 |     def __init__(self, train_file=None, test_file=None, output_file=None, as_binary=False, rank_length=10, sep='\t',
 20 |                  output_sep='\t'):
 21 |         """
 22 |         Most Popular for Item Recommendation
 23 | 
 24 |         This algorithm predicts a rank for each user using the count of number of feedback of users and items
 25 | 
 26 |         Usage::
 27 | 
 28 |             >> MostPopular(train, test).compute()
 29 |             >> MostPopular(train, test, as_binary=True).compute()
 30 | 
 31 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 32 |         (user item feedback_value).
 33 |         :type train_file: str
 34 | 
 35 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 36 |         (user item feedback_value).
 37 |         :type test_file: str, default None
 38 | 
 39 |         :param output_file: File with dir to write the final predictions
 40 |         :type output_file: str, default None
 41 | 
 42 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 43 |         :type rank_length: int, default 10
 44 | 
 45 |         :param as_binary: If True, the explicit feedback will be transform to binary
 46 |         :type as_binary: bool, default False
 47 | 
 48 |         :param sep: Delimiter for input files
 49 |         :type sep: str, default '\t'
 50 | 
 51 |         :param output_sep: Delimiter for output file
 52 |         :type output_sep: str, default '\t'
 53 | 
 54 |         """
 55 | 
 56 |         super(MostPopular, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 57 |                                           as_binary=as_binary, rank_length=rank_length, sep=sep, output_sep=output_sep)
 58 | 
 59 |         self.recommender_name = 'Most Popular'
 60 | 
 61 |     def predict(self):
 62 |         """
 63 |             This method predict final result, building an rank of each user of the train set.
 64 | 
 65 |         """
 66 | 
 67 |         for user in set(self.users):
 68 |             predictions = list()
 69 | 
 70 |             for item in self.train_set['items_unobserved'].get(user, []):
 71 | 
 72 |                 if self.as_binary:
 73 |                     predictions.append((user, item, len(self.train_set['users_viewed_item'][item])))
 74 |                 else:
 75 |                     count_value = 0
 76 |                     for user_v in self.train_set['users_viewed_item'][item]:
 77 |                         count_value += self.train_set['feedback'][user_v][item]
 78 |                     predictions.append((user, item, count_value))
 79 | 
 80 |             predictions = sorted(predictions, key=lambda x: -x[2])
 81 |             self.ranking += predictions[:self.rank_length]
 82 | 
 83 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
 84 |         """
 85 |         Extends compute method from BaseItemRecommendation. Method to run recommender algorithm
 86 | 
 87 |         :param verbose: Print recommender and database information
 88 |         :type verbose: bool, default True
 89 | 
 90 |         :param metrics: List of evaluation measures
 91 |         :type metrics: list, default None
 92 | 
 93 |         :param verbose_evaluation: Print the evaluation results
 94 |         :type verbose_evaluation: bool, default True
 95 | 
 96 |         :param as_table: Print the evaluation results as table
 97 |         :type as_table: bool, default False
 98 | 
 99 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
100 |         :type table_sep: str, default '\t'
101 | 
102 |         """
103 | 
104 |         super(MostPopular, self).compute(verbose=verbose)
105 | 
106 |         if verbose:
107 |             print("prediction_time:: %4f sec" % timed(self.predict))
108 |             print('\n')
109 | 
110 |         else:
111 |             self.predict()
112 | 
113 |         self.write_ranking()
114 | 
115 |         if self.test_file is not None:
116 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
117 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/paco_recommender.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 |     PaCo Recommender Algorithm
  4 |     [Co-Clustering Algorithm]
  5 | 
  6 |     Literature:
  7 |         Michail Vlachos, Francesco Fusco, Charalambos Mavroforakis, Anastasios Kyrillidis, and
  8 |         Vassilios G. Vassiliadis:
  9 |         Improving Co-Cluster Quality with Application to Product Recommendations. 2014.
 10 |         http://dl.acm.org/citation.cfm?id=2661980
 11 | 
 12 | """
 13 | 
 14 | from collections import defaultdict
 15 | import numpy as np
 16 | 
 17 | from caserec.clustering.paco import PaCo
 18 | from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation
 19 | from caserec.utils.process_data import ReadFile
 20 | 
 21 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 22 | 
 23 | 
 24 | class PaCoRecommender(object):
 25 |     def __init__(self, train_file, test_file=None, output_file=None, k_row=None, l_col=None,
 26 |                  density_low=0.01, as_binary=True, min_density=0.3):
 27 | 
 28 |         """
 29 |         PaCo for Item Recommendation
 30 | 
 31 |         This algorithm predicts a rank for each user using a co-clustering algorithm
 32 | 
 33 |         Usage::
 34 | 
 35 |             >> PaCoRecommender(train, test).compute()
 36 |             >> PaCoRecommender(train, test, as_binary=True).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param k_row: Number of clusters generated by k-means in rows
 50 |         :type k_row: int, default None
 51 | 
 52 |         :param l_col: (int) Number of clusters generated by k-means in rows
 53 |         :type l_col: int, default None
 54 | 
 55 |         :param density_low: Threshold to change the density matrix values
 56 |         :type density_low: float, default 0.008
 57 | 
 58 |         :param as_binary: If True, the explicit feedback will be transform to binary
 59 |         :type as_binary: bool, default True
 60 | 
 61 |         :param min_density: Considers bi-clusters until min-density
 62 |         :type min_density: float, default 0.3
 63 | 
 64 |         """
 65 |         self.recommender_name = 'PaCo Recommender Algorithm'
 66 | 
 67 |         self.train_file = train_file
 68 |         self.test_file = test_file
 69 |         if test_file is not None:
 70 |             self.test_set = ReadFile(test_file).read()
 71 |         self.train_set = ReadFile(train_file, as_binary=as_binary).read()
 72 |         self.output_file = output_file
 73 |         self.k_row = k_row
 74 |         self.l_col = l_col
 75 |         self.density_low = density_low
 76 |         self.min_density = min_density
 77 | 
 78 |         self.users = self.train_set['users']
 79 |         self.items = self.train_set['items']
 80 | 
 81 |         self.item_to_item_id = {}
 82 |         self.item_id_to_item = {}
 83 |         self.user_to_user_id = {}
 84 |         self.user_id_to_user = {}
 85 | 
 86 |         for i, item in enumerate(self.items):
 87 |             self.item_to_item_id.update({item: i})
 88 |             self.item_id_to_item.update({i: item})
 89 |         for u, user in enumerate(self.users):
 90 |             self.user_to_user_id.update({user: u})
 91 |             self.user_id_to_user.update({u: user})
 92 | 
 93 |         self.predictions = []
 94 |         self.uns_items = defaultdict()
 95 |         self.co_clustering = None
 96 | 
 97 |     def run_co_clustering(self):
 98 |         self.co_clustering = PaCo(self.train_file, k_row=self.k_row, l_col=self.l_col, density_low=self.density_low)
 99 |         self.co_clustering.fit()
100 |         if len(self.co_clustering.density) == 1:
101 |             raise ValueError('Error: Co-clustering generated only 1 bi-cluster!')
102 | 
103 |     def recommender(self):
104 |         for n, k in enumerate(self.co_clustering.list_row):
105 |             cols = self.co_clustering.density[n].argsort()
106 |             cols = np.array(cols).ravel()[::-1]
107 | 
108 |             for u_idx in k:
109 |                 user = self.user_id_to_user[u_idx]
110 |                 unseen_items = set()
111 |                 for l in cols:
112 |                     if self.co_clustering.density[n, l] != 0 and self.co_clustering.density[n, l] != 1 and \
113 |                             self.co_clustering.density[n, l] >= self.min_density:
114 |                         for i_idx in self.co_clustering.list_col[l]:
115 |                             item = self.item_id_to_item[i_idx]
116 |                             if self.train_set['feedback'][user].get(item, -1) == -1:
117 |                                 unseen_items.add(item)
118 | 
119 |                 self.uns_items[user] = unseen_items
120 | 
121 |         for user in self.train_set['users']:
122 |             ranking = []
123 |             for item in self.uns_items[user]:
124 |                 rui = len(self.train_set['users_viewed_item'][item])
125 |                 ranking.append((user, item, rui))
126 |             self.predictions += sorted(ranking, key=lambda x: -x[2])[:10]
127 | 
128 |         if self.output_file is not None:
129 |             with open(self.output_file, 'w') as fw:
130 |                 for sample in self.predictions:
131 |                     fw.write("%d\t%d\t%f\n" % (sample[0], sample[1], sample[2]))
132 | 
133 |     def compute(self, verbose=True, metrics=list(['PREC', 'RECALL', 'MAP', 'NDCG']), verbose_evaluation=True,
134 |                 as_table=False, table_sep='\t'):
135 | 
136 |         if verbose:
137 |             print("[Case Recommender: Item Recommendation > %s]\n" % self.recommender_name)
138 | 
139 |         self.run_co_clustering()
140 |         self.recommender()
141 | 
142 |         if self.test_file is not None:
143 |             ItemRecommendationEvaluation(metrics=metrics, as_table=as_table, table_sep=table_sep,
144 |                                          verbose=verbose_evaluation).evaluate_recommender(self.predictions,
145 |                                                                                           self.test_set)
146 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/random_rec.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Random Collaborative Filtering Recommender
  4 |     [Item Recommendation (Ranking)]
  5 | 
  6 |     Random predicts a user’s ranking based on random scores.
  7 | 
  8 | """
  9 | 
 10 | # © 2019. Case Recommender (MIT License)
 11 | 
 12 | import random
 13 | 
 14 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
 15 | from caserec.utils.extra_functions import timed
 16 | 
 17 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 18 | 
 19 | 
 20 | class RandomRec(BaseItemRecommendation):
 21 |     def __init__(self, train_file=None, test_file=None, output_file=None, rank_length=10, sep='\t', output_sep='\t'):
 22 |         """
 23 |         Random Recommender for Item Recommendation
 24 | 
 25 |         This algorithm predicts a rank for each user using the count of number of feedback of users and items
 26 | 
 27 |         Usage::
 28 | 
 29 |             >> RandomRec(train).compute()
 30 |             >> RandomRec(train, test, ranking).compute()
 31 | 
 32 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 33 |         (user item feedback_value).
 34 |         :type train_file: str
 35 | 
 36 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 37 |         (user item feedback_value).
 38 |         :type test_file: str, default None
 39 | 
 40 |         :param output_file: File with dir to write the final predictions
 41 |         :type output_file: str, default None
 42 | 
 43 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 44 |         :type rank_length: int, default 10
 45 | 
 46 |         :param sep: Delimiter for input files
 47 |         :type sep: str, default '\t'
 48 | 
 49 |         :param output_sep: Delimiter for output file
 50 |         :type output_sep: str, default '\t'
 51 | 
 52 |         """
 53 | 
 54 |         super(RandomRec, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 55 |                                         rank_length=rank_length, sep=sep, output_sep=output_sep)
 56 | 
 57 |         self.recommender_name = 'Random Recommender'
 58 | 
 59 |     def predict(self):
 60 |         """
 61 |         Method to predict a rank for each user.
 62 | 
 63 |         For each pair out of train set, predict a random score for it.
 64 | 
 65 |         """
 66 | 
 67 |         for user in set(self.users):
 68 |             predictions = list()
 69 |             for item in self.train_set['items_unobserved'].get(user, []):
 70 |                 predictions.append((user, item, random.uniform(0, 1)))
 71 |             predictions = sorted(predictions, key=lambda x: -x[2])
 72 |             self.ranking += predictions[:self.rank_length]
 73 | 
 74 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
 75 |         """
 76 |         Extends compute method from BaseItemRecommendation. Method to run recommender algorithm
 77 | 
 78 |         :param verbose: Print recommender and database information
 79 |         :type verbose: bool, default True
 80 | 
 81 |         :param metrics: List of evaluation metrics
 82 |         :type metrics: list, default None
 83 | 
 84 |         :param verbose_evaluation: Print the evaluation results
 85 |         :type verbose_evaluation: bool, default True
 86 | 
 87 |         :param as_table: Print the evaluation results as table
 88 |         :type as_table: bool, default False
 89 | 
 90 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
 91 |         :type table_sep: str, default '\t'
 92 | 
 93 |         """
 94 | 
 95 |         super(RandomRec, self).compute(verbose=verbose)
 96 | 
 97 |         if verbose:
 98 |             print("prediction_time:: %4f sec" % timed(self.predict))
 99 |             print('\n')
100 | 
101 |         else:
102 |             self.predict()
103 | 
104 |         self.write_ranking()
105 | 
106 |         if self.test_file is not None:
107 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
108 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/user_attribute_knn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     User Based Collaborative Filtering Recommender with Attributes (User Attribute KNN)
  4 |     [Item Recommendation (Ranking)]
  5 | 
  6 |     User-Attribute-kNN predicts a user’s ranking according to how similar users rated the same item. The algorithm
  7 |     matches similar users based on the similarity of their attributes scores. However, instead of traditional UserKNN,
  8 |     this approach uses a pre-computed similarity matrix based on metadata.
  9 | 
 10 | 
 11 | """
 12 | 
 13 | # © 2019. Case Recommender (MIT License)
 14 | 
 15 | import numpy as np
 16 | 
 17 | from caserec.recommenders.item_recommendation.userknn import UserKNN
 18 | from caserec.utils.process_data import ReadFile
 19 | 
 20 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 21 | 
 22 | 
 23 | class UserAttributeKNN(UserKNN):
 24 |     def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None,
 25 |                  k_neighbors=30, rank_length=10, as_binary=False, as_similar_first=True, metadata_as_binary=False,
 26 |                  metadata_similarity_sep='\t', similarity_metric="cosine", sep='\t', output_sep='\t'):
 27 |         """
 28 |         User Attribute KNN for Item Recommendation
 29 | 
 30 |         This algorithm predicts a rank for each user based on the similar items that his neighbors
 31 |         (similar users) consumed, using a metadata or similarity pre-computed file
 32 | 
 33 |         Usage::
 34 | 
 35 |             >> UserAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute()
 36 |             >> UserAttributeKNN(train, test, metadata_file=metadata, as_similar_first=True).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns
 50 |         (user metadata).
 51 |         :type metadata_file: str, default None
 52 | 
 53 |         :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns
 54 |         (user user similarity).
 55 |         :type similarity_file: str, default None
 56 | 
 57 |         :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users))
 58 |         :type k_neighbors: int, default None
 59 | 
 60 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 61 |         :type rank_length: int, default 10
 62 | 
 63 |         :param as_binary: If True, the explicit feedback will be transform to binary
 64 |         :type as_binary: bool, default False
 65 | 
 66 |         :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k
 67 |         most similar users and then take the intersection with the users that
 68 |         seen that item.
 69 |         :type as_similar_first: bool, default True
 70 | 
 71 |         :param metadata_as_binary: f True, the explicit value will be transform to binary
 72 |         :type metadata_as_binary: bool, default False
 73 | 
 74 |         :param metadata_similarity_sep: Delimiter for similarity or metadata file
 75 |         :type metadata_similarity_sep: str, default '\t'
 76 | 
 77 |         :param similarity_metric:
 78 |         :type similarity_metric: str, default cosine
 79 | 
 80 |         :param sep: Delimiter for input files file
 81 |         :type sep: str, default '\t'
 82 | 
 83 |         :param output_sep: Delimiter for output file
 84 |         :type output_sep: str, default '\t'
 85 |         """
 86 |         super(UserAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 87 |                                                k_neighbors=k_neighbors, rank_length=rank_length, as_binary=as_binary,
 88 |                                                as_similar_first=as_similar_first, similarity_metric=similarity_metric,
 89 |                                                sep=sep, output_sep=output_sep)
 90 | 
 91 |         self.recommender_name = 'User Attribute KNN Algorithm'
 92 | 
 93 |         self.metadata_file = metadata_file
 94 |         self.similarity_file = similarity_file
 95 |         self.metadata_as_binary = metadata_as_binary
 96 |         self.metadata_similarity_sep = metadata_similarity_sep
 97 | 
 98 |     def init_model(self):
 99 |         """
100 |         Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity
101 |         matrix
102 | 
103 |         """
104 | 
105 |         self.users_id_viewed_item = {}
106 | 
107 |         for item in self.items:
108 |             for user in self.train_set['users_viewed_item'].get(item, []):
109 |                 self.users_id_viewed_item.setdefault(item, []).append(self.user_to_user_id[user])
110 | 
111 |         # Set the value for k
112 |         if self.k_neighbors is None:
113 |             self.k_neighbors = int(np.sqrt(len(self.users)))
114 | 
115 |         if self.metadata_file is not None:
116 |             metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary
117 |                                 ).read_metadata_or_similarity()
118 | 
119 |             self.matrix = np.zeros((len(self.users), len(metadata['col_2'])))
120 | 
121 |             meta_to_meta_id = {}
122 | 
123 |             for m, data in enumerate(metadata['col_2']):
124 |                 meta_to_meta_id[data] = m
125 | 
126 |             for user_m in metadata['col_1']:
127 |                 for m1 in metadata['dict'][user_m]:
128 |                     try:
129 |                         self.matrix[self.user_to_user_id[user_m], meta_to_meta_id[m1]] = metadata['dict'][user_m][m1]
130 |                     except KeyError:
131 |                         pass
132 | 
133 |             # create header info for metadata
134 |             sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100
135 | 
136 |             self.extra_info_header = ">> metadata:: %d users and %d metadata (%d interactions) | sparsity:: %.2f%%" % \
137 |                                      (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'],
138 |                                       sparsity)
139 | 
140 |             # Create similarity matrix based on metadata or similarity file
141 |             self.su_matrix = self.compute_similarity(transpose=False)
142 | 
143 |         elif self.similarity_file is not None:
144 |             similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False
145 |                                   ).read_metadata_or_similarity()
146 | 
147 |             self.su_matrix = np.zeros((len(self.users), len(self.users)))
148 | 
149 |             # Fill similarity matrix
150 |             for u in similarity['col_1']:
151 |                 for u_j in similarity['dict'][u]:
152 |                     self.su_matrix[self.user_to_user_id[u], self.user_to_user_id[int(u_j)]] = similarity['dict'][u][u_j]
153 | 
154 |             # Remove NaNs
155 |             self.su_matrix[np.isnan(self.su_matrix)] = 0.0
156 | 
157 |         else:
158 |             raise ValueError("This algorithm needs a similarity matrix or a metadata file!")
159 | 
160 |         # Create original matrix user x item for prediction process
161 |         self.create_matrix()
162 | 


--------------------------------------------------------------------------------
/caserec/recommenders/item_recommendation/userknn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     User Based Collaborative Filtering Recommender (User KNN)
  4 |     [Item Recommendation (Ranking)]
  5 | 
  6 |     User KNN predicts a user’s ranking based on similar users behavior.
  7 | 
  8 | """
  9 | 
 10 | # © 2019. Case Recommender (MIT License)
 11 | 
 12 | import numpy as np
 13 | 
 14 | from caserec.recommenders.item_recommendation.base_item_recommendation import BaseItemRecommendation
 15 | from caserec.utils.extra_functions import timed
 16 | 
 17 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 18 | 
 19 | 
 20 | class UserKNN(BaseItemRecommendation):
 21 |     def __init__(self, train_file=None, test_file=None, output_file=None, similarity_metric="cosine", k_neighbors=None,
 22 |                  rank_length=10, as_binary=False, as_similar_first=True, sep='\t', output_sep='\t'):
 23 |         """
 24 |         User KNN for Item Recommendation
 25 | 
 26 |         This algorithm predicts a rank for each user based on the similar items that his neighbors
 27 |         (similar users) consumed.
 28 | 
 29 |         Usage::
 30 | 
 31 |             >> UserKNN(train, test, as_similar_first=True).compute()
 32 |             >> UserKNN(train, test, ranking_file, as_binary=True).compute()
 33 | 
 34 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 35 |         (user item feedback_value).
 36 |         :type train_file: str
 37 | 
 38 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type test_file: str, default None
 41 | 
 42 |         :param output_file: File with dir to write the final predictions
 43 |         :type output_file: str, default None
 44 | 
 45 |         :param similarity_metric: Pairwise metric to compute the similarity between the users. Reference about
 46 |         distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html
 47 |         :type similarity_metric: str, default cosine
 48 | 
 49 |         :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users))
 50 |         :type k_neighbors: int, default None
 51 | 
 52 |         :param rank_length: Size of the rank that must be generated by the predictions of the recommender algorithm
 53 |         :type rank_length: int, default 10
 54 | 
 55 |         :param as_binary: If True, the explicit feedback will be transform to binary
 56 |         :type as_binary: bool, default False
 57 | 
 58 |         :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k
 59 |         most similar users and then take the intersection with the users that
 60 |         seen that item.
 61 |         :type as_similar_first: bool, default True
 62 | 
 63 |         :param sep: Delimiter for input files
 64 |         :type sep: str, default '\t'
 65 | 
 66 |         :param output_sep: Delimiter for output file
 67 |         :type output_sep: str, default '\t'
 68 | 
 69 |         """
 70 | 
 71 |         super(UserKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 72 |                                       as_binary=as_binary, rank_length=rank_length, similarity_metric=similarity_metric,
 73 |                                       sep=sep, output_sep=output_sep)
 74 | 
 75 |         self.recommender_name = 'UserKNN Algorithm'
 76 | 
 77 |         self.as_similar_first = as_similar_first
 78 |         self.k_neighbors = k_neighbors
 79 | 
 80 |         # internal vars
 81 |         self.su_matrix = None
 82 |         self.users_id_viewed_item = None
 83 | 
 84 |     def init_model(self):
 85 |         """
 86 |         Method to initialize the model. Create and calculate a similarity matrix
 87 | 
 88 |         """
 89 |         self.users_id_viewed_item = {}
 90 | 
 91 |         self.create_matrix()
 92 |         self.su_matrix = self.compute_similarity(transpose=False)
 93 | 
 94 |         # Set the value for k
 95 |         if self.k_neighbors is None:
 96 |             self.k_neighbors = int(np.sqrt(len(self.users)))
 97 | 
 98 |         for item in self.items:
 99 |             for user in self.train_set['users_viewed_item'].get(item, []):
100 |                 self.users_id_viewed_item.setdefault(item, []).append(self.user_to_user_id[user])
101 | 
102 |     def predict(self):
103 |         """
104 |         Method to predict a rank for each user.
105 | 
106 |         """
107 | 
108 |         for u_id, user in enumerate(self.users):
109 |             if len(self.train_set['feedback'].get(user, [])) != 0:
110 |                 u_list = list(np.flatnonzero(self.matrix[u_id] == 0))
111 | 
112 |                 if self.as_similar_first:
113 |                     self.ranking += self.predict_similar_first_scores(user, u_id, u_list)
114 |                 else:
115 |                     self.ranking += self.predict_scores(user, u_id, u_list)
116 |             else:
117 |                 # Implement cold start user
118 |                 pass
119 | 
120 |     def predict_scores(self, user, user_id, unpredicted_items):
121 |         """
122 |         Method to predict a rank for each user. In this implementation, for each unknown item,
123 |         which will be predicted, we first look for users that seen that item and calculate the similarity between them
124 |         and the user. Then we sort these similarities and get the most similar k's. Finally, the score of the
125 |         unknown item will be the sum of the similarities.
126 | 
127 |         """
128 | 
129 |         predictions = []
130 |         for item_id in unpredicted_items:
131 |             item = self.items[item_id]
132 |             sim_sum = []
133 |             for user_v in self.users_id_viewed_item.get(item, []):
134 |                 sim_sum.append(self.su_matrix[user_id, user_v])
135 |             sim_sum = sorted(sim_sum, reverse=True)
136 | 
137 |             predictions.append((user, item, sum(sim_sum[:self.k_neighbors])))
138 | 
139 |         return sorted(predictions, key=lambda x: -x[2])[:self.rank_length]
140 | 
141 |     def predict_similar_first_scores(self, user, user_id, unpredicted_items):
142 |         """
143 |         Method to predict a rank for each user. In this implementation, for each unknown item, which will be
144 |         predicted, we first look for its k most similar users and then take the intersection with the users that
145 |         seen that item. Finally, the score of the unknown item will be the sum of the  similarities.
146 | 
147 |         """
148 | 
149 |         predictions = []
150 | 
151 |         # Select user neighbors, sorting user similarity vector. Returns a list with index of sorting values
152 |         neighbors = sorted(range(len(self.su_matrix[user_id])), key=lambda m: -self.su_matrix[user_id][m])
153 | 
154 |         for item_id in unpredicted_items:
155 |             item = self.items[item_id]
156 |             # Intersection bt. the neighbors closest to the user and the users who accessed the unknown item.
157 |             common_users = list(set(self.users_id_viewed_item.get(item, [])).
158 |                                 intersection(neighbors[1:self.k_neighbors]))
159 | 
160 |             sim_sum = 0
161 |             for user_v in common_users:
162 |                 sim_sum += self.su_matrix[user_id, user_v]
163 | 
164 |             predictions.append((user, item, sim_sum))
165 | 
166 |         return sorted(predictions, key=lambda x: -x[2])[:self.rank_length]
167 | 
168 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, 
169 |                 as_table=False, table_sep='\t', n_ranks=None):
170 |         """
171 |         Extends compute method from BaseItemRecommendation. Method to run recommender algorithm
172 | 
173 |         :param verbose: Print recommender and database information
174 |         :type verbose: bool, default True
175 | 
176 |         :param metrics: List of evaluation metrics
177 |         :type metrics: list, default None
178 | 
179 |         :param verbose_evaluation: Print the evaluation results
180 |         :type verbose_evaluation: bool, default True
181 | 
182 |         :param as_table: Print the evaluation results as table
183 |         :type as_table: bool, default False
184 | 
185 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
186 |         :type table_sep: str, default '\t'
187 | 
188 |         """
189 | 
190 |         super(UserKNN, self).compute(verbose=verbose)
191 | 
192 |         if verbose:
193 |             print("training_time:: %4f sec" % timed(self.init_model))
194 |             if self.extra_info_header is not None:
195 |                 print(self.extra_info_header)
196 |             print("prediction_time:: %4f sec" % timed(self.predict))
197 | 
198 |             print('\n')
199 | 
200 |         else:
201 |             # Execute all in silence without prints
202 |             self.extra_info_header = None
203 |             self.init_model()
204 |             self.predict()
205 | 
206 |         self.write_ranking()
207 | 
208 |         if self.test_file is not None:
209 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep, n_ranks=n_ranks)
210 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Arthur'
2 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/rating_prediction/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/__pycache__/base_rating_prediction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/rating_prediction/__pycache__/base_rating_prediction.cpython-37.pyc


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/__pycache__/nnmf.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/caserec/recommenders/rating_prediction/__pycache__/nnmf.cpython-37.pyc


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/base_knn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     This file is base for neighborhood-based algorithms
  4 | 
  5 |     Used by: ItemKNN, Item Attribute KNN, UserKNN and User Attribute KNN
  6 | 
  7 | """
  8 | 
  9 | # © 2019. Case Recommender (MIT License)
 10 | 
 11 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 12 | 
 13 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 14 | 
 15 | 
 16 | class BaseKNN(BaseRatingPrediction):
 17 |     def __init__(self, train_file, test_file, output_file=None, reg_bi=10, reg_bu=15, similarity_metric='cosine',
 18 |                  sep='\t', output_sep='\t'):
 19 |         """
 20 |         This class is base for all neighborhood-based algorithms.
 21 | 
 22 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 23 |         (user item feedback_value).
 24 |         :type train_file: str
 25 | 
 26 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 27 |         (user item feedback_value).
 28 |         :type test_file: str, default None
 29 | 
 30 |         :param output_file: File with dir to write the final predictions
 31 |         :type output_file: str, default None
 32 | 
 33 |         :param reg_bi: Regularization factor for items
 34 |         :type reg_bi: int, default 10
 35 | 
 36 |         :param reg_bu: Regularization factor for users
 37 |         :type reg_bu: int, default 15
 38 | 
 39 |         :param similarity_metric:
 40 |         :type similarity_metric: str, default cosine
 41 | 
 42 |         :param sep: Delimiter for input files
 43 |         :type sep: str, default'\t'
 44 | 
 45 |         :param output_sep: Delimiter for output file
 46 |         :type output_sep: str, default '\t'
 47 | 
 48 |         """
 49 |         super(BaseKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 50 |                                       similarity_metric=similarity_metric, sep=sep, output_sep=output_sep)
 51 | 
 52 |         self.reg_bi = reg_bi
 53 |         self.reg_bu = reg_bu
 54 | 
 55 |         # internal vars
 56 |         self.number_users = None
 57 |         self.number_items = None
 58 |         self.bu = {}
 59 |         self.bi = {}
 60 |         self.bui = {}
 61 | 
 62 |     def init_model(self):
 63 |         """
 64 |         Method to treat and initialize the model. Create a matrix user x item
 65 | 
 66 |         """
 67 | 
 68 |         self.number_users = len(self.users)
 69 |         self.number_items = len(self.items)
 70 | 
 71 |         self.create_matrix()
 72 | 
 73 |     def train_baselines(self):
 74 |         """
 75 |         Method to train baselines for each pair user, item
 76 | 
 77 |         """
 78 | 
 79 |         self.bu = {}
 80 |         self.bi = {}
 81 |         self.bui = {}
 82 | 
 83 |         for i in range(10):
 84 |             self.compute_bi()
 85 |             self.compute_bu()
 86 |         self.compute_bui()
 87 | 
 88 |     def compute_bi(self):
 89 |         """
 90 |         Method to compute bi values
 91 | 
 92 |         bi = (rui - mi - bu) / (regBi + number of interactions)
 93 | 
 94 |         """
 95 | 
 96 |         self.bi = dict()
 97 | 
 98 |         for item in self.items:
 99 |             count = 0
100 | 
101 |             for user in self.train_set['users_viewed_item'].get(item, []):
102 |                 self.bi[item] = self.bi.get(item, 0) + float(self.train_set['feedback'][user].get(item, 0)) - \
103 |                                 self.train_set['mean_value'] - self.bu.get(user, 0)
104 |                 count += 1
105 | 
106 |             if count > 1:
107 |                 self.bi[item] = float(self.bi[item]) / float(self.reg_bi + count)
108 |             elif count == 0:
109 |                 self.bi[item] = self.train_set['mean_value']
110 | 
111 |     def compute_bu(self):
112 |         """
113 |         Method to compute bu values
114 | 
115 |         bu = (rui - mi - bi) / (regBu + number of interactions)
116 | 
117 |         """
118 | 
119 |         self.bu = dict()
120 |         for user in self.users:
121 |             count = 0
122 | 
123 |             for item in self.train_set['items_seen_by_user'].get(user, []):
124 |                 self.bu[user] = self.bu.get(user, 0) + float(self.train_set['feedback'][user].get(item, 0)) - \
125 |                                 self.train_set['mean_value'] - self.bi.get(item, 0)
126 |                 count += 1
127 | 
128 |             if count > 1:
129 |                 self.bu[user] = float(self.bu[user]) / float(self.reg_bu + count)
130 |             elif count == 0:
131 |                 self.bu[user] = self.train_set['mean_value']
132 | 
133 |     def compute_bui(self):
134 |         """
135 |         Method to compute bui values
136 | 
137 |         bui = mi + bu + bi
138 |         """
139 | 
140 |         for user in self.users:
141 |             for item in self.items:
142 |                 self.bui.setdefault(user, {}).update(
143 |                     {item: self.train_set['mean_value'] + self.bu.get(user, 0) + self.bi.get(item, 0)})
144 | 
145 |         del self.bu
146 |         del self.bi
147 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/base_nsvd1.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     This class is base for NSVD1 algorithms.
  4 | 
  5 |     Used by: ItemNSVD1, and UserNSVD1
  6 | 
  7 |     Literature:
  8 |     István Pilászy and 	Domonkos Tikk:
  9 |     Recommending new movies: even a few ratings are more valuable than metadata
 10 |     RecSys 2009
 11 |     https://dl.acm.org/citation.cfm?id=1639731
 12 | 
 13 | """
 14 | 
 15 | # © 2019. Case Recommender (MIT License)
 16 | 
 17 | import numpy as np
 18 | 
 19 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 20 | 
 21 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 22 | 
 23 | 
 24 | class BaseNSVD1(BaseRatingPrediction):
 25 |     def __init__(self, train_file, test_file, output_file=None, factors=10, init_mean=0, init_stdev=0.1,
 26 |                  sep='\t', output_sep='\t', random_seed=None):
 27 |         """
 28 |         This class is base for all NSVD1 algorithms.
 29 | 
 30 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 31 |         (user item feedback_value).
 32 |         :type train_file: str
 33 | 
 34 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 35 |         (user item feedback_value).
 36 |         :type test_file: str, default None
 37 | 
 38 |         :param output_file: File with dir to write the final predictions
 39 |         :type output_file: str, default None
 40 | 
 41 |         :param factors: Number of latent factors per user/item
 42 |         :type factors: int, default 10
 43 | 
 44 |         :param init_mean: Mean of the normal distribution used to initialize the latent factors
 45 |         :type init_mean: float, default 0
 46 | 
 47 |         :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors
 48 |         :type init_stdev: float, default 0.1
 49 | 
 50 |         :param sep: Delimiter for input files
 51 |         :type sep: str, default'\t'
 52 | 
 53 |         :param output_sep: Delimiter for output file
 54 |         :type output_sep: str, default '\t'
 55 | 
 56 |         :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 57 |         :type random_seed: int, default None
 58 | 
 59 |         """
 60 |         super(BaseNSVD1, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep,
 61 |                                         output_sep=output_sep)
 62 | 
 63 |         self.factors = factors
 64 |         self.init_mean = init_mean
 65 |         self.init_stdev = init_stdev
 66 | 
 67 |         if random_seed is not None:
 68 |             np.random.seed(random_seed)
 69 | 
 70 |         # internal vars
 71 |         self.number_users = len(self.users)
 72 |         self.number_items = len(self.items)
 73 |         self.item_to_item_id = {}
 74 |         self.item_id_to_item = {}
 75 |         self.user_to_user_id = {}
 76 |         self.user_id_to_user = {}
 77 |         self.x = None
 78 |         self.p = None
 79 |         self.q = None
 80 |         self.w = None
 81 |         self.b = None
 82 |         self.c = None
 83 |         self.metadata = None
 84 |         self.number_metadata = None
 85 | 
 86 |         self.last_rmse = 0
 87 |         self.predictions = []
 88 | 
 89 |     def init_model(self):
 90 |         """
 91 |         Method to treat and initialize the model
 92 | 
 93 |         """
 94 | 
 95 |         # Map items and users with their respective ids and upgrade unobserved items with test set samples
 96 |         for i, item in enumerate(self.items):
 97 |             self.item_to_item_id.update({item: i})
 98 |             self.item_id_to_item.update({i: item})
 99 |         for u, user in enumerate(self.users):
100 |             self.user_to_user_id.update({user: u})
101 |             self.user_id_to_user.update({u: user})
102 | 
103 |     def create_factors(self):
104 |         self.b = np.random.normal(self.init_mean, self.init_stdev, self.number_users)
105 |         self.c = np.random.normal(self.init_mean, self.init_stdev, self.number_items)
106 |         self.p = np.random.normal(self.init_mean, self.init_stdev, (self.number_users, self.factors))
107 |         self.q = np.random.normal(self.init_mean, self.init_stdev, (self.number_items, self.factors))
108 |         self.w = np.random.normal(self.init_mean, self.init_stdev, (self.number_metadata, self.factors))
109 | 
110 |     def _predict(self, user, item, cond=True):
111 |         rui = self.b[user] + self.c[item] + np.dot(self.p[user], self.q[item])
112 | 
113 |         if cond:
114 |             if rui > self.train_set["max_value"]:
115 |                 rui = self.train_set["max_value"]
116 |             if rui < self.train_set["min_value"]:
117 |                 rui = self.train_set["min_value"]
118 | 
119 |         return rui
120 | 
121 |     def predict(self):
122 |         """
123 |         This method computes a final rating for unknown pairs (user, item)
124 | 
125 |         """
126 | 
127 |         if self.test_file is not None:
128 |             for user in self.test_set['users']:
129 |                 for item in self.test_set['feedback'][user]:
130 |                     rui = self._predict(self.user_to_user_id[user], self.item_to_item_id[item])
131 |                     self.predictions.append((user, item, rui))
132 |         else:
133 |             raise NotImplemented
134 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/base_rating_prediction.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     This class is base for rating prediction algorithms.
  4 | 
  5 | """
  6 | 
  7 | # © 2019. Case Recommender (MIT License)
  8 | 
  9 | from scipy.spatial.distance import squareform, pdist
 10 | import numpy as np
 11 | 
 12 | 
 13 | from caserec.evaluation.rating_prediction import RatingPredictionEvaluation
 14 | from caserec.utils.extra_functions import print_header
 15 | from caserec.utils.process_data import ReadFile, WriteFile
 16 | 
 17 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 18 | 
 19 | 
 20 | class BaseRatingPrediction(object):
 21 |     def __init__(self, train_file, test_file, output_file=None, similarity_metric='cosine', sep='\t',
 22 |                  output_sep='\t'):
 23 |         """
 24 |          This class is base for all rating prediction algorithms. Inherits the class Recommender
 25 |          and implements / adds common methods and attributes for rating prediction approaches.
 26 | 
 27 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 28 |         (user item feedback_value).
 29 |         :type train_file: str
 30 | 
 31 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 32 |         (user item feedback_value).
 33 |         :type test_file: str, default None
 34 | 
 35 |         :param output_file: File with dir to write the final predictions
 36 |         :type output_file: str, default None
 37 | 
 38 |         :param similarity_metric:
 39 |         :type similarity_metric: str, default cosine
 40 | 
 41 |         :param sep: Delimiter for input files
 42 |         :type sep: str, default '\t'
 43 | 
 44 |         :param output_sep: Delimiter for output file
 45 |         :type output_sep: str, default '\t'
 46 | 
 47 |         """
 48 | 
 49 |         self.train_file = train_file
 50 |         self.test_file = test_file
 51 |         self.similarity_metric = similarity_metric
 52 |         self.output_file = output_file
 53 |         self.sep = sep
 54 |         self.output_sep = output_sep
 55 | 
 56 |         # internal vars
 57 |         self.item_to_item_id = {}
 58 |         self.item_id_to_item = {}
 59 |         self.user_to_user_id = {}
 60 |         self.user_id_to_user = {}
 61 |         self.train_set = None
 62 |         self.test_set = None
 63 |         self.users = None
 64 |         self.items = None
 65 |         self.matrix = None
 66 |         self.evaluation_results = None
 67 |         self.recommender_name = None
 68 |         self.extra_info_header = None
 69 |         self.predictions = []
 70 | 
 71 |     def read_files(self):
 72 |         """
 73 |         Method to initialize recommender algorithm.
 74 | 
 75 |         """
 76 | 
 77 |         # Getting train_set as a dict_file = {'feedback': dict_feedback, 'users': list_users, 'items': list_items, 
 78 |         #               'sparsity': sparsity, 'number_interactions': number_interactions, 'users_viewed_item': users_viewed_item, 'items_unobserved': items_unobserved,
 79 |         #               'items_seen_by_user': items_seen_by_user, 'mean_value': mean_value, 'max_value': max(list_feedback), 'min_value': min(list_feedback)}
 80 |         self.train_set = ReadFile(self.train_file, sep=self.sep).read() 
 81 | 
 82 |         if self.test_file is not None:
 83 |             self.test_set = ReadFile(self.test_file, sep=self.sep).read()
 84 | 
 85 |             # Combining users/items from train and test set
 86 |             self.users = sorted(set(list(self.train_set['users']) + list(self.test_set['users'])))
 87 |             self.items = sorted(set(list(self.train_set['items']) + list(self.test_set['items'])))
 88 |         else:
 89 |             self.users = self.train_set['users']
 90 |             self.items = self.train_set['items']
 91 | 
 92 |         for i, item in enumerate(self.items):
 93 |             self.item_to_item_id.update({item: i})
 94 |             self.item_id_to_item.update({i: item})
 95 |         for u, user in enumerate(self.users):
 96 |             self.user_to_user_id.update({user: u})
 97 |             self.user_id_to_user.update({u: user})
 98 | 
 99 |     def create_matrix(self):
100 |         """
101 |         Method to create a feedback matrix having users as rows and items as columns
102 | 
103 |         """
104 | 
105 |         self.matrix = np.zeros((len(self.users), len(self.items)))
106 | 
107 |         for user in self.train_set['users']:
108 |             for item in self.train_set['feedback'][user]:
109 |                 self.matrix[self.user_to_user_id[user]][self.item_to_item_id[item]] = \
110 |                     self.train_set['feedback'][user][item]
111 | 
112 |     def compute_similarity(self, transpose=False):
113 |         """
114 |         Method to compute a similarity matrix from original df_matrix
115 | 
116 |         :param transpose: If True, calculate the similarity in a transpose matrix
117 |         :type transpose: bool, default False
118 | 
119 |         """
120 | 
121 |         # Calculate distance matrix
122 |         if transpose:
123 |             similarity_matrix = np.float32(squareform(pdist(self.matrix.T, self.similarity_metric)))
124 |         else:
125 |             similarity_matrix = np.float32(squareform(pdist(self.matrix, self.similarity_metric)))
126 | 
127 |         # Remove NaNs
128 |         similarity_matrix[np.isnan(similarity_matrix)] = 1.0
129 |         # transform distances in similarities. Values in matrix range from 0-1
130 |         similarity_matrix = (similarity_matrix.max() - similarity_matrix) / similarity_matrix.max()
131 | 
132 |         return similarity_matrix
133 | 
134 |     def evaluate(self, metrics, verbose=True, as_table=False, table_sep='\t'):
135 |         """
136 |         Method to evaluate the final ranking
137 | 
138 |         :param metrics: List of evaluation metrics
139 |         :type metrics: list, default ('MAE', 'RMSE')
140 | 
141 |         :param verbose: Print the evaluation results
142 |         :type verbose: bool, default True
143 | 
144 |         :param as_table: Print the evaluation results as table
145 |         :type as_table: bool, default False
146 | 
147 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
148 |         :type table_sep: str, default '\t'
149 | 
150 |         """
151 | 
152 |         self.evaluation_results = {}
153 | 
154 |         if metrics is None:
155 |             metrics = list(['MAE', 'RMSE'])
156 | 
157 |         results = RatingPredictionEvaluation(verbose=verbose, as_table=as_table, table_sep=table_sep, metrics=metrics
158 |                                              ).evaluate_recommender(predictions=self.predictions,
159 |                                                                     test_set=self.test_set)
160 | 
161 |         for metric in metrics:
162 |             self.evaluation_results[metric.upper()] = results[metric.upper()]
163 | 
164 |     def write_predictions(self):
165 |         """
166 |         Method to write final ranking
167 | 
168 |         """
169 | 
170 |         if self.output_file is not None:
171 |             WriteFile(self.output_file, data=self.predictions, sep=self.sep).write()
172 | 
173 |     def compute(self, verbose=True):
174 |         """
175 |         Method to run the recommender algorithm
176 | 
177 |         :param verbose: Print the information about recommender
178 |         :type verbose: bool, default True
179 | 
180 |         """
181 | 
182 |         # read files
183 |         self.read_files()
184 | 
185 |         # initialize empty predictions (Don't remove: important to Cross Validation)
186 |         self.predictions = []
187 | 
188 |         if verbose:
189 |             test_info = None
190 | 
191 |             main_info = {
192 |                 'title': 'Rating Prediction > ' + self.recommender_name,
193 |                 'n_users': len(self.train_set['users']),
194 |                 'n_items': len(self.train_set['items']),
195 |                 'n_interactions': self.train_set['number_interactions'],
196 |                 'sparsity': self.train_set['sparsity']
197 |                     }
198 | 
199 |             if self.test_file is not None:
200 |                 test_info = {
201 |                     'n_users': len(self.test_set['users']),
202 |                     'n_items': len(self.test_set['items']),
203 |                     'n_interactions': self.test_set['number_interactions'],
204 |                     'sparsity': self.test_set['sparsity']
205 |                 }
206 | 
207 |             print_header(main_info, test_info)
208 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/item_attribute_knn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Item Based Collaborative Filtering Recommender with Attributes (Item Attribute KNN)
  4 |     [Rating Prediction]
  5 | 
  6 |     Its philosophy is as follows: in order to determine the rating of User u on item m, we can find other movies that
  7 |     are similar to item m, and based on User u’s ratings on those similar movies we infer his rating on item m.
  8 |     However, instead of traditional ItemKNN, this approach uses a metadata or pre-computed similarity matrix.
  9 | 
 10 | """
 11 | 
 12 | # © 2019. Case Recommender (MIT License)
 13 | 
 14 | from collections import defaultdict
 15 | import numpy as np
 16 | 
 17 | from caserec.recommenders.rating_prediction.itemknn import ItemKNN
 18 | from caserec.utils.process_data import ReadFile
 19 | 
 20 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 21 | 
 22 | 
 23 | class ItemAttributeKNN(ItemKNN):
 24 |     def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None,
 25 |                  k_neighbors=30, as_similar_first=True, metadata_as_binary=False, metadata_similarity_sep='\t',
 26 |                  similarity_metric="cosine", sep='\t', output_sep='\t'):
 27 |         """
 28 |         Item Attribute KNN for Rating Prediction
 29 | 
 30 |         This algorithm predicts a rank for each user based on the similar items that he/her consumed,
 31 |         using a metadata or similarity pre-computed file
 32 | 
 33 |         Usage::
 34 | 
 35 |             >> ItemAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute()
 36 |             >> ItemAttributeKNN(train, test, metadata_file=metadata, as_similar_first=False).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str, default None
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns
 50 |         (item metadata).
 51 |         :type metadata_file: str, default None
 52 | 
 53 |         :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns
 54 |         (item item similarity).
 55 |         :type similarity_file: str, default None
 56 | 
 57 |         :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users))
 58 |         :type k_neighbors: int, default None
 59 | 
 60 |         :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k
 61 |         most similar users and then take the intersection with the users that
 62 |         seen that item.
 63 |         :type as_similar_first: bool, default True
 64 | 
 65 |         :param metadata_as_binary: f True, the explicit value will be transform to binary
 66 |         :type metadata_as_binary: bool, default False
 67 | 
 68 |         :param metadata_similarity_sep: Delimiter for similarity or metadata file
 69 |         :type metadata_similarity_sep: str, default '\t'
 70 | 
 71 |         :param similarity_metric: Pairwise metric to compute the similarity between the items. Reference about
 72 |         distances: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html
 73 |         :type similarity_metric: str, default cosine
 74 | 
 75 |         :param sep: Delimiter for input files file
 76 |         :type sep: str, default '\t'
 77 | 
 78 |         :param output_sep: Delimiter for output file
 79 |         :type output_sep: str, default '\t'
 80 | 
 81 |         """
 82 | 
 83 |         super(ItemAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 84 |                                                k_neighbors=k_neighbors, as_similar_first=as_similar_first, sep=sep,
 85 |                                                output_sep=output_sep, similarity_metric=similarity_metric)
 86 | 
 87 |         self.recommender_name = 'Item Attribute KNN Algorithm'
 88 | 
 89 |         self.metadata_file = metadata_file
 90 |         self.similarity_file = similarity_file
 91 |         self.metadata_as_binary = metadata_as_binary
 92 |         self.metadata_similarity_sep = metadata_similarity_sep
 93 | 
 94 |     def init_model(self):
 95 |         """
 96 |         Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity
 97 |         matrix
 98 | 
 99 |         """
100 | 
101 |         self.similar_items = defaultdict(list)
102 | 
103 |         # Set the value for k
104 |         if self.k_neighbors is None:
105 |             self.k_neighbors = int(np.sqrt(len(self.items)))
106 | 
107 |         if self.metadata_file is not None:
108 |             metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary
109 |                                 ).read_metadata_or_similarity()
110 | 
111 |             self.matrix = np.zeros((len(self.items), len(metadata['col_2'])))
112 | 
113 |             meta_to_meta_id = {}
114 |             for m, data in enumerate(metadata['col_2']):
115 |                 meta_to_meta_id[data] = m
116 | 
117 |             for item in metadata['col_1']:
118 |                 for m in metadata['dict'][item]:
119 |                     self.matrix[self.item_to_item_id[item], meta_to_meta_id[m]] = metadata['dict'][item][m]
120 | 
121 |             # create header info for metadata
122 |             sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100
123 | 
124 |             self.extra_info_header = ">> metadata:: %d items and %d metadata (%d interactions) | sparsity:: %.2f%%" % \
125 |                                      (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'],
126 |                                       sparsity)
127 | 
128 |             # Create similarity matrix based on metadata or similarity file. Transpose=False, because it is an
129 |             # item x metadata matrix
130 |             self.si_matrix = self.compute_similarity(transpose=False)
131 | 
132 |         elif self.similarity_file is not None:
133 |             similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False
134 |                                   ).read_metadata_or_similarity()
135 | 
136 |             self.si_matrix = np.zeros((len(self.items), len(self.items)))
137 | 
138 |             # Fill similarity matrix
139 |             for i in similarity['col_1']:
140 |                 for i_j in similarity['dict'][i]:
141 |                     self.si_matrix[self.item_to_item_id[i], self.item_to_item_id[int(i_j)]] = similarity['dict'][i][i_j]
142 | 
143 |             # Remove NaNs
144 |             self.si_matrix[np.isnan(self.si_matrix)] = 0.0
145 | 
146 |         else:
147 |             raise ValueError("This algorithm needs a similarity matrix or a metadata file!")
148 | 
149 |         # Create original matrix user x item for prediction process
150 |         self.create_matrix()
151 | 
152 |         for i_id, item in enumerate(self.items):
153 |             self.similar_items[i_id] = sorted(range(len(self.si_matrix[i_id])),
154 |                                               key=lambda k: -self.si_matrix[i_id][k])[1:self.k_neighbors + 1]
155 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/item_msmf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     Item-MSMF: Items Most Similar based on Matrix Factorization
  4 |     [Rating Prediction]
  5 | 
  6 |     Literature:
  7 |         2018 Brazilian Conference on Intelligent Systems (BRACIS).
  8 |         Link soon.
  9 | 
 10 | """
 11 | 
 12 | import numpy as np
 13 | 
 14 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization
 15 | from caserec.utils.extra_functions import timed
 16 | 
 17 | __author__ = 'Eduardo Fressato <eduardofressato@hotmail.com>'
 18 | 
 19 | 
 20 | class ItemMSMF(MatrixFactorization):
 21 |     def __init__(self, train_file=None, test_file=None, output_file=None, similarity_file=None, neighbors=20,
 22 |                  factors=10, learn_rate=0.01, epochs=30, delta=0.015, init_mean=0.1, init_stdev=0.1, baseline=True,
 23 |                  bias_learn_rate=0.005, delta_bias=0.002, stop_criteria=0.009, sep='\t', output_sep='\t',
 24 |                  similarity_sep='\t', random_seed=None, verbose=True):
 25 | 
 26 |         super(ItemMSMF, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep,
 27 |                                        learn_rate=learn_rate, factors=factors, epochs=epochs, delta=delta,
 28 |                                        init_mean=init_mean, init_stdev=init_stdev, baseline=baseline,
 29 |                                        bias_learn_rate=bias_learn_rate, delta_bias=delta_bias,
 30 |                                        stop_criteria=stop_criteria, output_sep=output_sep, random_seed=random_seed)
 31 | 
 32 |         """
 33 |             Item-MSMF: Items Most Similar based on Matrix Factorization
 34 | 
 35 |             The Item-MSMF algorithm, this is recommender technique based on matrix factorization, that incorporates
 36 |             similarities of items which are calculated based on metadata. This approach to address the item
 37 |             cold-start through a shared latent factor vector representation of similar items based on those items
 38 |             which have enough interactions with users. In this way, the new items representations that are not
 39 |             accurate in terms of rating prediction, is replaced them with a weighted average of the latent factor
 40 |             vectors of the most similar items.
 41 | 
 42 |             Usage::
 43 | 
 44 |                 >> ItemMSMF(train, test, similarity_file, neighbors).compute()
 45 |                 
 46 |             :param train_file: File which contains the train set. This file needs to have at least 3 columns
 47 |             (user item feedback_value).
 48 |             :type train_file: str
 49 |     
 50 |             :param test_file: File which contains the test set. This file needs to have at least 3 columns
 51 |             (user item feedback_value).
 52 |             :type test_file: str, default None
 53 |             
 54 |             :param output_file: File with dir to write the final predictions
 55 |             :type output_file: str, default None
 56 |             
 57 |             :param similarity_file: File which contains the similarity of items. This file needs to have at least 3 columns
 58 |             (item item similarity).
 59 |             :type similarity_file: str, default None
 60 |             
 61 |             :param neighbors: Number of items that replace the new item vector
 62 |             :type neighbors: int, default 20
 63 |     
 64 |             :param factors: Number of latent factors per user/item
 65 |             :type factors: int, default 10
 66 |     
 67 |             :param learn_rate: Learning rate (alpha)
 68 |             :type learn_rate: float, default 0.05
 69 |     
 70 |             :param epochs: Number of epochs over the training data
 71 |             :type epochs: int, default 30
 72 |     
 73 |             :param delta: Regularization value
 74 |             :type delta: float, default 0.015
 75 |     
 76 |             :param init_mean: Mean of the normal distribution used to initialize the latent factors
 77 |             :type init_mean: float, default 0
 78 |     
 79 |             :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors
 80 |             :type init_stdev: float, default 0.1
 81 |     
 82 |             :param bias_learn_rate: Learning rate for baselines
 83 |             :type bias_learn_rate: float, default 0.005
 84 |     
 85 |             :param delta_bias: Regularization value for baselines
 86 |             :type delta_bias: float, default 0.002
 87 |     
 88 |             :param stop_criteria: Difference between errors for stopping criteria
 89 |             :type stop_criteria: float, default 0.009
 90 |     
 91 |             :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 92 |             :type random_seed: int, default None
 93 |             
 94 |             :param verbose: Print information
 95 |             :type verbose: bool, default True
 96 |                         
 97 |         """
 98 | 
 99 |         self.recommender_name = 'Item-MSMF for Cold Start'
100 |         self.verbose = verbose
101 | 
102 |         self.similarity_file = similarity_file
103 |         self.similarity_sep = similarity_sep
104 |         self.si_matrix = None
105 |         self.new_items = set()
106 |         self.k = neighbors
107 | 
108 |     def init_model(self):
109 |         super(ItemMSMF, self).init_model()
110 |         if self.verbose:
111 |             print("\nread_similarity_matrix_time:: %4f sec" % timed(self.fill_similarity_matrix))
112 |         else:
113 |             self.fill_similarity_matrix()
114 | 
115 |     def fill_similarity_matrix(self):
116 |         self.si_matrix = np.zeros((len(self.items), len(self.items)))
117 |         items_sim = set()
118 | 
119 |         with open(self.similarity_file, "r", encoding='utf-8') as infile:
120 |             items = set(self.items)
121 |             for line in infile:
122 |                 if line.strip():
123 |                     inline = line.split(self.similarity_sep)
124 |                     item_a, item_b, sim = int(inline[0]), int(inline[1]), float(inline[2].rstrip())
125 | 
126 |                     if item_a in items and item_b in items:
127 |                         map_a = self.item_to_item_id[item_a]
128 |                         map_b = self.item_to_item_id[item_b]
129 |                         items_sim.add(item_a)
130 |                         items_sim.add(item_b)
131 |                         self.si_matrix[map_a][map_b] = sim
132 |                         self.si_matrix[map_b][map_a] = sim
133 | 
134 |         if self.verbose:
135 |             print("Number of item in similarity file:", len(items_sim))
136 |         del items_sim
137 | 
138 |     def search_new_items(self):
139 |         for i in self.test_set['items']:
140 |             if i not in self.train_set['items']:
141 |                 self.new_items.add(i)
142 | 
143 |     def search_similar_items(self, item):
144 |         item_index = self.item_to_item_id[item]
145 |         count = 0
146 |         list_items = []
147 |         list_similar = sorted(enumerate(self.si_matrix[item_index]), key=lambda x: -x[1])
148 | 
149 |         for i, sim in list_similar:
150 |             if i != item_index:
151 |                 if self.item_id_to_item[i] in self.train_set['items']:
152 |                     list_items.append((self.item_id_to_item[i], sim))
153 |                     count += 1
154 |                     if count == self.k:
155 |                         return list_items
156 | 
157 |     def replace_vector_new_item(self):
158 | 
159 |         for item in self.new_items:
160 |             list_items = self.search_similar_items(item)
161 | 
162 |             q_i = self.q[self.item_to_item_id[list_items[0][0]]].copy() * list_items[0][1]
163 |             b_i = self.bi[self.item_to_item_id[list_items[0][0]]].copy() * list_items[0][1]
164 |             sum_sim = list_items[0][1]
165 | 
166 |             for item_j, sim in list_items[1:]:
167 |                 q_i += self.q[self.item_to_item_id[item_j]].copy() * sim
168 |                 b_i += self.bi[self.item_to_item_id[item_j]].copy() * sim
169 |                 sum_sim += sim
170 | 
171 |             if sum_sim > 0:
172 |                 q_i = q_i / sum_sim
173 |                 b_i = b_i / sum_sim
174 | 
175 |                 self.q[self.item_to_item_id[item]] = q_i.copy()
176 |                 if self.baseline:
177 |                     self.bi[self.item_to_item_id[item]] = b_i.copy()
178 | 
179 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
180 | 
181 |         if verbose:
182 |             super(MatrixFactorization, self).compute(verbose=verbose)
183 |             self.init_model()
184 | 
185 |             print("training_time:: %4f sec" % timed(self.fit))
186 |             if self.extra_info_header is not None:
187 |                 print(self.extra_info_header)
188 | 
189 |             search_time = timed(self.search_new_items)
190 |             replace_time = timed(self.replace_vector_new_item)
191 |             prediction_time = timed(self.predict)
192 | 
193 |             print("search_new_items_time:: %4f sec" % search_time)
194 |             print("vectors_replacement_time:: %4f sec" % replace_time)
195 |             print("prediction_time:: %4f sec" % prediction_time)
196 |             print("total_prediction_time:: %4f sec" % (search_time + replace_time + prediction_time))
197 |             print("\n")
198 | 
199 |         else:
200 |             # Execute all in silence without prints
201 |             super(MatrixFactorization, self).compute(verbose=verbose)
202 |             self.init_model()
203 |             self.fit()
204 |             self.search_new_items()
205 |             self.replace_vector_new_item()
206 |             self.predict()
207 | 
208 |         self.write_predictions()
209 | 
210 |         if self.test_file is not None:
211 |             return self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
212 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/matrixfactorization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     Matrix Factorization Collaborative Filtering Recommender
  4 |     [Rating Prediction]
  5 | 
  6 |     Literature:
  7 |         Koren, Yehuda and Bell, Robert and Volinsky, Chris:
  8 |         Matrix Factorization Techniques for Recommender Systems
  9 |         Journal Computer 2009.
 10 |         http://dl.acm.org/citation.cfm?id=1608614
 11 | 
 12 | """
 13 | 
 14 | # © 2019. Case Recommender (MIT License)
 15 | 
 16 | import numpy as np
 17 | 
 18 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 19 | from caserec.utils.extra_functions import timed
 20 | 
 21 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 22 | 
 23 | 
 24 | class MatrixFactorization(BaseRatingPrediction):
 25 |     def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, learn_rate=0.01, epochs=30,
 26 |                  delta=0.015, init_mean=0.1, init_stdev=0.1, baseline=False, bias_learn_rate=0.005, delta_bias=0.002,
 27 |                  stop_criteria=0.009, sep='\t', output_sep='\t', random_seed=None):
 28 |         """
 29 |         Matrix Factorization for rating prediction
 30 | 
 31 |         Matrix factorization models map both users and items to a joint latent factor space of dimensionality f,
 32 |         such that user-item interactions are modeled as inner products in that space.
 33 | 
 34 |         Usage::
 35 | 
 36 |             >> MatrixFactorization(train, test).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param factors: Number of latent factors per user/item
 50 |         :type factors: int, default 10
 51 | 
 52 |         :param learn_rate: Learning rate (alpha)
 53 |         :type learn_rate: float, default 0.05
 54 | 
 55 |         :param epochs: Number of epochs over the training data
 56 |         :type epochs: int, default 30
 57 | 
 58 |         :param delta: Regularization value
 59 |         :type delta: float, default 0.015
 60 | 
 61 |         :param init_mean: Mean of the normal distribution used to initialize the latent factors
 62 |         :type init_mean: float, default 0
 63 | 
 64 |         :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors
 65 |         :type init_stdev: float, default 0.1
 66 | 
 67 |         :param baseline: Use the train data to build baselines (SVD Algorithm); else: Use only the mean
 68 |         :type baseline: bool, default False
 69 | 
 70 |         :param bias_learn_rate: Learning rate for baselines
 71 |         :type bias_learn_rate: float, default 0.005
 72 | 
 73 |         :param delta_bias: Regularization value for baselines
 74 |         :type delta_bias: float, default 0.002
 75 | 
 76 |         :param stop_criteria: Difference between errors for stopping criteria
 77 |         :type stop_criteria: float, default 0.001
 78 | 
 79 |         :param sep: Delimiter for input files
 80 |         :type sep: str, default'\t'
 81 | 
 82 |         :param output_sep: Delimiter for output file
 83 |         :type output_sep: str, default '\t'
 84 | 
 85 |         :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 86 |         :type random_seed: int, default None
 87 | 
 88 |         """
 89 |         super(MatrixFactorization, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 90 |                                                   sep=sep, output_sep=output_sep)
 91 | 
 92 |         self.recommender_name = 'Matrix Factorization'
 93 | 
 94 |         self.epochs = epochs
 95 |         self.learn_rate = learn_rate
 96 |         self.delta = delta
 97 |         self.factors = factors
 98 |         self.init_mean = init_mean
 99 |         self.init_stdev = init_stdev
100 |         self.baseline = baseline
101 |         self.bias_learn_rate = bias_learn_rate
102 |         self.delta_bias = delta_bias
103 |         self.stop_criteria = stop_criteria
104 | 
105 |         if random_seed is not None:
106 |             np.random.seed(random_seed)
107 | 
108 |         # internal vars
109 |         self.feedback_triples = None
110 |         self.p = None
111 |         self.q = None
112 |         self.bu = None
113 |         self.bi = None
114 | 
115 |     def init_model(self):
116 |         """
117 |         Method to treat and initialize the model
118 | 
119 |         """
120 |         self.feedback_triples = []
121 | 
122 |         # Map interaction with ids
123 |         for user in self.train_set['feedback']:
124 |             for item in self.train_set['feedback'][user]:
125 |                 self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item],
126 |                                               self.train_set['feedback'][user][item]))
127 | 
128 |         # Initialize factors
129 |         self.create_factors()
130 | 
131 |     def fit(self):
132 |         """
133 |         This method performs iterations of stochastic gradient ascent over the training data.
134 | 
135 |         """
136 | 
137 |         rmse_old = .0
138 | 
139 |         for epoch in range(self.epochs):
140 | 
141 |             error_final = .0
142 | 
143 |             for user, item, feedback in self.feedback_triples:
144 | 
145 |                 eui = feedback - self._predict_score(user, item, False)
146 |                 error_final += (eui ** 2.0)
147 | 
148 |                 # Adjust the factors
149 |                 u_f = self.p[user]
150 |                 i_f = self.q[item]
151 | 
152 |                 # Compute factor updates
153 |                 delta_u = np.subtract(np.multiply(eui, i_f), np.multiply(self.delta, u_f))
154 |                 delta_i = np.subtract(np.multiply(eui, u_f), np.multiply(self.delta, i_f))
155 | 
156 |                 # apply updates
157 |                 self.p[user] += np.multiply(self.learn_rate, delta_u)
158 |                 self.q[item] += np.multiply(self.learn_rate, delta_i)
159 | 
160 |                 if self.baseline:
161 |                     self.bu[user] += self.bias_learn_rate * (eui - self.delta_bias * self.bu[user])
162 |                     self.bi[item] += self.bias_learn_rate * (eui - self.delta_bias * self.bi[item])
163 | 
164 |             rmse_new = np.sqrt(error_final / self.train_set["number_interactions"])
165 |             if np.fabs(rmse_new - rmse_old) <= self.stop_criteria:
166 |                 break
167 |             else:
168 |                 rmse_old = rmse_new
169 | 
170 |     def create_factors(self):
171 |         """
172 |         This method create factors for users, items and bias
173 | 
174 |         """
175 | 
176 |         self.p = np.random.normal(self.init_mean, self.init_stdev, (len(self.users), self.factors))
177 |         self.q = np.random.normal(self.init_mean, self.init_stdev, (len(self.items), self.factors))
178 | 
179 |         if self.baseline:
180 |             self.bu = np.zeros(len(self.users), np.double)
181 |             self.bi = np.zeros(len(self.items), np.double)
182 | 
183 |     def _predict_score(self, u, i, cond=True):
184 |         """
185 |         Method to predict a single score for a pair (user, item)
186 | 
187 |         :param u: User ID
188 |         :type u: int
189 | 
190 |         :param i: Item ID
191 |         :type i: int
192 | 
193 |         :param cond: Use max and min values of train set to limit score
194 |         :type cond: bool, default True
195 | 
196 |         :return: Score generate for pair (user, item)
197 |         :rtype: float
198 | 
199 |         """
200 | 
201 |         if self.baseline:
202 |             rui = self.train_set["mean_value"] + self.bu[u] + self.bi[i] + np.dot(self.p[u], self.q[i])
203 |         else:
204 |             rui = self.train_set['mean_value'] + np.dot(self.p[u], self.q[i])
205 | 
206 |         if cond:
207 |             if rui > self.train_set["max_value"]:
208 |                 rui = self.train_set["max_value"]
209 |             elif rui < self.train_set["min_value"]:
210 |                 rui = self.train_set["min_value"]
211 | 
212 |         return rui
213 | 
214 |     def predict(self):
215 |         """
216 |         This method computes a final rating for unknown pairs (user, item)
217 | 
218 |         """
219 | 
220 |         if self.test_file is not None:
221 |             for user in self.test_set['users']:
222 |                 for item in self.test_set['feedback'][user]:
223 |                     self.predictions.append((user, item, self._predict_score(self.user_to_user_id[user],
224 |                                                                              self.item_to_item_id[item], True)))
225 |         else:
226 |             raise NotImplemented
227 | 
228 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
229 |         """
230 |         Extends compute method from BaseRatingPrediction. Method to run recommender algorithm
231 | 
232 |         :param verbose: Print recommender and database information
233 |         :type verbose: bool, default True
234 | 
235 |         :param metrics: List of evaluation measures
236 |         :type metrics: list, default None
237 | 
238 |         :param verbose_evaluation: Print the evaluation results
239 |         :type verbose_evaluation: bool, default True
240 | 
241 |         :param as_table: Print the evaluation results as table
242 |         :type as_table: bool, default False
243 | 
244 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
245 |         :type table_sep: str, default '\t'
246 | 
247 |         """
248 | 
249 |         super(MatrixFactorization, self).compute(verbose=verbose)
250 | 
251 |         if verbose:
252 |             self.init_model()
253 |             print("training_time:: %4f sec" % timed(self.fit))
254 |             if self.extra_info_header is not None:
255 |                 print(self.extra_info_header)
256 | 
257 |             print("prediction_time:: %4f sec" % timed(self.predict))
258 | 
259 |             print('\n')
260 | 
261 |         else:
262 |             # Execute all in silence without prints
263 |             self.init_model()
264 |             self.fit()
265 |             self.predict()
266 | 
267 |         self.write_predictions()
268 | 
269 |         if self.test_file is not None:
270 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
271 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/most_popular.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Most Popular Collaborative Filtering Recommender
  4 |     [Rating Prediction]
  5 | 
  6 |     Most Popular predicts ratings for unobserved items for each user based on popularity of user and items.
  7 | 
  8 | """
  9 | 
 10 | # © 2019. Case Recommender (MIT License)
 11 | 
 12 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 13 | from caserec.utils.extra_functions import timed
 14 | import numpy as np
 15 | 
 16 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 17 | 
 18 | 
 19 | class MostPopular(BaseRatingPrediction):
 20 |     def __init__(self, train_file=None, test_file=None, output_file=None, sep='\t', output_sep='\t'):
 21 |         """
 22 |         Most Popular for Item Recommendation
 23 | 
 24 |         This algorithm predicts a rank for each user using the count of number of feedback of users and items
 25 | 
 26 |         Usage::
 27 | 
 28 |             >> MostPopular(train, test).compute()
 29 | 
 30 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 31 |         (user item feedback_value).
 32 |         :type train_file: str
 33 | 
 34 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 35 |         (user item feedback_value).
 36 |         :type test_file: str, default None
 37 | 
 38 |         :param output_file: File with dir to write the final predictions
 39 |         :type output_file: str, default None
 40 | 
 41 |         :param sep: Delimiter for input files
 42 |         :type sep: str, default '\t'
 43 | 
 44 |         :param output_sep: Delimiter for output file
 45 |         :type output_sep: str, default '\t'
 46 | 
 47 |         """
 48 | 
 49 |         super(MostPopular, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 50 |                                           sep=sep, output_sep=output_sep)
 51 | 
 52 |         self.recommender_name = 'Most Popular'
 53 | 
 54 |     def predict(self):
 55 |         """
 56 |             This method predict final result, building an rank of each user of the train set.
 57 | 
 58 |         """
 59 | 
 60 |         if self.test_file is not None:
 61 |             for user in self.test_set['users']:
 62 |                 for item in self.test_set['feedback'][user]:
 63 | 
 64 |                     count_value = 0
 65 |                     feedback_value = 0
 66 | 
 67 |                     for user_v in self.train_set['users_viewed_item'].get(item, []):
 68 |                         feedback_value += self.train_set['feedback'][user_v][item]
 69 |                         count_value += 1
 70 | 
 71 |                     if feedback_value == 0:
 72 |                         try:
 73 |                             feedback_value = np.mean(list(self.train_set['feedback'][user].values()))
 74 |                         except KeyError:
 75 |                             feedback_value = self.train_set['mean_value']
 76 |                     else:
 77 |                         feedback_value /= count_value
 78 | 
 79 |                     self.predictions.append((user, item, feedback_value))
 80 |         else:
 81 |             raise NotImplemented
 82 | 
 83 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
 84 |         """
 85 |         Extends compute method from BaseItemRecommendation. Method to run recommender algorithm
 86 | 
 87 |         :param verbose: Print recommender and database information
 88 |         :type verbose: bool, default True
 89 | 
 90 |         :param metrics: List of evaluation measures
 91 |         :type metrics: list, default None
 92 | 
 93 |         :param verbose_evaluation: Print the evaluation results
 94 |         :type verbose_evaluation: bool, default True
 95 | 
 96 |         :param as_table: Print the evaluation results as table
 97 |         :type as_table: bool, default False
 98 | 
 99 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
100 |         :type table_sep: str, default '\t'
101 | 
102 |         """
103 | 
104 |         super(MostPopular, self).compute(verbose=verbose)
105 | 
106 |         if verbose:
107 |             print("prediction_time:: %4f sec" % timed(self.predict))
108 |             print('\n')
109 | 
110 |         else:
111 |             self.predict()
112 | 
113 |         self.write_predictions()
114 | 
115 |         if self.test_file is not None:
116 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
117 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/nnmf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     Non-negative Matrix Factorization
  4 |     [Rating Prediction]
  5 | 
  6 |     Literature:
  7 |         Badrul Sarwar , George Karypis , Joseph Konstan , John Riedl:
  8 |         Incremental Singular Value Decomposition Algorithms for Highly Scalable Recommender Systems
  9 |         Fifth International Conference on Computer and Information Science 2002.
 10 |         http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.7894
 11 | 
 12 | """
 13 | 
 14 | # © 2019. Case Recommender (MIT License)
 15 | 
 16 | import numpy as np
 17 | from sklearn.decomposition import NMF 
 18 | 
 19 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 20 | from caserec.utils.extra_functions import timed
 21 | 
 22 | __author__ = 'Joao Felipe Guedes <guedes.joaofelipe@poli.ufrj.br>'
 23 | 
 24 | 
 25 | class NNMF(BaseRatingPrediction):
 26 |     def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, sep='\t', output_sep='\t',
 27 |                  random_seed=None):
 28 |         """
 29 |         Matrix Factorization for rating prediction
 30 | 
 31 |         Matrix factorization models map both users and items to a joint latent factor space of dimensionality f,
 32 |         such that user-item interactions are modeled as inner products in that space.
 33 | 
 34 |         Usage::
 35 | 
 36 |             >> MatrixFactorization(train, test).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param factors: Number of latent factors per user/item
 50 |         :type factors: int, default 10
 51 | 
 52 |         :param sep: Delimiter for input files
 53 |         :type sep: str, default '\t'
 54 | 
 55 |         :param output_sep: Delimiter for output file
 56 |         :type output_sep: str, default '\t'
 57 | 
 58 |         :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 59 |         :type random_seed: int, default None
 60 | 
 61 |         """
 62 |         super(NNMF, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep,
 63 |                                   output_sep=output_sep)
 64 | 
 65 |         self.recommender_name = 'NNMF'
 66 |         self.factors = factors
 67 | 
 68 |         if random_seed is not None:
 69 |             np.random.seed(random_seed)
 70 | 
 71 |         # internal vars
 72 |         self.feedback_triples = None
 73 |         self.prediction_matrix = None
 74 | 
 75 |     def init_model(self):
 76 |         """
 77 |         Method to treat and initialize the model
 78 | 
 79 |         """
 80 | 
 81 |         self.feedback_triples = []
 82 | 
 83 |         # Map interaction with ids
 84 |         for user in self.train_set['feedback']:
 85 |             for item in self.train_set['feedback'][user]:
 86 |                 self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item],
 87 |                                               self.train_set['feedback'][user][item]))
 88 | 
 89 |         self.create_matrix()
 90 | 
 91 |     def fit(self):
 92 |         """
 93 |         This method performs Non-negative matrix factorization over the training data.
 94 | 
 95 |         """
 96 | 
 97 |         model = NMF(n_components=self.factors, init='random', random_state=0)
 98 | 
 99 |         P = model.fit_transform(self.matrix)
100 | 
101 |         Q = model.components_
102 |         
103 |         self.prediction_matrix = np.dot(P, Q)
104 | 
105 |     def predict_score(self, u, i, cond=True):
106 |         """
107 |         Method to predict a single score for a pair (user, item)
108 | 
109 |         :param u: User ID
110 |         :type u: int
111 | 
112 |         :param i: Item ID
113 |         :type i: int
114 | 
115 |         :param cond: Use max and min values of train set to limit score
116 |         :type cond: bool, default True
117 | 
118 |         :return: Score generate for pair (user, item)
119 |         :rtype: float
120 | 
121 |         """
122 | 
123 |         rui = self.train_set["mean_value"] + self.prediction_matrix[u][i]
124 | 
125 |         if cond:
126 |             if rui > self.train_set["max_value"]:
127 |                 rui = self.train_set["max_value"]
128 |             elif rui < self.train_set["min_value"]:
129 |                 rui = self.train_set["min_value"]
130 | 
131 |         return rui
132 | 
133 |     def predict(self):
134 |         """
135 |         This method computes a final rating for unknown pairs (user, item)
136 | 
137 |         """
138 | 
139 |         if self.test_file is not None:
140 |             for user in self.test_set['users']:
141 |                 for item in self.test_set['feedback'][user]:
142 |                     self.predictions.append((user, item, self.predict_score(self.user_to_user_id[user],
143 |                                                                             self.item_to_item_id[item], True)))
144 |         else:
145 |             raise NotImplemented
146 | 
147 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
148 |         """
149 |         Extends compute method from BaseRatingPrediction. Method to run recommender algorithm
150 | 
151 |         :param verbose: Print recommender and database information
152 |         :type verbose: bool, default True
153 | 
154 |         :param metrics: List of evaluation measures
155 |         :type metrics: list, default None
156 | 
157 |         :param verbose_evaluation: Print the evaluation results
158 |         :type verbose_evaluation: bool, default True
159 | 
160 |         :param as_table: Print the evaluation results as table
161 |         :type as_table: bool, default False
162 | 
163 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
164 |         :type table_sep: str, default '\t'
165 | 
166 |         """
167 | 
168 |         super(NNMF, self).compute(verbose=verbose)
169 | 
170 |         if verbose:
171 |             self.init_model()
172 |             print("training_time:: %4f sec" % timed(self.fit))
173 |             if self.extra_info_header is not None:
174 |                 print(self.extra_info_header)
175 | 
176 |             print("prediction_time:: %4f sec" % timed(self.predict))
177 | 
178 |             print('\n')
179 | 
180 |         else:
181 |             # Execute all in silence without prints
182 |             self.init_model()
183 |             self.fit()
184 |             self.predict()
185 | 
186 |         self.write_predictions()
187 | 
188 |         if self.test_file is not None:
189 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
190 | 
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/random_rec.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Random Collaborative Filtering Recommender
  4 |     [Rating Prediction (Rating)]
  5 | 
  6 |     Random predicts a user’s ratings based on random distributions of rates.
  7 | 
  8 | """
  9 | 
 10 | # © 2019. Case Recommender (MIT License)
 11 | 
 12 | import numpy as np
 13 | 
 14 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 15 | from caserec.utils.extra_functions import timed
 16 | 
 17 | __author__ = 'Fernando S. de Aguiar Neto <fsan110792@gmail.com>'
 18 | 
 19 | 
 20 | class RandomRec(BaseRatingPrediction):
 21 |     def __init__(self, train_file, test_file, uniform=True, output_file=None, sep='\t', output_sep='\t',
 22 |                  random_seed=None):
 23 |         """
 24 |         Random recommendation for Rating Prediction
 25 | 
 26 |         This algorithm predicts ratings for each user-item
 27 | 
 28 |         Usage::
 29 | 
 30 |             >> RandomRec(train, test).compute()
 31 | 
 32 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 33 |         (user item feedback_value).
 34 |         :type train_file: str
 35 | 
 36 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 37 |         (user item feedback_value).
 38 |         :type test_file: str, default None
 39 |         
 40 |         :param uniform: Indicates whether the ratings are drawn from a uniform sample or not
 41 |         if False, the ratings are drawn from a normal distribution with the same mean and standard deviation
 42 |         as the feedback provided in train
 43 |         :type uniform: bool, default True
 44 | 
 45 |         :param output_file: File with dir to write the final predictions
 46 |         :type output_file: str, default None
 47 | 
 48 |         :param sep: Delimiter for input files
 49 |         :type sep: str, default '\t'
 50 | 
 51 |         :param output_sep: Delimiter for output file
 52 |         :type output_sep: str, default '\t'
 53 | 
 54 |         :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 55 |         :type random_seed: int, default None
 56 |         
 57 |         """
 58 | 
 59 |         super(RandomRec, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 60 |                                         sep=sep, output_sep=output_sep)
 61 | 
 62 |         if random_seed is not None:
 63 |             np.random.seed(random_seed)
 64 | 
 65 |         self.uniform = uniform
 66 | 
 67 |         self.recommender_name = 'Random Recommender'
 68 | 
 69 |     def predict(self):
 70 |         if not self.uniform:
 71 |             feedbacks = []
 72 |             for user in self.train_set["users"]:
 73 |                 for item in self.train_set['items_seen_by_user'][user]:
 74 |                     feedbacks.append(self.train_set['feedback'][user][item])
 75 | 
 76 |             std = np.std(feedbacks)
 77 | 
 78 |         if self.test_file is not None:
 79 |             for user in self.test_set['users']:
 80 |                 for item in self.test_set['feedback'][user]:
 81 |                     if self.uniform:
 82 |                         feedback_value = np.random.uniform(self.train_set['min_value'], self.train_set['max_value'])
 83 |                     else:
 84 |                         feedback_value = np.random.normal(self.train_set['mean_value'], std)
 85 | 
 86 |                     self.predictions.append((user, item, feedback_value))
 87 |         else:
 88 |             raise NotImplemented
 89 | 
 90 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
 91 |         """
 92 |         Extends compute method from BaseRatingPrediction. Method to run recommender algorithm
 93 | 
 94 |         :param verbose: Print recommender and database information
 95 |         :type verbose: bool, default True
 96 | 
 97 |         :param metrics: List of evaluation measures
 98 |         :type metrics: list, default None
 99 | 
100 |         :param verbose_evaluation: Print the evaluation results
101 |         :type verbose_evaluation: bool, default True
102 | 
103 |         :param as_table: Print the evaluation results as table
104 |         :type as_table: bool, default False
105 | 
106 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
107 |         :type table_sep: str, default '\t'
108 | 
109 |         """
110 | 
111 |         super(RandomRec, self).compute(verbose=verbose)
112 | 
113 |         if verbose:
114 |             print("prediction_time:: %4f sec" % timed(self.predict))
115 |             print('\n')
116 | 
117 |         else:
118 |             self.predict()
119 | 
120 |         self.write_predictions()
121 | 
122 |         if self.test_file is not None:
123 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
124 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/svd.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     Singular Value Decomposition Based Collaborative Filtering Recommender
  4 |     [Rating Prediction]
  5 | 
  6 |     Literature:
  7 |         Badrul Sarwar , George Karypis , Joseph Konstan , John Riedl:
  8 |         Incremental Singular Value Decomposition Algorithms for Highly Scalable Recommender Systems
  9 |         Fifth International Conference on Computer and Information Science 2002.
 10 |         http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.7894
 11 | 
 12 | """
 13 | 
 14 | # © 2019. Case Recommender (MIT License)
 15 | 
 16 | import numpy as np
 17 | from scipy.sparse.linalg import svds
 18 | 
 19 | from caserec.recommenders.rating_prediction.base_rating_prediction import BaseRatingPrediction
 20 | from caserec.utils.extra_functions import timed
 21 | 
 22 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 23 | 
 24 | 
 25 | class SVD(BaseRatingPrediction):
 26 |     def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, sep='\t', output_sep='\t',
 27 |                  random_seed=None):
 28 |         """
 29 |         Matrix Factorization for rating prediction
 30 | 
 31 |         Matrix factorization models map both users and items to a joint latent factor space of dimensionality f,
 32 |         such that user-item interactions are modeled as inner products in that space.
 33 | 
 34 |         Usage::
 35 | 
 36 |             >> MatrixFactorization(train, test).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param factors: Number of latent factors per user/item
 50 |         :type factors: int, default 10
 51 | 
 52 |         :param sep: Delimiter for input files
 53 |         :type sep: str, default '\t'
 54 | 
 55 |         :param output_sep: Delimiter for output file
 56 |         :type output_sep: str, default '\t'
 57 | 
 58 |         :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 59 |         :type random_seed: int, default None
 60 | 
 61 |         """
 62 |         super(SVD, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file, sep=sep,
 63 |                                   output_sep=output_sep)
 64 | 
 65 |         self.recommender_name = 'SVD'
 66 |         self.factors = factors
 67 | 
 68 |         if random_seed is not None:
 69 |             np.random.seed(random_seed)
 70 | 
 71 |         # internal vars
 72 |         self.feedback_triples = None
 73 |         self.prediction_matrix = None
 74 | 
 75 |     def init_model(self):
 76 |         """
 77 |         Method to treat and initialize the model
 78 | 
 79 |         """
 80 | 
 81 |         self.feedback_triples = []
 82 | 
 83 |         # Map interaction with ids
 84 |         for user in self.train_set['feedback']:
 85 |             for item in self.train_set['feedback'][user]:
 86 |                 self.feedback_triples.append((self.user_to_user_id[user], self.item_to_item_id[item],
 87 |                                               self.train_set['feedback'][user][item]))
 88 | 
 89 |         self.create_matrix()
 90 | 
 91 |     def fit(self):
 92 |         """
 93 |         This method performs Singular Value Decomposition over the training data.
 94 | 
 95 |         """
 96 | 
 97 |         u, s, vt = svds(self.matrix, k=self.factors)
 98 |         s_diagonal_matrix = np.diag(s)
 99 |         self.prediction_matrix = np.dot(np.dot(u, s_diagonal_matrix), vt)
100 | 
101 |     def predict_score(self, u, i, cond=True):
102 |         """
103 |         Method to predict a single score for a pair (user, item)
104 | 
105 |         :param u: User ID
106 |         :type u: int
107 | 
108 |         :param i: Item ID
109 |         :type i: int
110 | 
111 |         :param cond: Use max and min values of train set to limit score
112 |         :type cond: bool, default True
113 | 
114 |         :return: Score generate for pair (user, item)
115 |         :rtype: float
116 | 
117 |         """
118 | 
119 |         rui = self.train_set["mean_value"] + self.prediction_matrix[u][i]
120 | 
121 |         if cond:
122 |             if rui > self.train_set["max_value"]:
123 |                 rui = self.train_set["max_value"]
124 |             elif rui < self.train_set["min_value"]:
125 |                 rui = self.train_set["min_value"]
126 | 
127 |         return rui
128 | 
129 |     def predict(self):
130 |         """
131 |         This method computes a final rating for unknown pairs (user, item)
132 | 
133 |         """
134 | 
135 |         if self.test_file is not None:
136 |             for user in self.test_set['users']:
137 |                 for item in self.test_set['feedback'][user]:
138 |                     self.predictions.append((user, item, self.predict_score(self.user_to_user_id[user],
139 |                                                                             self.item_to_item_id[item], True)))
140 |         else:
141 |             raise NotImplemented
142 | 
143 |     def compute(self, verbose=True, metrics=None, verbose_evaluation=True, as_table=False, table_sep='\t'):
144 |         """
145 |         Extends compute method from BaseRatingPrediction. Method to run recommender algorithm
146 | 
147 |         :param verbose: Print recommender and database information
148 |         :type verbose: bool, default True
149 | 
150 |         :param metrics: List of evaluation measures
151 |         :type metrics: list, default None
152 | 
153 |         :param verbose_evaluation: Print the evaluation results
154 |         :type verbose_evaluation: bool, default True
155 | 
156 |         :param as_table: Print the evaluation results as table
157 |         :type as_table: bool, default False
158 | 
159 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
160 |         :type table_sep: str, default '\t'
161 | 
162 |         """
163 | 
164 |         super(SVD, self).compute(verbose=verbose)
165 | 
166 |         if verbose:
167 |             self.init_model()
168 |             print("training_time:: %4f sec" % timed(self.fit))
169 |             if self.extra_info_header is not None:
170 |                 print(self.extra_info_header)
171 | 
172 |             print("prediction_time:: %4f sec" % timed(self.predict))
173 | 
174 |             print('\n')
175 | 
176 |         else:
177 |             # Execute all in silence without prints
178 |             self.init_model()
179 |             self.fit()
180 |             self.predict()
181 | 
182 |         self.write_predictions()
183 | 
184 |         if self.test_file is not None:
185 |             self.evaluate(metrics, verbose_evaluation, as_table=as_table, table_sep=table_sep)
186 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/svdplusplus.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     SVD++ Based Collaborative Filtering Recommender
  4 |     [Rating Prediction]
  5 | 
  6 |     Literature:
  7 |         Yehuda Koren:
  8 |         Factorization meets the neighborhood: a multifaceted collaborative filtering model
  9 |         KDD 2008
 10 |         http://portal.acm.org/citation.cfm?id=1401890.1401944
 11 | 
 12 | """
 13 | 
 14 | # © 2019. Case Recommender (MIT License)
 15 | 
 16 | import numpy as np
 17 | 
 18 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization
 19 | 
 20 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 21 | 
 22 | 
 23 | class SVDPlusPlus(MatrixFactorization):
 24 |     def __init__(self, train_file=None, test_file=None, output_file=None, factors=10, learn_rate=0.01, epochs=10,
 25 |                  delta=0.015, init_mean=0.1, init_stdev=0.1, bias_learn_rate=0.005, delta_bias=0.002,
 26 |                  stop_criteria=0.009, sep='\t', output_sep='\t', random_seed=None, update_delta=False):
 27 |         """
 28 |         SVD++ for rating prediction
 29 | 
 30 |         The SVD++ algorithm, an extension of SVD taking into account implicit ratings. Just as for SVD, the parameters
 31 |         are learned using a SGD on the regularized squared error objective.
 32 | 
 33 |         Usage::
 34 | 
 35 |             >> SVDPlusPlus(train, test).compute()
 36 | 
 37 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 38 |         (user item feedback_value).
 39 |         :type train_file: str
 40 | 
 41 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 42 |         (user item feedback_value).
 43 |         :type test_file: str, default None
 44 | 
 45 |         :param output_file: File with dir to write the final predictions
 46 |         :type output_file: str, default None
 47 | 
 48 |         :param factors: Number of latent factors per user/item
 49 |         :type factors: int, default 10
 50 | 
 51 |         :param learn_rate: Learning rate (alpha)
 52 |         :type learn_rate: float, default 0.05
 53 | 
 54 |         :param epochs: Number of epochs over the training data
 55 |         :type epochs: int, default 30
 56 | 
 57 |         :param delta: Regularization value
 58 |         :type delta: float, default 0.015
 59 | 
 60 |         :param init_mean: Mean of the normal distribution used to initialize the latent factors
 61 |         :type init_mean: float, default 0
 62 | 
 63 |         :param init_stdev: Standard deviation of the normal distribution used to initialize the latent factors
 64 |         :type init_stdev: float, default 0.1
 65 | 
 66 |         :param bias_learn_rate: Learning rate for baselines
 67 |         :type bias_learn_rate: float, default 0.005
 68 | 
 69 |         :param delta_bias: Regularization value for baselines
 70 |         :type delta_bias: float, default 0.002
 71 | 
 72 |         :param stop_criteria: Difference between errors for stopping criteria
 73 |         :type stop_criteria: float, default 0.009
 74 | 
 75 |         :param sep: Delimiter for input files
 76 |         :type sep: str, default '\t'
 77 | 
 78 |         :param output_sep: Delimiter for output file
 79 |         :type output_sep: str, default '\t'
 80 | 
 81 |         :param random_seed: Number of seed. Lock random numbers for reproducibility of experiments.
 82 |         :type random_seed: int, default None
 83 | 
 84 |         """
 85 | 
 86 |         super(SVDPlusPlus, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 87 |                                           factors=factors, learn_rate=learn_rate, epochs=epochs, delta=delta,
 88 |                                           init_mean=init_mean, init_stdev=init_stdev, baseline=True,
 89 |                                           bias_learn_rate=bias_learn_rate, delta_bias=delta_bias,
 90 |                                           stop_criteria=stop_criteria, sep=sep, output_sep=output_sep,
 91 |                                           random_seed=random_seed)
 92 | 
 93 |         self.recommender_name = 'SVDPlusPlus'
 94 |         self.update_delta = update_delta
 95 | 
 96 |         self.y = None
 97 |         self.n_u = None
 98 |         self.items_id_seen_by_user = None
 99 | 
100 |     def init_model(self):
101 |         """
102 |         Method to treat and initialize the model. . Extends init_model from MatrixFactorization
103 | 
104 |         """
105 | 
106 |         super(SVDPlusPlus, self).init_model()
107 | 
108 |         self.n_u = {}
109 |         self.items_id_seen_by_user = {}
110 | 
111 |         for user in self.train_set['users']:
112 |             for item in self.train_set['items_seen_by_user'][user]:
113 |                 self.items_id_seen_by_user.setdefault(self.user_to_user_id[user], []).append(self.item_to_item_id[item])
114 |             # |N(u)|^(-1/2)
115 |             self.n_u[self.user_to_user_id[user]] = np.sqrt(len(self.train_set['items_seen_by_user'][user]))
116 | 
117 |     def fit(self):
118 |         """
119 |         This method performs iterations of stochastic gradient ascent over the training data.
120 | 
121 |         """
122 | 
123 |         rmse_old = .0
124 |         for epoch in range(self.epochs):
125 |             error_final = .0
126 | 
127 |             for user, item, feedback in self.feedback_triples:
128 |                 pu = self.p[user] + self.y_sum_rows(user)
129 | 
130 |                 # Calculate error
131 |                 eui = feedback - self._predict_svd_plus_plus_score(user, item, pu, False)
132 |                 error_final += (eui ** 2.0)
133 | 
134 |                 # update bu and bi
135 |                 self.bu[user] += self.bias_learn_rate * (eui - self.delta_bias * self.bu[user])
136 |                 self.bi[item] += self.bias_learn_rate * (eui - self.delta_bias * self.bi[item])
137 | 
138 |                 # Adjust the factors
139 |                 norm_eui = eui / self.n_u[user]
140 | 
141 |                 i_f = self.q[item]
142 | 
143 |                 # Compute factor updates
144 |                 delta_u = np.subtract(np.multiply(eui, i_f), np.multiply(self.delta, self.p[user]))
145 |                 self.p[user] += np.multiply(self.learn_rate, delta_u)
146 | 
147 |                 delta_i = np.subtract(np.multiply(eui, pu), np.multiply(self.delta, i_f))
148 |                 self.q[item] += np.multiply(self.learn_rate, delta_i)
149 | 
150 |                 # update y (implicit factor)
151 |                 common_update = norm_eui * i_f
152 | 
153 |                 for j in self.items_id_seen_by_user[user]:
154 |                     delta_y = np.subtract(common_update, self.delta * self.y[j])
155 |                     self.y[j] += self.learn_rate * delta_y
156 | 
157 |             rmse_new = np.sqrt(error_final / self.train_set["number_interactions"])
158 | 
159 |             if np.fabs(rmse_new - rmse_old) <= self.stop_criteria:
160 |                 break
161 |             else:
162 |                 rmse_old = rmse_new
163 | 
164 |     def create_factors(self):
165 |         """
166 |         This method extends create_factors from Matrix Factorization, adding y factors
167 | 
168 |         """
169 | 
170 |         super(SVDPlusPlus, self).create_factors()
171 |         self.y = np.random.normal(self.init_mean, self.init_stdev, (len(self.items), self.factors))
172 | 
173 |     def _predict_svd_plus_plus_score(self, u, i, pu, cond=True):
174 |         """
175 | 
176 |         :param u: User ID (from self.items)
177 |         :type u: int
178 | 
179 |         :param i: Item ID (from self.items)
180 |         :type i: int
181 | 
182 |         :param pu: User updated vector (pu * y)
183 |         :type pu: list or np.array
184 | 
185 |         :param cond: Use max and min values of train set to limit score
186 |         :type cond: bool, default True
187 | 
188 |         :return: prediction for user u and item i
189 |         :rtype: float
190 | 
191 |         """
192 |         rui = self.train_set["mean_value"] + self.bu[u] + self.bi[i] + np.dot(pu, self.q[i])
193 | 
194 |         if cond:
195 |             if rui > self.train_set["max_value"]:
196 |                 rui = self.train_set["max_value"]
197 |             elif rui < self.train_set["min_value"]:
198 |                 rui = self.train_set["min_value"]
199 |         return rui
200 | 
201 |     def y_sum_rows(self, user):
202 |         """
203 |         Incorporating implicit feedback in the SVD: Sum (j E N(u)) Yj
204 | 
205 |         :param user: User ID
206 |         :type user: int
207 | 
208 |         :return: Sum of y vectors for seen items of user
209 | 
210 |         """
211 | 
212 |         sum_imp = np.zeros(self.factors)
213 |         for ui in self.items_id_seen_by_user[user]:
214 |             sum_imp += self.y[ui]
215 |         return sum_imp / self.n_u[user]
216 | 
217 |     def predict(self):
218 |         """
219 |         This method computes a final rating for unknown pairs (user, item)
220 | 
221 |         """
222 | 
223 |         if self.test_file is not None:
224 |             for user in self.test_set['users']:
225 |                 pu = self.p[self.user_to_user_id[user]] + self.y_sum_rows(self.user_to_user_id[user])
226 | 
227 |                 for item in self.test_set['feedback'][user]:
228 |                     self.predictions.append(
229 |                         (user, item, self._predict_svd_plus_plus_score(self.user_to_user_id[user],
230 |                                                                        self.item_to_item_id[item], pu, True)))
231 |         else:
232 |             raise NotImplemented
233 | 


--------------------------------------------------------------------------------
/caserec/recommenders/rating_prediction/user_attribute_knn.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     User Based Collaborative Filtering Recommender with Attributes (User Attribute KNN)
  4 |     [Rating Prediction]
  5 | 
  6 |     User-Attribute-kNN predicts a user’s rating according to how similar users rated the same item. The algorithm
  7 |     matches similar users based on the similarity of their attributes scores. However, instead of traditional UserKNN,
  8 |     this approach uses a pre-computed similarity matrix based on metadata.
  9 | 
 10 | 
 11 | """
 12 | 
 13 | # © 2019. Case Recommender (MIT License)
 14 | 
 15 | import numpy as np
 16 | 
 17 | from caserec.recommenders.rating_prediction.userknn import UserKNN
 18 | from caserec.utils.process_data import ReadFile
 19 | 
 20 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 21 | 
 22 | 
 23 | class UserAttributeKNN(UserKNN):
 24 |     def __init__(self, train_file=None, test_file=None, output_file=None, metadata_file=None, similarity_file=None,
 25 |                  k_neighbors=30, as_similar_first=True, metadata_as_binary=False,
 26 |                  metadata_similarity_sep='\t', similarity_metric="cosine", sep='\t', output_sep='\t'):
 27 |         """
 28 |         User Attribute KNN for Rating Prediction
 29 | 
 30 |         This algorithm predicts a rating for each pair (user, item) based on the similar items that his neighbors
 31 |         (similar users) consumed, using a metadata or similarity pre-computed file
 32 | 
 33 |         Usage::
 34 | 
 35 |             >> UserAttributeKNN(train, test, similarity_file=sim_matrix, as_similar_first=True).compute()
 36 |             >> UserAttributeKNN(train, test, metadata_file=metadata, as_similar_first=True).compute()
 37 | 
 38 |         :param train_file: File which contains the train set. This file needs to have at least 3 columns
 39 |         (user item feedback_value).
 40 |         :type train_file: str
 41 | 
 42 |         :param test_file: File which contains the test set. This file needs to have at least 3 columns
 43 |         (user item feedback_value).
 44 |         :type test_file: str, default None
 45 | 
 46 |         :param output_file: File with dir to write the final predictions
 47 |         :type output_file: str, default None
 48 | 
 49 |         :param metadata_file: File which contains the metadata set. This file needs to have at least 2 columns
 50 |         (user metadata).
 51 |         :type metadata_file: str, default None
 52 | 
 53 |         :param similarity_file: File which contains the similarity set. This file needs to have at least 3 columns
 54 |         (user user similarity).
 55 |         :type similarity_file: str, default None
 56 | 
 57 |         :param k_neighbors: Number of neighbors to use. If None, k_neighbor = int(sqrt(n_users))
 58 |         :type k_neighbors: int, default None
 59 | 
 60 |         :param as_similar_first: If True, for each unknown item, which will be predicted, we first look for its k
 61 |         most similar users and then take the intersection with the users that
 62 |         seen that item.
 63 |         :type as_similar_first: bool, default True
 64 | 
 65 |         :param metadata_as_binary: f True, the explicit value will be transform to binary
 66 |         :type metadata_as_binary: bool, default False
 67 | 
 68 |         :param metadata_similarity_sep: Delimiter for similarity or metadata file
 69 |         :type metadata_similarity_sep: str, default '\t'
 70 | 
 71 |         :param similarity_metric:
 72 |         :type similarity_metric: str, default cosine
 73 | 
 74 |         :param sep: Delimiter for input files file
 75 |         :type sep: str, default '\t'
 76 | 
 77 |         :param output_sep: Delimiter for output file
 78 |         :type output_sep: str, default '\t'
 79 |         """
 80 |         super(UserAttributeKNN, self).__init__(train_file=train_file, test_file=test_file, output_file=output_file,
 81 |                                                k_neighbors=k_neighbors, as_similar_first=as_similar_first,
 82 |                                                similarity_metric=similarity_metric, sep=sep, output_sep=output_sep)
 83 | 
 84 |         self.recommender_name = 'User Attribute KNN Algorithm'
 85 | 
 86 |         self.metadata_file = metadata_file
 87 |         self.similarity_file = similarity_file
 88 |         self.metadata_as_binary = metadata_as_binary
 89 |         self.metadata_similarity_sep = metadata_similarity_sep
 90 | 
 91 |     def init_model(self):
 92 |         """
 93 |         Method to fit the model. Create and calculate a similarity matrix by metadata file or a pre-computed similarity
 94 |         matrix
 95 | 
 96 |         """
 97 |         self.users_id_viewed_item = {}
 98 | 
 99 |         # Set the value for k
100 |         if self.k_neighbors is None:
101 |             self.k_neighbors = int(np.sqrt(len(self.users)))
102 | 
103 |         for item in self.items:
104 |             for user in self.train_set['users_viewed_item'].get(item, []):
105 |                 self.users_id_viewed_item.setdefault(item, []).append(self.user_to_user_id[user])
106 | 
107 |         if self.metadata_file is not None:
108 |             metadata = ReadFile(self.metadata_file, sep=self.metadata_similarity_sep, as_binary=self.metadata_as_binary
109 |                                 ).read_metadata_or_similarity()
110 | 
111 |             self.matrix = np.zeros((len(self.users), len(metadata['col_2'])))
112 | 
113 |             meta_to_meta_id = {}
114 |             for m, data in enumerate(metadata['col_2']):
115 |                 meta_to_meta_id[data] = m
116 | 
117 |             for user_m in metadata['col_1']:
118 |                 for m1 in metadata['dict'][user_m]:
119 |                     try:
120 |                         self.matrix[self.user_to_user_id[user_m], meta_to_meta_id[m1]] = metadata['dict'][user_m][m1]
121 |                     except KeyError:
122 |                         pass
123 | 
124 |             # create header info for metadata
125 |             sparsity = (1 - (metadata['number_interactions'] / (len(metadata['col_1']) * len(metadata['col_2'])))) * 100
126 | 
127 |             self.extra_info_header = ">> metadata:: %d users and %d metadata (%d interactions) | sparsity:: %.2f%%" % \
128 |                                      (len(metadata['col_1']), len(metadata['col_2']), metadata['number_interactions'],
129 |                                       sparsity)
130 | 
131 |             # Create similarity matrix based on metadata or similarity file
132 |             self.su_matrix = self.compute_similarity(transpose=False)
133 | 
134 |         elif self.similarity_file is not None:
135 |             similarity = ReadFile(self.similarity_file, sep=self.metadata_similarity_sep, as_binary=False
136 |                                   ).read_metadata_or_similarity()
137 | 
138 |             self.su_matrix = np.zeros((len(self.users), len(self.users)))
139 | 
140 |             # Fill similarity matrix
141 |             for u in similarity['col_1']:
142 |                 for u_j in similarity['dict'][u]:
143 |                     self.su_matrix[self.user_to_user_id[u], self.user_to_user_id[int(u_j)]] = similarity['dict'][u][u_j]
144 | 
145 |             # Remove NaNs
146 |             self.su_matrix[np.isnan(self.su_matrix)] = 0.0
147 | 
148 |         else:
149 |             raise ValueError("This algorithm needs a similarity matrix or a metadata file!")
150 | 
151 |         # Create original matrix user x item for prediction process
152 |         self.create_matrix()
153 | 


--------------------------------------------------------------------------------
/caserec/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Arthur'
2 | 


--------------------------------------------------------------------------------
/caserec/utils/cross_validation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     Cross Validation fo Recommender Algorithms
  4 | 
  5 | """
  6 | 
  7 | # © 2019. Case Recommender (MIT License)
  8 | 
  9 | from collections import defaultdict
 10 | import numpy as np
 11 | import shutil
 12 | 
 13 | from caserec.utils.split_database import SplitDatabase
 14 | 
 15 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 16 | 
 17 | 
 18 | class CrossValidation(object):
 19 |     def __init__(self, input_file, recommender, dir_folds, k_folds=10, header=None, sep='\t', write_predictions=False,
 20 |                  write_sep='\t', recommender_verbose=False, evaluation_in_fold_verbose=True, metrics=None,
 21 |                  as_table=False, table_sep='\t', del_folds=False, random_seed=None):
 22 |         """
 23 |         Cross Validation
 24 | 
 25 |         This strategy is responsible to divide the database in K folds, in which each fold contain a train and a test
 26 |         set. Its also responsible to run and evaluate the recommender results in each fold and calculate the mean and
 27 |         the standard deviation.
 28 | 
 29 |         Usage:
 30 |             >> rec = MostPopular(as_binary=True)
 31 |             >> CrossValidation(db, rec, fold_d, evaluation_in_fold_verbose=False).compute()
 32 | 
 33 |         :param input_file: Database file
 34 |         :type input_file: str
 35 | 
 36 |         :param recommender: Initialize the recommender algorithm. e.g.: MostPopular(as_binary=True)
 37 |         :type recommender: class
 38 | 
 39 |         :param dir_folds: Directory to write folds (train and test files)
 40 |         :type dir_folds: str
 41 | 
 42 |         :param k_folds: How much folds the strategy will divide
 43 |         :type k_folds: int, default 10
 44 | 
 45 |         :param header: Skip header line
 46 |         :type header: int, default None
 47 | 
 48 |         :param sep: Delimiter for input files
 49 |         :type sep: str, default '\t'
 50 | 
 51 |         :param write_predictions: Write the recommender predictions in each fold
 52 |         :type write_predictions: bool, default False
 53 | 
 54 |         :param write_sep: Delimiter for output files
 55 |         :type write_sep: str, default '\t'
 56 | 
 57 |         :param recommender_verbose: Print header of recommender in each fold
 58 |         :type recommender_verbose: bool, default False
 59 | 
 60 |         :param evaluation_in_fold_verbose: Print evaluation of recommender in each fold
 61 |         :type evaluation_in_fold_verbose: bool, default True
 62 | 
 63 |         :param metrics: List of evaluation metrics
 64 |         :type metrics: str, default None
 65 | 
 66 |         :param as_table: Print the evaluation results as table
 67 |         :type as_table: bool, default False
 68 | 
 69 |         :param table_sep: Delimiter for print results (only work with verbose=True and as_table=True)
 70 |         :type table_sep: str, default '\t'
 71 | 
 72 |         :param del_folds: Delete folds after evaluation
 73 |         :type del_folds: bool, default False
 74 | 
 75 |         :param random_seed: Random seed
 76 |         :type random_seed: int, default None
 77 | 
 78 |         """
 79 | 
 80 |         self.input_file = input_file
 81 |         self.recommender = recommender
 82 |         self.dir_folds = dir_folds
 83 |         self.k_folds = k_folds
 84 |         self.header = header
 85 |         self.sep = sep
 86 |         self.write_predictions = write_predictions
 87 |         self.write_sep = write_sep
 88 |         self.recommender_verbose = recommender_verbose
 89 |         self.evaluation_in_fold_verbose = evaluation_in_fold_verbose
 90 |         self.metrics = metrics
 91 |         self.as_table = as_table
 92 |         self.table_sep = table_sep
 93 |         self.del_folds = del_folds
 94 |         self.random_seed = random_seed
 95 | 
 96 |         # internal vars
 97 |         self.folds_results = defaultdict(list)
 98 | 
 99 |     def generate_folds(self):
100 |         """
101 |         Method to generate folds with k fold cross validation
102 | 
103 |         """
104 | 
105 |         SplitDatabase(input_file=self.input_file, n_splits=self.k_folds, dir_folds=self.dir_folds,
106 |                       sep_read=self.sep, header=self.header).k_fold_cross_validation(random_state=self.random_seed)
107 | 
108 |     def execute_algorithm(self):
109 |         """
110 |         Method to run recommender algorithm in k folds
111 | 
112 |         """
113 | 
114 |         for k in range(self.k_folds):
115 |             train_file = self.dir_folds + 'folds/%d/train.dat' % k
116 |             test_file = self.dir_folds + 'folds/%d/test.dat' % k
117 | 
118 |             self.recommender.train_file = train_file
119 |             self.recommender.test_file = test_file
120 | 
121 |             if self.write_predictions:
122 |                 output_file = self.dir_folds + 'folds/%d/output.dat' % k
123 |                 self.recommender.output_file = output_file
124 | 
125 |             self.recommender.compute(verbose=self.recommender_verbose,
126 |                                      verbose_evaluation=self.evaluation_in_fold_verbose, metrics=self.metrics)
127 | 
128 |             if self.metrics is None:
129 |                 self.metrics = self.recommender.evaluation_results.keys()
130 | 
131 |             for metric in self.metrics:
132 |                 self.folds_results[metric.upper()].append(self.recommender.evaluation_results[metric.upper()])
133 | 
134 |     def evaluate(self, verbose=True):
135 |         """
136 |         Method to evaluate folds results and generate mean and standard deviation
137 | 
138 |         :param verbose: If True, print evaluation results
139 |         :type verbose: bool, default True
140 | 
141 |         """
142 | 
143 |         mean_dict = defaultdict(dict)
144 |         std_dict = defaultdict(dict)
145 | 
146 |         for metric in self.metrics:
147 |             mean_dict[metric.upper()] = np.mean(self.folds_results[metric.upper()])
148 |             std_dict[metric.upper()] = np.std(self.folds_results[metric.upper()])
149 | 
150 |         if verbose:
151 |             if self.as_table:
152 |                 header = ''
153 |                 values_mean = ''
154 |                 values_std = ''
155 |                 for metric in self.metrics:
156 |                     header += metric.upper() + self.table_sep
157 |                     values_mean += str(round(mean_dict[metric.upper()], 6)) + self.table_sep
158 |                     values_std += str(round(std_dict[metric.upper()], 6)) + self.table_sep
159 |                 print('Metric%s%s' % (self.table_sep, header))
160 |                 print('Mean%s%s' % (self.table_sep, values_mean))
161 |                 print('STD%s%s' % (self.table_sep, values_std))
162 |             else:
163 |                 evaluation_mean = 'Mean:: '
164 |                 evaluation_std = 'STD:: '
165 |                 for metrics in self.metrics:
166 |                     evaluation_mean += "%s: %.6f " % (metrics.upper(), mean_dict[metrics.upper()])
167 |                     evaluation_std += "%s: %.6f " % (metrics.upper(), std_dict[metrics.upper()])
168 |                 print(evaluation_mean)
169 |                 print(evaluation_std)
170 | 
171 |     def erase_folds(self):
172 |         """
173 |         Method to delete folds after evaluation
174 | 
175 |         """
176 | 
177 |         folds = self.dir_folds + 'folds/'
178 |         shutil.rmtree(folds)
179 | 
180 |     def compute(self, verbose=True):
181 |         """
182 |         Method to run the cross validation
183 | 
184 |         :param verbose: If True, print header
185 |         :type verbose: bool, default True
186 | 
187 |         """
188 | 
189 |         if verbose:
190 | 
191 |             print("[Case Recommender: Cross Validation]\n")
192 |             print("Database:: %s \nRecommender Algorithm:: %s | K Folds: %d\n" % (self.input_file,
193 |                                                                                   self.recommender.recommender_name,
194 |                                                                                   self.k_folds))
195 | 
196 |         self.generate_folds()
197 |         self.execute_algorithm()
198 |         self.evaluate(verbose)
199 | 
200 |         if self.del_folds:
201 |             self.erase_folds()
202 | 


--------------------------------------------------------------------------------
/caserec/utils/extra_functions.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 |     This file has some auxiliary functions for Case Recommender. Method:
  4 |         - check_error_file: check if file exist
  5 |         - check_len_lists: check if the size of two list are equal
  6 |         - timed: measure the execution time of a function
  7 |         - print_header: print header in the algorithms
  8 | 
  9 | """
 10 | 
 11 | # © 2019. Case Recommender (MIT License)
 12 | 
 13 | import sys
 14 | import time
 15 | 
 16 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 17 | 
 18 | 
 19 | def check_error_file(file_check):
 20 |     """
 21 |     Function to check if file exist
 22 | 
 23 |     :param file_check: File to check
 24 |     :type file_check: str
 25 | 
 26 |     """
 27 | 
 28 |     try:
 29 |         open(file_check)
 30 |     except TypeError:
 31 |         raise TypeError("File cannot be empty or file is invalid: " + str(file_check))
 32 | 
 33 | 
 34 | def check_len_lists(list1, list2):
 35 |     """
 36 |     Function to check if 2 have the same length
 37 | 
 38 |     :param list1: First list
 39 |     :type list1: list
 40 | 
 41 |     :param list2: Second list
 42 |     :type list2: list
 43 | 
 44 |     """
 45 | 
 46 |     if len(list1) != len(list2):
 47 |         print("Error: Number of files in train list and rank list must be equal!")
 48 |         sys.exit()
 49 | 
 50 | 
 51 | def timed(f):
 52 |     """
 53 |     Function to calculate the time of execution
 54 | 
 55 |     :param f: Function name without ()
 56 |     :type f: function name
 57 | 
 58 |     :return: Time of execution
 59 |     :rtype: float
 60 | 
 61 |     """
 62 |     start = time.time()
 63 |     f()
 64 |     elapsed = time.time() - start
 65 |     return elapsed
 66 | 
 67 | 
 68 | def print_header(header_info, test_info=None):
 69 |     """
 70 |     Function to print the header with information of the files
 71 | 
 72 |     :param header_info: Dictionary with information about dataset or train file
 73 |     :type header_info: dict
 74 | 
 75 |     :param test_info: Dictionary with information about test file
 76 |     :type test_info: dict
 77 | 
 78 |     """
 79 | 
 80 |     print("[Case Recommender: %s]\n" % header_info['title'])
 81 |     print("train data:: %d users and %d items (%d interactions) | sparsity:: %.2f%%" %
 82 |           (header_info['n_users'], header_info['n_items'], header_info['n_interactions'], header_info['sparsity']))
 83 | 
 84 |     if test_info is not None:
 85 |         print("test data:: %d users and %d items (%d interactions) | sparsity:: %.2f%%\n" %
 86 |               (test_info['n_users'], test_info['n_items'], test_info['n_interactions'], test_info['sparsity']))
 87 | 
 88 | 
 89 | class ComputeBui(object):
 90 |     """
 91 |     Compute baselines based on training information considering information about users and items
 92 | 
 93 |     """
 94 |     def __init__(self, training_set):
 95 |         """
 96 | 
 97 |         :param training_set: Dictionary returned by ReadFile with method read()
 98 |         :type training_set: dict
 99 |         """
100 |         self.training_set = training_set
101 |         self.bu = dict()
102 |         self.bi = dict()
103 |         self.bui = dict()
104 | 
105 |     def train_baselines(self):
106 |         for i in range(10):
107 |             self.compute_bi()
108 |             self.compute_bu()
109 |         self.compute_bui()
110 | 
111 |     def compute_bi(self):
112 |         # bi = (rui - mi - bu) / (regBi + number of interactions)
113 |         self.bi = dict()
114 | 
115 |         for item in self.training_set['items']:
116 |             cont = 0
117 |             for user in self.training_set['users_viewed_item'][item]:
118 |                 self.bi[item] = self.bi.get(item, 0) + float(self.training_set['feedback'][user][item]) - \
119 |                                 self.training_set['mean_value'] - self.bu.get(user, 0)
120 |                 cont += 1
121 |             if cont > 1:
122 |                 self.bi[item] = float(self.bi[item]) / float(10 + cont)
123 | 
124 |     def compute_bu(self):
125 |         # bu = (rui - mi - bi) / (regBu + number of interactions)
126 |         self.bu = dict()
127 |         for user in self.training_set['users']:
128 |             cont = 0
129 |             for item in self.training_set['items_seen_by_user'][user]:
130 |                 self.bu[user] = self.bu.get(user, 0) + float(self.training_set['feedback'][user][item]) - \
131 |                                 self.training_set['mean_value'] - self.bi.get(item, 0)
132 |                 cont += 1
133 |             if cont > 1:
134 |                 self.bu[user] = float(self.bu[user]) / float(15 + cont)
135 | 
136 |     def compute_bui(self):
137 |         # bui = mi + bu + bi
138 |         for user in self.training_set['users']:
139 |             for item in self.training_set['items']:
140 |                 try:
141 |                     self.bui.setdefault(user, {}).update(
142 |                         {item: self.training_set['mean_value'] + self.bu[user] + self.bi[item]})
143 |                 except KeyError:
144 |                     self.bui.setdefault(user, {}).update({item: self.training_set['mean_value']})
145 | 
146 |     def execute(self):
147 |         self.train_baselines()
148 |         return self.bui
149 | 


--------------------------------------------------------------------------------
/caserec/utils/split_database.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """"
  3 |     This class is responsible for divide databases in k folds with two strategies:
  4 |         k-fold cross-validation or ShuffleSplit
  5 | 
  6 | """
  7 | 
  8 | # © 2019. Case Recommender (MIT License)
  9 | 
 10 | 
 11 | from sklearn.model_selection import KFold, ShuffleSplit
 12 | import os
 13 | 
 14 | from caserec.utils.process_data import ReadFile, WriteFile
 15 | 
 16 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
 17 | 
 18 | 
 19 | class SplitDatabase(ReadFile):
 20 |     def __init__(self, input_file, dir_folds=None, n_splits=10, sep_read='\t', sep_write='\t', header=None,
 21 |                  names=None, as_binary=False, binary_col=None, write_mode='w'):
 22 |         """
 23 |         Given a database, this class is responsible for creating a training and test sets
 24 |         for k folds with well-known strategies:
 25 | 
 26 |         - k-fold cross-validation
 27 |         - ShuffleSplit
 28 | 
 29 |         Usage:
 30 | 
 31 |             >> SplitDatabase(input_file=database, dir_folds=dir_path, n_folds=10).k_fold_cross_validation()
 32 |             >> SplitDatabase(input_file=database, dir_folds=dir_path, n_folds=10).shuffle_split(test_size=0.3)
 33 |             # To use only one fold, you should use only shuffle_split. k_fold_cross_validation works only with
 34 |             # n_folds >= 2:
 35 |             >> SplitDatabase(input_file=database, dir_folds=dir_path, n_folds=1).shuffle_split(test_size=0.1)
 36 | 
 37 |         :param input_file: Input File with at least 2 columns.
 38 |         :type input_file: str
 39 | 
 40 |         :param dir_folds: Directory to write folds (train and test files)
 41 |         :type dir_folds: str
 42 | 
 43 |         :param n_splits: How much folds the strategy will divide
 44 |         :type n_splits: int, default 10
 45 | 
 46 |         :param sep_read: Delimiter for input files
 47 |         :type sep_read: str, default '\t'
 48 | 
 49 |         :param sep_write: Delimiter for output files
 50 |         :type sep_write: str, default '\t'
 51 | 
 52 |         :param header: Skip header line (only work with method: read_with_pandas)
 53 |         :type header: int, default None
 54 | 
 55 |         :param names: Name of columns (only work with method: read_with_pandas)
 56 |         :type names: str, default None
 57 | 
 58 |         :param as_binary: If True, the explicit feedback will be transform to binary
 59 |         :type as_binary: bool, default False
 60 | 
 61 |         :param binary_col: Index of columns to read as binary (only work with method: read_with_pandas)
 62 |         :type binary_col: int, default 2
 63 | 
 64 |         :param write_mode: Method to write file
 65 |         :type write_mode: str, default 'w'
 66 | 
 67 |         """
 68 | 
 69 |         super(SplitDatabase, self).__init__(input_file, sep=sep_read, header=header, names=names, as_binary=as_binary,
 70 |                                             binary_col=binary_col)
 71 | 
 72 |         self.dir_folds = dir_folds
 73 |         self.n_splits = n_splits
 74 |         self.sep_write = sep_write
 75 |         self.write_mode = write_mode
 76 |         self.df = self.read_with_pandas()
 77 | 
 78 |         if self.dir_folds is not None:
 79 |             self.create_folds()
 80 | 
 81 |     def create_folds(self):
 82 |         self.dir_folds += "folds/"
 83 |         if not os.path.exists(self.dir_folds):
 84 |             os.mkdir(self.dir_folds)
 85 | 
 86 |         for n in range(self.n_splits):
 87 |             if not os.path.exists(self.dir_folds + str(n)):
 88 |                 os.mkdir(self.dir_folds + str(n))
 89 | 
 90 |     def write_files(self, trained_model):
 91 |         fold = 0
 92 |         for train_index, test_index in trained_model:
 93 |             if self.dir_folds is not None:
 94 |                 train_file = self.dir_folds + str(fold) + '/train.dat'
 95 |                 test_file = self.dir_folds + str(fold) + '/test.dat'
 96 | 
 97 |                 df_train = self.df.iloc[train_index]
 98 |                 df_test = self.df.iloc[test_index]
 99 | 
100 |                 WriteFile(train_file, sep=self.sep_write, mode=self.write_mode
101 |                           ).write_with_pandas(df_train.sort_values(by=[0, 1]))
102 |                 WriteFile(test_file, sep=self.sep_write, mode=self.write_mode
103 |                           ).write_with_pandas(df_test.sort_values(by=[0, 1]))
104 | 
105 |                 fold += 1
106 | 
107 |     def k_fold_cross_validation(self, shuffle=True, random_state=None):
108 |         """
109 |         k-fold cross-validation
110 | 
111 |         In k-fold cross-validation, the original sample is randomly partitioned into
112 |         k equal sized subsamples. Of the k subsamples, a single subsample is retained as
113 |         the validation data for testing the model, and the remaining k − 1 subsamples are
114 |         used as training data. The cross-validation process is then repeated k times (the folds),
115 |         with each of the k subsamples used exactly once as the validation data.
116 | 
117 |         The k results from the folds can then be averaged (or otherwise combined) to produce a
118 |         single estimation. Reference: https://en.wikipedia.org/wiki/Cross-validation_(statistics)
119 | 
120 |         :param shuffle:
121 |         :type shuffle:
122 | 
123 |         :param random_state:
124 |         :type random_state:
125 | 
126 |         :return:
127 |         """
128 | 
129 |         kfold = KFold(n_splits=self.n_splits, shuffle=shuffle, random_state=random_state)
130 |         trained_model = list(kfold.split(self.df))
131 | 
132 |         if self.dir_folds is not None:
133 |             self.write_files(trained_model)
134 | 
135 |     def shuffle_split(self, test_size=0.1, random_state=None):
136 |         """
137 |         Shuffle Split
138 | 
139 |         Random permutation cross-validator
140 | 
141 |         Yields indices to split data into training and test sets.
142 | 
143 |         Note: contrary to other cross-validation strategies, random splits do not guarantee that
144 |         all folds will be different, although this is still very likely for sizeable databases.
145 | 
146 |         :param test_size:
147 |         :type test_size:
148 | 
149 |         :param random_state:
150 |         :type random_state:
151 | 
152 |         :return:
153 |         """
154 |         ss = ShuffleSplit(n_splits=self.n_splits, test_size=test_size, random_state=random_state)
155 |         trained_model = list(ss.split(self.df))
156 | 
157 |         if self.dir_folds is not None:
158 |             self.write_files(trained_model)
159 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caserec/CaseRecommender/779bc5dd91ff704ce60c0f3fafd07e2eba689dd7/examples/__init__.py


--------------------------------------------------------------------------------
/examples/ranking_content_based.py:
--------------------------------------------------------------------------------
 1 | from caserec.recommenders.item_recommendation.content_based import ContentBased
 2 | from caserec.recommenders.item_recommendation.item_attribute_knn import ItemAttributeKNN
 3 | 
 4 | train = '../../datasets/ml-100k/folds/0/train.dat'
 5 | test = '../../datasets/ml-100k/folds/0/test.dat'
 6 | rank_cb = '../../datasets/ml-100k/folds/0/rank_cb.dat'
 7 | rank_attr = '../../datasets/ml-100k/folds/0/rank_attr.dat'
 8 | similarity = '../../datasets/ml-100k/folds/0/vsm.dat'
 9 | top_n = 10
10 | metrics = ('PREC', 'RECALL', 'NDCG', 'MAP')
11 | 
12 | ItemAttributeKNN(train, test, similarity_file=similarity, output_file=rank_attr, rank_length=50).\
13 |     compute(metrics=metrics, n_ranks=[10, 20, 50])
14 | ContentBased(train, test, similarity_file=similarity, output_file=rank_cb, rank_length=50).\
15 |     compute(metrics=metrics, n_ranks=[10, 20, 50])
16 | 
17 | 


--------------------------------------------------------------------------------
/examples/ranking_knn.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Running KNN Recommenders [Item Recommendation]
 3 | 
 4 |     - Cross Validation
 5 |     - Simple
 6 | 
 7 | """
 8 | 
 9 | from caserec.recommenders.item_recommendation.user_attribute_knn import UserAttributeKNN
10 | from caserec.recommenders.item_recommendation.item_attribute_knn import ItemAttributeKNN
11 | from caserec.recommenders.item_recommendation.itemknn import ItemKNN
12 | from caserec.recommenders.item_recommendation.userknn import UserKNN
13 | from caserec.utils.cross_validation import CrossValidation
14 | 
15 | db = '../../datasets/ml-100k/u.data'
16 | folds_path = '../../datasets/ml-100k/'
17 | 
18 | metadata_item = '../../datasets/ml-100k/db_item_subject.dat'
19 | sm_item = '../../datasets/ml-100k/sim_item.dat'
20 | metadata_user = '../../datasets/ml-100k/metadata_user.dat'
21 | sm_user = '../../datasets/ml-100k/sim_user.dat'
22 | 
23 | tr = '../../datasets/ml-100k/folds/0/train.dat'
24 | te = '../../datasets/ml-100k/folds/0/test.dat'
25 | 
26 | """
27 | 
28 |     UserKNN
29 | 
30 | """
31 | 
32 | # # Cross Validation
33 | # recommender = UserKNN(as_binary=True)
34 | #
35 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
36 | #
37 | # # Simple
38 | # UserKNN(tr, te, as_binary=True).compute()
39 | UserAttributeKNN(tr, te, metadata_file=metadata_user).compute()
40 | # UserAttributeKNN(tr, te, similarity_file=sm_user).compute()
41 | 
42 | """
43 | 
44 |     ItemKNN
45 | 
46 | """
47 | 
48 | # # Cross Validation
49 | # recommender = ItemKNN(as_binary=True)
50 | #
51 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
52 | #
53 | # # Simple
54 | # ItemKNN(tr, te, as_binary=True).compute()
55 | # ItemAttributeKNN(tr, te, metadata_file=metadata_item).compute()
56 | # ItemAttributeKNN(tr, te, similarity_file=sm_item).compute()
57 | 


--------------------------------------------------------------------------------
/examples/ranking_mp.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Running Most Popular Recommender [Item Recommendation]
 3 | 
 4 |     - Cross Validation
 5 |     - Simple
 6 | 
 7 | """
 8 | 
 9 | from caserec.recommenders.item_recommendation.most_popular import MostPopular
10 | from caserec.utils.cross_validation import CrossValidation
11 | 
12 | db = '../../datasets/ml-100k/u.data'
13 | folds_path = '../../datasets/ml-100k/'
14 | 
15 | tr = '../../datasets/ml-100k/folds/0/train.dat'
16 | te = '../../datasets/ml-100k/folds/0/test.dat'
17 | 
18 | # Cross Validation
19 | recommender = MostPopular(as_binary=True)
20 | 
21 | CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
22 | 
23 | # Simple
24 | MostPopular(tr, te, as_binary=True).compute()
25 | 


--------------------------------------------------------------------------------
/examples/ranking_others.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 |     Running item recommendation algorithms
 4 | 
 5 | """
 6 | from caserec.recommenders.item_recommendation.bprmf import BprMF
 7 | 
 8 | tr = '../../datasets/ml-100k/folds/0/train.dat'
 9 | te = '../../datasets/ml-100k/folds/0/test.dat'
10 | 
11 | 
12 | BprMF(tr, te, batch_size=30).compute()
13 | 


--------------------------------------------------------------------------------
/examples/ranking_rating_based_algorithm.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Running Precision and Recall metrics on rating-based algorithms
 3 | 
 4 | """
 5 | 
 6 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization
 7 | from caserec.recommenders.rating_prediction.nnmf import NNMF
 8 | from caserec.utils.process_data import ReadFile
 9 | from caserec.evaluation.rating_prediction import RatingPredictionEvaluation
10 | 
11 | tr = '../../datasets/ml-100k/folds/0/train.dat'
12 | te = '../../datasets/ml-100k/folds/0/test.dat'
13 | 
14 | # File to be saved model's predictions
15 | predictions_output_filepath = './predictions_output.dat'
16 | 
17 | # Creating model and computing train / test sets
18 | # model = MatrixFactorization(tr, te, output_file = predictions_output_filepath)
19 | model = NNMF(tr, te, output_file = predictions_output_filepath)
20 | 
21 | model.compute(verbose=False)
22 | 
23 | # Using ReadFile class to read predictions from file
24 | reader = ReadFile(input_file=predictions_output_filepath)
25 | predictions = reader.read()
26 | 
27 | # Creating evaluator with item-recommendation parameters
28 | evaluator = RatingPredictionEvaluation(sep = '\t', n_rank = [10], as_rank = True, metrics = ['PREC'])
29 | 
30 | # Getting evaluation
31 | item_rec_metrics = evaluator.evaluate(predictions['feedback'], model.test_set)
32 | 
33 | print ('\nItem Recommendation Metrics:\n', item_rec_metrics)
34 | 
35 | model.predict()
36 | 
37 | print ('\nOriginal Rating Prediction Metrics:\n', model.evaluation_results)


--------------------------------------------------------------------------------
/examples/rating_prediction_knn.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Running KNN Recommenders [Rating Prediction]
 3 | 
 4 |     - Cross Validation
 5 |     - Simple
 6 | 
 7 | """
 8 | 
 9 | from caserec.recommenders.rating_prediction.user_attribute_knn import UserAttributeKNN
10 | from caserec.recommenders.rating_prediction.item_attribute_knn import ItemAttributeKNN
11 | from caserec.recommenders.rating_prediction.itemknn import ItemKNN
12 | from caserec.recommenders.rating_prediction.userknn import UserKNN
13 | from caserec.utils.cross_validation import CrossValidation
14 | 
15 | db = '../../datasets/ml-100k/u.data'
16 | folds_path = '../../datasets/ml-100k/'
17 | 
18 | metadata_item = '../datasets/ml-100k/db_item_subject.dat'
19 | sm_item = '../datasets/ml-100k/sim_item.dat'
20 | metadata_user = '../datasets/ml-100k/metadata_user.dat'
21 | sm_user = '../datasets/ml-100k/sim_user.dat'
22 | 
23 | tr = '../datasets/ml-100k/folds/0/train.dat'
24 | te = '../datasets/ml-100k/folds/0/test.dat'
25 | 
26 | """
27 | 
28 |     UserKNN
29 | 
30 | """
31 | 
32 | # # Cross Validation
33 | # recommender = UserKNN()
34 | #
35 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
36 | #
37 | # # # Simple
38 | # UserKNN(tr, te).compute()
39 | # # UserAttributeKNN(tr, te, metadata_file=metadata_user).compute()
40 | # # UserAttributeKNN(tr, te, similarity_file=sm_user).compute()
41 | 
42 | """
43 | 
44 |     ItemKNN
45 | 
46 | """
47 | 
48 | # # Cross Validation
49 | recommender = ItemKNN()
50 | 
51 | CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
52 | #
53 | # # Simple
54 | # ItemKNN(tr, te).compute()
55 | # ItemAttributeKNN(tr, te, metadata_file=metadata_item).compute()
56 | # ItemAttributeKNN(tr, te, similarity_file=sm_item).compute()
57 | 


--------------------------------------------------------------------------------
/examples/rating_prediction_mf.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Running MF / SVD Recommenders [Rating Prediction]
 3 | 
 4 |     - Cross Validation
 5 |     - Simple
 6 | 
 7 | """
 8 | 
 9 | from caserec.recommenders.rating_prediction.svdplusplus import SVDPlusPlus
10 | from caserec.recommenders.rating_prediction.nnmf import NNMF
11 | from caserec.recommenders.rating_prediction.matrixfactorization import MatrixFactorization
12 | from caserec.utils.cross_validation import CrossValidation
13 | 
14 | db = '../../datasets/ml-100k/u.data'
15 | folds_path = '../../datasets/ml-100k/'
16 | 
17 | metadata_item = '../../datasets/ml-100k/db_item_subject.dat'
18 | sm_item = '../../datasets/ml-100k/sim_item.dat'
19 | metadata_user = '../../datasets/ml-100k/metadata_user.dat'
20 | sm_user = '../../datasets/ml-100k/sim_user.dat'
21 | 
22 | tr = '../../datasets/ml-100k/folds/0/train.dat'
23 | te = '../../datasets/ml-100k/folds/0/test.dat'
24 | 
25 | """
26 | 
27 |     UserKNN
28 | 
29 | """
30 | 
31 | # Cross Validation
32 | # recommender = MatrixFactorization()
33 | 
34 | # CrossValidation(input_file=db, recommender=recommender, dir_folds=folds_path, header=1, k_folds=5).compute()
35 | 
36 | # # Simple
37 | # MatrixFactorization(tr, te).compute()
38 | # SVDPlusPlus(tr, te).compute()
39 | 
40 | NNMF(tr, te, factors = 20).compute()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # requirements
2 | pandas
3 | scikit-learn
4 | scipy
5 | numpy
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 
4 | [bdist_wheel]
5 | universal=1
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """"
 2 |     Setup for Case Recommender
 3 | 
 4 | """
 5 | 
 6 | # © 2019. Case Recommender (MIT License)
 7 | 
 8 | from distutils.core import setup
 9 | from setuptools import find_packages
10 | from os import path
11 | 
12 | here = path.abspath(path.dirname(__file__))
13 | 
14 | __author__ = 'Arthur Fortes <fortes.arthur@gmail.com>'
15 | 
16 | # Get the long description from the README file
17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
18 |     long_description = f.read()
19 | 
20 | # Get requiriments
21 | REQUIRED_PACKAGES = [
22 |     'numpy',
23 |     'scipy',
24 |     'scikit-learn',
25 |     'pandas'
26 | ]
27 | 
28 | setup(
29 |     name='CaseRecommender',
30 |     packages=find_packages(),
31 |     version='1.1.1',
32 |     license='MIT License',
33 |     description='A recommender systems framework for Python',
34 |     long_description=long_description,
35 |     install_requires=REQUIRED_PACKAGES,
36 | 
37 |     author='Arthur Fortes <fortes.arthur@gmail.com>',
38 |     author_email='fortes.arthur@gmail.com',
39 | 
40 |     url='https://github.com/caserec/CaseRecommender',
41 |     download_url='https://github.com/caserec/CaseRecommender/archive/master.zip',
42 | 
43 |     keywords=['recommender systems', 'framework', 'collaborative filtering', 'content-based filtering',
44 |               'recommendation'],
45 | 
46 |     classifiers=[
47 |         # Indicate who your project is intended for
48 |         'Intended Audience :: Developers',
49 |         'Topic :: Software Development :: Build Tools',
50 | 
51 |         'License :: OSI Approved :: MIT License',
52 | 
53 |         'Programming Language :: Python :: 3',
54 |         'Programming Language :: Python :: 3.4',
55 |         'Programming Language :: Python :: 3.5',
56 |         'Programming Language :: Python :: 3.6',
57 |         'Intended Audience :: Developers',
58 |         'Topic :: Software Development :: Build Tools',
59 |     ],
60 | )
61 | 


--------------------------------------------------------------------------------