├── requirements.txt
├── .coveragerc
├── setup.cfg
├── doc
    ├── source
    │   ├── dump.rst
    │   ├── slope_one.rst
    │   ├── algobase.rst
    │   ├── accuracy.rst
    │   ├── co_clustering.rst
    │   ├── evaluate.rst
    │   ├── similarities.rst
    │   ├── dataset.rst
    │   ├── predictions_module.rst
    │   ├── basic_algorithms.rst
    │   ├── matrix_factorization.rst
    │   ├── prediction_algorithms_package.rst
    │   ├── spelling_wordlist.txt
    │   ├── knn_inspired.rst
    │   ├── index.rst
    │   ├── notation_standards.rst
    │   ├── refs.bib
    │   ├── building_custom_algo.rst
    │   ├── FAQ.rst
    │   ├── prediction_algorithms.rst
    │   ├── getting_started.rst
    │   └── conf.py
    ├── Makefile
    └── make.bat
├── tests
    ├── custom_test
    ├── custom_dataset
    ├── custom_train
    ├── test_pep8.py
    ├── test_evaluate.py
    ├── test_co_clustering.py
    ├── test_dump.py
    ├── test_grid_search.py
    ├── test_accuracy.py
    ├── test_reader.py
    ├── test_NMF.py
    ├── test_sim_options.py
    ├── test_algorithms.py
    ├── test_bsl_options.py
    ├── test_dataset.py
    ├── test_SVD.py
    └── test_similarities.py
├── MANIFEST.in
├── requirements_travis.txt
├── requirements_dev.txt
├── .travis.yml
├── .gitignore
├── examples
    ├── building_custom_algorithms
    │   ├── most_basic_algorithm.py
    │   ├── mean_rating_user_item.py
    │   ├── most_basic_algorithm2.py
    │   └── with_baselines_or_sim.py
    ├── basic_usage.py
    ├── iterate_over_folds.py
    ├── similarity_conf.py
    ├── serialize_algorithm.py
    ├── load_custom_dataset.py
    ├── evaluate_on_trainset.py
    ├── query_for_predictions.py
    ├── load_custom_dataset_predefined_folds.py
    ├── grid_search_usage.py
    ├── baselines_conf.py
    ├── split_data_for_unbiased_estimation.py
    ├── top_n_recommendations.py
    └── k_nearest_neighbors.py
├── surprise
    ├── prediction_algorithms
    │   ├── __init__.py
    │   ├── baseline_only.py
    │   ├── random_pred.py
    │   ├── predictions.py
    │   ├── optimize_baselines.pyx
    │   ├── slope_one.pyx
    │   ├── co_clustering.pyx
    │   └── algo_base.py
    ├── __init__.py
    ├── dump.py
    ├── accuracy.py
    └── __main__.py
├── LICENSE.md
├── CONTRIBUTING.md
├── CHANGELOG.md
├── setup.py
├── TODO.md
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.11.2
2 | six>=1.10.0
3 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit = surprise/__main__.py
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file=README.md
3 | 


--------------------------------------------------------------------------------
/doc/source/dump.rst:
--------------------------------------------------------------------------------
1 | .. _dump_module:
2 | 
3 | dump module
4 | ===============
5 | 
6 | .. automodule:: surprise.dump
7 |     :members:
8 | 


--------------------------------------------------------------------------------
/tests/custom_test:
--------------------------------------------------------------------------------
1 | There are three lines to be ignored
2 | Line format is user - item - rating
3 | 
4 | user3 item0 5
5 | user0 item1 1
6 | user_neverseen item_neverseen 5
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE.md
3 | include requirements.txt
4 | recursive-include doc *
5 | recursive-include examples *
6 | recursive-include surprise *.c *.pyx
7 | 


--------------------------------------------------------------------------------
/doc/source/slope_one.rst:
--------------------------------------------------------------------------------
1 | .. _pred_package_slope_one:
2 | 
3 | Slope One
4 | ---------
5 | 
6 | .. autoclass:: surprise.prediction_algorithms.slope_one.SlopeOne
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/tests/custom_dataset:
--------------------------------------------------------------------------------
1 | There are three lines to be ignored
2 | Line format is user - item - rating
3 | 
4 | user0 item0 4
5 | user1 item0 4
6 | user2 item0 1
7 | user3 item1 5
8 | user4 item1 1
9 | 


--------------------------------------------------------------------------------
/doc/source/algobase.rst:
--------------------------------------------------------------------------------
1 | .. _pred_package_algo_base:
2 | 
3 | The algorithm base class
4 | ------------------------
5 | 
6 | .. automodule:: surprise.prediction_algorithms.algo_base
7 |     :members:
8 | 


--------------------------------------------------------------------------------
/doc/source/accuracy.rst:
--------------------------------------------------------------------------------
 1 | .. _accuracy:
 2 | 
 3 | accuracy module
 4 | ===================
 5 | 
 6 | 
 7 | .. automodule:: surprise.accuracy
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/doc/source/co_clustering.rst:
--------------------------------------------------------------------------------
1 | .. _pred_package_co_clustering:
2 | 
3 | Co-clustering
4 | -------------
5 | 
6 | .. autoclass:: surprise.prediction_algorithms.co_clustering.CoClustering
7 |     :show-inheritance:
8 | 
9 | 


--------------------------------------------------------------------------------
/tests/custom_train:
--------------------------------------------------------------------------------
 1 | There are three lines to be ignored
 2 | Line format is user - item - rating
 3 | 
 4 | user0 item0 4
 5 | user1 item0 4
 6 | user1 item1 2
 7 | user2 item0 1
 8 | user2 item1 1
 9 | user3 item1 5
10 | 


--------------------------------------------------------------------------------
/requirements_travis.txt:
--------------------------------------------------------------------------------
 1 | # Requirements file for development
 2 | numpy>=1.11.2
 3 | Cython>=0.24.1
 4 | six>=1.10.0
 5 | pytest>=3.0.3
 6 | sphinx>=1.4.9
 7 | sphinx_rtd_theme
 8 | sphinxcontrib-bibtex
 9 | flake8>=3.2.1
10 | 
11 | 


--------------------------------------------------------------------------------
/doc/source/evaluate.rst:
--------------------------------------------------------------------------------
1 | .. _evaluate:
2 | 
3 | evaluate module
4 | ===============
5 | 
6 | .. automodule:: surprise.evaluate
7 |     :members:
8 |     :exclude-members: CaseInsensitiveDefaultDict, CaseInsensitiveDefaultDictForBestResults
9 | 


--------------------------------------------------------------------------------
/doc/source/similarities.rst:
--------------------------------------------------------------------------------
 1 | .. _similarities:
 2 | 
 3 | similarities module
 4 | ===================
 5 | 
 6 | .. automodule:: surprise.similarities
 7 |     :members:
 8 |     :exclude-members: compute_mean_diff
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | # Requirements file for development
 2 | numpy>=1.11.2
 3 | Cython>=0.24.1
 4 | six>=1.10.0
 5 | pytest>=3.0.3
 6 | sphinx>=1.4.9
 7 | sphinx_rtd_theme
 8 | sphinxcontrib-bibtex
 9 | sphinxcontrib-spelling
10 | flake8>=3.2.1
11 | 


--------------------------------------------------------------------------------
/doc/source/dataset.rst:
--------------------------------------------------------------------------------
 1 | .. _dataset:
 2 | 
 3 | dataset module
 4 | ===================
 5 | 
 6 | .. automodule:: surprise.dataset
 7 |     :members:
 8 |     :exclude-members: BuiltinDataset, read_ratings, DatasetUserFolds,
 9 |         parse_line
10 | 


--------------------------------------------------------------------------------
/doc/source/predictions_module.rst:
--------------------------------------------------------------------------------
 1 | .. _pred_package_predictions:
 2 | 
 3 | The predictions module
 4 | ------------------------
 5 | 
 6 | .. automodule:: surprise.prediction_algorithms.predictions
 7 |     :members:
 8 |     :exclude-members: all_ratings, all_xs, all_ys
 9 | 
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.5"
 5 | 
 6 | # command to install dependencies
 7 | install:
 8 |   - "pip install -r requirements_travis.txt"
 9 |   - "pip install ."
10 | 
11 | # command to run tests
12 | script:
13 |   - pytest
14 | 
15 | # safelist
16 | branches:
17 |   only:
18 |   - master
19 | 


--------------------------------------------------------------------------------
/doc/source/basic_algorithms.rst:
--------------------------------------------------------------------------------
 1 | .. _pred_package_basic_algorithms:
 2 | 
 3 | Basic algorithms
 4 | ----------------
 5 | 
 6 | These are basic algorithms that do not do much work but that are still useful
 7 | for comparing accuracies.
 8 | 
 9 | .. autoclass:: surprise.prediction_algorithms.random_pred.NormalPredictor
10 |     :show-inheritance:
11 | 
12 | .. autoclass:: surprise.prediction_algorithms.baseline_only.BaselineOnly
13 |     :show-inheritance:
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *~
 3 | *.swp
 4 | 
 5 | doc/build
 6 | .ipynb_checkpoints/
 7 | .cache/
 8 | 
 9 | scikit_surprise.egg-info/
10 | 
11 | build
12 | dist/
13 | surprise/similarities.c
14 | surprise/prediction_algorithms/matrix_factorization.c
15 | surprise/prediction_algorithms/optimize_baselines.c
16 | surprise/prediction_algorithms/slope_one.c
17 | surprise/prediction_algorithms/co_clustering.c
18 | *.so
19 | .idea/*
20 | 
21 | Gemfile.lock
22 | _site
23 | 
24 | .coverage
25 | 


--------------------------------------------------------------------------------
/doc/source/matrix_factorization.rst:
--------------------------------------------------------------------------------
 1 | .. _pred_package_matrix_factorization:
 2 | 
 3 | Matrix Factorization-based algorithms
 4 | -------------------------------------
 5 | 
 6 | .. autoclass:: surprise.prediction_algorithms.matrix_factorization.SVD
 7 |     :show-inheritance:
 8 | 
 9 | .. autoclass:: surprise.prediction_algorithms.matrix_factorization.SVDpp
10 |     :show-inheritance:
11 | 
12 | .. autoclass:: surprise.prediction_algorithms.matrix_factorization.NMF
13 |     :show-inheritance:
14 | 


--------------------------------------------------------------------------------
/doc/source/prediction_algorithms_package.rst:
--------------------------------------------------------------------------------
 1 | .. _prediction_algorithms_package:
 2 | 
 3 | prediction_algorithms package
 4 | =============================
 5 | 
 6 | .. automodule:: surprise.prediction_algorithms
 7 | 
 8 | You may want to check the :ref:`notation standards <notation_standards>`
 9 | before diving into the formulas.
10 | 
11 | 
12 | .. toctree::
13 |    :includehidden:
14 | 
15 |    algobase
16 |    predictions_module
17 |    basic_algorithms
18 |    knn_inspired
19 |    matrix_factorization
20 |    slope_one
21 |    co_clustering
22 | 


--------------------------------------------------------------------------------
/doc/source/spelling_wordlist.txt:
--------------------------------------------------------------------------------
 1 | MSD
 2 | overfitting
 3 | trainset
 4 | trainsets
 5 | testset
 6 | namespace
 7 | pdf
 8 | dataset
 9 | datasets
10 | scikit
11 | movielens
12 | timestamp
13 | algo
14 | knn_inspired
15 | slope_one
16 | accuracies
17 | NN
18 | deserialize
19 | 
20 | 
21 | 
22 | 
23 | 
24 | Srujana
25 | Merugu
26 | scalable
27 | Yehuda
28 | Koren
29 | Yehuda
30 | Koren
31 | scalable
32 | Yehuda
33 | Koren
34 | Volinsky
35 | recommender
36 | Seung
37 | Lemire
38 | Maclachlan
39 | Xin
40 | Luo
41 | Mengchu
42 | Zhou
43 | Yunni
44 | Xia
45 | Qinsheng
46 | Zhu
47 | recommender
48 | Francesco
49 | Ricci
50 | Lior
51 | Rokach
52 | Bracha
53 | Shapira
54 | Kantor
55 | Ruslan
56 | Salakhutdinov
57 | Andriy
58 | Mnih
59 | Sheng
60 | Zhang
61 | Weihong
62 | Fillia
63 | Makedon
64 | 


--------------------------------------------------------------------------------
/examples/building_custom_algorithms/most_basic_algorithm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to build your own prediction algorithm. Please refer
 3 | to User Guide for more insight.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import AlgoBase
10 | from surprise import Dataset
11 | from surprise import evaluate
12 | 
13 | 
14 | class MyOwnAlgorithm(AlgoBase):
15 | 
16 |     def __init__(self):
17 | 
18 |         # Always call base method before doing anything.
19 |         AlgoBase.__init__(self)
20 | 
21 |     def estimate(self, u, i):
22 | 
23 |         return 3
24 | 
25 | 
26 | data = Dataset.load_builtin('ml-100k')
27 | algo = MyOwnAlgorithm()
28 | 
29 | evaluate(algo, data)
30 | 


--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes the most basic usage of surprise: you define a prediction
 3 | algorithm, (down)load a dataset and evaluate the performances of the algorithm.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import SVD
10 | from surprise import Dataset
11 | from surprise import evaluate, print_perf
12 | 
13 | 
14 | # Load the movielens-100k dataset (download it if needed),
15 | # and split it into 3 folds for cross-validation.
16 | data = Dataset.load_builtin('ml-100k')
17 | data.split(n_folds=3)
18 | 
19 | # We'll use the famous SVD algorithm.
20 | algo = SVD()
21 | 
22 | # Evaluate performances of our algorithm on the dataset.
23 | perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
24 | 
25 | print_perf(perf)
26 | 


--------------------------------------------------------------------------------
/examples/iterate_over_folds.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to manually train and test an algorithm without using
 3 | the evaluate() function.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import BaselineOnly
10 | from surprise import Dataset
11 | from surprise import accuracy
12 | 
13 | # Load the movielens-100k dataset and split it into 3 folds for
14 | # cross-validation.
15 | data = Dataset.load_builtin('ml-100k')
16 | data.split(n_folds=3)
17 | 
18 | algo = BaselineOnly()
19 | 
20 | for trainset, testset in data.folds():
21 | 
22 |     # train and test algorithm.
23 |     algo.train(trainset)
24 |     predictions = algo.test(testset)
25 | 
26 |     # Compute and print Root Mean Squared Error
27 |     rmse = accuracy.rmse(predictions, verbose=True)
28 | 


--------------------------------------------------------------------------------
/examples/similarity_conf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module gives an example of how to configure similarity measures
 3 | computation.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import KNNBasic
10 | from surprise import Dataset
11 | from surprise import evaluate
12 | 
13 | 
14 | # Load the movielens-100k dataset.
15 | data = Dataset.load_builtin('ml-100k')
16 | 
17 | # Example using cosine similarity
18 | sim_options = {'name': 'cosine',
19 |                'user_based': False  # compute  similarities between items
20 |                }
21 | algo = KNNBasic(sim_options=sim_options)
22 | 
23 | evaluate(algo, data)
24 | 
25 | # Example using pearson_baseline similarity
26 | sim_options = {'name': 'pearson_baseline',
27 |                'shrinkage': 0  # no shrinkage
28 |                }
29 | algo = KNNBasic(sim_options=sim_options)
30 | 
31 | evaluate(algo, data)
32 | 


--------------------------------------------------------------------------------
/tests/test_pep8.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing if the code is PEP8 compliant.
 3 | """
 4 | 
 5 | from flake8.api import legacy as flake8
 6 | 
 7 | 
 8 | def test_regular_files():
 9 | 
10 |     style_guide = flake8.get_style_guide(
11 |         filename=['*.py'],
12 |         exclude=['doc', '.eggs', '*.egg', 'build', 'setup.py'],
13 |         select=['E', 'W', 'F'],
14 |     )
15 | 
16 |     report = style_guide.check_files()
17 | 
18 |     assert report.get_statistics('E') == []
19 |     assert report.get_statistics('W') == []
20 |     assert report.get_statistics('F') == []
21 | 
22 | 
23 | def test_cython_files():
24 | 
25 |     style_guide = flake8.get_style_guide(
26 |         filename=['*.pyx', '*.px'],
27 |         exclude=['doc', '.eggs', '*.egg', 'build', 'setup.py'],
28 |         select=['E', 'W', 'F'],
29 |         ignore=['E225']
30 |     )
31 | 
32 |     report = style_guide.check_files()
33 | 
34 |     assert report.get_statistics('E') == []
35 |     assert report.get_statistics('W') == []
36 |     assert report.get_statistics('F') == []
37 | 


--------------------------------------------------------------------------------
/examples/serialize_algorithm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module illustrates the use of the dump and load methods for serializing an
 3 | algorithm. The SVD algorithm is trained on a dataset and then serialized. It is
 4 | then reloaded and can be used again for making predictions.
 5 | """
 6 | 
 7 | from __future__ import (absolute_import, division, print_function,
 8 |                         unicode_literals)
 9 | import os
10 | 
11 | from surprise import SVD
12 | from surprise import Dataset
13 | from surprise import dump
14 | 
15 | 
16 | data = Dataset.load_builtin('ml-100k')
17 | trainset = data.build_full_trainset()
18 | 
19 | algo = SVD()
20 | algo.train(trainset)
21 | 
22 | # Compute predictions of the 'original' algorithm.
23 | predictions = algo.test(trainset.build_testset())
24 | 
25 | # Dump algorithm and reload it.
26 | file_name = os.path.expanduser('~/dump_file')
27 | dump.dump(file_name, algo=algo)
28 | _, loaded_algo = dump.load(file_name)
29 | 
30 | # We now ensure that the algo is still the same by checking the predictions.
31 | predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
32 | assert predictions == predictions_loaded_algo
33 | print('Predictions are the same')
34 | 


--------------------------------------------------------------------------------
/examples/load_custom_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to load a custom dataset from a single file.
 3 | 
 4 | As a custom dataset we will actually use the movielens-100k dataset, but act as
 5 | if it were not built-in.
 6 | """
 7 | 
 8 | from __future__ import (absolute_import, division, print_function,
 9 |                         unicode_literals)
10 | import os
11 | 
12 | from surprise import BaselineOnly
13 | from surprise import Dataset
14 | from surprise import evaluate
15 | from surprise import Reader
16 | 
17 | # path to dataset file
18 | file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
19 | 
20 | # As we're loading a custom dataset, we need to define a reader. In the
21 | # movielens-100k dataset, each line has the following format:
22 | # 'user item rating timestamp', separated by '\t' characters.
23 | reader = Reader(line_format='user item rating timestamp', sep='\t')
24 | 
25 | data = Dataset.load_from_file(file_path, reader=reader)
26 | data.split(n_folds=5)
27 | 
28 | # We'll use an algorithm that predicts baseline estimates.
29 | algo = BaselineOnly()
30 | 
31 | # Evaluate performances of our algorithm on the dataset.
32 | evaluate(algo, data)
33 | 


--------------------------------------------------------------------------------
/examples/building_custom_algorithms/mean_rating_user_item.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to build your own prediction algorithm. Please refer
 3 | to User Guide for more insight.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | import numpy as np
10 | 
11 | from surprise import AlgoBase
12 | from surprise import Dataset
13 | from surprise import evaluate
14 | 
15 | 
16 | class MyOwnAlgorithm(AlgoBase):
17 | 
18 |     def __init__(self):
19 | 
20 |         # Always call base method before doing anything.
21 |         AlgoBase.__init__(self)
22 | 
23 |     def estimate(self, u, i):
24 | 
25 |         sum_means = self.trainset.global_mean
26 |         div = 1
27 | 
28 |         if self.trainset.knows_user(u):
29 |             sum_means += np.mean([r for (_, r) in self.trainset.ur[u]])
30 |             div += 1
31 |         if self.trainset.knows_item(i):
32 |             sum_means += np.mean([r for (_, r) in self.trainset.ir[i]])
33 |             div += 1
34 | 
35 |         return sum_means / div
36 | 
37 | 
38 | data = Dataset.load_builtin('ml-100k')
39 | algo = MyOwnAlgorithm()
40 | 
41 | evaluate(algo, data)
42 | 


--------------------------------------------------------------------------------
/examples/building_custom_algorithms/most_basic_algorithm2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to build your own prediction algorithm. Please refer
 3 | to User Guide for more insight.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | import numpy as np
10 | 
11 | from surprise import AlgoBase
12 | from surprise import Dataset
13 | from surprise import evaluate
14 | 
15 | 
16 | class MyOwnAlgorithm(AlgoBase):
17 | 
18 |     def __init__(self):
19 | 
20 |         # Always call base method before doing anything.
21 |         AlgoBase.__init__(self)
22 | 
23 |     def train(self, trainset):
24 | 
25 |         # Here again: call base method before doing anything.
26 |         AlgoBase.train(self, trainset)
27 | 
28 |         # Compute the average rating. We might as well use the
29 |         # trainset.global_mean attribute ;)
30 |         self.the_mean = np.mean([r for (_, _, r) in
31 |                                  self.trainset.all_ratings()])
32 | 
33 |     def estimate(self, u, i):
34 | 
35 |         return self.the_mean
36 | 
37 | 
38 | data = Dataset.load_builtin('ml-100k')
39 | algo = MyOwnAlgorithm()
40 | 
41 | evaluate(algo, data)
42 | 


--------------------------------------------------------------------------------
/examples/evaluate_on_trainset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to test the performances of an algorithm on the
 3 | trainset.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import Dataset
10 | from surprise import SVD
11 | from surprise import accuracy
12 | 
13 | 
14 | data = Dataset.load_builtin('ml-100k')
15 | 
16 | algo = SVD()
17 | 
18 | trainset = data.build_full_trainset()
19 | algo.train(trainset)
20 | 
21 | testset = trainset.build_testset()
22 | predictions = algo.test(testset)
23 | # RMSE should be low as we are biased
24 | accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)
25 | 
26 | # We can also do this during a cross-validation procedure!
27 | print('CV procedure:')
28 | 
29 | data.split(3)
30 | for i, (trainset_cv, testset_cv) in enumerate(data.folds()):
31 |     print('fold number', i + 1)
32 |     algo.train(trainset_cv)
33 | 
34 |     print('On testset,', end='  ')
35 |     predictions = algo.test(testset_cv)
36 |     accuracy.rmse(predictions, verbose=True)
37 | 
38 |     print('On trainset,', end=' ')
39 |     predictions = algo.test(trainset_cv.build_testset())
40 |     accuracy.rmse(predictions, verbose=True)
41 | 


--------------------------------------------------------------------------------
/examples/query_for_predictions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to train on a full dataset (when no testset is
 3 | built/specified) and how to query for specific predictions.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import KNNBasic
10 | from surprise import Dataset
11 | from surprise import evaluate
12 | 
13 | # Load the movielens-100k dataset and split it into 3 folds for
14 | # cross-validation.
15 | data = Dataset.load_builtin('ml-100k')
16 | 
17 | # Retrieve the trainset.
18 | trainset = data.build_full_trainset()
19 | 
20 | # Build an algorithm, and train it.
21 | algo = KNNBasic()
22 | algo.train(trainset)
23 | 
24 | 
25 | ##########################################
26 | # we can now query for specific predicions
27 | 
28 | uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
29 | iid = str(302)  # raw item id (as in the ratings file). They are **strings**!
30 | 
31 | # get a prediction for specific users and items.
32 | pred = algo.predict(uid, iid, r_ui=4, verbose=True)
33 | 
34 | 
35 | ##########################################
36 | # Tired? You can still call the 'split' method!
37 | data.split(n_folds=3)
38 | evaluate(algo, data)
39 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing the evaluate function.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | import os
 8 | import tempfile
 9 | import shutil
10 | 
11 | from surprise import NormalPredictor
12 | from surprise import Dataset
13 | from surprise import Reader
14 | from surprise import evaluate
15 | 
16 | 
17 | def test_performances():
18 |     """Test the returned dict. Also do dumping."""
19 | 
20 |     current_dir = os.path.dirname(os.path.realpath(__file__))
21 |     folds_files = [(current_dir + '/custom_train',
22 |                     current_dir + '/custom_test')]
23 | 
24 |     reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
25 |                     rating_scale=(1, 5))
26 |     data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
27 | 
28 |     algo = NormalPredictor()
29 |     tmp_dir = tempfile.mkdtemp()  # create tmp dir
30 |     performances = evaluate(algo, data, measures=['RmSe', 'Mae'],
31 |                             with_dump=True, dump_dir=tmp_dir, verbose=2)
32 |     shutil.rmtree(tmp_dir)  # remove tmp dir
33 | 
34 |     assert performances['RMSE'] is performances['rmse']
35 |     assert performances['MaE'] is performances['mae']
36 | 


--------------------------------------------------------------------------------
/examples/load_custom_dataset_predefined_folds.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to load a custom dataset when folds (for
 3 | cross-validation) are predefined by train and test files.
 4 | 
 5 | As a custom dataset we will actually use the movielens-100k dataset, but act as
 6 | if it were not built-in.
 7 | """
 8 | 
 9 | from __future__ import (absolute_import, division, print_function,
10 |                         unicode_literals)
11 | import os
12 | 
13 | from surprise import BaselineOnly
14 | from surprise import Dataset
15 | from surprise import evaluate
16 | from surprise import Reader
17 | 
18 | # path to dataset folder
19 | files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
20 | 
21 | # This time, we'll use the built-in reader.
22 | reader = Reader('ml-100k')
23 | 
24 | # folds_files is a list of tuples containing file paths:
25 | # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
26 | train_file = files_dir + 'u%d.base'
27 | test_file = files_dir + 'u%d.test'
28 | folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]
29 | 
30 | data = Dataset.load_from_folds(folds_files, reader=reader)
31 | 
32 | # We'll use an algorithm that predicts baseline estimates.
33 | algo = BaselineOnly()
34 | 
35 | # Evaluate performances of our algorithm on the dataset.
36 | evaluate(algo, data)
37 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`prediction_algorithms` package includes the prediction algorithms
 3 | available for recommendation.
 4 | 
 5 | The available prediction algorithms are:
 6 | 
 7 | .. autosummary::
 8 |     :nosignatures:
 9 | 
10 |     random_pred.NormalPredictor
11 |     baseline_only.BaselineOnly
12 |     knns.KNNBasic
13 |     knns.KNNWithMeans
14 |     knns.KNNBaseline
15 |     matrix_factorization.SVD
16 |     matrix_factorization.SVDpp
17 |     matrix_factorization.NMF
18 |     slope_one.SlopeOne
19 |     co_clustering.CoClustering
20 | """
21 | 
22 | from .algo_base import AlgoBase
23 | from .random_pred import NormalPredictor
24 | from .baseline_only import BaselineOnly
25 | from .knns import KNNBasic
26 | from .knns import KNNBaseline
27 | from .knns import KNNWithMeans
28 | from .matrix_factorization import SVD
29 | from .matrix_factorization import SVDpp
30 | from .matrix_factorization import NMF
31 | from .slope_one import SlopeOne
32 | from .co_clustering import CoClustering
33 | 
34 | from .predictions import PredictionImpossible
35 | from .predictions import Prediction
36 | 
37 | __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic',
38 |            'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne',
39 |            'CoClustering', 'PredictionImpossible', 'Prediction']
40 | 


--------------------------------------------------------------------------------
/examples/grid_search_usage.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module describes how to manually train and test an algorithm without using
 3 | the evaluate() function.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import GridSearch
10 | from surprise import SVD
11 | from surprise import Dataset
12 | 
13 | param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
14 |               'reg_all': [0.4, 0.6]}
15 | 
16 | grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
17 | 
18 | # Prepare Data
19 | data = Dataset.load_builtin('ml-100k')
20 | data.split(n_folds=3)
21 | 
22 | grid_search.evaluate(data)
23 | 
24 | # best RMSE score
25 | print(grid_search.best_score['RMSE'])
26 | # >>> 0.96117566386
27 | 
28 | # combination of parameters that gave the best RMSE score
29 | print(grid_search.best_params['RMSE'])
30 | # >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}
31 | 
32 | # best FCP score
33 | print(grid_search.best_score['FCP'])
34 | # >>> 0.702279736531
35 | 
36 | # combination of parameters that gave the best FCP score
37 | print(grid_search.best_params['FCP'])
38 | # >>> {'reg_all': 0.6, 'lr_all': 0.005, 'n_epochs': 10}
39 | 
40 | import pandas as pd  # noqa
41 | 
42 | results_df = pd.DataFrame.from_dict(grid_search.cv_results)
43 | print(results_df)
44 | 


--------------------------------------------------------------------------------
/surprise/__init__.py:
--------------------------------------------------------------------------------
 1 | from pkg_resources import get_distribution
 2 | 
 3 | from .prediction_algorithms import AlgoBase
 4 | from .prediction_algorithms import NormalPredictor
 5 | from .prediction_algorithms import BaselineOnly
 6 | from .prediction_algorithms import KNNBasic
 7 | from .prediction_algorithms import KNNWithMeans
 8 | from .prediction_algorithms import KNNBaseline
 9 | from .prediction_algorithms import SVD
10 | from .prediction_algorithms import SVDpp
11 | from .prediction_algorithms import NMF
12 | from .prediction_algorithms import SlopeOne
13 | from .prediction_algorithms import CoClustering
14 | 
15 | from .prediction_algorithms import PredictionImpossible
16 | from .prediction_algorithms import Prediction
17 | 
18 | from .dataset import Dataset
19 | from .dataset import Reader
20 | from .dataset import Trainset
21 | from .evaluate import evaluate
22 | from .evaluate import print_perf
23 | from .evaluate import GridSearch
24 | from . import dump
25 | 
26 | __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic',
27 |            'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne',
28 |            'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset',
29 |            'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch',
30 |            'dump']
31 | 
32 | __version__ = get_distribution('scikit-surprise').version
33 | 


--------------------------------------------------------------------------------
/examples/baselines_conf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module gives an example of how to configure baseline estimates
 3 | computation.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import BaselineOnly
10 | from surprise import KNNBasic
11 | from surprise import Dataset
12 | from surprise import evaluate
13 | 
14 | 
15 | # Load the movielens-100k dataset.
16 | data = Dataset.load_builtin('ml-100k')
17 | 
18 | # Example using ALS
19 | print('Using ALS')
20 | bsl_options = {'method': 'als',
21 |                'n_epochs': 5,
22 |                'reg_u': 12,
23 |                'reg_i': 5
24 |                }
25 | algo = BaselineOnly(bsl_options=bsl_options)
26 | 
27 | evaluate(algo, data)
28 | 
29 | # Example using SGD
30 | print('Using SGD')
31 | bsl_options = {'method': 'sgd',
32 |                'learning_rate': .00005,
33 |                }
34 | algo = BaselineOnly(bsl_options=bsl_options)
35 | 
36 | evaluate(algo, data)
37 | 
38 | # Some similarity measures may use baselines. It works just the same.
39 | print('Using ALS with pearson_baseline similarity')
40 | bsl_options = {'method': 'als',
41 |                'n_epochs': 20,
42 |                }
43 | sim_options = {'name': 'pearson_baseline'}
44 | algo = KNNBasic(bsl_options=bsl_options, sim_options=sim_options)
45 | 
46 | evaluate(algo, data)
47 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/baseline_only.py:
--------------------------------------------------------------------------------
 1 | """
 2 | """
 3 | 
 4 | from __future__ import (absolute_import, division, print_function,
 5 |                         unicode_literals)
 6 | 
 7 | from .algo_base import AlgoBase
 8 | 
 9 | 
10 | class BaselineOnly(AlgoBase):
11 |     """Algorithm predicting the baseline estimate for given user and item.
12 | 
13 |     :math:`\hat{r}_{ui} = b_{ui} = \mu + b_u + b_i`
14 | 
15 |     If user :math:`u` is unknown, then the bias :math:`b_u` is assumed to be
16 |     zero. The same applies for item :math:`i` with :math:`b_i`.
17 | 
18 |     See section 2.1 of :cite:`Koren:2010` for details.
19 | 
20 |     Args:
21 |         bsl_options(dict): A dictionary of options for the baseline estimates
22 |             computation. See :ref:`baseline_estimates_configuration` for
23 |             accepted options.
24 | 
25 |     """
26 | 
27 |     def __init__(self, bsl_options={}):
28 | 
29 |         AlgoBase.__init__(self, bsl_options=bsl_options)
30 | 
31 |     def train(self, trainset):
32 | 
33 |         AlgoBase.train(self, trainset)
34 |         self.bu, self.bi = self.compute_baselines()
35 | 
36 |     def estimate(self, u, i):
37 | 
38 |         est = self.trainset.global_mean
39 |         if self.trainset.knows_user(u):
40 |             est += self.bu[u]
41 |         if self.trainset.knows_item(i):
42 |             est += self.bi[i]
43 | 
44 |         return est
45 | 


--------------------------------------------------------------------------------
/doc/source/knn_inspired.rst:
--------------------------------------------------------------------------------
 1 | .. _pred_package_knn_inpired:
 2 | 
 3 | k-NN inspired algorithms
 4 | ------------------------
 5 | 
 6 | These are algorithms that are directly derived from a basic nearest neighbors
 7 | approach.
 8 | 
 9 | .. _actual_k_note:
10 | 
11 | .. note::
12 | 
13 |   For each of these algorithms, the actual number of neighbors that are
14 |   aggregated to compute an estimation is necessarily less than or equal to
15 |   :math:`k`. First, there might just not exist enough neighbors and second, the
16 |   sets :math:`N_i^k(u)` and :math:`N_u^k(i)` only include neighbors for which
17 |   the similarity measure is **positive**. It would make no sense to aggregate
18 |   ratings from users (or items) that are negatively correlated. For a given
19 |   prediction, the actual number of neighbors can be retrieved in the
20 |   ``'actual_k'`` field of the ``details`` dictionary of the :class:`prediction
21 |   <surprise.prediction_algorithms.predictions.Prediction>`.
22 | 
23 | You may want to read the :ref:`User Guide <similarity_measures_configuration>`
24 | on how to configure the ``sim_options`` parameter.
25 | 
26 | .. autoclass:: surprise.prediction_algorithms.knns.KNNBasic
27 |     :show-inheritance:
28 | 
29 | .. autoclass:: surprise.prediction_algorithms.knns.KNNWithMeans
30 |     :show-inheritance:
31 | 
32 | .. autoclass:: surprise.prediction_algorithms.knns.KNNBaseline
33 |     :show-inheritance:
34 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/random_pred.py:
--------------------------------------------------------------------------------
 1 | """ Algorithm predicting a random rating.
 2 | """
 3 | 
 4 | from __future__ import (absolute_import, division, print_function,
 5 |                         unicode_literals)
 6 | 
 7 | import numpy as np
 8 | 
 9 | from .algo_base import AlgoBase
10 | 
11 | 
12 | class NormalPredictor(AlgoBase):
13 |     """Algorithm predicting a random rating based on the distribution of the
14 |     training set, which is assumed to be normal.
15 | 
16 |     The prediction :math:`\hat{r}_{ui}` is generated from a normal distribution
17 |     :math:`\mathcal{N}(\hat{\mu}, \hat{\sigma}^2)` where :math:`\hat{\mu}` and
18 |     :math:`\hat{\sigma}` are estimated from the training data using Maximum
19 |     Likelihood Estimation:
20 | 
21 |     .. math::
22 |         \\hat{\mu} &= \\frac{1}{|R_{train}|} \\sum_{r_{ui} \\in R_{train}}
23 |         r_{ui}\\\\\\\\\
24 |         \\hat{\sigma} &= \\sqrt{\\sum_{r_{ui} \\in R_{train}}
25 |         \\frac{(r_{ui} - \\hat{\mu})^2}{|R_{train}|}}
26 |     """
27 | 
28 |     def __init__(self):
29 | 
30 |         AlgoBase.__init__(self)
31 | 
32 |     def train(self, trainset):
33 | 
34 |         AlgoBase.train(self, trainset)
35 | 
36 |         num = sum((r - self.trainset.global_mean)**2
37 |                   for (_, _, r) in self.trainset.all_ratings())
38 |         denum = self.trainset.n_ratings
39 |         self.sigma = np.sqrt(num / denum)
40 | 
41 |     def estimate(self, *_):
42 | 
43 |         return np.random.normal(self.trainset.global_mean, self.sigma)
44 | 


--------------------------------------------------------------------------------
/tests/test_co_clustering.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing the CoClustering algorithm.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | import os
 8 | 
 9 | from surprise import CoClustering
10 | from surprise import Dataset
11 | from surprise import Reader
12 | from surprise import evaluate
13 | 
14 | # the test and train files are from the ml-100k dataset (10% of u1.base and
15 | # 10 % of u1.test)
16 | train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
17 | test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
18 | data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
19 | 
20 | 
21 | def test_CoClustering_parameters():
22 |     """Ensure that all parameters are taken into account."""
23 | 
24 |     # The baseline against which to compare.
25 |     algo = CoClustering(n_epochs=1)
26 |     rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']
27 | 
28 |     # n_cltr_u
29 |     algo = CoClustering(n_cltr_u=1, n_epochs=1)
30 |     rmse_n_cltr_u = evaluate(algo, data, measures=['rmse'])['rmse']
31 |     assert rmse_default != rmse_n_cltr_u
32 | 
33 |     # n_cltr_i
34 |     algo = CoClustering(n_cltr_i=1, n_epochs=1)
35 |     rmse_n_cltr_i = evaluate(algo, data, measures=['rmse'])['rmse']
36 |     assert rmse_default != rmse_n_cltr_i
37 | 
38 |     # n_epochs
39 |     algo = CoClustering(n_epochs=2)
40 |     rmse_n_epochs = evaluate(algo, data, measures=['rmse'])['rmse']
41 |     assert rmse_default != rmse_n_epochs
42 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Nicolas Hug
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 |    may be used to endorse or promote products derived from this software
16 |    without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Surprise documentation master file, created by
 2 |    sphinx-quickstart on Tue Dec 29 20:08:18 2015.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. _index:
 7 | 
 8 | Welcome to Surprise' documentation!
 9 | ===================================
10 | 
11 | `Surprise <http://surpriselib.com>`_  is an easy-to-use Python `scikit
12 | <https://www.scipy.org/scikits.html>`_ for recommender systems.
13 | 
14 | If you're new to `Surprise <http://surpriselib.com>`_, we invite you to take a
15 | look at the :ref:`getting_started` guide, where you'll find a series of
16 | tutorials illustrating all you can do with  `Surprise
17 | <http://surpriselib.com>`_. You can also check out the :ref:`FAQ` for many
18 | use-case example. For installation guidelines, please refer to the `project
19 | page <http://surpriselib.com>`_.
20 | 
21 | Any kind of feedback/criticism would be greatly appreciated (software design,
22 | documentation, improvement ideas, spelling mistakes, etc...). Please feel free
23 | to contribute and send pull requests (see `GitHub page
24 | <https://github.com/NicolasHug/Surprise>`_)!
25 | 
26 | 
27 | .. toctree::
28 |    :caption: User Guide
29 |    :hidden:
30 | 
31 |    getting_started
32 |    prediction_algorithms
33 |    building_custom_algo
34 |    notation_standards
35 |    FAQ
36 | 
37 | 
38 | .. toctree::
39 |    :maxdepth: 2
40 |    :caption: API Reference
41 |    :hidden:
42 | 
43 |    prediction_algorithms_package
44 |    similarities
45 |    accuracy
46 |    dataset
47 |    evaluate
48 |    dump
49 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/predictions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`surprise.prediction_algorithms.predictions` module defines the
 3 | :class:`Prediction` named tuple and the :class:`PredictionImpossible`
 4 | exception.
 5 | """
 6 | 
 7 | from __future__ import (absolute_import, division, print_function,
 8 |                         unicode_literals)
 9 | 
10 | from collections import namedtuple
11 | 
12 | 
13 | class PredictionImpossible(Exception):
14 |     """Exception raised when a prediction is impossible.
15 | 
16 |     When raised, the estimation :math:`\hat{r}_{ui}` is set to the global mean
17 |     of all ratings :math:`\mu`.
18 |     """
19 | 
20 |     pass
21 | 
22 | 
23 | class Prediction(namedtuple('Prediction',
24 |                             ['uid', 'iid', 'r_ui', 'est', 'details'])):
25 |     """A named tuple for storing the results of a prediction.
26 | 
27 |     It's wrapped in a class, but only for documentation and printing purposes.
28 | 
29 |     Args:
30 |         uid: The (raw) user id. See :ref:`this note<raw_inner_note>`.
31 |         iid: The (raw) item id. See :ref:`this note<raw_inner_note>`.
32 |         r_ui(float): The true rating :math:`r_{ui}`.
33 |         est(float): The estimated rating :math:`\\hat{r}_{ui}`.
34 |         details (dict): Stores additional details about the prediction that
35 |             might be useful for later analysis.
36 |     """
37 | 
38 |     __slots__ = ()  # for memory saving purpose.
39 | 
40 |     def __str__(self):
41 |         s = 'user: {uid:<10} '.format(uid=self.uid)
42 |         s += 'item: {iid:<10} '.format(iid=self.iid)
43 |         s += 'r_ui = {r_ui:1.2f}   '.format(r_ui=self.r_ui)
44 |         s += 'est = {est:1.2f}   '.format(est=self.est)
45 |         s += str(self.details)
46 | 
47 |         return s
48 | 


--------------------------------------------------------------------------------
/examples/split_data_for_unbiased_estimation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to split a dataset into two parts A and B: A is for
 3 | tuning the algorithm parameters, and B is for having an unbiased estimation of
 4 | its performances. The tuning is done by Grid Search.
 5 | """
 6 | 
 7 | from __future__ import (absolute_import, division, print_function,
 8 |                         unicode_literals)
 9 | 
10 | import random
11 | 
12 | from surprise import SVD
13 | from surprise import Dataset
14 | from surprise import accuracy
15 | from surprise import GridSearch
16 | 
17 | 
18 | # Load the full dataset.
19 | data = Dataset.load_builtin('ml-100k')
20 | raw_ratings = data.raw_ratings
21 | 
22 | # shuffle ratings if you want
23 | random.shuffle(raw_ratings)
24 | 
25 | # A = 90% of the data, B = 10% of the data
26 | threshold = int(.9 * len(raw_ratings))
27 | A_raw_ratings = raw_ratings[:threshold]
28 | B_raw_ratings = raw_ratings[threshold:]
29 | 
30 | data.raw_ratings = A_raw_ratings  # data is now the set A
31 | data.split(n_folds=3)
32 | 
33 | # Select your best algo with grid search.
34 | print('Grid Search...')
35 | param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
36 | grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=0)
37 | grid_search.evaluate(data)
38 | 
39 | algo = grid_search.best_estimator['RMSE']
40 | 
41 | # retrain on the whole set A
42 | trainset = data.build_full_trainset()
43 | algo.train(trainset)
44 | 
45 | # Compute biased accuracy on A
46 | predictions = algo.test(trainset.build_testset())
47 | print('Biased accuracy on A,', end='   ')
48 | accuracy.rmse(predictions)
49 | 
50 | # Compute unbiased accuracy on B
51 | testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
52 | predictions = algo.test(testset)
53 | print('Unbiased accuracy on B,', end=' ')
54 | accuracy.rmse(predictions)
55 | 


--------------------------------------------------------------------------------
/tests/test_dump.py:
--------------------------------------------------------------------------------
 1 | """Module for testing the dump module."""
 2 | 
 3 | 
 4 | from __future__ import (absolute_import, division, print_function,
 5 |                         unicode_literals)
 6 | import tempfile
 7 | import random
 8 | import os
 9 | 
10 | from surprise import BaselineOnly
11 | from surprise import Dataset
12 | from surprise import Reader
13 | from surprise import dump
14 | 
15 | 
16 | def test_dump():
17 |     """Train an algorithm, compute its predictions then dump them.
18 |     Ensure that the predictions that are loaded back are the correct ones, and
19 |     that the predictions of the dumped algorithm are also equal to the other
20 |     ones."""
21 | 
22 |     random.seed(0)
23 | 
24 |     train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
25 |     test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
26 |     data = Dataset.load_from_folds([(train_file, test_file)],
27 |                                    Reader('ml-100k'))
28 | 
29 |     for trainset, testset in data.folds():
30 |         pass
31 | 
32 |     algo = BaselineOnly()
33 |     algo.train(trainset)
34 |     predictions = algo.test(testset)
35 | 
36 |     with tempfile.NamedTemporaryFile() as tmp_file:
37 |         dump.dump(tmp_file.name, predictions, algo)
38 |         predictions_dumped, algo_dumped = dump.load(tmp_file.name)
39 | 
40 |         predictions_algo_dumped = algo_dumped.test(testset)
41 |         assert predictions == predictions_dumped
42 |         assert predictions == predictions_algo_dumped
43 | 
44 | 
45 | def test_dump_nothing():
46 |     """Ensure that by default None objects are dumped."""
47 |     with tempfile.NamedTemporaryFile() as tmp_file:
48 |         dump.dump(tmp_file.name)
49 |         predictions, algo = dump.load(tmp_file.name)
50 |         assert predictions is None
51 |         assert algo is None
52 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing to Surprise
 2 | ========================
 3 | 
 4 | Pull requests are always welcome! Before submitting a new pull request, please
 5 | make sure that:
 6 | 
 7 | * Your code is [clean](https://www.youtube.com/watch?v=wf-BqAjZb8M),
 8 |   [pythonic](https://www.youtube.com/watch?v=OSGv2VnC0go), well commented and
 9 |   if it's a new feature/algorithm, that it's also well documented.
10 | * Your code passes the tests (we use
11 |   [pytest](http://doc.pytest.org/en/latest/)), plus the ones that you wrote ;)
12 | * Your code is [PEP 8](https://www.python.org/dev/peps/pep-0008/) compliant.
13 |   The bare minimum is that
14 |   [flake8](http://flake8.pycqa.org/en/latest/index.html) does not report any
15 |   warning (see below).
16 | 
17 | 
18 | All the tools needed for the development of Surprise (sphinx, flake8,
19 | etc...) can be installed by running
20 | 
21 |     pip install -r requirements_dev.txt
22 | 
23 | Then, you can install your local copy of the repo by running
24 | 
25 |     pip install -e .
26 | 
27 | Running tests
28 | -------------
29 | 
30 | We use [pytest](http://doc.pytest.org/en/latest/) so simply running
31 | 
32 |     pytest
33 | 
34 | in the root directory should do the job.
35 | 
36 | Check coding style
37 | ------------------
38 | 
39 | You can check that your code is PEP8 compliant by running
40 | 
41 |     pytest tests/test_pep8.py
42 | 
43 | Building the docs locally
44 | -------------------------
45 | 
46 | The docs can be compiled with
47 | 
48 |     cd doc
49 |     make html
50 | 
51 | You can check the results in `doc/build/html`. Please make sure that the docs
52 | compile without errors. Run `make clean` from time to time in order to avoid
53 | hidden warnings. You can check spelling mistakes by running
54 | 
55 |     make spelling
56 | 
57 | Legit words that are not recognized can be added in the
58 | `source/spelling_wordlist.txt` file.
59 | 


--------------------------------------------------------------------------------
/examples/building_custom_algorithms/with_baselines_or_sim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module descibes how to build your own prediction algorithm. Please refer
 3 | to User Guide for more insight.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | from surprise import AlgoBase
10 | from surprise import Dataset
11 | from surprise import evaluate
12 | from surprise import PredictionImpossible
13 | 
14 | 
15 | class MyOwnAlgorithm(AlgoBase):
16 | 
17 |     def __init__(self, sim_options={}, bsl_options={}):
18 | 
19 |         AlgoBase.__init__(self, sim_options=sim_options,
20 |                           bsl_options=bsl_options)
21 | 
22 |     def train(self, trainset):
23 | 
24 |         AlgoBase.train(self, trainset)
25 | 
26 |         # Compute baselines and similarities
27 |         self.bu, self.bi = self.compute_baselines()
28 |         self.sim = self.compute_similarities()
29 | 
30 |     def estimate(self, u, i):
31 | 
32 |         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
33 |             raise PredictionImpossible('User and/or item is unkown.')
34 | 
35 |         # Compute similarities between u and v, where v describes all other
36 |         # users that have also rated item i.
37 |         neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
38 |         # Sort these neighbors by similarity
39 |         neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)
40 | 
41 |         print('The 3 nearest neighbors of user', str(u), 'are:')
42 |         for v, sim_uv in neighbors[:3]:
43 |             print('user {0:} with sim {1:1.2f}'.format(v, sim_uv))
44 | 
45 |         # ... Aaaaand return the baseline estimate anyway ;)
46 |         bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
47 |         return bsl
48 | 
49 | 
50 | data = Dataset.load_builtin('ml-100k')
51 | algo = MyOwnAlgorithm()
52 | 
53 | evaluate(algo, data)
54 | 


--------------------------------------------------------------------------------
/doc/source/notation_standards.rst:
--------------------------------------------------------------------------------
 1 | .. _notation_standards:
 2 | 
 3 | Notation standards, References
 4 | ==============================
 5 | 
 6 | In the documentation, you will find the following notation:
 7 | 
 8 | * :math:`R` : the set of all ratings.
 9 | * :math:`R_{train}`, :math:`R_{test}` and :math:`\hat{R}` denote the training
10 |   set, the test set, and the set of predicted ratings.
11 | * :math:`U` : the set of all users. :math:`u` and :math:`v` denotes users.
12 | * :math:`I` : the set of all items. :math:`i` and :math:`j` denotes items.
13 | * :math:`U_i` : the set of all users that have rated item :math:`i`.
14 | * :math:`U_{ij}` : the set of all users that have rated both items :math:`i`
15 |   and :math:`j`.
16 | * :math:`I_u` : the set of all items rated by user :math:`u`.
17 | * :math:`I_{uv}` : the set of all items rated by both users :math:`u`
18 |   and :math:`v`.
19 | * :math:`r_{ui}` : the *true* rating of user :math:`u` for item
20 |   :math:`i`.
21 | * :math:`\hat{r}_{ui}` : the *estimated* rating of user :math:`u` for item
22 |   :math:`i`.
23 | * :math:`b_{ui}` : the baseline rating of user :math:`u` for item :math:`i`.
24 | * :math:`\mu` : the mean of all ratings.
25 | * :math:`\mu_u` : the mean of all ratings given by user :math:`u`.
26 | * :math:`\mu_i` : the mean of all ratings given to item :math:`i`.
27 | * :math:`N_i^k(u)` : the :math:`k` nearest neighbors of user :math:`u` that
28 |   have rated item :math:`i`. This set is computed using a :mod:`similarity
29 |   metric <surprise.similarities>`.
30 | * :math:`N_u^k(i)` : the :math:`k` nearest neighbors of item :math:`i` that
31 |   are rated by user :math:`u`. This set is computed using a :py:mod:`similarity
32 |   metric <surprise.similarities>`.
33 | 
34 | .. rubric:: References
35 | 
36 | Here are the papers used as references in the documentation. Links to pdf files
37 | where added when possible. A simple Google search should lead you easily to the
38 | missing ones :)
39 | 
40 | .. bibliography:: refs.bib
41 |   :all:
42 | 


--------------------------------------------------------------------------------
/surprise/dump.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`dump` module defines the :func:`dump` function.
 3 | """
 4 | 
 5 | import pickle
 6 | 
 7 | 
 8 | def dump(file_name, predictions=None, algo=None, verbose=0):
 9 |     """A basic wrapper around Pickle to serialize a list of prediction and/or
10 |     an algorithm on drive.
11 | 
12 |     What is dumped is a dictionary with keys ``'predictions'`` and ``'algo'``.
13 | 
14 |     Args:
15 |         file_name(str): The name (with full path) specifying where to dump the
16 |             predictions.
17 |         predictions(list of :obj:`Prediction\
18 |             <surprise.prediction_algorithms.predictions.Prediction>`): The
19 |             predictions to dump.
20 |         algo(:class:`Algorithm\
21 |             <surprise.prediction_algorithms.algo_base.AlgoBase>`, optional):
22 |             The algorithm to dump.
23 |         verbose(int): Level of verbosity. If ``1``, then a message indicates
24 |             that the dumping went successfully. Default is ``0``.
25 |     """
26 | 
27 |     dump_obj = {'predictions': predictions,
28 |                 'algo': algo
29 |                 }
30 |     pickle.dump(dump_obj, open(file_name, 'wb'))
31 | 
32 |     if verbose:
33 |         print('The dump has been saved as file', file_name)
34 | 
35 | 
36 | def load(file_name):
37 |     """A basic wrapper around Pickle to deserialize a list of prediction and/or
38 |     an algorithm that were dumped on drive using :func:`dump()
39 |     <surprise.dump.dump>`.
40 | 
41 |     Args:
42 |         file_name(str): The path of the file from which the algorithm is
43 |             to be loaded
44 | 
45 |     Returns:
46 |         A tuple ``(predictions, algo)`` where ``predictions`` is a list of
47 |         :class:`Prediction
48 |         <surprise.prediction_algorithms.predictions.Prediction>` objects and
49 |         ``algo`` is an :class:`Algorithm
50 |         <surprise.prediction_algorithms.algo_base.AlgoBase>` object. Depending
51 |         on what was dumped, some of these may be ``None``.
52 | 
53 |     """
54 | 
55 |     dump_obj = pickle.load(open(file_name, 'rb'))
56 | 
57 |     return dump_obj['predictions'], dump_obj['algo']
58 | 


--------------------------------------------------------------------------------
/tests/test_grid_search.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing SearchGrid class.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | 
 8 | import os
 9 | import random
10 | 
11 | from surprise import Dataset
12 | from surprise import Reader
13 | from surprise import SVD
14 | from surprise import evaluate
15 | from surprise import GridSearch
16 | 
17 | # the test and train files are from the ml-100k dataset (10% of u1.base and
18 | # 10 % of u1.test)
19 | train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
20 | test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
21 | data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
22 | 
23 | random.seed(0)
24 | 
25 | 
26 | def test_grid_search_cv_results():
27 |     param_grid = {'n_epochs': [1, 2], 'lr_all': [0.002, 0.005],
28 |                   'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]}
29 |     grid_search = GridSearch(SVD, param_grid)
30 |     grid_search.evaluate(data)
31 |     assert len(grid_search.cv_results['params']) == 8
32 | 
33 | 
34 | def test_measure_is_not_case_sensitive():
35 |     param_grid = {'n_epochs': [1], 'lr_all': [0.002, 0.005],
36 |                   'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]}
37 |     grid_search = GridSearch(SVD, param_grid, measures=['FCP', 'mae', 'rMSE'])
38 |     grid_search.evaluate(data)
39 |     assert grid_search.best_index['fcp'] == grid_search.best_index['FCP']
40 |     assert grid_search.best_params['mAe'] == grid_search.best_params['MaE']
41 |     assert grid_search.best_score['RmSE'] == grid_search.best_score['RMSE']
42 | 
43 | 
44 | def test_best_estimator():
45 |     param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005],
46 |                   'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]}
47 |     grid_search = GridSearch(SVD, param_grid, measures=['FCP', 'mae', 'rMSE'])
48 |     grid_search.evaluate(data)
49 |     best_estimator = grid_search.best_estimator['MAE']
50 |     assert evaluate(
51 |         best_estimator, data)['MAE'] == grid_search.best_score['MAE']
52 | 


--------------------------------------------------------------------------------
/examples/top_n_recommendations.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module illustrates how to retrieve the top-10 items with highest rating
 3 | prediction. We first train an SVD algorithm on the MovieLens dataset, and then
 4 | predict all the ratings for the pairs (user, item) that are not in the training
 5 | set. We then retrieve the top-10 prediction for each user.
 6 | """
 7 | 
 8 | from __future__ import (absolute_import, division, print_function,
 9 |                         unicode_literals)
10 | from collections import defaultdict
11 | 
12 | from surprise import SVD
13 | from surprise import Dataset
14 | 
15 | 
16 | def get_top_n(predictions, n=10):
17 |     '''Return the top-N recommendation for each user from a set of predictions.
18 | 
19 |     Args:
20 |         predictions(list of Prediction objects): The list of predictions, as
21 |             returned by the test method of an algorithm.
22 |         n(int): The number of recommendation to output for each user. Default
23 |             is 10.
24 | 
25 |     Returns:
26 |     A dict where keys are user (raw) ids and values are lists of tuples:
27 |         [(raw item id, rating estimation), ...] of size n.
28 |     '''
29 | 
30 |     # First map the predictions to each user.
31 |     top_n = defaultdict(list)
32 |     for uid, iid, true_r, est, _ in predictions:
33 |         top_n[uid].append((iid, est))
34 | 
35 |     # Then sort the predictions for each user and retrieve the k highest ones.
36 |     for uid, user_ratings in top_n.items():
37 |         user_ratings.sort(key=lambda x: x[1], reverse=True)
38 |         top_n[uid] = user_ratings[:n]
39 | 
40 |     return top_n
41 | 
42 | 
43 | # First train an SVD algorithm on the movielens dataset.
44 | data = Dataset.load_builtin('ml-100k')
45 | trainset = data.build_full_trainset()
46 | algo = SVD()
47 | algo.train(trainset)
48 | 
49 | # Than predict ratings for all pairs (u, i) that are NOT in the training set.
50 | testset = trainset.build_anti_testset()
51 | predictions = algo.test(testset)
52 | 
53 | top_n = get_top_n(predictions, n=10)
54 | 
55 | # Print the recommended items for each user
56 | for uid, user_ratings in top_n.items():
57 |     print(uid, [iid for (iid, _) in user_ratings])
58 | 


--------------------------------------------------------------------------------
/tests/test_accuracy.py:
--------------------------------------------------------------------------------
 1 | """Module for testing accuracy evaluation measures (RMSE, MAE...)"""
 2 | 
 3 | from __future__ import (absolute_import, division, print_function,
 4 |                         unicode_literals)
 5 | from math import sqrt
 6 | 
 7 | import pytest
 8 | 
 9 | from surprise.accuracy import mae, rmse, fcp
10 | 
11 | 
12 | def pred(true_r, est, u0=None):
13 |     """Just a small tool to build a prediction with appropriate format."""
14 |     return (u0, None, true_r, est, None)
15 | 
16 | 
17 | def test_mae():
18 |     """Tests for the MAE function."""
19 | 
20 |     predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
21 |     assert mae(predictions) == 0
22 | 
23 |     predictions = [pred(0, 0), pred(0, 2)]
24 |     assert mae(predictions) == abs(0 - 2) / 2
25 | 
26 |     predictions = [pred(2, 0), pred(3, 4)]
27 |     assert mae(predictions) == (abs(2 - 0) + abs(3 - 4)) / 2
28 | 
29 |     with pytest.raises(ValueError):
30 |         mae([])
31 | 
32 | 
33 | def test_rmse():
34 |     """Tests for the RMSE function."""
35 | 
36 |     predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
37 |     assert rmse(predictions) == 0
38 | 
39 |     predictions = [pred(0, 0), pred(0, 2)]
40 |     assert rmse(predictions) == sqrt((0 - 2)**2 / 2)
41 | 
42 |     predictions = [pred(2, 0), pred(3, 4)]
43 |     assert rmse(predictions) == sqrt(((2 - 0)**2 + (3 - 4)**2) / 2)
44 | 
45 |     with pytest.raises(ValueError):
46 |         rmse([])
47 | 
48 | 
49 | def test_fcp():
50 |     """Tests for the FCP function."""
51 | 
52 |     predictions = [pred(0, 0, u0='u1'), pred(1, 1, u0='u1'), pred(2, 2,
53 |                    u0='u2'), pred(100, 100, u0='u2')]
54 |     assert fcp(predictions) == 1
55 | 
56 |     predictions = [pred(0, 0, u0='u1'), pred(0, 0, u0='u1')]
57 |     with pytest.raises(ValueError):
58 |         fcp(predictions)
59 | 
60 |     predictions = [pred(0, 0, u0='u1')]
61 |     with pytest.raises(ValueError):
62 |         fcp(predictions)
63 | 
64 |     predictions = [pred(0, 1, u0='u1'), pred(1, 0, u0='u1'), pred(2, 0.5,
65 |                    u0='u2'), pred(0, 0.6, u0='u2')]
66 |     assert fcp(predictions) == 0
67 | 
68 |     with pytest.raises(ValueError):
69 |         fcp([])
70 | 


--------------------------------------------------------------------------------
/examples/k_nearest_neighbors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module illustrates how to retrieve the k-nearest neighbors of an item. The
 3 | same can be done for users with minor changes. There's a lot of boilerplate
 4 | because of the id conversions, but it all boils down to the use of
 5 | algo.get_neighbors().
 6 | """
 7 | 
 8 | from __future__ import (absolute_import, division, print_function,
 9 |                         unicode_literals)
10 | import os
11 | import io  # needed because of weird encoding of u.item file
12 | 
13 | from surprise import KNNBaseline
14 | from surprise import Dataset
15 | 
16 | 
17 | def read_item_names():
18 |     """Read the u.item file from MovieLens 100-k dataset and return two
19 |     mappings to convert raw ids into movie names and movie names into raw ids.
20 |     """
21 | 
22 |     file_name = (os.path.expanduser('~') +
23 |                  '/.surprise_data/ml-100k/ml-100k/u.item')
24 |     rid_to_name = {}
25 |     name_to_rid = {}
26 |     with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
27 |         for line in f:
28 |             line = line.split('|')
29 |             rid_to_name[line[0]] = line[1]
30 |             name_to_rid[line[1]] = line[0]
31 | 
32 |     return rid_to_name, name_to_rid
33 | 
34 | 
35 | # First, train the algortihm to compute the similarities between items
36 | data = Dataset.load_builtin('ml-100k')
37 | trainset = data.build_full_trainset()
38 | sim_options = {'name': 'pearson_baseline', 'user_based': False}
39 | algo = KNNBaseline(sim_options=sim_options)
40 | algo.train(trainset)
41 | 
42 | # Read the mappings raw id <-> movie name
43 | rid_to_name, name_to_rid = read_item_names()
44 | 
45 | # Retieve inner id of the movie Toy Story
46 | toy_story_raw_id = name_to_rid['Toy Story (1995)']
47 | toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
48 | 
49 | # Retrieve inner ids of the nearest neighbors of Toy Story.
50 | toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
51 | 
52 | # Convert inner ids of the neighbors into names.
53 | toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
54 |                        for inner_id in toy_story_neighbors)
55 | toy_story_neighbors = (rid_to_name[rid]
56 |                        for rid in toy_story_neighbors)
57 | 
58 | print()
59 | print('The 10 nearest neighbors of Toy Story are:')
60 | for movie in toy_story_neighbors:
61 |     print(movie)
62 | 


--------------------------------------------------------------------------------
/tests/test_reader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing the Reader class.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | 
 8 | import pytest
 9 | 
10 | from surprise import Reader
11 | 
12 | 
13 | def test_params():
14 |     """Test Reader parameters"""
15 | 
16 |     with pytest.raises(ValueError):
17 |         Reader(name='wrong_name')
18 | 
19 |     with pytest.raises(ValueError):
20 |         Reader(line_format='users item rating')
21 | 
22 |     with pytest.raises(ValueError):
23 |         Reader(line_format='user itemm rating')
24 | 
25 |     with pytest.raises(ValueError):
26 |         Reader(line_format='item user rrating')
27 | 
28 |     with pytest.raises(ValueError):
29 |         Reader(line_format='item BLABLA user rating')
30 | 
31 | 
32 | def test_parse_line():
33 |     """Test the parse_line method"""
34 | 
35 |     # Basic line parsing
36 |     line_format = 'user item rating timestamp'
37 |     sep = ','
38 |     reader = Reader(line_format=line_format, sep=sep)
39 | 
40 |     line = 'me,best_movie_ever, 5 ,25111990'
41 |     uid, iid, rating, timestamp = reader.parse_line(line)
42 | 
43 |     assert uid == 'me'
44 |     assert iid == 'best_movie_ever'
45 |     assert rating == 5
46 |     assert timestamp == '25111990'
47 | 
48 |     # Change order of fields (and sep)
49 |     line_format = 'timestamp rating item user'
50 |     sep = ' '
51 |     reader = Reader(line_format=line_format, sep=sep)
52 | 
53 |     line = '25111990 5 best_movie_ever me'
54 |     uid, iid, rating, timestamp = reader.parse_line(line)
55 | 
56 |     assert uid == 'me'
57 |     assert iid == 'best_movie_ever'
58 |     assert rating == 5
59 |     assert timestamp == '25111990'
60 | 
61 |     # Without timestamp (changed sep as well)
62 |     line_format = 'rating item user'
63 |     sep = '-'
64 |     reader = Reader(line_format=line_format, sep=sep)
65 | 
66 |     line = '5 - best_movie_ever - me'
67 |     uid, iid, rating, _ = reader.parse_line(line)
68 | 
69 |     assert uid == 'me'
70 |     assert iid == 'best_movie_ever'
71 |     assert rating == 5
72 | 
73 |     # Wrong sep
74 |     line_format = 'rating item user'
75 |     sep = ';'
76 |     reader = Reader(line_format=line_format, sep=sep)
77 | 
78 |     line = '5 - best_movie_ever - me'
79 |     with pytest.raises(ValueError):
80 |         uid, iid, rating, _ = reader.parse_line(line)
81 | 
82 |     # Wrong number of fields
83 |     line = '5 - best_movie_ever'
84 |     with pytest.raises(ValueError):
85 |         uid, iid, rating, _ = reader.parse_line(line)
86 | 


--------------------------------------------------------------------------------
/doc/source/refs.bib:
--------------------------------------------------------------------------------
 1 | @article{Koren:2010,
 2 |  author = {Koren, Yehuda},
 3 |  title = {Factor in the Neighbors: Scalable and Accurate Collaborative Filtering},
 4 |  journal = {},
 5 |  year = {2010},
 6 |  url = {http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf},
 7 | }
 8 | 
 9 | @book{Ricci:2010,
10 |  author = {Ricci, Francesco and Rokach, Lior and Shapira, Bracha and Kantor, Paul B.},
11 |  title = {Recommender Systems Handbook},
12 |  year = {2010},
13 |  edition = {1st},
14 |  publisher = {},
15 | }
16 | 
17 | @article{salakhutdinov2008a,
18 |   author = {Salakhutdinov, Ruslan and Mnih, Andriy},
19 |   journal = {},
20 |   title = {Probabilistic Matrix Factorization},
21 |   year = 2008,
22 |   url = {http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf},
23 | }
24 | 
25 | @article{Koren:2009,
26 |  author = {Koren, Yehuda and Bell, Robert and Volinsky, Chris},
27 |  title = {Matrix Factorization Techniques for Recommender Systems},
28 |  journal = {},
29 |  year = {2009},
30 | }
31 | 
32 | @article{lemire2007a,
33 | 	author    = {Daniel Lemire and Anna Maclachlan},
34 |   title     = {Slope One Predictors for Online Rating-Based Collaborative Filtering},
35 |   journal   = {},
36 |   year      = {2007},
37 |   url       = {http://arxiv.org/abs/cs/0702144},
38 | }
39 | 
40 | @article{Koren:2008:FMN,
41 |  author = {Koren, Yehuda},
42 |  title = {Factorization Meets the Neighborhood: A Multifaceted Collaborative Filtering Model},
43 |  journal = {},
44 |  year = {2008},
45 |  url = {http://www.cs.rochester.edu/twiki/pub/Main/HarpSeminar/Factorization_Meets_the_Neighborhood-_a_Multifaceted_Collaborative_Filtering_Model.pdf},
46 | }
47 | 
48 | @article{George:2005,
49 |   author = {George, Thomas and Merugu, Srujana},
50 |   title = {A Scalable Collaborative Filtering Framework Based on
51 |     Co-Clustering},
52 |   journal = {},
53 |   year = {2005},
54 |   url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf},
55 | }
56 | 
57 | @article{NMF:2014,
58 |   author = {Luo, Xin and Zhou, Mengchu and Xia, Yunni and Zhu, Qinsheng},
59 |   title = {An Efficient Non-Negative Matrix Factorization-Based Approach to
60 |     Collaborative Filtering for Recommender Systems},
61 |   journal = {},
62 |   year = {2014},
63 | }
64 | 
65 | @article{Zhang96,
66 |   author = {Sheng Zhang and Weihong Wang and James Ford and Fillia Makedon},
67 |   title = {Learning from incomplete ratings using non-negative matrix factorization},
68 |   journal = {},
69 |   year = {1996},
70 |   url = {http://www.siam.org/meetings/sdm06/proceedings/059zhangs2.pdf}
71 | }
72 | 
73 | @article{NMF_algo,
74 | title = {Algorithms for Non-negative Matrix Factorization},
75 | author = {Daniel D. Lee and Seung, H. Sebastian},
76 | journal = {},
77 | year = {2001},
78 | url = {http://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf}
79 | }
80 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/optimize_baselines.pyx:
--------------------------------------------------------------------------------
 1 | """
 2 | This module includes the two methods for baseline computation: stochastic
 3 | gradient descent and alternating least squares.
 4 | """
 5 | 
 6 | from __future__ import (absolute_import, division, print_function,
 7 |                         unicode_literals)
 8 | 
 9 | cimport numpy as np  # noqa
10 | import numpy as np
11 | from six.moves import range
12 | 
13 | 
14 | def baseline_als(self):
15 |     """Optimize biases using ALS.
16 | 
17 |     Args:
18 |         self: The algorithm that needs to compute baselines.
19 | 
20 |     Returns:
21 |         A tuple ``(bu, bi)``, which are users and items baselines.
22 |     """
23 | 
24 |     # This piece of code is largely inspired by that of MyMediaLite:
25 |     # https://github.com/zenogantner/MyMediaLite/blob/master/src/MyMediaLite/RatingPrediction/UserItemBaseline.cs
26 |     # see also https://www.youtube.com/watch?v=gCaOa3W9kM0&t=32m55s
27 |     # (Alex Smola on RS, ML Class 10-701)
28 | 
29 |     cdef np.ndarray[np.double_t] bu = np.zeros(self.trainset.n_users)
30 |     cdef np.ndarray[np.double_t] bi = np.zeros(self.trainset.n_items)
31 | 
32 |     cdef int u, i
33 |     cdef double r, err, dev_i, dev_u
34 |     cdef double global_mean = self.trainset.global_mean
35 | 
36 |     cdef int n_epochs = self.bsl_options.get('n_epochs', 10)
37 |     cdef double reg_u = self.bsl_options.get('reg_u', 15)
38 |     cdef double reg_i = self.bsl_options.get('reg_i', 10)
39 | 
40 |     for dummy in range(n_epochs):
41 |         for i in self.trainset.all_items():
42 |             dev_i = 0
43 |             for (u, r) in self.trainset.ir[i]:
44 |                 dev_i += r - global_mean - bu[u]
45 | 
46 |             bi[i] = dev_i / (reg_i + len(self.trainset.ir[i]))
47 | 
48 |         for u in self.trainset.all_users():
49 |             dev_u = 0
50 |             for (i, r) in self.trainset.ur[u]:
51 |                 dev_u += r - global_mean - bi[i]
52 |             bu[u] = dev_u / (reg_u + len(self.trainset.ur[u]))
53 | 
54 |     return bu, bi
55 | 
56 | 
57 | def baseline_sgd(self):
58 |     """Optimize biases using SGD.
59 | 
60 |     Args:
61 |         self: The algorithm that needs to compute baselines.
62 | 
63 |     Returns:
64 |         A tuple ``(bu, bi)``, which are users and items baselines.
65 |     """
66 | 
67 |     cdef np.ndarray[np.double_t] bu = np.zeros(self.trainset.n_users)
68 |     cdef np.ndarray[np.double_t] bi = np.zeros(self.trainset.n_items)
69 | 
70 |     cdef int u, i
71 |     cdef double r, err
72 |     cdef double global_mean = self.trainset.global_mean
73 | 
74 |     cdef int n_epochs = self.bsl_options.get('n_epochs', 20)
75 |     cdef double reg = self.bsl_options.get('reg', 0.02)
76 |     cdef double lr = self.bsl_options.get('learning_rate', 0.005)
77 | 
78 |     for dummy in range(n_epochs):
79 |         for u, i, r in self.trainset.all_ratings():
80 |             err = (r - (global_mean + bu[u] + bi[i]))
81 |             bu[u] += lr * (err - reg * bu[u])
82 |             bi[i] += lr * (err - reg * bi[i])
83 | 
84 |     return bu, bi
85 | 


--------------------------------------------------------------------------------
/tests/test_NMF.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing the NMF algorithm.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | import os
 8 | import pytest
 9 | 
10 | from surprise import NMF
11 | from surprise import Dataset
12 | from surprise import Reader
13 | from surprise import evaluate
14 | 
15 | # the test and train files are from the ml-100k dataset (10% of u1.base and
16 | # 10 % of u1.test)
17 | train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
18 | test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
19 | data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
20 | 
21 | 
22 | def test_NMF_parameters():
23 |     """Ensure that all parameters are taken into account."""
24 | 
25 |     # The baseline against which to compare.
26 |     algo = NMF(n_factors=1, n_epochs=1)
27 |     rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']
28 | 
29 |     # n_factors
30 |     algo = NMF(n_factors=2, n_epochs=1)
31 |     rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
32 |     assert rmse_default != rmse_factors
33 | 
34 |     # n_epochs
35 |     algo = NMF(n_factors=1, n_epochs=2)
36 |     rmse_n_epochs = evaluate(algo, data, measures=['rmse'])['rmse']
37 |     assert rmse_default != rmse_n_epochs
38 | 
39 |     # biased
40 |     algo = NMF(n_factors=1, n_epochs=1, biased=True)
41 |     rmse_biased = evaluate(algo, data, measures=['rmse'])['rmse']
42 |     assert rmse_default != rmse_biased
43 | 
44 |     # reg_pu
45 |     algo = NMF(n_factors=1, n_epochs=1, reg_pu=1)
46 |     rmse_reg_pu = evaluate(algo, data, measures=['rmse'])['rmse']
47 |     assert rmse_default != rmse_reg_pu
48 | 
49 |     # reg_qi
50 |     algo = NMF(n_factors=1, n_epochs=1, reg_qi=1)
51 |     rmse_reg_qi = evaluate(algo, data, measures=['rmse'])['rmse']
52 |     assert rmse_default != rmse_reg_qi
53 | 
54 |     # reg_bu
55 |     algo = NMF(n_factors=1, n_epochs=1, reg_bu=1)
56 |     rmse_reg_bu = evaluate(algo, data, measures=['rmse'])['rmse']
57 |     assert rmse_default != rmse_reg_bu
58 | 
59 |     # reg_bi
60 |     algo = NMF(n_factors=1, n_epochs=1, reg_bi=1)
61 |     rmse_reg_bi = evaluate(algo, data, measures=['rmse'])['rmse']
62 |     assert rmse_default != rmse_reg_bi
63 | 
64 |     # lr_bu
65 |     algo = NMF(n_factors=1, n_epochs=1, lr_bu=1)
66 |     rmse_lr_bu = evaluate(algo, data, measures=['rmse'])['rmse']
67 |     assert rmse_default != rmse_lr_bu
68 | 
69 |     # lr_bi
70 |     algo = NMF(n_factors=1, n_epochs=1, lr_bi=1)
71 |     rmse_lr_bi = evaluate(algo, data, measures=['rmse'])['rmse']
72 |     assert rmse_default != rmse_lr_bi
73 | 
74 |     # init_low
75 |     algo = NMF(n_factors=1, n_epochs=1, init_low=.5)
76 |     rmse_init_low = evaluate(algo, data, measures=['rmse'])['rmse']
77 |     assert rmse_default != rmse_init_low
78 | 
79 |     # init_low
80 |     with pytest.raises(ValueError):
81 |         algo = NMF(n_factors=1, n_epochs=1, init_low=-1)
82 | 
83 |     # init_high
84 |     algo = NMF(n_factors=1, n_epochs=1, init_high=.5)
85 |     rmse_init_high = evaluate(algo, data, measures=['rmse'])['rmse']
86 |     assert rmse_default != rmse_init_high
87 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | Current
  2 | =======
  3 | 
  4 | VERSION 1.0.3
  5 | =============
  6 | 
  7 | Date: 03/05/17
  8 | 
  9 | Enhancements
 10 | ------------
 11 | 
 12 | * Added FAQ in the doc
 13 | * Added the possibility to retrieve the k nearest neighbors of a user or an
 14 |   item.
 15 | * Changed the dumping process a bit (see API changes). Plus, dumps can now be
 16 |   loaded.
 17 | * Added possibility to build a testset from the ratings of a training set
 18 | * Added inner-to-raw id conversion in the Trainset class
 19 | * The r_ui parameter of the predict() method is now optional
 20 | 
 21 | Fixes
 22 | -----
 23 | * Fixed verbosity of the evaluate function
 24 | * Corrected prediction when only user (or only item) is unknown in SVD and NMF
 25 |   algorithms. Thanks to kenoung!
 26 | * Corrected factor vectors initialization of SVD algorithms. Thanks to
 27 |   adideshp!
 28 | 
 29 | API Changes
 30 | -----------
 31 | 
 32 | * The dump() method now dumps a list of predition (optional) and an algorithm
 33 |   (optional as well). The algorithm is now a real algorithm object. The
 34 |   trainset is not dumped anymore as it is already part of the algorithm anyway.
 35 | * The dump() method is now part of the dump namespace, and not the global
 36 |   namespace (so it is accessed by surprise.dump.dump)
 37 | 
 38 | VERSION 1.0.2
 39 | =============
 40 | 
 41 | Date: 04/01/17
 42 | 
 43 | Just a minor change so that README.md is converted to rst for better rendering
 44 | on PyPI.
 45 | 
 46 | VERSION 1.0.1
 47 | =============
 48 | 
 49 | Date: 02/01/17
 50 | 
 51 | Enhancements
 52 | ------------
 53 | 
 54 | * Added the GridSearch feature, by Maher
 55 | * Added a 'clip' option to the predict() method
 56 | * Added NMF algorithm
 57 | * Added entry point for better command line usage.
 58 | * Added CoClustering algorithm.
 59 | * Added SlopeOne algorithm.
 60 | * Added Probabilistic Matrix Factorization as an option SVD
 61 | * Cythonized Baseline Computation
 62 | 
 63 | Other
 64 | -----
 65 | 
 66 | * Surprise is now a scikit!
 67 | * Changed license to BSD
 68 | * Six is now a dependency
 69 | 
 70 | VERSION 1.0.0
 71 | =============
 72 | 
 73 | Date: 22/11/16
 74 | 
 75 | * Changed name from recsys to surprise
 76 | * Improved printing of accuracy measures.
 77 | * Added version number.
 78 | * Rewrote the the __main__.py
 79 | 
 80 | VERSION 0.0.4
 81 | =============
 82 | 
 83 | Date: 15/11/16
 84 | 
 85 | Enhancements
 86 | ------------
 87 | 
 88 | * Added notebooks for comparing and evaluating algorithm performances
 89 | * Better use of setup.py
 90 | * Added a min_support parameter to the similarity measures.
 91 | * Added a min_k parameter to the KNN algorithms.
 92 | * The similarity matrix and baselines are now returned.
 93 | * You can now train on a whole training set without test set.
 94 | * The estimate method can return a tuple with prediction details.
 95 | * Added SVD and SVD++ algorithms.
 96 | * Removed all the x/y vs user/item stuff. That was useless for most algorithms.
 97 | 
 98 | 
 99 | API Changes
100 | -----------
101 | 
102 | * Removed the @property decorator for many iterators.
103 | * It's now up to the algorithms to decide if they can or cannot make a
104 | 	prediction.
105 | 
106 | VERSION 0.0.3
107 | =============
108 | 
109 | Date: 25/10/16
110 | 
111 | * Added support for Python 2
112 | 


--------------------------------------------------------------------------------
/tests/test_sim_options.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing the sim_options parameter.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | import os
 8 | from itertools import combinations
 9 | 
10 | import pytest
11 | 
12 | from surprise import KNNBasic
13 | from surprise import KNNWithMeans
14 | from surprise import KNNBaseline
15 | from surprise import Dataset
16 | from surprise import Reader
17 | from surprise import evaluate
18 | 
19 | 
20 | # the test and train files are from the ml-100k dataset (10% of u1.base and
21 | # 10 % of u1.test)
22 | train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
23 | test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
24 | data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
25 | 
26 | 
27 | def test_name_field():
28 |     """Ensure the name field is taken into account."""
29 | 
30 |     sim_options = {'name': 'cosine'}
31 |     algo = KNNBasic(sim_options=sim_options)
32 |     rmse_cosine = evaluate(algo, data, measures=['rmse'])['rmse']
33 | 
34 |     sim_options = {'name': 'msd'}
35 |     algo = KNNBasic(sim_options=sim_options)
36 |     rmse_msd = evaluate(algo, data, measures=['rmse'])['rmse']
37 | 
38 |     sim_options = {'name': 'pearson'}
39 |     algo = KNNBasic(sim_options=sim_options)
40 |     rmse_pearson = evaluate(algo, data, measures=['rmse'])['rmse']
41 | 
42 |     sim_options = {'name': 'pearson_baseline'}
43 |     bsl_options = {'n_epochs': 1}
44 |     algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
45 |     rmse_pearson_bsl = evaluate(algo, data, measures=['rmse'])['rmse']
46 | 
47 |     for rmse_a, rmse_b in combinations((rmse_cosine, rmse_msd, rmse_pearson,
48 |                                         rmse_pearson_bsl), 2):
49 |         assert (rmse_a != rmse_b)
50 | 
51 |     with pytest.raises(NameError):
52 |         sim_options = {'name': 'wrong_name'}
53 |         algo = KNNBasic(sim_options=sim_options)
54 |         evaluate(algo, data)
55 | 
56 | 
57 | def test_user_based_field():
58 |     """Ensure that the user_based field is taken into account (only) when
59 |     needed."""
60 | 
61 |     algorithms = (KNNBasic, KNNWithMeans, KNNBaseline)
62 |     for klass in algorithms:
63 |         algo = klass(sim_options={'user_based': True})
64 |         rmses_user_based = evaluate(algo, data, measures=['rmse'])['rmse']
65 |         algo = klass(sim_options={'user_based': False})
66 |         rmses_item_based = evaluate(algo, data, measures=['rmse'])['rmse']
67 |         assert rmses_user_based != rmses_item_based
68 | 
69 | 
70 | def test_shrinkage_field():
71 |     """Ensure the shrinkage field is taken into account."""
72 | 
73 |     sim_options = {'name': 'pearson_baseline',
74 |                    'shrinkage': 0
75 |                    }
76 |     bsl_options = {'n_epochs': 1}
77 |     algo = KNNBasic(sim_options=sim_options)
78 |     rmse_shrinkage_0 = evaluate(algo, data, measures=['rmse'])['rmse']
79 | 
80 |     sim_options = {'name': 'pearson_baseline',
81 |                    'shrinkage': 100
82 |                    }
83 |     bsl_options = {'n_epochs': 1}
84 |     algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
85 |     rmse_shrinkage_100 = evaluate(algo, data, measures=['rmse'])['rmse']
86 | 
87 |     assert rmse_shrinkage_0 != rmse_shrinkage_100
88 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/slope_one.pyx:
--------------------------------------------------------------------------------
 1 | """
 2 | the :mod:`slope_one` module includes the :class:`SlopeOne` algorithm.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | 
 8 | cimport numpy as np  # noqa
 9 | import numpy as np
10 | from six.moves import range
11 | from six import iteritems
12 | 
13 | from .algo_base import AlgoBase
14 | from .predictions import PredictionImpossible
15 | 
16 | 
17 | class SlopeOne(AlgoBase):
18 |     """A simple yet accurate collaborative filtering algorithm.
19 | 
20 |     This is a straightforward implementation of the SlopeOne algorithm
21 |     :cite:`lemire2007a`.
22 | 
23 |     The prediction :math:`\\hat{r}_{ui}` is set as:
24 | 
25 |     .. math::
26 |         \hat{r}_{ui} = \\mu_u + \\frac{1}{
27 |         |R_i(u)|}
28 |         \\sum\\limits_{j \in R_i(u)} \\text{dev}(i, j),
29 | 
30 |     where :math:`R_i(u)` is the set of relevant items, i.e. the set of items
31 |     :math:`j` rated by :math:`u` that also have at least one common user with
32 |     :math:`i`. :math:`\\text{dev}_(i, j)` is defined as the average difference
33 |     between the ratings of :math:`i` and those of :math:`j`:
34 | 
35 |     .. math::
36 |         \\text{dev}(i, j) = \\frac{1}{
37 |         |U_{ij}|}\\sum\\limits_{u \in U_{ij}} r_{ui} - r_{uj}
38 |     """
39 | 
40 |     def __init__(self):
41 | 
42 |         AlgoBase.__init__(self)
43 | 
44 |     def train(self, trainset):
45 | 
46 |         n_items = trainset.n_items
47 | 
48 |         # Number of users having rated items i and j: |U_ij|
49 |         cdef np.ndarray[np.int_t, ndim=2] freq
50 |         # Deviation from item i to item j: mean(r_ui - r_uj for u in U_ij)
51 |         cdef np.ndarray[np.double_t, ndim=2] dev
52 | 
53 |         cdef int u, i, j, r_ui, r_uj
54 | 
55 |         AlgoBase.train(self, trainset)
56 | 
57 |         freq = np.zeros((trainset.n_items, trainset.n_items), np.int)
58 |         dev = np.zeros((trainset.n_items, trainset.n_items), np.double)
59 | 
60 |         # Computation of freq and dev arrays.
61 |         for u, u_ratings in iteritems(trainset.ur):
62 |             for i, r_ui in u_ratings:
63 |                 for j, r_uj in u_ratings:
64 |                     freq[i, j] += 1
65 |                     dev[i, j] += r_ui - r_uj
66 | 
67 |         for i in range(n_items):
68 |             dev[i, i] = 0
69 |             for j in range(i + 1, n_items):
70 |                 dev[i, j] /= freq[i, j]
71 |                 dev[j, i] = -dev[i, j]
72 | 
73 |         self.freq = freq
74 |         self.dev = dev
75 | 
76 |         # mean ratings of all users: mu_u
77 |         self.user_mean = [np.mean([r for (_, r) in trainset.ur[u]])
78 |                           for u in trainset.all_users()]
79 | 
80 |     def estimate(self, u, i):
81 | 
82 |         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
83 |             raise PredictionImpossible('User and/or item is unkown.')
84 | 
85 |         # Ri: relevant items for i. This is the set of items j rated by u that
86 |         # also have common users with i (i.e. at least one user has rated both
87 |         # i and j).
88 |         Ri = [j for (j, _) in self.trainset.ur[u] if self.freq[i, j] > 0]
89 |         est = self.user_mean[u]
90 |         if Ri:
91 |             est += sum(self.dev[i, j] for j in Ri) / len(Ri)
92 | 
93 |         return est
94 | 


--------------------------------------------------------------------------------
/tests/test_algorithms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing prediction algorithms.
 3 | """
 4 | 
 5 | from __future__ import (absolute_import, division, print_function,
 6 |                         unicode_literals)
 7 | import os
 8 | 
 9 | from surprise import NormalPredictor
10 | from surprise import BaselineOnly
11 | from surprise import KNNBasic
12 | from surprise import KNNWithMeans
13 | from surprise import KNNBaseline
14 | from surprise import SVD
15 | from surprise import SVDpp
16 | from surprise import NMF
17 | from surprise import SlopeOne
18 | from surprise import CoClustering
19 | from surprise import Dataset
20 | from surprise import Reader
21 | 
22 | 
23 | def test_unknown_user_or_item():
24 |     """Ensure that all algorithms act gracefully when asked to predict a rating
25 |     of an unknown user, an unknown item, and when both are unknown.
26 |     """
27 | 
28 |     reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
29 |                     rating_scale=(1, 5))
30 | 
31 |     file_path = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'
32 | 
33 |     data = Dataset.load_from_file(file_path=file_path, reader=reader)
34 | 
35 |     for trainset, testset in data.folds():
36 |         pass  # just need trainset and testset to be set
37 | 
38 |     klasses = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans,
39 |                KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering)
40 |     for klass in klasses:
41 |         algo = klass()
42 |         algo.train(trainset)
43 |         algo.predict('user0', 'unknown_item', None)
44 |         algo.predict('unkown_user', 'item0', None)
45 |         algo.predict('unkown_user', 'unknown_item', None)
46 | 
47 | 
48 | def test_knns():
49 |     """Ensure the k and min_k parameters are effective for knn algorithms."""
50 | 
51 |     # the test and train files are from the ml-100k dataset (10% of u1.base and
52 |     # 10 % of u1.test)
53 |     train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
54 |     test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
55 |     data = Dataset.load_from_folds([(train_file, test_file)],
56 |                                    Reader('ml-100k'))
57 | 
58 |     # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when
59 |     # there are not enough neighbors, we can't really test them...
60 |     klasses = (KNNBasic, )  # KNNWithMeans, KNNBaseline)
61 | 
62 |     k, min_k = 20, 5
63 |     for klass in klasses:
64 |         algo = klass(k=k, min_k=min_k)
65 |         for trainset, testset in data.folds():
66 |             algo.train(trainset)
67 |             predictions = algo.test(testset)
68 |             for pred in predictions:
69 |                 if not pred.details['was_impossible']:
70 |                     assert min_k <= pred.details['actual_k'] <= k
71 | 
72 | 
73 | def test_nearest_neighbors():
74 |     """Ensure the nearest neighbors are different when using user-user
75 |     similarity vs item-item."""
76 | 
77 |     reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
78 |                     rating_scale=(1, 5))
79 | 
80 |     data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'
81 |     data = Dataset.load_from_file(data_file, reader)
82 |     trainset = data.build_full_trainset()
83 | 
84 |     algo_ub = KNNBasic(sim_options={'user_based': True})
85 |     algo_ub.train(trainset)
86 |     algo_ib = KNNBasic(sim_options={'user_based': False})
87 |     algo_ib.train(trainset)
88 |     assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
89 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, find_packages, Extension
  2 | from codecs import open
  3 | from os import path
  4 | 
  5 | # sum up:
  6 | # mktmpenv (Python version should not matter)
  7 | # pip install numpy cython pypandoc
  8 | # python setup.py sdist
  9 | # twine upload dist/blabla.tar.gz [-r testpypi]
 10 | 
 11 | try:
 12 |     import numpy as np
 13 | except ImportError:
 14 |     exit('Please install numpy>=1.11.2 first.')
 15 | 
 16 | try:
 17 |     from Cython.Build import cythonize
 18 |     from Cython.Distutils import build_ext
 19 | except ImportError:
 20 |     USE_CYTHON = False
 21 | else:
 22 |     USE_CYTHON = True
 23 | 
 24 | __version__ = '1.0.3'
 25 | 
 26 | here = path.abspath(path.dirname(__file__))
 27 | 
 28 | # Get the long description from the README file and convert it to rst
 29 | try:
 30 |     import pypandoc
 31 |     long_description = pypandoc.convert(path.join(here, 'README.md'), 'rst')
 32 | except(IOError, ImportError):
 33 |     with open(path.join(here, 'README.md'), encoding='utf-8') as f:
 34 |         long_description = f.read()
 35 | 
 36 | # get the dependencies and installs
 37 | with open(path.join(here, 'requirements.txt'), encoding='utf-8') as f:
 38 |     all_reqs = f.read().split('\n')
 39 | 
 40 | install_requires = [x.strip() for x in all_reqs if 'git+' not in x]
 41 | dependency_links = [x.strip().replace('git+', '') for x in all_reqs if x.startswith('git+')]
 42 | 
 43 | cmdclass = {}
 44 | 
 45 | ext = '.pyx' if USE_CYTHON else '.c'
 46 | 
 47 | extensions = [Extension('surprise.similarities',
 48 |                        ['surprise/similarities' + ext],
 49 |                         include_dirs=[np.get_include()]),
 50 |               Extension('surprise.prediction_algorithms.matrix_factorization',
 51 |                         ['surprise/prediction_algorithms/matrix_factorization' + ext],
 52 |                         include_dirs=[np.get_include()]),
 53 |               Extension('surprise.prediction_algorithms.optimize_baselines',
 54 |                         ['surprise/prediction_algorithms/optimize_baselines' + ext],
 55 |                         include_dirs=[np.get_include()]),
 56 |               Extension('surprise.prediction_algorithms.slope_one',
 57 |                         ['surprise/prediction_algorithms/slope_one' + ext],
 58 |                         include_dirs=[np.get_include()]),
 59 |               Extension('surprise.prediction_algorithms.co_clustering',
 60 |                         ['surprise/prediction_algorithms/co_clustering' + ext],
 61 |                         include_dirs=[np.get_include()]),
 62 |              ]
 63 | 
 64 | if USE_CYTHON:
 65 |     ext_modules = cythonize(extensions)
 66 |     cmdclass.update({'build_ext': build_ext})
 67 | else:
 68 | 	ext_modules = extensions
 69 | 
 70 | setup(
 71 |     name='scikit-surprise',
 72 |     author='Nicolas Hug',
 73 |     author_email='contact@nicolas-hug.com',
 74 | 
 75 |     description=('An easy-to-use library for recommender systems.'),
 76 |     long_description=long_description,
 77 | 
 78 |     version=__version__,
 79 |     url='http://surpriselib.com',
 80 | 
 81 |     license='GPLv3+',
 82 |     classifiers=[
 83 |       'Development Status :: 5 - Production/Stable',
 84 |       'Intended Audience :: Developers',
 85 |       'Intended Audience :: Education',
 86 |       'Intended Audience :: Science/Research',
 87 |       'Topic :: Scientific/Engineering',
 88 |       'License :: OSI Approved :: BSD License',
 89 |       'Programming Language :: Python :: 3',
 90 |       'Programming Language :: Python :: 2.7',
 91 |     ],
 92 |     keywords='recommender recommendation system',
 93 | 
 94 |     packages=find_packages(exclude=['tests*']),
 95 |     include_package_data=True,
 96 |     ext_modules = ext_modules,
 97 |     cmdclass=cmdclass,
 98 |     install_requires=install_requires,
 99 |     dependency_links=dependency_links,
100 | 
101 |     entry_points={'console_scripts':
102 |                  ['surprise = surprise.__main__:main']},
103 | )
104 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | TODO
 2 | ====
 3 | 
 4 | * Allow to discount similarities (see aggarwal)
 5 | * Support conda?
 6 | * Allow incremental updates for some algorithms
 7 | * Profile code (mostly cython) to see what could be optimized
 8 | 
 9 | Maybe, Maybe not
10 | ----------------
11 | 
12 | * allow a back up algorithm  when prediction is impossible. Right now it's just
13 |   the mean rating that is predicted. Maybe user would want to choose it.
14 | 
15 | Done:
16 | -----
17 | 
18 | * Complete FAQ
19 | * Change the dumping machinery to be more consistent 
20 | * Allow to test on the trainset
21 | * make bibtex entry
22 | * Verbosity of gridsearch still prints stuff because of evaluate. Fix that.
23 | * Make the r_ui param of predict optional
24 | * Put some PredictionImpossible messages in every algo
25 | * allow a 'clip' option to the predict method? Also, describe r_min and r_max
26 | * configure entrypoints to use surprise directly from command line
27 | * Allow a 'biased' option in the SVD algo. If true, use baselines, if False,
28 |   don't. It should be pretty easy to do.
29 | * create option in __main__ to clean the .recsys directory. Actually, the
30 |   __main__ module should be entirely reviewed.
31 | * when dumping, we should dump all the algorithm parameter. Use __dict__ ?
32 | * do something about the generators Python 2 vs 3 (range, dict.items(), etc...)
33 | * should a Prediction output the raw id or the inner id? Right now it's the
34 |   inner id. Maybe sort this out when working on the comparison tools.
35 | * allow the perf dict returned by evaluate to accept keys with lower/upper
36 |   case for retarded users such as me.
37 | * Add a 'min_support' parameter to sim_options? Add a min_k to knns?
38 | * Do something about the user_based stuff. It should be better. Check knns BTW.
39 | * Do something about unknown users and unknown items, i.e. users or items that
40 |   have no rating in the trainset. Right now, the predict method checks if the
41 |   name starts with 'unknown' but this is shiiite because it's dependent on the
42 |   construct_trainset method, which is sometimes never called (so the raw2inner
43 |   stuff will come in play somehow). Plus, It should be up to the algorithms to
44 |   choose whether it can (or can't) make a prediction even if user or item is
45 |   unknown.
46 | * remove kwargs : done where useless.
47 | * say something quick about baseline computation (when not matrix facto) 
48 | * Matrix facto algo
49 | * allow the 'estimate' method to return some details about prediction (such as
50 |   the number of neighbors for a KNN)
51 | * allow to train on a SINGLE file without test set, and let user query for some
52 |   predictions
53 | * write tuto for using only predict() (and not test)
54 | * maybe clean a little all the dataset machinery? Plus, are the
55 |   raw2inner_id_users and raw2inner_id_items worth keeping? May be for analysing
56 |   tools, I don't know right now. EDIT: yes, we need to keep them, simply
57 |   because the similarity computation can only work with integer as indexes
58 |   (numpy arrays).
59 | * sort out this warning issue coming from cython
60 | * say something about the sim > 0 in knns algos
61 | * get less restrictive requirements.txt
62 | * write the custom algorithm tutorial
63 | * improve test coverage
64 | * add the cool stickers on the readme just like scikit learn
65 | * set up travis
66 | * keep on testing
67 | * keep on documenting and commenting code
68 | * extensively test the reader class, + check that the doc is OK for reader
69 | * set up a nice API (looks ok now)
70 | * handle algo-specific or similarity-specific parameters (such as 'k' for knn,
71 |   regularization parameters, shrinkage paramaters, etc.) in an appropriate
72 |   manner, rather than pass them all to constructors... UPDATE: ok so using
73 |   kwargs like matplotlib.pyplot might be enough. should we create a
74 |   'Similarity' class?
75 | * clean the main and all the dataset handling stuff (still needs to be
76 |   polished)
77 | * rewrite this TODO in english
78 | * create a proper project structure
79 | * from camelCase to snake\_case
80 | 


--------------------------------------------------------------------------------
/surprise/accuracy.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The :mod:`surprise.accuracy` module provides with tools for computing accuracy
  3 | metrics on a set of predictions.
  4 | 
  5 | Available accuracy metrics:
  6 | 
  7 | .. autosummary::
  8 |     :nosignatures:
  9 | 
 10 |     rmse
 11 |     mae
 12 |     fcp
 13 | """
 14 | 
 15 | from __future__ import (absolute_import, division, print_function,
 16 |                         unicode_literals)
 17 | from collections import defaultdict
 18 | import numpy as np
 19 | from six import iteritems
 20 | 
 21 | 
 22 | def rmse(predictions, verbose=True):
 23 |     """Compute RMSE (Root Mean Squared Error).
 24 | 
 25 |     .. math::
 26 |         \\text{RMSE} = \\sqrt{\\frac{1}{|\\hat{R}|} \\sum_{\\hat{r}_{ui} \in
 27 |         \\hat{R}}(r_{ui} - \\hat{r}_{ui})^2}.
 28 | 
 29 |     Args:
 30 |         predictions (:obj:`list` of :obj:`Prediction\
 31 |             <surprise.prediction_algorithms.predictions.Prediction>`):
 32 |             A list of predictions, as returned by the :meth:`test()
 33 |             <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method.
 34 |         verbose: If True, will print computed value. Default is ``True``.
 35 | 
 36 | 
 37 |     Returns:
 38 |         The Root Mean Squared Error of predictions.
 39 | 
 40 |     Raises:
 41 |         ValueError: When ``predictions`` is empty.
 42 |     """
 43 | 
 44 |     if not predictions:
 45 |         raise ValueError('Prediction list is empty.')
 46 | 
 47 |     mse = np.mean([float((true_r - est)**2)
 48 |                    for (_, _, true_r, est, _) in predictions])
 49 |     rmse_ = np.sqrt(mse)
 50 | 
 51 |     if verbose:
 52 |         print('RMSE: {0:1.4f}'.format(rmse_))
 53 | 
 54 |     return rmse_
 55 | 
 56 | 
 57 | def mae(predictions, verbose=True):
 58 |     """Compute MAE (Mean Absolute Error).
 59 | 
 60 |     .. math::
 61 |         \\text{MAE} = \\frac{1}{|\\hat{R}|} \\sum_{\\hat{r}_{ui} \in
 62 |         \\hat{R}}|r_{ui} - \\hat{r}_{ui}|
 63 | 
 64 |     Args:
 65 |         predictions (:obj:`list` of :obj:`Prediction\
 66 |             <surprise.prediction_algorithms.predictions.Prediction>`):
 67 |             A list of predictions, as returned by the :meth:`test()
 68 |             <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method.
 69 |         verbose: If True, will print computed value. Default is ``True``.
 70 | 
 71 | 
 72 |     Returns:
 73 |         The Mean Absolute Error of predictions.
 74 | 
 75 |     Raises:
 76 |         ValueError: When ``predictions`` is empty.
 77 |     """
 78 | 
 79 |     if not predictions:
 80 |         raise ValueError('Prediction list is empty.')
 81 | 
 82 |     mae_ = np.mean([float(abs(true_r - est))
 83 |                     for (_, _, true_r, est, _) in predictions])
 84 | 
 85 |     if verbose:
 86 |         print('MAE:  {0:1.4f}'.format(mae_))
 87 | 
 88 |     return mae_
 89 | 
 90 | 
 91 | def fcp(predictions, verbose=True):
 92 |     """Compute FCP (Fraction of Concordant Pairs).
 93 | 
 94 |     Computed as described in paper `Collaborative Filtering on Ordinal User
 95 |     Feedback <http://www.ijcai.org/Proceedings/13/Papers/449.pdf>`_ by Koren
 96 |     and Sill, section 5.2.
 97 | 
 98 |     Args:
 99 |         predictions (:obj:`list` of :obj:`Prediction\
100 |             <surprise.prediction_algorithms.predictions.Prediction>`):
101 |             A list of predictions, as returned by the :meth:`test()
102 |             <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method.
103 |         verbose: If True, will print computed value. Default is ``True``.
104 | 
105 | 
106 |     Returns:
107 |         The Fraction of Concordant Pairs.
108 | 
109 |     Raises:
110 |         ValueError: When ``predictions`` is empty.
111 |     """
112 | 
113 |     if not predictions:
114 |         raise ValueError('Prediction list is empty.')
115 | 
116 |     predictions_u = defaultdict(list)
117 |     nc_u = defaultdict(int)
118 |     nd_u = defaultdict(int)
119 | 
120 |     for u0, _, r0, est, _ in predictions:
121 |         predictions_u[u0].append((r0, est))
122 | 
123 |     for u0, preds in iteritems(predictions_u):
124 |         for r0i, esti in preds:
125 |             for r0j, estj in preds:
126 |                 if esti > estj and r0i > r0j:
127 |                     nc_u[u0] += 1
128 |                 if esti >= estj and r0i < r0j:
129 |                     nd_u[u0] += 1
130 | 
131 |     nc = np.mean(list(nc_u.values())) if nc_u else 0
132 |     nd = np.mean(list(nd_u.values())) if nd_u else 0
133 | 
134 |     try:
135 |         fcp = nc / (nc + nd)
136 |     except ZeroDivisionError:
137 |         raise ValueError('cannot compute fcp on this list of prediction. ' +
138 |                          'Does every user have at least two predictions?')
139 | 
140 |     if verbose:
141 |         print('FCP:  {0:1.4f}'.format(fcp))
142 | 
143 |     return fcp
144 | 


--------------------------------------------------------------------------------
/tests/test_bsl_options.py:
--------------------------------------------------------------------------------
  1 | """Ensure that options for baseline estimates are taken into account."""
  2 | 
  3 | from __future__ import (absolute_import, division, print_function,
  4 |                         unicode_literals)
  5 | import os
  6 | 
  7 | import pytest
  8 | 
  9 | from surprise import BaselineOnly
 10 | from surprise import Dataset
 11 | from surprise import Reader
 12 | from surprise import evaluate
 13 | 
 14 | 
 15 | # the test and train files are from the ml-100k dataset (10% of u1.base and
 16 | # 10 % of u1.test)
 17 | train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
 18 | test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
 19 | data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
 20 | 
 21 | 
 22 | def test_method_field():
 23 |     """Ensure the method field is taken into account."""
 24 | 
 25 |     bsl_options = {'method': 'als'}
 26 |     algo = BaselineOnly(bsl_options=bsl_options)
 27 |     rmse_als = evaluate(algo, data, measures=['rmse'])['rmse']
 28 | 
 29 |     bsl_options = {'method': 'sgd'}
 30 |     algo = BaselineOnly(bsl_options=bsl_options)
 31 |     rmse_sgd = evaluate(algo, data, measures=['rmse'])['rmse']
 32 | 
 33 |     assert rmse_als != rmse_sgd
 34 | 
 35 |     with pytest.raises(ValueError):
 36 |         bsl_options = {'method': 'wrong_name'}
 37 |         algo = BaselineOnly(bsl_options=bsl_options)
 38 |         evaluate(algo, data)
 39 | 
 40 | 
 41 | def test_als_n_epochs_field():
 42 |     """Ensure the n_epochs field is taken into account."""
 43 | 
 44 |     bsl_options = {'method': 'als',
 45 |                    'n_epochs': 1,
 46 |                    }
 47 |     algo = BaselineOnly(bsl_options=bsl_options)
 48 |     rmse_als_n_epochs_1 = evaluate(algo, data, measures=['rmse'])['rmse']
 49 | 
 50 |     bsl_options = {'method': 'als',
 51 |                    'n_epochs': 5,
 52 |                    }
 53 |     algo = BaselineOnly(bsl_options=bsl_options)
 54 |     rmse_als_n_epochs_5 = evaluate(algo, data, measures=['rmse'])['rmse']
 55 | 
 56 |     assert rmse_als_n_epochs_1 != rmse_als_n_epochs_5
 57 | 
 58 | 
 59 | def test_als_reg_u_field():
 60 |     """Ensure the reg_u field is taken into account."""
 61 | 
 62 |     bsl_options = {'method': 'als',
 63 |                    'reg_u': 0,
 64 |                    }
 65 |     algo = BaselineOnly(bsl_options=bsl_options)
 66 |     rmse_als_regu_0 = evaluate(algo, data, measures=['rmse'])['rmse']
 67 | 
 68 |     bsl_options = {'method': 'als',
 69 |                    'reg_u': 10,
 70 |                    }
 71 |     algo = BaselineOnly(bsl_options=bsl_options)
 72 |     rmse_als_regu_10 = evaluate(algo, data, measures=['rmse'])['rmse']
 73 | 
 74 |     assert rmse_als_regu_0 != rmse_als_regu_10
 75 | 
 76 | 
 77 | def test_als_reg_i_field():
 78 |     """Ensure the reg_i field is taken into account."""
 79 | 
 80 |     bsl_options = {'method': 'als',
 81 |                    'reg_i': 0,
 82 |                    }
 83 |     algo = BaselineOnly(bsl_options=bsl_options)
 84 |     rmse_als_regi_0 = evaluate(algo, data, measures=['rmse'])['rmse']
 85 | 
 86 |     bsl_options = {'method': 'als',
 87 |                    'reg_i': 10,
 88 |                    }
 89 |     algo = BaselineOnly(bsl_options=bsl_options)
 90 |     rmse_als_regi_10 = evaluate(algo, data, measures=['rmse'])['rmse']
 91 | 
 92 |     assert rmse_als_regi_0 != rmse_als_regi_10
 93 | 
 94 | 
 95 | def test_sgd_n_epoch_field():
 96 |     """Ensure the n_epoch field is taken into account."""
 97 | 
 98 |     bsl_options = {'method': 'sgd',
 99 |                    'n_epochs': 1,
100 |                    }
101 |     algo = BaselineOnly(bsl_options=bsl_options)
102 |     rmse_sgd_n_epoch_1 = evaluate(algo, data, measures=['rmse'])['rmse']
103 | 
104 |     bsl_options = {'method': 'sgd',
105 |                    'n_epochs': 20,
106 |                    }
107 |     algo = BaselineOnly(bsl_options=bsl_options)
108 |     rmse_sgd_n_epoch_5 = evaluate(algo, data, measures=['rmse'])['rmse']
109 | 
110 |     assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
111 | 
112 | 
113 | def test_sgd_learning_rate_field():
114 |     """Ensure the learning_rate field is taken into account."""
115 | 
116 |     bsl_options = {'method': 'sgd',
117 |                    'n_epochs': 1,
118 |                    'learning_rate': .005,
119 |                    }
120 |     algo = BaselineOnly(bsl_options=bsl_options)
121 |     rmse_sgd_lr_005 = evaluate(algo, data, measures=['rmse'])['rmse']
122 | 
123 |     bsl_options = {'method': 'sgd',
124 |                    'n_epochs': 1,
125 |                    'learning_rate': .00005,
126 |                    }
127 |     algo = BaselineOnly(bsl_options=bsl_options)
128 |     rmse_sgd_lr_00005 = evaluate(algo, data, measures=['rmse'])['rmse']
129 | 
130 |     assert rmse_sgd_lr_005 != rmse_sgd_lr_00005
131 | 
132 | 
133 | def test_sgd_reg_field():
134 |     """Ensure the reg field is taken into account."""
135 | 
136 |     bsl_options = {'method': 'sgd',
137 |                    'n_epochs': 1,
138 |                    'reg': 0.02,
139 |                    }
140 |     algo = BaselineOnly(bsl_options=bsl_options)
141 |     rmse_sgd_reg_002 = evaluate(algo, data, measures=['rmse'])['rmse']
142 | 
143 |     bsl_options = {'method': 'sgd',
144 |                    'n_epochs': 1,
145 |                    'reg': 1,
146 |                    }
147 |     algo = BaselineOnly(bsl_options=bsl_options)
148 |     rmse_sgd_reg_1 = evaluate(algo, data, measures=['rmse'])['rmse']
149 | 
150 |     assert rmse_sgd_reg_002 != rmse_sgd_reg_1
151 | 


--------------------------------------------------------------------------------
/doc/source/building_custom_algo.rst:
--------------------------------------------------------------------------------
  1 | .. _building_custom_algo:
  2 | 
  3 | How to build your own prediction algorithm
  4 | ==========================================
  5 | 
  6 | This page describes how to build a custom prediction algorithm using Surprise.
  7 | 
  8 | The basics
  9 | ~~~~~~~~~~
 10 | 
 11 | Want to get your hands dirty? Cool.
 12 | 
 13 | Creating your own prediction algorithm is pretty simple: an algorithm is
 14 | nothing but a class derived from :class:`AlgoBase
 15 | <surprise.prediction_algorithms.algo_base.AlgoBase>` that has an ``estimate``
 16 | method.  This is the method that is called by the :meth:`predict()
 17 | <surprise.prediction_algorithms.algo_base.AlgoBase.predict>` method. It takes
 18 | in an **inner** user id, an **inner** item id (see :ref:`this note
 19 | <raw_inner_note>`), and returns the estimated rating :math:`\hat{r}_{ui}`:
 20 | 
 21 | .. literalinclude:: ../../examples/building_custom_algorithms/most_basic_algorithm.py
 22 |     :caption: From file ``examples/building_custom_algorithms/most_basic_algorithm.py``
 23 |     :name: most_basic_algorithm.py
 24 |     :lines: 9-
 25 | 
 26 | This algorithm is the dumbest we could have thought of: it just predicts a
 27 | rating of 3, regardless of users and items.
 28 | 
 29 | If you want to store additional information about the prediction, you can also
 30 | return a dictionary with given details: ::
 31 | 
 32 |     def estimate(self, u, i):
 33 | 
 34 |         details = {'info1' : 'That was',
 35 |                    'info2' : 'easy stuff :)'}
 36 |         return 3, details
 37 | 
 38 | This dictionary will be stored in the :class:`prediction
 39 | <surprise.prediction_algorithms.predictions.Prediction>` as the ``details``
 40 | field and can be used for :ref:`later analysis <further_analysis>`.
 41 | 
 42 | 
 43 | 
 44 | The ``train`` method
 45 | ~~~~~~~~~~~~~~~~~~~~
 46 | 
 47 | Now, let's make a slightly cleverer algorithm that predicts the average of all
 48 | the ratings of the trainset. As this is a constant value that does not depend
 49 | on current user or item, we would rather compute it once and for all. This can
 50 | be done by defining the ``train`` method:
 51 | 
 52 | .. literalinclude:: ../../examples/building_custom_algorithms/most_basic_algorithm2.py
 53 |     :caption: From file ``examples/building_custom_algorithms/most_basic_algorithm2.py``
 54 |     :name: most_basic_algorithm2.py
 55 |     :lines: 15-35
 56 | 
 57 | 
 58 | The ``train`` method is called by the :func:`evaluate
 59 | <surprise.evaluate.evaluate>` function at each fold of a cross-validation
 60 | process, (but you can also :ref:`call it yourself <iterate_over_folds>`).
 61 | Before doing anything, you should call the base class :meth:`train()
 62 | <surprise.prediction_algorithms.algo_base.AlgoBase.train>` method.
 63 | 
 64 | The ``trainset`` attribute
 65 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 66 | 
 67 | Once the base class :meth:`train()
 68 | <surprise.prediction_algorithms.algo_base.AlgoBase.train>` method has returned,
 69 | all the info you need about the current training set (rating values, etc...) is
 70 | stored in the ``self.trainset`` attribute. This is a :class:`Trainset
 71 | <surprise.dataset.Trainset>` object that has many attributes and methods of
 72 | interest for prediction.
 73 | 
 74 | To illustrate its usage, let's make an algorithm that predicts an average
 75 | between the mean of all ratings, the mean rating of the user and the mean
 76 | rating for the item:
 77 | 
 78 | .. literalinclude:: ../../examples/building_custom_algorithms/mean_rating_user_item.py
 79 |     :caption: From file ``examples/building_custom_algorithms/mean_rating_user_item.py``
 80 |     :name: mean_rating_user_item.py
 81 |     :lines: 22-35
 82 | 
 83 | Note that it would have been a better idea to compute all the user means in the
 84 | ``train`` method, thus avoiding the same computations multiple times.
 85 | 
 86 | 
 87 | When the prediction is impossible
 88 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 89 | 
 90 | It's up to your algorithm to decide if it can or cannot yield a prediction. If
 91 | the prediction is impossible, then you can raise the
 92 | :class:`PredictionImpossible
 93 | <surprise.prediction_algorithms.predictions.PredictionImpossible>` exception.
 94 | You'll need to import it first): ::
 95 | 
 96 |   from surprise import PredictionImpossible
 97 | 
 98 | 
 99 | This exception will be caught by the :meth:`predict()
100 | <surprise.prediction_algorithms.algo_base.AlgoBase.predict>` method, and the
101 | estimation :math:`\hat{r}_{ui}` will be set to the global mean of all ratings
102 | :math:`\mu`.
103 | 
104 | Using similarities and baselines
105 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
106 | 
107 | Should your algorithm use a similarity measure or baseline estimates, you'll
108 | need to accept ``bsl_options`` and ``sim_options`` as parameters to the
109 | ``__init__`` method, and pass them along to the Base class. See how to use
110 | these parameters in the :ref:`prediction_algorithms` section.
111 | 
112 | Methods :meth:`compute_baselines()
113 | <surprise.prediction_algorithms.algo_base.AlgoBase.compute_baselines>`   and
114 | :meth:`compute_similarities()
115 | <surprise.prediction_algorithms.algo_base.AlgoBase.compute_similarities>` can
116 | be called in the ``train`` method (or anywhere else).
117 | 
118 | .. literalinclude:: ../../examples/building_custom_algorithms/with_baselines_or_sim.py
119 |     :caption: From file ``examples/building_custom_algorithms/.with_baselines_or_sim.py``
120 |     :name: with_baselines_or_sim.py
121 |     :lines: 15-47
122 | 
123 | 
124 | Feel free to explore the prediction_algorithms package `source
125 | <https://github.com/NicolasHug/Surprise/tree/master/surprise/prediction_algorithms>`_
126 | to get an idea of what can be done.
127 | 


--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for testing the Dataset class.
  3 | """
  4 | 
  5 | from __future__ import (absolute_import, division, print_function,
  6 |                         unicode_literals)
  7 | import os
  8 | 
  9 | import pytest
 10 | 
 11 | from surprise import BaselineOnly
 12 | from surprise import Dataset
 13 | from surprise import Reader
 14 | 
 15 | 
 16 | reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
 17 |                 rating_scale=(1, 5))
 18 | 
 19 | 
 20 | def test_wrong_file_name():
 21 |     """Ensure file names are checked when creating a (custom) Dataset."""
 22 |     wrong_files = [('does_not_exist', 'does_not_either')]
 23 | 
 24 |     with pytest.raises(ValueError):
 25 |         Dataset.load_from_folds(folds_files=wrong_files, reader=reader)
 26 | 
 27 | 
 28 | def test_build_full_trainset():
 29 |     """Test the build_full_trainset method."""
 30 | 
 31 |     custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) +
 32 |                            '/custom_dataset')
 33 |     data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader)
 34 | 
 35 |     trainset = data.build_full_trainset()
 36 | 
 37 |     assert len(trainset.ur) == 5
 38 |     assert len(trainset.ir) == 2
 39 |     assert trainset.n_users == 5
 40 |     assert trainset.n_items == 2
 41 | 
 42 | 
 43 | def test_split():
 44 |     """Test the split method."""
 45 | 
 46 |     custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) +
 47 |                            '/custom_dataset')
 48 |     data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader)
 49 | 
 50 |     # Test n_folds parameter
 51 |     data.split(5)
 52 |     assert len(list(data.folds())) == 5
 53 | 
 54 |     with pytest.raises(ValueError):
 55 |         data.split(10)
 56 |         for fold in data.folds():
 57 |             pass
 58 | 
 59 |     with pytest.raises(ValueError):
 60 |         data.split(1)
 61 |         for fold in data.folds():
 62 |             pass
 63 | 
 64 |     # Test the shuffle parameter
 65 |     data.split(n_folds=3, shuffle=False)
 66 |     testsets_a = [testset for (_, testset) in data.folds()]
 67 |     data.split(n_folds=3, shuffle=False)
 68 |     testsets_b = [testset for (_, testset) in data.folds()]
 69 |     assert testsets_a == testsets_b
 70 | 
 71 |     # We'll shuffle and check that folds are now different. There's a chance
 72 |     # that they're still the same, just by lack of luck. If after 10000 tries
 73 |     # the're still the same, there's a high probability that our code is
 74 |     # faulty. If we're very (very very very) unlucky, it may fail though (or
 75 |     # loop for eternity).
 76 |     i = 0
 77 |     while testsets_a == testsets_b:
 78 |         data.split(n_folds=3, shuffle=True)
 79 |         testsets_b = [testset for (_, testset) in data.folds()]
 80 |         i += 1
 81 |     assert i < 10000
 82 | 
 83 |     # Ensure that folds are the same if split is not called again
 84 |     testsets_a = [testset for (_, testset) in data.folds()]
 85 |     testsets_b = [testset for (_, testset) in data.folds()]
 86 |     assert testsets_a == testsets_b
 87 | 
 88 | 
 89 | def test_trainset_testset():
 90 |     """Test the construct_trainset and construct_testset methods."""
 91 | 
 92 |     current_dir = os.path.dirname(os.path.realpath(__file__))
 93 |     folds_files = [(current_dir + '/custom_train',
 94 |                     current_dir + '/custom_test')]
 95 | 
 96 |     data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
 97 | 
 98 |     for trainset, testset in data.folds():
 99 |         pass  # just need trainset and testset to be set
100 | 
101 |     # test ur
102 |     ur = trainset.ur
103 |     assert ur[0] == [(0, 4)]
104 |     assert ur[1] == [(0, 4), (1, 2)]
105 |     assert ur[40] == []  # not in the trainset
106 | 
107 |     # test ir
108 |     ir = trainset.ir
109 |     assert ir[0] == [(0, 4), (1, 4), (2, 1)]
110 |     assert ir[1] == [(1, 2), (2, 1), (3, 5)]
111 |     assert ir[20000] == []  # not in the trainset
112 | 
113 |     # test n_users, n_items, n_ratings, rating_scale
114 |     assert trainset.n_users == 4
115 |     assert trainset.n_items == 2
116 |     assert trainset.n_ratings == 6
117 |     assert trainset.rating_scale == (1, 5)
118 | 
119 |     # test raw2inner
120 |     for i in range(4):
121 |         assert trainset.to_inner_uid('user' + str(i)) == i
122 |     with pytest.raises(ValueError):
123 |         trainset.to_inner_uid('unkown_user')
124 | 
125 |     for i in range(2):
126 |         assert trainset.to_inner_iid('item' + str(i)) == i
127 |     with pytest.raises(ValueError):
128 |         trainset.to_inner_iid('unkown_item')
129 | 
130 |     # test inner2raw
131 |     assert trainset._inner2raw_id_users is None
132 |     assert trainset._inner2raw_id_items is None
133 |     for i in range(4):
134 |         assert trainset.to_raw_uid(i) == 'user' + str(i)
135 |     for i in range(2):
136 |         assert trainset.to_raw_iid(i) == 'item' + str(i)
137 |     assert trainset._inner2raw_id_users is not None
138 |     assert trainset._inner2raw_id_items is not None
139 | 
140 |     # Test the build_testset() method
141 |     algo = BaselineOnly()
142 |     algo.train(trainset)
143 |     testset = trainset.build_testset()
144 |     algo.test(testset)  # ensure an algorithm can manage the data
145 |     assert ('user0', 'item0', 4) in testset
146 |     assert ('user3', 'item1', 5) in testset
147 |     assert ('user3', 'item1', 0) not in testset
148 | 
149 |     # Test the build_anti_testset() method
150 |     algo = BaselineOnly()
151 |     algo.train(trainset)
152 |     testset = trainset.build_anti_testset()
153 |     algo.test(testset)  # ensure an algorithm can manage the data
154 |     assert ('user0', 'item0', trainset.global_mean) not in testset
155 |     assert ('user3', 'item1', trainset.global_mean) not in testset
156 |     assert ('user0', 'item1', trainset.global_mean) in testset
157 |     assert ('user3', 'item0', trainset.global_mean) in testset
158 | 


--------------------------------------------------------------------------------
/doc/source/FAQ.rst:
--------------------------------------------------------------------------------
  1 | .. _FAQ:
  2 | 
  3 | FAQ
  4 | ===
  5 | 
  6 | You will find here the Frequently Asked Questions, as well as some other
  7 | use-case examples that are not part of the User Guide.
  8 | 
  9 | How to get the top-N recommendations for each user
 10 | ----------------------------------------------------------
 11 | 
 12 | Here is an example where we retrieve retrieve the top-10 items with highest
 13 | rating prediction for each user in the MovieLens-100k dataset. We first train
 14 | an SVD algorithm on the whole dataset, and then predict all the ratings for the
 15 | pairs (user, item) that are not in the training set. We then retrieve the
 16 | top-10 prediction for each user.
 17 | 
 18 | .. literalinclude:: ../../examples/top_n_recommendations.py
 19 |     :caption: From file ``examples/top_n_recommendations.py``
 20 |     :name: top_n_recommendations.py
 21 |     :lines: 10-
 22 | 
 23 | .. _get_k_nearest_neighbors:
 24 | 
 25 | How to get the k nearest neighbors of a user (or item)
 26 | --------------------------------------------------------------
 27 | 
 28 | You can use the :meth:`get_neighbors()
 29 | <surprise.prediction_algorithms.algo_base.AlgoBase.get_neighbors>` methods of
 30 | the algorithm object. This is only relevant for algorithms that use a
 31 | similarity measure, such as the :ref:`k-NN algorithms
 32 | <pred_package_knn_inpired>`.
 33 | 
 34 | Here is an example where we retrieve the 10 nearest neighbors of the movie Toy
 35 | Story from the MovieLens-100k dataset. The output is:
 36 | 
 37 | .. parsed-literal::
 38 | 
 39 |     The 10 nearest neighbors of Toy Story are:
 40 |     Beauty and the Beast (1991)
 41 |     Raiders of the Lost Ark (1981)
 42 |     That Thing You Do! (1996)
 43 |     Lion King, The (1994)
 44 |     Craft, The (1996)
 45 |     Liar Liar (1997)
 46 |     Aladdin (1992)
 47 |     Cool Hand Luke (1967)
 48 |     Winnie the Pooh and the Blustery Day (1968)
 49 |     Indiana Jones and the Last Crusade (1989)
 50 | 
 51 | There's a lot of boilerplate because of the conversions between movie names and
 52 | their raw/inner ids (see :ref:`this note <raw_inner_note>`), but it all boils
 53 | down to the use of :meth:`get_neighbors()
 54 | <surprise.prediction_algorithms.algo_base.AlgoBase.get_neighbors>`:
 55 | 
 56 | .. literalinclude:: ../../examples/k_nearest_neighbors.py
 57 |     :caption: From file ``examples/k_nearest_neighbors.py``
 58 |     :name: k_nearest_neighbors.py
 59 |     :lines: 10-
 60 | 
 61 | Naturally, the same can be done for users with minor modifications.
 62 | 
 63 | .. _serialize_an_algorithm:
 64 | 
 65 | How to serialize an algorithm
 66 | -----------------------------
 67 | 
 68 | Prediction algorithms can be serialized and loaded back using the :func:`dump()
 69 | <surprise.dump.dump>` and :func:`load() <surprise.dump.load>` functions. Here
 70 | is a small example where the SVD algorithm is trained on a dataset and
 71 | serialized. It is then reloaded and can be used again for making predictions:
 72 | 
 73 | .. literalinclude:: ../../examples/serialize_algorithm.py
 74 |     :caption: From file ``examples/serialize_algorithm.py``
 75 |     :name: serialize_algorithm.py
 76 |     :lines: 9-
 77 | 
 78 | .. _further_analysis:
 79 | 
 80 | Algorithms can be serialized along with their predictions, so that can be
 81 | further analyzed or compared with other algorithms, using pandas dataframes.
 82 | Some examples are given in the two following notebooks:
 83 | 
 84 |     * `Dumping and analysis of the KNNBasic algorithm
 85 |       <http://nbviewer.jupyter.org/github/NicolasHug/Surprise/tree/master/examples/notebooks/KNNBasic_analysis.ipynb/>`_.
 86 |     * `Comparison of two algorithms
 87 |       <http://nbviewer.jupyter.org/github/NicolasHug/Surprise/tree/master/examples/notebooks/Compare.ipynb/>`_.
 88 | 
 89 | How to build my own prediction algorithm
 90 | ----------------------------------------
 91 | 
 92 | There's a whole guide :ref:`here<building_custom_algo>`.
 93 | 
 94 | What are raw and inner ids
 95 | --------------------------
 96 | 
 97 | See :ref:`this note <raw_inner_note>`.
 98 | 
 99 | Can I use my own dataset with Surprise
100 | --------------------------------------
101 | 
102 | Yes, you can. See the :ref:`user guide <load_custom>`.
103 | 
104 | How to tune an algorithm parameters
105 | -----------------------------------
106 | 
107 | You can tune the parameters of an algorithm with the :class:`GridSearch
108 | <surprise.evaluate.GridSearch>` class as described :ref:`here
109 | <tuning_algorithm_parameters>`. After the tuning, you may want to have an
110 | :ref:`unbiased estimate of your algorithm performances
111 | <unbiased_estimate_after_tuning>`.
112 | 
113 | How to get accuracy measures on the training set
114 | ------------------------------------------------
115 | 
116 | You can use the :meth:`build_testset()
117 | <surprise.dataset.Trainset.build_testset()>` method of the :class:`Trainset
118 | <surprise.dataset.Trainset>` object to build a testset that can be then used
119 | with the :meth:`test()
120 | <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method:
121 | 
122 | .. literalinclude:: ../../examples/evaluate_on_trainset.py
123 |     :caption: From file ``examples/evaluate_on_trainset.py``
124 |     :name: evaluate_on_trainset.py
125 |     :lines: 9-24
126 | 
127 | Check out the example file for more usage examples.
128 | 
129 | .. _unbiased_estimate_after_tuning:
130 | 
131 | How to save some data for unbiased accuracy estimation
132 | ------------------------------------------------------
133 | 
134 | If your goal is to tune the parameters of an algorithm, you may want to spare a
135 | bit of data to have an unbiased estimation of its performances. For instance
136 | you may want to split your data into two sets A and B. A is used for parameter
137 | tuning using grid search, and B is used for unbiased estimation. This can be
138 | done as follows:
139 | 
140 | .. literalinclude:: ../../examples/split_data_for_unbiased_estimation.py
141 |     :caption: From file ``examples/split_data_for_unbiased_estimation.py``
142 |     :name: split_data_for_unbiased_estimation.py
143 |     :lines: 10-
144 | 


--------------------------------------------------------------------------------
/tests/test_SVD.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for testing the SVD and SVD++ algorithms.
  3 | """
  4 | 
  5 | from __future__ import (absolute_import, division, print_function,
  6 |                         unicode_literals)
  7 | import os
  8 | 
  9 | from surprise import SVD
 10 | from surprise import SVDpp
 11 | from surprise import Dataset
 12 | from surprise import Reader
 13 | from surprise import evaluate
 14 | 
 15 | 
 16 | # the test and train files are from the ml-100k dataset (10% of u1.base and
 17 | # 10 % of u1.test)
 18 | train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
 19 | test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
 20 | data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
 21 | 
 22 | 
 23 | def test_SVD_parameters():
 24 |     """Ensure that all parameters are taken into account."""
 25 | 
 26 |     # The baseline against which to compare.
 27 |     algo = SVD(n_factors=1, n_epochs=1)
 28 |     rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']
 29 | 
 30 |     # n_factors
 31 |     algo = SVD(n_factors=2, n_epochs=1)
 32 |     rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
 33 |     assert rmse_default != rmse_factors
 34 | 
 35 |     # n_epochs
 36 |     algo = SVD(n_factors=1, n_epochs=2)
 37 |     rmse_n_epochs = evaluate(algo, data, measures=['rmse'])['rmse']
 38 |     assert rmse_default != rmse_n_epochs
 39 | 
 40 |     # biased
 41 |     algo = SVD(n_factors=1, n_epochs=1, biased=False)
 42 |     rmse_biased = evaluate(algo, data, measures=['rmse'])['rmse']
 43 |     assert rmse_default != rmse_biased
 44 | 
 45 |     # lr_all
 46 |     algo = SVD(n_factors=1, n_epochs=1, lr_all=5)
 47 |     rmse_lr_all = evaluate(algo, data, measures=['rmse'])['rmse']
 48 |     assert rmse_default != rmse_lr_all
 49 | 
 50 |     # reg_all
 51 |     algo = SVD(n_factors=1, n_epochs=1, reg_all=5)
 52 |     rmse_reg_all = evaluate(algo, data, measures=['rmse'])['rmse']
 53 |     assert rmse_default != rmse_reg_all
 54 | 
 55 |     # lr_bu
 56 |     algo = SVD(n_factors=1, n_epochs=1, lr_bu=5)
 57 |     rmse_lr_bu = evaluate(algo, data, measures=['rmse'])['rmse']
 58 |     assert rmse_default != rmse_lr_bu
 59 | 
 60 |     # lr_bi
 61 |     algo = SVD(n_factors=1, n_epochs=1, lr_bi=5)
 62 |     rmse_lr_bi = evaluate(algo, data, measures=['rmse'])['rmse']
 63 |     assert rmse_default != rmse_lr_bi
 64 | 
 65 |     # lr_pu
 66 |     algo = SVD(n_factors=1, n_epochs=1, lr_pu=5)
 67 |     rmse_lr_pu = evaluate(algo, data, measures=['rmse'])['rmse']
 68 |     assert rmse_default != rmse_lr_pu
 69 | 
 70 |     # lr_qi
 71 |     algo = SVD(n_factors=1, n_epochs=1, lr_qi=5)
 72 |     rmse_lr_qi = evaluate(algo, data, measures=['rmse'])['rmse']
 73 |     assert rmse_default != rmse_lr_qi
 74 | 
 75 |     # reg_bu
 76 |     algo = SVD(n_factors=1, n_epochs=1, reg_bu=5)
 77 |     rmse_reg_bu = evaluate(algo, data, measures=['rmse'])['rmse']
 78 |     assert rmse_default != rmse_reg_bu
 79 | 
 80 |     # reg_bi
 81 |     algo = SVD(n_factors=1, n_epochs=1, reg_bi=5)
 82 |     rmse_reg_bi = evaluate(algo, data, measures=['rmse'])['rmse']
 83 |     assert rmse_default != rmse_reg_bi
 84 | 
 85 |     # reg_pu
 86 |     algo = SVD(n_factors=1, n_epochs=1, reg_pu=5)
 87 |     rmse_reg_pu = evaluate(algo, data, measures=['rmse'])['rmse']
 88 |     assert rmse_default != rmse_reg_pu
 89 | 
 90 |     # reg_qi
 91 |     algo = SVD(n_factors=1, n_epochs=1, reg_qi=5)
 92 |     rmse_reg_qi = evaluate(algo, data, measures=['rmse'])['rmse']
 93 |     assert rmse_default != rmse_reg_qi
 94 | 
 95 | 
 96 | def test_SVDpp_parameters():
 97 |     """Ensure that all parameters are taken into account."""
 98 | 
 99 |     # The baseline against which to compare.
100 |     algo = SVDpp(n_factors=1, n_epochs=1)
101 |     rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']
102 | 
103 |     # n_factors
104 |     algo = SVDpp(n_factors=2, n_epochs=1)
105 |     rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
106 |     assert rmse_default != rmse_factors
107 | 
108 |     # The rest is OK but just takes too long for now...
109 |     """
110 | 
111 |     # n_epochs
112 |     algo = SVDpp(n_factors=1, n_epochs=2)
113 |     rmse_n_epochs = evaluate(algo, data, measures=['rmse'])['rmse']
114 |     assert rmse_default != rmse_n_epochs
115 | 
116 |     # lr_all
117 |     algo = SVDpp(n_factors=1, n_epochs=1, lr_all=5)
118 |     rmse_lr_all = evaluate(algo, data, measures=['rmse'])['rmse']
119 |     assert rmse_default != rmse_lr_all
120 | 
121 |     # reg_all
122 |     algo = SVDpp(n_factors=1, n_epochs=1, reg_all=5)
123 |     rmse_reg_all = evaluate(algo, data, measures=['rmse'])['rmse']
124 |     assert rmse_default != rmse_reg_all
125 | 
126 |     # lr_bu
127 |     algo = SVDpp(n_factors=1, n_epochs=1, lr_bu=5)
128 |     rmse_lr_bu = evaluate(algo, data, measures=['rmse'])['rmse']
129 |     assert rmse_default != rmse_lr_bu
130 | 
131 |     # lr_bi
132 |     algo = SVDpp(n_factors=1, n_epochs=1, lr_bi=5)
133 |     rmse_lr_bi = evaluate(algo, data, measures=['rmse'])['rmse']
134 |     assert rmse_default != rmse_lr_bi
135 | 
136 |     # lr_pu
137 |     algo = SVDpp(n_factors=1, n_epochs=1, lr_pu=5)
138 |     rmse_lr_pu = evaluate(algo, data, measures=['rmse'])['rmse']
139 |     assert rmse_default != rmse_lr_pu
140 | 
141 |     # lr_qi
142 |     algo = SVDpp(n_factors=1, n_epochs=1, lr_qi=5)
143 |     rmse_lr_qi = evaluate(algo, data, measures=['rmse'])['rmse']
144 |     assert rmse_default != rmse_lr_qi
145 | 
146 |     # lr_yj
147 |     algo = SVDpp(n_factors=1, n_epochs=1, lr_yj=5)
148 |     rmse_lr_yj = evaluate(algo, data, measures=['rmse'])['rmse']
149 |     assert rmse_default != rmse_lr_yj
150 | 
151 |     # reg_bu
152 |     algo = SVDpp(n_factors=1, n_epochs=1, reg_bu=5)
153 |     rmse_reg_bu = evaluate(algo, data, measures=['rmse'])['rmse']
154 |     assert rmse_default != rmse_reg_bu
155 | 
156 |     # reg_bi
157 |     algo = SVDpp(n_factors=1, n_epochs=1, reg_bi=5)
158 |     rmse_reg_bi = evaluate(algo, data, measures=['rmse'])['rmse']
159 |     assert rmse_default != rmse_reg_bi
160 | 
161 |     # reg_pu
162 |     algo = SVDpp(n_factors=1, n_epochs=1, reg_pu=5)
163 |     rmse_reg_pu = evaluate(algo, data, measures=['rmse'])['rmse']
164 |     assert rmse_default != rmse_reg_pu
165 | 
166 |     # reg_qi
167 |     algo = SVDpp(n_factors=1, n_epochs=1, reg_qi=5)
168 |     rmse_reg_qi = evaluate(algo, data, measures=['rmse'])['rmse']
169 |     assert rmse_default != rmse_reg_qi
170 | 
171 |     # reg_yj
172 |     algo = SVDpp(n_factors=1, n_epochs=1, reg_yj=5)
173 |     rmse_reg_yj = evaluate(algo, data, measures=['rmse'])['rmse']
174 |     assert rmse_default != rmse_reg_yj
175 |     """
176 | 


--------------------------------------------------------------------------------
/doc/source/prediction_algorithms.rst:
--------------------------------------------------------------------------------
  1 | .. _prediction_algorithms:
  2 | 
  3 | Using prediction algorithms
  4 | ===========================
  5 | 
  6 | Surprise provides with a bunch of built-in algorithms. All algorithms derive
  7 | from the :class:`AlgoBase <surprise.prediction_algorithms.algo_base.AlgoBase>`
  8 | base class, where are implemented some key methods (e.g. :meth:`predict
  9 | <surprise.prediction_algorithms.algo_base.AlgoBase.predict>`, :meth:`train
 10 | <surprise.prediction_algorithms.algo_base.AlgoBase.train>` and :meth:`test
 11 | <surprise.prediction_algorithms.algo_base.AlgoBase.test>`). You can find the
 12 | details of each of these in the :mod:`prediction_algorithms
 13 | <surprise.prediction_algorithms>` package documentation.
 14 | 
 15 | Every algorithm is part of the global Surprise namespace, so you only need to
 16 | import their names from the Surprise package, for example: ::
 17 | 
 18 |     from surprise import KNNBasic
 19 |     algo = KNNBasic()
 20 | 
 21 | 
 22 | Some of these algorithms may use :ref:`baseline estimates
 23 | <baseline_estimates_configuration>`, some may use a :ref:`similarity measure
 24 | <similarity_measures_configuration>`. We will here review how to configure the
 25 | way baselines and similarities are computed.
 26 | 
 27 | 
 28 | .. _baseline_estimates_configuration:
 29 | 
 30 | Baselines estimates configuration
 31 | ---------------------------------
 32 | 
 33 | 
 34 | .. note::
 35 |   This section only applies to algorithms (or similarity measures) that try to
 36 |   minimize the following regularized squared error (or equivalent):
 37 | 
 38 |   .. math::
 39 |     \sum_{r_{ui} \in R_{train}} \left(r_{ui} - (\mu + b_u + b_i)\right)^2 +
 40 |     \lambda \left(b_u^2 + b_i^2 \right).
 41 | 
 42 |   For algorithms using baselines in another objective function (e.g. the
 43 |   :class:`SVD <surprise.prediction_algorithms.matrix_factorization.SVD>`
 44 |   algorithm), the baseline configuration is done differently and is specific to
 45 |   each algorithm. Please refer to their own documentation.
 46 | 
 47 | First of all, if you do not want to configure the way baselines are computed,
 48 | you don't have to: the default parameters will do just fine. If you do want to
 49 | well... This is for you.
 50 | 
 51 | You may want to read section 2.1 of :cite:`Koren:2010` to get a good idea of
 52 | what are baseline estimates.
 53 | 
 54 | Baselines can be estimated in two different ways:
 55 | 
 56 | * Using Stochastic Gradient Descent (SGD).
 57 | * Using Alternating Least Squares (ALS).
 58 | 
 59 | You can configure the way baselines are computed using the ``bsl_options``
 60 | parameter passed at the creation of an algorithm. This parameter is a
 61 | dictionary for which the key ``'method'`` indicates the method to use. Accepted
 62 | values are ``'als'`` (default) and ``'sgd'``. Depending on its value, other
 63 | options may be set. For ALS:
 64 | 
 65 | - ``'reg_i'``: The regularization parameter for items. Corresponding to
 66 |   :math:`\lambda_2` in :cite:`Koren:2010`.  Default is ``10``.
 67 | - ``'reg_u'``: The regularization parameter for users. Corresponding to
 68 |   :math:`\lambda_3` in :cite:`Koren:2010`.  Default is ``15``.
 69 | - ``'n_epochs'``: The number of iteration of the ALS procedure. Default is
 70 |   ``10``.  Note that in :cite:`Koren:2010`, what is described is a **single**
 71 |   iteration ALS process.
 72 | 
 73 | And for SGD:
 74 | 
 75 | - ``'reg'``: The regularization parameter of the cost function that is
 76 |   optimized, corresponding to :math:`\lambda_1` and then :math:`\lambda_5` in
 77 |   :cite:`Koren:2010` Default is ``0.02``.
 78 | - ``'learning_rate'``: The learning rate of SGD, corresponding to
 79 |   :math:`\gamma` in :cite:`Koren:2010`.  Default is ``0.005``.
 80 | - ``'n_epochs'``: The number of iteration of the SGD procedure. Default is 20. 
 81 | 
 82 | .. note::
 83 |   For both procedures (ALS and SGD), user and item biases (:math:`b_u` and
 84 |   :math:`b_i`) are initialized to zero.
 85 | 
 86 | Usage examples:
 87 | 
 88 | .. literalinclude:: ../../examples/baselines_conf.py
 89 |     :caption: From file ``examples/baselines_conf.py``
 90 |     :name: baselines_als
 91 |     :lines: 19-25
 92 | 
 93 | .. literalinclude:: ../../examples/baselines_conf.py
 94 |     :caption: From file ``examples/baselines_conf.py``
 95 |     :name: baselines_sgd
 96 |     :lines: 30-34
 97 | 
 98 | Note that some similarity measures may use baselines, such as the
 99 | :func:`pearson_baseline <surprise.similarities.pearson_baseline>` similarity.
100 | Configuration works just the same, whether the baselines are used in the actual
101 | prediction :math:`\hat{r}_{ui}` or not:
102 | 
103 | .. literalinclude:: ../../examples/baselines_conf.py
104 |     :caption: From file ``examples/baselines_conf.py``
105 |     :name: baselines_als_pearson_sim
106 |     :lines: 40-44
107 | 
108 | 
109 | This leads us to similarity measure configuration, which we will review right
110 | now.
111 | 
112 | .. _similarity_measures_configuration:
113 | 
114 | Similarity measure configuration
115 | --------------------------------
116 | 
117 | Many algorithms use a similarity measure to estimate a rating. The way they can
118 | be configured is done in a similar fashion as for baseline ratings: you just
119 | need to pass a ``sim_options`` argument at the creation of an algorithm. This
120 | argument is a dictionary with the following (all optional) keys:
121 | 
122 | - ``'name'``: The name of the similarity to use, as defined in the
123 |   :mod:`similarities <surprise.similarities>` module. Default is ``'MSD'``.
124 | - ``'user_based'``: Whether similarities will be computed between users or
125 |   between items. This has a **huge** impact on the performance of a prediction
126 |   algorithm.  Default is ``True``.
127 | - ``'min_support'``: The minimum number of common items (when ``'user_based'``
128 |   is ``'True'``) or minimum number of common users (when ``'user_based'`` is
129 |   ``'False'``) for the similarity not to be zero. Simply put, if
130 |   :math:`|I_{uv}| < \text{min_support}` then :math:`\text{sim}(u, v) = 0`. The
131 |   same goes for items.
132 | - ``'shrinkage'``: Shrinkage parameter to apply (only relevant for
133 |   :func:`pearson_baseline <surprise.similarities.pearson_baseline>` similarity).
134 |   Default is 100.
135 | 
136 | Usage examples:
137 | 
138 | .. literalinclude:: ../../examples/similarity_conf.py
139 |     :caption: From file ``examples/similarity_conf.py``
140 |     :name: sim_conf_cos
141 |     :lines: 18-21
142 | 
143 | .. literalinclude:: ../../examples/similarity_conf.py
144 |     :caption: From file ``examples/similarity_conf.py``
145 |     :name: sim_conf_pearson_baseline
146 |     :lines: 26-29
147 | 
148 | .. seealso::
149 |     The :mod:`similarities <surprise.similarities>` module.
150 | 


--------------------------------------------------------------------------------
/tests/test_similarities.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for testing the similarity measures
  3 | """
  4 | 
  5 | from __future__ import (absolute_import, division, print_function,
  6 |                         unicode_literals)
  7 | import random
  8 | 
  9 | import numpy as np
 10 | 
 11 | import surprise.similarities as sims
 12 | 
 13 | 
 14 | n_x = 7
 15 | yr_global = {
 16 |         0: [(0, 3), (1, 3), (2, 3),                 (5, 1), (6, 2)], # noqa
 17 |         1: [(0, 4), (1, 4), (2, 4),                               ], # noqa
 18 |         2: [                (2, 5), (3, 2), (4, 3)                ], # noqa
 19 |         3: [        (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 4)], # noqa
 20 |         4: [        (1, 5), (2, 1),                 (5, 2), (6, 3)], # noqa
 21 |         }
 22 | 
 23 | 
 24 | def test_cosine_sim():
 25 |     """Tests for the cosine similarity."""
 26 | 
 27 |     yr = yr_global.copy()
 28 | 
 29 |     # shuffle every rating list, to ensure the order in which ratings are
 30 |     # processed does not matter (it's important because it used to be error
 31 |     # prone when we were using itertools.combinations)
 32 |     for _, ratings in yr.items():
 33 |         random.shuffle(ratings)
 34 | 
 35 |     sim = sims.cosine(n_x, yr, min_support=1)
 36 | 
 37 |     # check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
 38 |     for xi in range(n_x):
 39 |         assert sim[xi, xi] == 1
 40 |         for xj in range(n_x):
 41 |             assert sim[xi, xj] == sim[xj, xi]
 42 |             assert 0 <= sim[xi, xj] <= 1
 43 | 
 44 |     # on common items, users 0, 1 and 2 have the same ratings
 45 |     assert sim[0, 1] == 1
 46 |     assert sim[0, 2] == 1
 47 | 
 48 |     # for vectors with constant ratings (even if they're different constants),
 49 |     # cosine sim is necessarily 1
 50 |     assert sim[3, 4] == 1
 51 | 
 52 |     # pairs of users (0, 3)  have no common items
 53 |     assert sim[0, 3] == 0
 54 |     assert sim[0, 4] == 0
 55 | 
 56 |     # non constant and different ratings: cosine sim must be in ]0, 1[
 57 |     assert 0 < sim[5, 6] < 1
 58 | 
 59 |     # ensure min_support is taken into account. Only users 1 and 2 have more
 60 |     # than 4 common ratings.
 61 |     sim = sims.cosine(n_x, yr, min_support=4)
 62 |     for i in range(n_x):
 63 |         for j in range(i + 1, n_x):
 64 |             if i != 1 and j != 2:
 65 |                 assert sim[i, j] == 0
 66 | 
 67 | 
 68 | def test_msd_sim():
 69 |     """Tests for the MSD similarity."""
 70 | 
 71 |     yr = yr_global.copy()
 72 | 
 73 |     # shuffle every rating list, to ensure the order in which ratings are
 74 |     # processed does not matter (it's important because it used to be error
 75 |     # prone when we were using itertools.combinations)
 76 |     for _, ratings in yr.items():
 77 |         random.shuffle(ratings)
 78 | 
 79 |     sim = sims.msd(n_x, yr, min_support=1)
 80 | 
 81 |     # check symetry and bounds. MSD sim must be in [0, 1]
 82 |     for xi in range(n_x):
 83 |         assert sim[xi, xi] == 1
 84 |         for xj in range(n_x):
 85 |             assert sim[xi, xj] == sim[xj, xi]
 86 |             assert 0 <= sim[xi, xj] <= 1
 87 | 
 88 |     # on common items, users 0, 1 and 2 have the same ratings
 89 |     assert sim[0, 1] == 1
 90 |     assert sim[0, 2] == 1
 91 | 
 92 |     # msd(3, 4) = mean(1^2, 1^2). sim = (1 / (1 + msd)) = 1 / 2
 93 |     assert sim[3, 4] == .5
 94 | 
 95 |     # pairs of users (0, 3)  have no common items
 96 |     assert sim[0, 3] == 0
 97 |     assert sim[0, 4] == 0
 98 | 
 99 |     # ensure min_support is taken into account. Only users 1 and 2 have more
100 |     # than 4 common ratings.
101 |     sim = sims.msd(n_x, yr, min_support=4)
102 |     for i in range(n_x):
103 |         for j in range(i + 1, n_x):
104 |             if i != 1 and j != 2:
105 |                 assert sim[i, j] == 0
106 | 
107 | 
108 | def test_pearson_sim():
109 |     """Tests for the pearson similarity."""
110 | 
111 |     yr = yr_global.copy()
112 | 
113 |     # shuffle every rating list, to ensure the order in which ratings are
114 |     # processed does not matter (it's important because it used to be error
115 |     # prone when we were using itertools.combinations)
116 |     for _, ratings in yr.items():
117 |         random.shuffle(ratings)
118 | 
119 |     sim = sims.pearson(n_x, yr, min_support=1)
120 |     # check symetry and bounds. -1 <= pearson coeff <= 1
121 |     for xi in range(n_x):
122 |         assert sim[xi, xi] == 1
123 |         for xj in range(n_x):
124 |             assert sim[xi, xj] == sim[xj, xi]
125 |             assert -1 <= sim[xi, xj] <= 1
126 | 
127 |     # on common items, users 0, 1 and 2 have the same ratings
128 |     assert sim[0, 1] == 1
129 |     assert sim[0, 2] == 1
130 | 
131 |     # for vectors with constant ratings, pearson sim is necessarily zero (as
132 |     # ratings are centered)
133 |     assert sim[3, 4] == 0
134 |     assert sim[2, 3] == 0
135 |     assert sim[2, 4] == 0
136 | 
137 |     # pairs of users (0, 3), have no common items
138 |     assert sim[0, 3] == 0
139 |     assert sim[0, 4] == 0
140 | 
141 |     # almost same ratings (just with an offset of 1)
142 |     assert sim[5, 6] == 1
143 | 
144 |     # ratings vary in the same direction
145 |     assert sim[2, 5] > 0
146 | 
147 |     # ensure min_support is taken into account. Only users 1 and 2 have more
148 |     # than 4 common ratings.
149 |     sim = sims.pearson(n_x, yr, min_support=4)
150 |     for i in range(n_x):
151 |         for j in range(i + 1, n_x):
152 |             if i != 1 and j != 2:
153 |                 assert sim[i, j] == 0
154 | 
155 | 
156 | def test_pearson_baseline_sim():
157 |     """Tests for the pearson_baseline similarity."""
158 | 
159 |     yr = yr_global.copy()
160 | 
161 |     # shuffle every rating list, to ensure the order in which ratings are
162 |     # processed does not matter (it's important because it used to be error
163 |     # prone when we were using itertools.combinations)
164 |     for _, ratings in yr.items():
165 |         random.shuffle(ratings)
166 | 
167 |     global_mean = 3  # fake
168 |     x_biases = np.random.normal(0, 1, n_x)  # fake
169 |     y_biases = np.random.normal(0, 1, 5)  # fake (there are 5 ys)
170 |     sim = sims.pearson_baseline(n_x, yr, 1, global_mean, x_biases, y_biases)
171 |     # check symetry and bounds. -1 <= pearson coeff <= 1
172 |     for xi in range(n_x):
173 |         assert sim[xi, xi] == 1
174 |         for xj in range(n_x):
175 |             assert sim[xi, xj] == sim[xj, xi]
176 |             assert -1 <= sim[xi, xj] <= 1
177 | 
178 |     # Note: as sim now depends on baselines, which depend on both users and
179 |     # items ratings, it's now impossible to test assertions such that 'as users
180 |     # have the same ratings, they should have a maximal similarity'. Both users
181 |     # AND common items should have same ratings.
182 | 
183 |     # pairs of users (0, 3), have no common items
184 |     assert sim[0, 3] == 0
185 |     assert sim[0, 4] == 0
186 | 
187 |     # ensure min_support is taken into account. Only users 1 and 2 have more
188 |     # than 4 common ratings.
189 |     sim = sims.pearson_baseline(n_x, yr, 4, global_mean, x_biases, y_biases)
190 |     for i in range(n_x):
191 |         for j in range(i + 1, n_x):
192 |             if i != 1 and j != 2:
193 |                 assert sim[i, j] == 0
194 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/RecSys.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/RecSys.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/RecSys"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/RecSys"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 
194 | spelling:
195 | 	$(SPHINXBUILD) -b spelling -d build/doctrees   source build/spelling
196 | 


--------------------------------------------------------------------------------
/surprise/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import (absolute_import, division, print_function,
  4 |                         unicode_literals)
  5 | import random as rd
  6 | import sys
  7 | import shutil
  8 | import argparse
  9 | 
 10 | import numpy as np
 11 | 
 12 | from surprise.prediction_algorithms import NormalPredictor
 13 | from surprise.prediction_algorithms import BaselineOnly
 14 | from surprise.prediction_algorithms import KNNBasic
 15 | from surprise.prediction_algorithms import KNNBaseline
 16 | from surprise.prediction_algorithms import KNNWithMeans
 17 | from surprise.prediction_algorithms import SVD
 18 | from surprise.prediction_algorithms import SVDpp
 19 | from surprise.prediction_algorithms import NMF
 20 | from surprise.prediction_algorithms import SlopeOne
 21 | from surprise.prediction_algorithms import CoClustering
 22 | import surprise.dataset as dataset
 23 | from surprise.dataset import Dataset
 24 | from surprise.dataset import Reader  # noqa
 25 | from surprise.evaluate import evaluate
 26 | from surprise import __version__
 27 | 
 28 | 
 29 | def main():
 30 | 
 31 |     class MyParser(argparse.ArgumentParser):
 32 |         '''A parser which prints the help message when an error occurs. Taken from
 33 |         http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.'''  # noqa
 34 | 
 35 |         def error(self, message):
 36 |             sys.stderr.write('error: %s\n' % message)
 37 |             self.print_help()
 38 |             sys.exit(2)
 39 | 
 40 |     parser = MyParser(
 41 |         description='Evaluate the performance of a rating prediction ' +
 42 |         'on a given dataset using cross validation. You can use a built-in ' +
 43 |         'or a custom dataset, and you can choose to automatically split the ' +
 44 |         'dataset into folds, or manually specify train and test files. ' +
 45 |         'Please refer to the documentation page ' +
 46 |         '(http://surprise.readthedocs.io/) for more details.',
 47 |         epilog="""Example:\n
 48 |         surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}"
 49 |         -load-builtin ml-100k -n-folds 3""")
 50 | 
 51 |     algo_choices = {
 52 |         'NormalPredictor': NormalPredictor,
 53 |         'BaselineOnly': BaselineOnly,
 54 |         'KNNBasic': KNNBasic,
 55 |         'KNNBaseline': KNNBaseline,
 56 |         'KNNWithMeans': KNNWithMeans,
 57 |         'SVD': SVD,
 58 |         'SVDpp': SVDpp,
 59 |         'NMF': NMF,
 60 |         'SlopeOne': SlopeOne,
 61 |         'CoClustering': CoClustering,
 62 |     }
 63 | 
 64 |     parser.add_argument('-algo', type=str,
 65 |                         choices=algo_choices,
 66 |                         help='The prediction algorithm to use. ' +
 67 |                         'Allowed values are ' +
 68 |                         ', '.join(algo_choices.keys()) + '.',
 69 |                         metavar='<prediction algorithm>')
 70 | 
 71 |     parser.add_argument('-params', type=str,
 72 |                         metavar='<algorithm parameters>',
 73 |                         default='{}',
 74 |                         help='A kwargs dictionary that contains all the ' +
 75 |                         'algorithm parameters.' +
 76 |                         'Example: "{\'n_epochs\': 10}".'
 77 |                         )
 78 | 
 79 |     parser.add_argument('-load-builtin', type=str, dest='load_builtin',
 80 |                         metavar='<dataset name>',
 81 |                         default='ml-100k',
 82 |                         help='The name of the built-in dataset to use.' +
 83 |                         'Allowed values are ' +
 84 |                         ', '.join(dataset.BUILTIN_DATASETS.keys()) +
 85 |                         '. Default is ml-100k.'
 86 |                         )
 87 | 
 88 |     parser.add_argument('-load-custom', type=str, dest='load_custom',
 89 |                         metavar='<file path>',
 90 |                         default=None,
 91 |                         help='A file path to custom dataset to use. ' +
 92 |                         'Ignored if ' +
 93 |                         '-loadbuiltin is set. The -reader parameter needs ' +
 94 |                         'to be set.'
 95 |                         )
 96 | 
 97 |     parser.add_argument('-folds-files', type=str, dest='folds_files',
 98 |                         metavar='<train1 test1 train2 test2... >',
 99 |                         default=None,
100 |                         help='A list of custom train and test files. ' +
101 |                         'Ignored if -load-builtin or -load-custom is set. '
102 |                         'The -reader parameter needs to be set.'
103 |                         )
104 | 
105 |     parser.add_argument('-reader', type=str,
106 |                         metavar='<reader>',
107 |                         default=None,
108 |                         help='A Reader to read the custom dataset. Example: ' +
109 |                         '"Reader(line_format=\'user item rating timestamp\',' +
110 |                         ' sep=\'\\t\')"'
111 |                         )
112 | 
113 |     parser.add_argument('-n-folds', type=int, dest='n_folds',
114 |                         metavar="<number of folds>",
115 |                         default=5,
116 |                         help='The number of folds for cross-validation. ' +
117 |                         'Default is 5.'
118 |                         )
119 | 
120 |     parser.add_argument('-seed', type=int,
121 |                         metavar='<random seed>',
122 |                         default=None,
123 |                         help='The seed to use for RNG. ' +
124 |                         'Default is the current system time.'
125 |                         )
126 | 
127 |     parser.add_argument('--with-dump', dest='with_dump', action='store_true',
128 |                         help='Dump the algorithm ' +
129 |                         'results in a file (one file per fold)' +
130 |                         'Default is False.'
131 |                         )
132 | 
133 |     parser.add_argument('-dump-dir', dest='dump_dir', type=str,
134 |                         metavar='<dir>',
135 |                         default=None,
136 |                         help='Where to dump the files. Ignored if ' +
137 |                         'with-dump is not set. Default is ' +
138 |                         '~/.surprise_data/dumps.'
139 |                         )
140 | 
141 |     parser.add_argument('--clean', dest='clean', action='store_true',
142 |                         help='Remove the ' + dataset.DATASETS_DIR +
143 |                         ' directory and exit.'
144 |                         )
145 | 
146 |     parser.add_argument('-v', '--version', action='version',
147 |                         version=__version__)
148 | 
149 |     args = parser.parse_args()
150 | 
151 |     if args.clean:
152 |         shutil.rmtree(dataset.DATASETS_DIR)
153 |         print('Removed', dataset.DATASETS_DIR)
154 |         exit()
155 | 
156 |     # setup RNG
157 |     rd.seed(args.seed)
158 |     np.random.seed(args.seed)
159 | 
160 |     # setup algorithm
161 |     params = eval(args.params)
162 |     if args.algo is None:
163 |         parser.error('No algorithm was specified.')
164 |     algo = algo_choices[args.algo](**params)
165 | 
166 |     # setup dataset
167 |     if args.load_custom is not None:  # load custom and split
168 |         if args.reader is None:
169 |             parser.error('-reader parameter is needed.')
170 |         reader = eval(args.reader)
171 |         data = Dataset.load_from_file(args.load_custom, reader=reader)
172 |         data.split(n_folds=args.n_folds)
173 | 
174 |     elif args.folds_files is not None:  # load from files
175 |         if args.reader is None:
176 |             parser.error('-reader parameter is needed.')
177 |         reader = eval(args.reader)
178 |         folds_files = args.folds_files.split()
179 |         folds_files = [(folds_files[i], folds_files[i + 1])
180 |                        for i in range(0, len(folds_files) - 1, 2)]
181 |         data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
182 | 
183 |     else:  # load builtin dataset and split
184 |         data = Dataset.load_builtin(args.load_builtin)
185 |         data.split(n_folds=args.n_folds)
186 | 
187 |     evaluate(algo, data, with_dump=args.with_dump, dump_dir=args.dump_dir)
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     main()
192 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 1>NUL 2>NUL
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\RecSys.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\RecSys.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/co_clustering.pyx:
--------------------------------------------------------------------------------
  1 | """
  2 | the :mod:`co_clustering` module includes the :class:`CoClustering` algorithm.
  3 | """
  4 | 
  5 | from __future__ import (absolute_import, division, print_function,
  6 |                         unicode_literals)
  7 | 
  8 | cimport numpy as np  # noqa
  9 | import numpy as np
 10 | 
 11 | from .algo_base import AlgoBase
 12 | 
 13 | 
 14 | class CoClustering(AlgoBase):
 15 |     """A collaborative filtering algorithm based on co-clustering.
 16 | 
 17 |     This is a straightforward implementation of :cite:`George:2005`.
 18 | 
 19 |     Basically, users and items are assigned some clusters :math:`C_u`,
 20 |     :math:`C_i`, and some co-clusters :math:`C_{ui}`.
 21 | 
 22 |     The prediction :math:`\\hat{r}_{ui}` is set as:
 23 | 
 24 |     .. math::
 25 |         \hat{r}_{ui} = \\overline{C_{ui}} + (\\mu_u - \\overline{C_u}) + (\mu_i
 26 |         - \\overline{C_i}),
 27 | 
 28 |     where :math:`\\overline{C_{ui}}` is the average rating of co-cluster
 29 |     :math:`C_{ui}`, :math:`\\overline{C_u}` is the average rating of
 30 |     :math:`u`'s cluster, and :math:`\\overline{C_i}` is the average rating of
 31 |     :math:`i`'s cluster. If the user is unknown, the prediction is
 32 |     :math:`\hat{r}_{ui} = \\mu_i`. If the item is unknown, the prediction is
 33 |     :math:`\hat{r}_{ui} = \\mu_u`. If both the user and the item are unknown,
 34 |     the prediction is :math:`\hat{r}_{ui} = \\mu`.
 35 | 
 36 |     Clusters are assigned using a straightforward optimization method, much
 37 |     like k-means.
 38 | 
 39 |     Args:
 40 |        n_cltr_u(int): Number of user clusters. Default is ``3``.
 41 |        n_cltr_i(int): Number of item clusters. Default is ``3``.
 42 |        n_epochs(int): Number of iteration of the optimization loop. Default is
 43 |            ``20``.
 44 |        verbose(bool): If True, the current epoch will be printed. Default is
 45 |            ``False``.
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(self, n_cltr_u=3, n_cltr_i=3, n_epochs=20, verbose=False):
 50 | 
 51 |         AlgoBase.__init__(self)
 52 | 
 53 |         self.n_cltr_u = n_cltr_u
 54 |         self.n_cltr_i = n_cltr_i
 55 |         self.n_epochs = n_epochs
 56 |         self.verbose=verbose
 57 | 
 58 |     def train(self, trainset):
 59 | 
 60 |         # All this implementation was hugely inspired from MyMediaLite:
 61 |         # https://github.com/zenogantner/MyMediaLite/blob/master/src/MyMediaLite/RatingPrediction/CoClustering.cs
 62 | 
 63 |         AlgoBase.train(self, trainset)
 64 | 
 65 |         # User and item means
 66 |         cdef np.ndarray[np.double_t] user_mean
 67 |         cdef np.ndarray[np.double_t] item_mean
 68 | 
 69 |         # User and items clusters
 70 |         cdef np.ndarray[np.int_t] cltr_u
 71 |         cdef np.ndarray[np.int_t] cltr_i
 72 | 
 73 |         # Average rating of user clusters, item clusters and co-clusters
 74 |         cdef np.ndarray[np.double_t] avg_cltr_u
 75 |         cdef np.ndarray[np.double_t] avg_cltr_i
 76 |         cdef np.ndarray[np.double_t, ndim=2] avg_cocltr
 77 | 
 78 |         cdef np.ndarray[np.double_t] errors
 79 |         cdef int u, i, r, uc, ic
 80 |         cdef double est
 81 | 
 82 |         # Randomly assign users and items to intial clusters
 83 |         cltr_u = np.random.randint(self.n_cltr_u, size=trainset.n_users)
 84 |         cltr_i = np.random.randint(self.n_cltr_i, size=trainset.n_items)
 85 | 
 86 |         # Compute user and item means
 87 |         user_mean = np.zeros(self.trainset.n_users, np.double)
 88 |         item_mean = np.zeros(self.trainset.n_items, np.double)
 89 |         for u in trainset.all_users():
 90 |             user_mean[u] = np.mean([r for (_, r) in trainset.ur[u]])
 91 |         for i in trainset.all_items():
 92 |             item_mean[i] = np.mean([r for (_, r) in trainset.ir[i]])
 93 | 
 94 |         # Optimization loop. This could be optimized a bit by checking if
 95 |         # clusters where effectively updated and early stop if they did not.
 96 |         for epoch in range(self.n_epochs):
 97 | 
 98 |             if self.verbose:
 99 |                 print("Processing epoch {}".format(epoch))
100 | 
101 |             # Update averages of clusters
102 |             avg_cltr_u, avg_cltr_i, avg_cocltr = self.compute_averages(cltr_u,
103 |                                                                        cltr_i)
104 |             # set user cluster to the one that minimizes squarred error of all
105 |             # the user's ratings.
106 |             for u in self.trainset.all_users():
107 |                 errors = np.zeros(self.n_cltr_u, np.double)
108 |                 for uc in range(self.n_cltr_u):
109 |                     for i, r in self.trainset.ur[u]:
110 |                         ic = cltr_i[i]
111 |                         est = (avg_cocltr[uc, ic] +
112 |                                user_mean[u] - avg_cltr_u[uc] +
113 |                                item_mean[i] - avg_cltr_i[ic])
114 |                         errors[uc] += (r - est)**2
115 |                 cltr_u[u] = np.argmin(errors)
116 | 
117 |             # set item cluster to the one that minimizes squarred error over
118 |             # all the item's ratings.
119 |             for i in self.trainset.all_items():
120 |                 errors = np.zeros(self.n_cltr_i, np.double)
121 |                 for ic in range(self.n_cltr_i):
122 |                     for u, r in self.trainset.ir[i]:
123 |                         uc = cltr_u[u]
124 |                         est = (avg_cocltr[uc, ic] +
125 |                                user_mean[u] - avg_cltr_u[uc] +
126 |                                item_mean[i] - avg_cltr_i[ic])
127 |                         errors[ic] += (r - est)**2
128 |                 cltr_i[i] = np.argmin(errors)
129 | 
130 |         # Compute averages one last time as clusters may have change
131 |         avg_cltr_u, avg_cltr_i, avg_cocltr = self.compute_averages(cltr_u,
132 |                                                                    cltr_i)
133 |         # Set cdefed arrays as attributes as they are needed for prediction
134 |         self.cltr_u = cltr_u
135 |         self.cltr_i = cltr_i
136 | 
137 |         self.user_mean = user_mean
138 |         self.item_mean = item_mean
139 | 
140 |         self.avg_cltr_u = avg_cltr_u
141 |         self.avg_cltr_i = avg_cltr_i
142 |         self.avg_cocltr = avg_cocltr
143 | 
144 |     def compute_averages(self, np.ndarray[np.int_t] cltr_u,
145 |                          np.ndarray[np.int_t] cltr_i):
146 |         """Compute cluster averages.
147 | 
148 |         Args:
149 |             cltr_u: current user clusters
150 |             cltr_i: current item clusters
151 | 
152 |         Returns:
153 |             Three arrays: averages of user clusters, item clusters and
154 |             co-clusters.
155 |         """
156 | 
157 |         # Number of entities in user clusters, item clusters and co-clusters.
158 |         cdef np.ndarray[np.int_t] count_cltr_u
159 |         cdef np.ndarray[np.int_t] count_cltr_i
160 |         cdef np.ndarray[np.int_t, ndim=2] count_cocltr
161 | 
162 |         # Sum of ratings for entities in each cluster
163 |         cdef np.ndarray[np.int_t] sum_cltr_u
164 |         cdef np.ndarray[np.int_t] sum_cltr_i
165 |         cdef np.ndarray[np.int_t, ndim=2] sum_cocltr
166 | 
167 |         # The averages of each cluster (what will be returned)
168 |         cdef np.ndarray[np.double_t] avg_cltr_u
169 |         cdef np.ndarray[np.double_t] avg_cltr_i
170 |         cdef np.ndarray[np.double_t, ndim=2] avg_cocltr
171 | 
172 |         cdef int u, i, r, uc, ic
173 |         cdef double global_mean = self.trainset.global_mean
174 | 
175 |         # Initialize everything to zero
176 |         count_cltr_u = np.zeros(self.n_cltr_u, np.int)
177 |         count_cltr_i = np.zeros(self.n_cltr_i, np.int)
178 |         count_cocltr = np.zeros((self.n_cltr_u, self.n_cltr_i), np.int)
179 | 
180 |         sum_cltr_u = np.zeros(self.n_cltr_u, np.int)
181 |         sum_cltr_i = np.zeros(self.n_cltr_i, np.int)
182 |         sum_cocltr = np.zeros((self.n_cltr_u, self.n_cltr_i), np.int)
183 | 
184 |         avg_cltr_u = np.zeros(self.n_cltr_u, np.double)
185 |         avg_cltr_i = np.zeros(self.n_cltr_i, np.double)
186 |         avg_cocltr = np.zeros((self.n_cltr_u, self.n_cltr_i), np.double)
187 | 
188 |         # Compute counts and sums for every cluster.
189 |         for u, i, r in self.trainset.all_ratings():
190 |             uc = cltr_u[u]
191 |             ic = cltr_i[i]
192 | 
193 |             count_cltr_u[uc] += 1
194 |             count_cltr_i[ic] += 1
195 |             count_cocltr[uc, ic] += 1
196 | 
197 |             sum_cltr_u[uc] += r
198 |             sum_cltr_i[ic] += r
199 |             sum_cocltr[uc, ic] += r
200 | 
201 |         # Then set the averages for users...
202 |         for uc in range(self.n_cltr_u):
203 |             if count_cltr_u[uc]:
204 |                 avg_cltr_u[uc] = sum_cltr_u[uc] / count_cltr_u[uc]
205 |             else:
206 |                 avg_cltr_u[uc] = global_mean
207 | 
208 |         # ... for items
209 |         for ic in range(self.n_cltr_i):
210 |             if count_cltr_i[ic]:
211 |                 avg_cltr_i[ic] = sum_cltr_i[ic] / count_cltr_i[ic]
212 |             else:
213 |                 avg_cltr_i[ic] = global_mean
214 | 
215 |         # ... and for co-clusters
216 |         for uc in range(self.n_cltr_u):
217 |             for ic in range(self.n_cltr_i):
218 |                 if count_cocltr[uc, ic]:
219 |                     avg_cocltr[uc, ic] = (sum_cocltr[uc, ic] /
220 |                                           count_cocltr[uc, ic])
221 |                 else:
222 |                     avg_cocltr[uc, ic] = global_mean
223 | 
224 |         return avg_cltr_u, avg_cltr_i, avg_cocltr
225 | 
226 |     def estimate(self, u, i):
227 | 
228 |         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
229 |             return self.trainset.global_mean
230 | 
231 |         if not self.trainset.knows_user(u):
232 |             return self.cltr_i[i]
233 | 
234 |         if not self.trainset.knows_item(i):
235 |             return self.cltr_u[u]
236 | 
237 |         # I doubt cdefing makes any difference here as cython has no clue about
238 |         # arrays self.stuff... But maybe?
239 |         cdef int _u = u
240 |         cdef int _i = i
241 |         cdef int uc = self.cltr_u[_u]
242 |         cdef int ic = self.cltr_i[_i]
243 |         cdef double est
244 | 
245 |         est = (self.avg_cocltr[uc, ic] +
246 |                self.user_mean[_u] - self.avg_cltr_u[uc] +
247 |                self.item_mean[_i] - self.avg_cltr_i[ic])
248 | 
249 |         return est
250 | 


--------------------------------------------------------------------------------
/doc/source/getting_started.rst:
--------------------------------------------------------------------------------
  1 | .. _getting_started:
  2 | 
  3 | Getting Started
  4 | ===============
  5 | 
  6 | 
  7 | .. _load_builtin_example:
  8 | 
  9 | Basic usage
 10 | -----------
 11 | 
 12 | `Surprise <https://nicolashug.github.io/Surprise/>`_ has a set of built-in
 13 | :ref:`algorithms<prediction_algorithms>` and :ref:`datasets <dataset>` for you
 14 | to play with. In its simplest form, it takes about four lines of code to
 15 | evaluate the performance of an algorithm:
 16 | 
 17 | .. literalinclude:: ../../examples/basic_usage.py
 18 |     :caption: From file ``examples/basic_usage.py``
 19 |     :name: basic_usage.py
 20 |     :lines: 9-
 21 | 
 22 | 
 23 | If `Surprise <https://nicolashug.github.io/Surprise/>`_ cannot find the
 24 | `movielens-100k dataset <http://grouplens.org/datasets/movielens/>`_, it will
 25 | offer to download it and will store it under the ``.surprise_data`` folder in
 26 | your home directory.  The :meth:`split()
 27 | <surprise.dataset.DatasetAutoFolds.split>` method automatically splits the
 28 | dataset into 3 folds and the :func:`evaluate() <surprise.evaluate.evaluate>`
 29 | function runs the cross-validation procedure and compute some :mod:`accuracy
 30 | <surprise.accuracy>` measures.
 31 | 
 32 | 
 33 | .. _load_custom:
 34 | 
 35 | Load a custom dataset
 36 | ---------------------
 37 | 
 38 | You can of course use a custom dataset. `Surprise
 39 | <https://nicolashug.github.io/Surprise/>`_ offers two ways of loading a custom
 40 | dataset:
 41 | 
 42 | - you can either specify a single file with all the ratings and
 43 |   use the :meth:`split ()<surprise.dataset.DatasetAutoFolds.split>` method to
 44 |   perform cross-validation ;
 45 | - or if your dataset is already split into predefined folds, you can specify a
 46 |   list of files for training and testing.
 47 | 
 48 | Either way, you will need to define a :class:`Reader <surprise.dataset.Reader>`
 49 | object for `Surprise <https://nicolashug.github.io/Surprise/>`_ to be able to
 50 | parse the file(s).
 51 | 
 52 | We'll see how to handle both cases with the `movielens-100k dataset
 53 | <http://grouplens.org/datasets/movielens/>`_. Of course this is a built-in
 54 | dataset, but we will act as if it were not.
 55 | 
 56 | .. _load_from_file_example:
 57 | 
 58 | Load an entire dataset
 59 | ~~~~~~~~~~~~~~~~~~~~~~
 60 | 
 61 | .. literalinclude:: ../../examples/load_custom_dataset.py
 62 |     :caption: From file ``examples/load_custom_dataset.py``
 63 |     :name: load_custom_dataset.py
 64 |     :lines: 17-26
 65 | 
 66 | .. note::
 67 |     Actually, as the Movielens-100k dataset is builtin, `Surprise
 68 |     <https://nicolashug.github.io/Surprise/>`_ provides with a proper reader so
 69 |     in this case, we could have just created the reader like this: ::
 70 | 
 71 |       reader = Reader('ml-100k')
 72 | 
 73 | For more details about readers and how to use them, see the :class:`Reader
 74 | class <surprise.dataset.Reader>` documentation.
 75 | 
 76 | .. _load_from_folds_example:
 77 | 
 78 | Load a dataset with predefined folds
 79 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 80 | 
 81 | .. literalinclude:: ../../examples/load_custom_dataset_predefined_folds.py
 82 |     :caption: From file ``examples/load_custom_dataset_predefined_folds.py``
 83 |     :name: load_custom_dataset_predefined_folds.py
 84 |     :lines: 18-30
 85 | 
 86 | Of course, nothing prevents you from only loading a single file for training
 87 | and a single file for testing. However, the ``folds_files`` parameter still
 88 | needs to be a ``list``.
 89 | 
 90 | 
 91 | Advanced usage
 92 | --------------
 93 | 
 94 | We will here get a little deeper on what can `Surprise
 95 | <https://nicolashug.github.io/Surprise/>`_ do for you.
 96 | 
 97 | .. _tuning_algorithm_parameters:
 98 | 
 99 | Tune algorithm parameters with GridSearch
100 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
101 | 
102 | The :func:`evaluate() <surprise.evaluate.evaluate>` function gives us the
103 | results on one set of parameters given to the algorithm. If the user wants
104 | to try the algorithm on a different set of parameters, the
105 | :class:`GridSearch <surprise.evaluate.GridSearch>` class comes to the rescue.
106 | Given a ``dict`` of parameters, this
107 | class exhaustively tries all the combination of parameters and helps get the
108 | best combination for an accuracy measurement. It is analogous to
109 | `GridSearchCV <http://scikit-learn.org/stable/modules/generated/sklearn.model
110 | _selection.GridSearchCV.html>`_ from scikit-learn.
111 | 
112 | For instance, suppose that we want to tune the parameters of the
113 | :class:`SVD <surprise.prediction_algorithms.matrix_factorization.SVD>`. Some of
114 | the parameters of this algorithm are ``n_epochs``, ``lr_all`` and ``reg_all``.
115 | Thus we define a parameters grid as follows
116 | 
117 | .. literalinclude:: ../../examples/grid_search_usage.py
118 |     :caption: From file ``examples/grid_search_usage.py``
119 |     :name: grid_search_usage.py
120 |     :lines: 13-14
121 | 
122 | Next we define a :class:`GridSearch <surprise.evaluate.GridSearch>` instance
123 | and give it the class
124 | :class:`SVD <surprise.prediction_algorithms.matrix_factorization.SVD>` as an
125 | algorithm, and ``param_grid``. We will compute both the
126 | RMSE and FCP values for all the combination. Thus the following definition:
127 | 
128 | .. literalinclude:: ../../examples/grid_search_usage.py
129 |     :caption: From file ``examples/grid_search_usage.py``
130 |     :name: grid_search_usage2.py
131 |     :lines: 16
132 | 
133 | Now that :class:`GridSearch <surprise.evaluate.GridSearch>` instance is ready,
134 | we can evaluate the algorithm on any data with the
135 | :meth:`GridSearch.evaluate()<surprise.evaluate.GridSearch.evaluate>` method,
136 | exactly like with the regular
137 | :func:`evaluate() <surprise.evaluate.evaluate>` function:
138 | 
139 | .. literalinclude:: ../../examples/grid_search_usage.py
140 |     :caption: From file ``examples/grid_search_usage.py``
141 |     :name: grid_search_usage3.py
142 |     :lines: 19-22
143 | 
144 | Everything is ready now to read the results. For example, we get the best RMSE
145 | and FCP scores and parameters as follows:
146 | 
147 | .. literalinclude:: ../../examples/grid_search_usage.py
148 |     :caption: From file ``examples/grid_search_usage.py``
149 |     :name: grid_search_usage4.py
150 |     :lines: 24-38
151 | 
152 | For further analysis, we can easily read all the results in a pandas
153 | ``DataFrame`` as follows:
154 | 
155 | .. literalinclude:: ../../examples/grid_search_usage.py
156 |     :caption: From file ``examples/grid_search_usage.py``
157 |     :name: grid_search_usage5.py
158 |     :lines: 40-
159 | 
160 | .. _iterate_over_folds:
161 | 
162 | Manually iterate over folds
163 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
164 | 
165 | We have so far used the :func:`evaluate() <surprise.evaluate.evaluate>`
166 | function that does all the hard work for us. If you want to have better control
167 | on your experiments, you can use the :meth:`folds()
168 | <surprise.dataset.Dataset.folds>` generator of your dataset, and then the
169 | :meth:`train() <surprise.prediction_algorithms.algo_base.AlgoBase.train>` and
170 | :meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>` methods
171 | of your algorithm on each of the folds:
172 | 
173 | .. literalinclude:: ../../examples/iterate_over_folds.py
174 |     :caption: From file ``examples/iterate_over_folds.py``
175 |     :name: iterate_over_folds.py
176 |     :lines: 15-
177 | 
178 | .. _train_on_whole_trainset:
179 | 
180 | Train on a whole trainset and specifically query for predictions
181 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
182 | 
183 | We will here review how to get a prediction for specified users and items. In
184 | the mean time, we will also review how to train on a whole dataset, without
185 | performing cross-validation (i.e. there is no test set).
186 | 
187 | The latter is pretty straightforward: all you need is to load a dataset, and
188 | the :meth:`build_full_trainset()
189 | <surprise.dataset.DatasetAutoFolds.build_full_trainset>` method to build the
190 | :class:`trainset <surprise.dataset.Trainset>` and train you algorithm:
191 | 
192 | .. literalinclude:: ../../examples/query_for_predictions.py
193 |     :caption: From file ``examples/query_for_predictions.py``
194 |     :name: query_for_predictions.py
195 |     :lines: 15-22
196 | 
197 | Now, there's no way we could call the :meth:`test()
198 | <surprise.prediction_algorithms.algo_base.AlgoBase.test>` method, because we
199 | have no testset. But you can still get predictions for the users and items you
200 | want.
201 | 
202 | Let's say you're interested in user 196 and item 302 (make sure they're in the
203 | trainset!), and you know that the true rating :math:`r_{ui} = 4`. All you need
204 | is call the :meth:`predict()
205 | <surprise.prediction_algorithms.algo_base.AlgoBase.predict>` method:
206 | 
207 | .. literalinclude:: ../../examples/query_for_predictions.py
208 |     :caption: From file ``examples/query_for_predictions.py``
209 |     :name: query_for_predictions2.py
210 |     :lines: 28-32
211 | 
212 | If the :meth:`predict()
213 | <surprise.prediction_algorithms.algo_base.AlgoBase.predict>` method is called
214 | with user or item ids that were not part of the trainset, it's up to the
215 | algorithm to decide if it still can make a prediction or not. If it can't,
216 | :meth:`predict() <surprise.prediction_algorithms.algo_base.AlgoBase.predict>`
217 | will still predict the mean of all ratings :math:`\mu`.
218 | 
219 | .. _raw_inner_note:
220 | .. note::
221 |   Raw ids are ids as defined in a rating file. They can be strings, numbers, or
222 |   whatever (but are still represented as strings).  On trainset creation, each
223 |   raw id is mapped to a unique integer called inner id, which is a lot more
224 |   suitable for `Surprise <https://nicolashug.github.io/Surprise/>`_ to
225 |   manipulate. Conversions between raw and inner ids can be done using the
226 |   :meth:`to_inner_uid() <surprise.dataset.Trainset.to_inner_uid>`,
227 |   :meth:`to_inner_iid() <surprise.dataset.Trainset.to_inner_iid>`,
228 |   :meth:`to_raw_uid() <surprise.dataset.Trainset.to_raw_uid>`, and
229 |   :meth:`to_raw_iid() <surprise.dataset.Trainset.to_raw_iid>` methods of the
230 |   :class:`trainset <surprise.dataset.Trainset>`.
231 | 
232 | Obviously, it is perfectly fine to use the :meth:`predict()
233 | <surprise.prediction_algorithms.algo_base.AlgoBase.predict>` method directly
234 | during a cross-validation process. It's then up to you to ensure that the user
235 | and item ids are present in the trainset though.
236 | 
237 | Command line usage
238 | ~~~~~~~~~~~~~~~~~~
239 | 
240 | Surprise can also be used from the command line, for example:
241 | 
242 | .. code::
243 | 
244 |     surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3
245 | 
246 | See detailed usage by running:
247 | 
248 | .. code::
249 | 
250 |     surprise -h
251 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Surprise documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Dec 29 20:08:18 2015.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | import shlex
 19 | 
 20 | # If extensions (or modules to document with autodoc) are in another directory,
 21 | # add these directories to sys.path here. If the directory is relative to the
 22 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 23 | #sys.path.insert(0, os.path.abspath('../../'))
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = [
 34 |     'sphinx.ext.autodoc',
 35 |     'sphinx.ext.napoleon',
 36 |     'sphinx.ext.coverage',
 37 |     'sphinx.ext.mathjax',
 38 |     'sphinx.ext.viewcode',
 39 |     'sphinx.ext.graphviz',
 40 |     'sphinx.ext.inheritance_diagram',
 41 |     'sphinx.ext.autosummary',
 42 |     'sphinxcontrib.bibtex',
 43 |     'sphinxcontrib.spelling',
 44 | ]
 45 | 
 46 | # Add any paths that contain templates here, relative to this directory.
 47 | templates_path = ['.templates']
 48 | 
 49 | # The suffix(es) of source filenames.
 50 | # You can specify multiple suffix as a list of string:
 51 | # source_suffix = ['.rst', '.md']
 52 | source_suffix = '.rst'
 53 | 
 54 | # The encoding of source files.
 55 | #source_encoding = 'utf-8-sig'
 56 | 
 57 | # The master toctree document.
 58 | master_doc = 'index'
 59 | 
 60 | # General information about the project.
 61 | project = 'Surprise'
 62 | copyright = '2015, Nicolas Hug'
 63 | author = 'Nicolas Hug'
 64 | 
 65 | # The version info for the project you're documenting, acts as replacement for
 66 | # |version| and |release|, also used in various other places throughout the
 67 | # built documents.
 68 | #
 69 | # The short X.Y version.
 70 | version = '0'
 71 | # The full version, including alpha/beta/rc tags.
 72 | release = '1'
 73 | 
 74 | # The language for content autogenerated by Sphinx. Refer to documentation
 75 | # for a list of supported languages.
 76 | #
 77 | # This is also used if you do content translation via gettext catalogs.
 78 | # Usually you set "language" from the command line for these cases.
 79 | language = None
 80 | 
 81 | # There are two options for replacing |today|: either, you set today to some
 82 | # non-false value, then it is used:
 83 | #today = ''
 84 | # Else, today_fmt is used as the format for a strftime call.
 85 | #today_fmt = '%B %d, %Y'
 86 | 
 87 | # List of patterns, relative to source directory, that match files and
 88 | # directories to ignore when looking for source files.
 89 | exclude_patterns = []
 90 | 
 91 | # The reST default role (used for this markup: `text`) to use for all
 92 | # documents.
 93 | #default_role = None
 94 | 
 95 | # If true, '()' will be appended to :func: etc. cross-reference text.
 96 | add_function_parentheses = True
 97 | 
 98 | # If true, the current module name will be prepended to all description
 99 | # unit titles (such as .. function::).
100 | #add_module_names = True
101 | 
102 | # If true, sectionauthor and moduleauthor directives will be shown in the
103 | # output. They are ignored by default.
104 | #show_authors = False
105 | 
106 | # The name of the Pygments (syntax highlighting) style to use.
107 | pygments_style = 'sphinx'
108 | 
109 | # A list of ignored prefixes for module index sorting.
110 | #modindex_common_prefix = []
111 | 
112 | # If true, keep warnings as "system message" paragraphs in the built documents.
113 | #keep_warnings = False
114 | 
115 | # If true, `todo` and `todoList` produce output, else they produce nothing.
116 | todo_include_todos = False
117 | 
118 | 
119 | # -- Options for HTML output ----------------------------------------------
120 | 
121 | # The theme to use for HTML and HTML Help pages.  See the documentation for
122 | # a list of builtin themes.
123 | import sphinx_rtd_theme
124 | 
125 | html_theme = "sphinx_rtd_theme"
126 | 
127 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
128 | #html_theme = 'haiku'
129 | 
130 | # Theme options are theme-specific and customize the look and feel of a theme
131 | # further.  For a list of options available for each theme, see the
132 | # documentation.
133 | #html_theme_options = {}
134 | 
135 | # Add any paths that contain custom themes here, relative to this directory.
136 | #html_theme_path = []
137 | 
138 | # The name for this set of Sphinx documents.  If None, it defaults to
139 | # "<project> v<release> documentation".
140 | #html_title = None
141 | 
142 | # A shorter title for the navigation bar.  Default is the same as html_title.
143 | #html_short_title = None
144 | 
145 | # The name of an image file (relative to this directory) to place at the top
146 | # of the sidebar.
147 | #html_logo = None
148 | 
149 | # The name of an image file (within the static path) to use as favicon of the
150 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
151 | # pixels large.
152 | #html_favicon = None
153 | 
154 | # Add any paths that contain custom static files (such as style sheets) here,
155 | # relative to this directory. They are copied after the builtin static files,
156 | # so a file named "default.css" will overwrite the builtin "default.css".
157 | #html_static_path = ['.static']
158 | 
159 | # Add any extra paths that contain custom files (such as robots.txt or
160 | # .htaccess) here, relative to this directory. These files are copied
161 | # directly to the root of the documentation.
162 | #html_extra_path = []
163 | 
164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
165 | # using the given strftime format.
166 | #html_last_updated_fmt = '%b %d, %Y'
167 | 
168 | # If true, SmartyPants will be used to convert quotes and dashes to
169 | # typographically correct entities.
170 | #html_use_smartypants = True
171 | 
172 | # Custom sidebar templates, maps document names to template names.
173 | #html_sidebars = {}
174 | 
175 | # Additional templates that should be rendered to pages, maps page names to
176 | # template names.
177 | #html_additional_pages = {}
178 | 
179 | # If false, no module index is generated.
180 | #html_domain_indices = True
181 | 
182 | # If false, no index is generated.
183 | #html_use_index = True
184 | 
185 | # If true, the index is split into individual pages for each letter.
186 | #html_split_index = False
187 | 
188 | # If true, links to the reST sources are added to the pages.
189 | #html_show_sourcelink = True
190 | 
191 | # If true, "Created using Sphinx" is shown in the HTML barter. Default is True.
192 | #html_show_sphinx = True
193 | 
194 | # If true, "(C) Copyright ..." is shown in the HTML barter. Default is True.
195 | #html_show_copyright = True
196 | 
197 | # If true, an OpenSearch description file will be output, and all pages will
198 | # contain a <link> tag referring to it.  The value of this option must be the
199 | # base URL from which the finished HTML is served.
200 | #html_use_opensearch = ''
201 | 
202 | # This is the file name suffix for HTML files (e.g. ".xhtml").
203 | #html_file_suffix = None
204 | 
205 | # Language to be used for generating the HTML full-text search index.
206 | # Sphinx supports the following languages:
207 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
208 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
209 | #html_search_language = 'en'
210 | 
211 | # A dictionary with options for the search language support, empty by default.
212 | # Now only 'ja' uses this config value
213 | #html_search_options = {'type': 'default'}
214 | 
215 | # The name of a javascript file (relative to the configuration directory) that
216 | # implements a search results scorer. If empty, the default will be used.
217 | #html_search_scorer = 'scorer.js'
218 | 
219 | # Output file base name for HTML help builder.
220 | htmlhelp_basename = 'Surprisedoc'
221 | 
222 | # -- Options for LaTeX output ---------------------------------------------
223 | 
224 | latex_elements = {
225 | # The paper size ('letterpaper' or 'a4paper').
226 | #'papersize': 'letterpaper',
227 | 
228 | # The font size ('10pt', '11pt' or '12pt').
229 | 'pointsize': '12pt',
230 | 
231 | # Additional stuff for the LaTeX preamble.
232 | #'preamble': '',
233 | 
234 | # Latex figure (float) alignment
235 | #'figure_align': 'htbp',
236 | }
237 | 
238 | # Grouping the document tree into LaTeX files. List of tuples
239 | # (source start file, target name, title,
240 | #  author, documentclass [howto, manual, or own class]).
241 | latex_documents = [
242 |   (master_doc, 'Surprise.tex', 'Surprise Documentation',
243 |    'Nicolas Hug', 'manual'),
244 | ]
245 | 
246 | # The name of an image file (relative to this directory) to place at the top of
247 | # the title page.
248 | #latex_logo = None
249 | 
250 | # For "manual" documents, if this is true, then toplevel headings are parts,
251 | # not chapters.
252 | #latex_use_parts = False
253 | 
254 | # If true, show page references after internal links.
255 | #latex_show_pagerefs = False
256 | 
257 | # If true, show URL addresses after external links.
258 | #latex_show_urls = False
259 | 
260 | # Documents to append as an appendix to all manuals.
261 | #latex_appendices = []
262 | 
263 | # If false, no module index is generated.
264 | #latex_domain_indices = True
265 | 
266 | 
267 | # -- Options for manual page output ---------------------------------------
268 | 
269 | # One entry per manual page. List of tuples
270 | # (source start file, name, description, authors, manual section).
271 | man_pages = [
272 |     (master_doc, 'surprise', 'Surprise Documentation',
273 |      [author], 1)
274 | ]
275 | 
276 | # If true, show URL addresses after external links.
277 | #man_show_urls = False
278 | 
279 | 
280 | # -- Options for Texinfo output -------------------------------------------
281 | 
282 | # Grouping the document tree into Texinfo files. List of tuples
283 | # (source start file, target name, title, author,
284 | #  dir menu entry, description, category)
285 | texinfo_documents = [
286 |   (master_doc, 'Surprise', 'Surprise Documentation',
287 |    author, 'Surprise', 'One line description of project.',
288 |    'Miscellaneous'),
289 | ]
290 | 
291 | # Documents to append as an appendix to all manuals.
292 | #texinfo_appendices = []
293 | 
294 | # If false, no module index is generated.
295 | #texinfo_domain_indices = True
296 | 
297 | # How to display URL addresses: 'bartnote', 'no', or 'inline'.
298 | #texinfo_show_urls = 'bartnote'
299 | 
300 | # If true, do not generate a @detailmenu in the "Top" node's menu.
301 | #texinfo_no_detailmenu = False
302 | 


--------------------------------------------------------------------------------
/surprise/prediction_algorithms/algo_base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The :mod:`surprise.prediction_algorithms.algo_base` module defines the base
  3 | class :class:`AlgoBase` from which every single prediction algorithm has to
  4 | inherit.
  5 | """
  6 | 
  7 | from __future__ import (absolute_import, division, print_function,
  8 |                         unicode_literals)
  9 | 
 10 | from .. import similarities as sims
 11 | from .predictions import PredictionImpossible
 12 | from .predictions import Prediction
 13 | from .optimize_baselines import baseline_als
 14 | from .optimize_baselines import baseline_sgd
 15 | 
 16 | 
 17 | class AlgoBase:
 18 |     """Abstract class where is defined the basic behavior of a prediction
 19 |     algorithm.
 20 | 
 21 |     Keyword Args:
 22 |         baseline_options(dict, optional): If the algorithm needs to compute a
 23 |             baseline estimate, the ``baseline_options`` parameter is used to
 24 |             configure how they are computed. See
 25 |             :ref:`baseline_estimates_configuration` for usage.
 26 |     """
 27 | 
 28 |     def __init__(self, **kwargs):
 29 | 
 30 |         self.bsl_options = kwargs.get('bsl_options', {})
 31 |         self.sim_options = kwargs.get('sim_options', {})
 32 |         if 'user_based' not in self.sim_options:
 33 |             self.sim_options['user_based'] = True
 34 | 
 35 |     def train(self, trainset):
 36 |         """Train an algorithm on a given training set.
 37 | 
 38 |         This method is called by every derived class as the first basic step
 39 |         for training an algorithm. It basically just initializes some internal
 40 |         structures and set the self.trainset attribute.
 41 | 
 42 |         Args:
 43 |             trainset(:obj:`Trainset <surprise.dataset.Trainset>`) : A training
 44 |                 set, as returned by the :meth:`folds
 45 |                 <surprise.dataset.Dataset.folds>` method.
 46 |         """
 47 | 
 48 |         self.trainset = trainset
 49 | 
 50 |         # (re) Initialise baselines
 51 |         self.bu = self.bi = None
 52 | 
 53 |     def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
 54 |         """Compute the rating prediction for given user and item.
 55 | 
 56 |         The ``predict`` method converts raw ids to inner ids and then calls the
 57 |         ``estimate`` method which is defined in every derived class. If the
 58 |         prediction is impossible (for whatever reason), the prediction is set
 59 |         to the global mean of all ratings.
 60 | 
 61 |         Args:
 62 |             uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
 63 |             iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
 64 |             r_ui(float): The true rating :math:`r_{ui}`. Optional, default is
 65 |                 ``None``.
 66 |             clip(bool): Whether to clip the estimation into the rating scale.
 67 |                 For example, if :math:`\\hat{r}_{ui}` is :math:`5.5` while the
 68 |                 rating scale is :math:`[1, 5]`, then :math:`\\hat{r}_{ui}` is
 69 |                 set to :math:`5`. Same goes if :math:`\\hat{r}_{ui} < 1`.
 70 |                 Default is ``True``.
 71 |             verbose(bool): Whether to print details of the prediction.  Default
 72 |                 is False.
 73 | 
 74 |         Returns:
 75 |             A :obj:`Prediction\
 76 |             <surprise.prediction_algorithms.predictions.Prediction>` object
 77 |             containing:
 78 | 
 79 |             - The (raw) user id ``uid``.
 80 |             - The (raw) item id ``iid``.
 81 |             - The true rating ``r_ui`` (:math:`\\hat{r}_{ui}`).
 82 |             - The estimated rating (:math:`\\hat{r}_{ui}`).
 83 |             - Some additional details about the prediction that might be useful
 84 |               for later analysis.
 85 |         """
 86 | 
 87 |         # Convert raw ids to inner ids
 88 |         try:
 89 |             iuid = self.trainset.to_inner_uid(uid)
 90 |         except ValueError:
 91 |             iuid = 'UKN__' + str(uid)
 92 |         try:
 93 |             iiid = self.trainset.to_inner_iid(iid)
 94 |         except ValueError:
 95 |             iiid = 'UKN__' + str(iid)
 96 | 
 97 |         details = {}
 98 |         try:
 99 |             est = self.estimate(iuid, iiid)
100 | 
101 |             # If the details dict was also returned
102 |             if isinstance(est, tuple):
103 |                 est, details = est
104 | 
105 |             details['was_impossible'] = False
106 | 
107 |         except PredictionImpossible as e:
108 |             est = self.trainset.global_mean
109 |             details['was_impossible'] = True
110 |             details['reason'] = str(e)
111 | 
112 |         # Remap the rating into its initial rating scale (because the rating
113 |         # scale was translated so that ratings are all >= 1)
114 |         est -= self.trainset.offset
115 | 
116 |         # clip estimate into [lower_bound, higher_bound]
117 |         if clip:
118 |             lower_bound, higher_bound = self.trainset.rating_scale
119 |             est = min(higher_bound, est)
120 |             est = max(lower_bound, est)
121 | 
122 |         pred = Prediction(uid, iid, r_ui, est, details)
123 | 
124 |         if verbose:
125 |             print(pred)
126 | 
127 |         return pred
128 | 
129 |     def test(self, testset, verbose=False):
130 |         """Test the algorithm on given testset, i.e. estimate all the ratings
131 |         in the given testset.
132 | 
133 |         Args:
134 |             testset: A test set, as returned by the :meth:`folds()
135 |                 <surprise.dataset.Dataset.folds>` method or by the
136 |                 :meth:`build_testset()
137 |                 <surprise.dataset.Trainset.build_testset>` method.
138 |             verbose(bool): Whether to print details for each predictions.
139 |                 Default is False.
140 | 
141 |         Returns:
142 |             A list of :class:`Prediction\
143 |             <surprise.prediction_algorithms.predictions.Prediction>` objects
144 |             that contains all the estimated ratings.
145 |         """
146 | 
147 |         # The ratings are translated back to their original scale.
148 |         predictions = [self.predict(uid,
149 |                                     iid,
150 |                                     r_ui_trans - self.trainset.offset,
151 |                                     verbose=verbose)
152 |                        for (uid, iid, r_ui_trans) in testset]
153 |         return predictions
154 | 
155 |     def compute_baselines(self):
156 |         """Compute users and items baselines.
157 | 
158 |         The way baselines are computed depends on the ``bsl_options`` parameter
159 |         passed at the creation of the algorithm (see
160 |         :ref:`baseline_estimates_configuration`).
161 | 
162 |         This method is only relevant for algorithms using :func:`Pearson
163 |         baseline similarty<surprise.similarities.pearson_baseline>` or the
164 |         :class:`BaselineOnly
165 |         <surprise.prediction_algorithms.baseline_only.BaselineOnly>` algorithm.
166 | 
167 |         Returns:
168 |             A tuple ``(bu, bi)``, which are users and items baselines."""
169 | 
170 |         # Firt of, if this method has already been called before on the same
171 |         # trainset, then just return. Indeed, compute_baselines may be called
172 |         # more than one time, for example when a similarity metric (e.g.
173 |         # pearson_baseline) uses baseline estimates.
174 |         if self.bu is not None:
175 |             return self.bu, self.bi
176 | 
177 |         method = dict(als=baseline_als,
178 |                       sgd=baseline_sgd)
179 | 
180 |         method_name = self.bsl_options.get('method', 'als')
181 | 
182 |         try:
183 |             print('Estimating biases using', method_name + '...')
184 |             self.bu, self.bi = method[method_name](self)
185 |             return self.bu, self.bi
186 |         except KeyError:
187 |             raise ValueError('Invalid method ' + method_name +
188 |                              ' for baseline computation.' +
189 |                              ' Available methods are als and sgd.')
190 | 
191 |     def compute_similarities(self):
192 |         """Build the similarity matrix.
193 | 
194 |         The way the similarity matrix is computed depends on the
195 |         ``sim_options`` parameter passed at the creation of the algorithm (see
196 |         :ref:`similarity_measures_configuration`).
197 | 
198 |         This method is only relevant for algorithms using a similarity measure,
199 |         such as the :ref:`k-NN algorithms <pred_package_knn_inpired>`.
200 | 
201 |         Returns:
202 |             The similarity matrix."""
203 | 
204 |         construction_func = {'cosine': sims.cosine,
205 |                              'msd': sims.msd,
206 |                              'pearson': sims.pearson,
207 |                              'pearson_baseline': sims.pearson_baseline}
208 | 
209 |         if self.sim_options['user_based']:
210 |             n_x, yr = self.trainset.n_users, self.trainset.ir
211 |         else:
212 |             n_x, yr = self.trainset.n_items, self.trainset.ur
213 | 
214 |         min_support = self.sim_options.get('min_support', 1)
215 | 
216 |         args = [n_x, yr, min_support]
217 | 
218 |         name = self.sim_options.get('name', 'msd').lower()
219 |         if name == 'pearson_baseline':
220 |             shrinkage = self.sim_options.get('shrinkage', 100)
221 |             bu, bi = self.compute_baselines()
222 |             if self.sim_options['user_based']:
223 |                 bx, by = bu, bi
224 |             else:
225 |                 bx, by = bi, bu
226 | 
227 |             args += [self.trainset.global_mean, bx, by, shrinkage]
228 | 
229 |         try:
230 |             print('Computing the {0} similarity matrix...'.format(name))
231 |             sim = construction_func[name](*args)
232 |             print('Done computing similarity matrix.')
233 |             return sim
234 |         except KeyError:
235 |             raise NameError('Wrong sim name ' + name + '. Allowed values ' +
236 |                             'are ' + ', '.join(construction_func.keys()) + '.')
237 | 
238 |     def get_neighbors(self, iid, k):
239 |         """Return the ``k`` nearest neighbors of ``iid``, which is the inner id
240 |         of a user or an item, depending on the ``user_based`` field of
241 |         ``sim_options`` (see :ref:`similarity_measures_configuration`).
242 | 
243 |         As the similarities are computed on the basis of a similarity measure,
244 |         this method is only relevant for algorithms using a similarity measure,
245 |         such as the :ref:`k-NN algorithms <pred_package_knn_inpired>`.
246 | 
247 |         For a usage example, see the :ref:`FAQ <get_k_nearest_neighbors>`.
248 | 
249 |         Args:
250 |             iid(int): The (inner) id of the user (or item) for which we want
251 |                 the nearest neighbors. See :ref:`this note<raw_inner_note>`.
252 | 
253 |             k(int): The number of neighbors to retrieve.
254 | 
255 |         Returns:
256 |             The list of the ``k`` (inner) ids of the closest users (or items)
257 |             to ``iid``.
258 |         """
259 | 
260 |         if self.sim_options['user_based']:
261 |             all_instances = self.trainset.all_users
262 |         else:
263 |             all_instances = self.trainset.all_items
264 | 
265 |         others = [(x, self.sim[iid, x]) for x in all_instances() if x != iid]
266 |         others.sort(key=lambda tple: tple[1], reverse=True)
267 |         k_nearest_neighbors = [j for (j, _) in others[:k]]
268 | 
269 |         return k_nearest_neighbors
270 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GitHub version](https://badge.fury.io/gh/nicolashug%2FSurprise.svg)](https://badge.fury.io/gh/nicolashug%2FSurprise)
  2 | [![Documentation Status](https://readthedocs.org/projects/surprise/badge/?version=stable)](http://surprise.readthedocs.io/en/stable/?badge=stable)
  3 | [![Build Status](https://travis-ci.org/NicolasHug/Surprise.svg?branch=master)](https://travis-ci.org/NicolasHug/Surprise)
  4 | [![python versions](https://img.shields.io/badge/python-2.7%2C%203.5-blue.svg)](http://surpriselib.com)
  5 | [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
  6 | 
  7 | 
  8 | 
  9 | 
 10 | Surprise
 11 | ========
 12 | 
 13 | Overview
 14 | --------
 15 | 
 16 | [Surprise](http://surpriselib.com) is a Python
 17 | [scikit](https://www.scipy.org/scikits.html) building and analyzing recommender
 18 | systems.
 19 | 
 20 | [Surprise](http://surpriselib.com) **was designed with the
 21 | following purposes in mind**:
 22 | 
 23 | - Give users perfect control over their experiments. To this end, a strong
 24 |   emphasis is laid on
 25 |   [documentation](http://surprise.readthedocs.io/en/stable/index.html), which we
 26 |   have tried to make as clear and precise as possible by pointing out every
 27 |   detail of the algorithms.
 28 | - Alleviate the pain of [Dataset
 29 |   handling](http://surprise.readthedocs.io/en/stable/getting_started.html#load-a-custom-dataset).
 30 |   Users can use both *built-in* datasets
 31 |   ([Movielens](http://grouplens.org/datasets/movielens/),
 32 |   [Jester](http://eigentaste.berkeley.edu/dataset/)), and their own *custom*
 33 |   datasets.
 34 | - Provide various ready-to-use [prediction
 35 |   algorithms](http://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html)
 36 |   such as [baseline
 37 |   algorithms](http://surprise.readthedocs.io/en/stable/basic_algorithms.html),
 38 |   [neighborhood
 39 |   methods](http://surprise.readthedocs.io/en/stable/knn_inspired.html), matrix
 40 |   factorization-based (
 41 |   [SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD),
 42 |   [PMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#unbiased-note),
 43 |   [SVD++](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp),
 44 |   [NMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)),
 45 |   and [many
 46 |   others](http://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html).
 47 |   Also, various [similarity
 48 |   measures](http://surprise.readthedocs.io/en/stable/similarities.html)
 49 |   (cosine, MSD, pearson...) are built-in.
 50 | - Make it easy to implement [new algorithm
 51 |   ideas](http://surprise.readthedocs.io/en/stable/building_custom_algo.html).
 52 | - Provide tools to [evaluate](http://surprise.readthedocs.io/en/stable/evaluate.html),
 53 |   [analyse](http://nbviewer.jupyter.org/github/NicolasHug/Surprise/tree/master/examples/notebooks/KNNBasic_analysis.ipynb/)
 54 |   and
 55 |   [compare](http://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/Compare.ipynb)
 56 |   the algorithms performance. Cross-validation procedures can be run very
 57 |   easily, as well as [exhaustive search over a set of
 58 |   parameters](http://surprise.readthedocs.io/en/stable/getting_started.html#tune-algorithm-parameters-with-gridsearch).
 59 | 
 60 | 
 61 | The name *SurPRISE* (roughly :) ) stands for Simple Python RecommendatIon
 62 | System Engine.
 63 | 
 64 | 
 65 | Getting started, example
 66 | ------------------------
 67 | 
 68 | Here is a simple example showing how you can (down)load a dataset, split it for
 69 | 3-folds cross-validation, and compute the MAE and RMSE of the
 70 | [SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)
 71 | algorithm.
 72 | 
 73 | ```python
 74 | from surprise import SVD
 75 | from surprise import Dataset
 76 | from surprise import evaluate, print_perf
 77 | 
 78 | 
 79 | # Load the movielens-100k dataset (download it if needed),
 80 | # and split it into 3 folds for cross-validation.
 81 | data = Dataset.load_builtin('ml-100k')
 82 | data.split(n_folds=3)
 83 | 
 84 | # We'll use the famous SVD algorithm.
 85 | algo = SVD()
 86 | 
 87 | # Evaluate performances of our algorithm on the dataset.
 88 | perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
 89 | 
 90 | print_perf(perf)
 91 | ```
 92 | 
 93 | **Output**:
 94 | 
 95 | ```
 96 | Evaluating RMSE, MAE of algorithm SVD.
 97 | 
 98 |         Fold 1  Fold 2  Fold 3  Mean
 99 | MAE     0.7475  0.7447  0.7425  0.7449
100 | RMSE    0.9461  0.9436  0.9425  0.9441
101 | ```
102 | 
103 | [Surprise](http://surpriselib.com) can do **much** more (e.g,
104 | [GridSearch](http://surprise.readthedocs.io/en/stable/getting_started.html#tune-algorithm-parameters-with-gridsearch))!
105 | You'll find [more usage
106 | examples](http://surprise.readthedocs.io/en/stable/getting_started.html) in the
107 | [documentation ](http://surprise.readthedocs.io/en/stable/index.html).
108 | 
109 | 
110 | Benchmarks
111 | ----------
112 | 
113 | Here are the average RMSE, MAE and total execution time of various algorithms
114 | (with their default parameters) on a 5-folds cross-validation procedure. The
115 | datasets are the [Movielens](http://grouplens.org/datasets/movielens/) 100k and
116 | 1M datasets. The folds are the same for all the algorithms (the random seed is
117 | set to 0). All experiments are run on a small laptop with Intel Core i3 1.7
118 | GHz, 4Go RAM. The execution time is the *real* execution time, as returned by
119 | the GNU [time](http://man7.org/linux/man-pages/man1/time.1.html) command.
120 | 
121 | |  [Movielens 100k](http://grouplens.org/datasets/movielens/100k) |  RMSE  |   MAE  | Time (s) |
122 | |-----------------|:------:|:------:|:--------:|
123 | | [NormalPredictor](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor) | 1.5228 | 1.2242 |     4    |
124 | | [BaselineOnly](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)    |  .9445 |  .7488 |    5    |
125 | | [KNNBasic](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)        |  .9789 |  .7732 |    27    |
126 | | [KNNWithMeans](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)    |  .9514 |  .7500 |    30    |
127 | | [KNNBaseline](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)     |  .9306 |  .7334 |    44    |
128 | | [SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)             |  .9364 |  .7381 |    46    |
129 | | [SVD++](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)             |  .9200 |  .7253 |    31min    |
130 | | [NMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)             |  .9634 |  .7572 |    55    |
131 | | [Slope One](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)             |  .9454 |  .7430 |    25    |
132 | | [Co clustering](http://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering)             |  .9678 |  .7579 |    15    |
133 | 
134 | 
135 | |  [Movielens 1M](http://grouplens.org/datasets/movielens/1m) |  RMSE  |   MAE  | Time (min) |
136 | |-----------------|:------:|:------:|:--------:|
137 | | [NormalPredictor](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor) | 1.5037 | 1.2051 |     < 1    |
138 | | [BaselineOnly](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)    |  .9086 | .7194 |    < 1    |
139 | | [KNNBasic](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)        |  .9207 |  .7250 |    22    |
140 | | [KNNWithMeans](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)    |  .9292 |  .7386 |    22    |
141 | | [KNNBaseline](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)     |  .8949 | .7063 |    44    |
142 | | [SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)             |  .8738 |  .6858 |    7    |
143 | | [NMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)             |  .9155 |  .7232 |    9    |
144 | | [Slope One](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)             |  .9065 |  .7144 |    8    |
145 | | [Co clustering](http://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering)             |  .9155 |  .7174 |    2    |
146 | 
147 | Installation / Usage
148 | --------------------
149 | 
150 | The easiest way is to use pip (you'll need [numpy](http://www.numpy.org/)):
151 | 
152 |     $ pip install numpy
153 |     $ pip install scikit-surprise
154 | 
155 | Or you can clone the repo and build the source (you'll need
156 | [Cython](http://cython.org/) and [numpy](http://www.numpy.org/)):
157 | 
158 |     $ git clone https://github.com/NicolasHug/surprise.git
159 |     $ python setup.py install
160 | 
161 | 
162 | License
163 | -------
164 | 
165 | This project is licensed under the [BSD
166 | 3-Clause](https://opensource.org/licenses/BSD-3-Clause) license, so it can be
167 | used for pretty much everything, including commercial applications. Please let
168 | us know how [Surprise](http://surpriselib.com) is useful to you!
169 | 
170 | Here is a Bibtex entry if you ever need to cite Surprise in a research paper
171 | (please keep us posted, we would love to know if Surprise was helpful to you):
172 | 
173 |     @Misc{Surprise,
174 |     author =   {Hug, Nicolas},
175 |     title =    { {S}urprise, a {P}ython library for recommender systems},
176 |     howpublished = {\url{http://surpriselib.com}},
177 |     year = {2017}
178 |     }
179 | 
180 | Acknowledgements:
181 | ----------------
182 | 
183 | - [Pierre-François Gimenez](https://github.com/PFgimenez), for his valuable
184 |   insights on software design.
185 | - [Maher Malaeb](https://github.com/mahermalaeb), for the
186 |   [GridSearch](http://surprise.readthedocs.io/en/stable/evaluate.html#surprise.evaluate.GridSearch)
187 |   implementation.
188 | 
189 | Contributing, feedback, contact
190 | -------------------------------
191 | 
192 | Any kind of feedback/criticism would be greatly appreciated (software design,
193 | documentation, improvement ideas, spelling mistakes, etc...).
194 | 
195 | If you'd like to see some features or algorithms implemented in
196 | [Surprise](http://surpriselib.com), please let us know!
197 | 
198 | Please feel free to contribute (see
199 | [guidelines](https://github.com/NicolasHug/Surprise/blob/master/CONTRIBUTING.md))
200 | and send pull requests!
201 | 
202 | To contact us, send us a [tweet](https://twitter.com/Surpriselib) or mail us at
203 | contact at nicolas-hug dot com. For any bug or issue, please use
204 | the GitHub [project page](https://github.com/NicolasHug/Surprise).
205 | 


--------------------------------------------------------------------------------