├── .gitignore
├── .travis.yml
├── COPYING
├── Makefile
├── README.rst
├── bin
    ├── extract_movie_features
    ├── run_graphchi.sh
    └── run_mymedialite.sh
├── doc
    ├── Makefile
    ├── _templates
    │   └── layout.html
    ├── api.rst
    ├── aws.rst
    ├── conf.py
    ├── evaluation.rst
    ├── hybrid.rst
    ├── index.rst
    ├── mrec.evaluation.rst
    ├── mrec.examples.rst
    ├── mrec.item_similarity.rst
    ├── mrec.mf.model.rst
    ├── mrec.mf.rst
    ├── mrec.parallel.rst
    ├── mrec.rst
    ├── preparation.rst
    ├── quickstart.rst
    └── training.rst
├── mrec
    ├── __init__.py
    ├── base_recommender.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── tests
    │   │   └── test_metrics.py
    ├── examples
    │   ├── __init__.py
    │   ├── convert.py
    │   ├── evaluate.py
    │   ├── factors.py
    │   ├── filename_conventions.py
    │   ├── predict.py
    │   ├── prepare.py
    │   ├── train.py
    │   └── tune_slim.py
    ├── item_similarity
    │   ├── __init__.py
    │   ├── knn.py
    │   ├── precomputed.py
    │   ├── recommender.py
    │   └── slim.py
    ├── mf
    │   ├── __init__.py
    │   ├── climf.py
    │   ├── evaluate.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── warp.py
    │   │   ├── warp2.py
    │   │   └── warp_fast.pyx
    │   ├── recommender.py
    │   ├── warp.py
    │   ├── warp2.py
    │   └── wrmf.py
    ├── parallel
    │   ├── __init__.py
    │   ├── evaluate.py
    │   ├── item_similarity.py
    │   ├── predict.py
    │   ├── warp.py
    │   └── wrmf.py
    ├── popularity.py
    ├── reranking_recommender.py
    ├── sparse.py
    ├── testing.py
    └── tests
    │   ├── test_base_recommender.py
    │   ├── test_mrec.py
    │   └── test_sparse.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.so
3 | *.c
4 | doc/_build
5 | build
6 | dist
7 | *.egg-info
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "2.7"
 5 | 
 6 | sudo: false
 7 | 
 8 | addons:
 9 |   apt:
10 |     packages:
11 |     - libblas-dev
12 |     - liblapack-dev
13 |     - gfortran
14 | 
15 | cache:
16 |   - pip
17 |   - apt
18 | 
19 | before_install:
20 |   # Setup.py references some of its dependencies
21 |   # before we get to the install_requires line
22 |   # so install them first.
23 |   # Note: scipy takes a terribly long time to install
24 |   - pip install cython
25 |   - pip install numpy
26 |   - travis_wait pip install scipy
27 | 
28 | # This is a library, not an application.
29 | # So we do not have a requirements.txt
30 | install: python setup.py install
31 | 
32 | script: py.test
33 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | New BSD License
 2 | 
 3 | Copyright (c) 2013, Mendeley Ltd.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 |     * Redistributions in binary form must reproduce the above copyright
11 |       notice, this list of conditions and the following disclaimer in the
12 |       documentation and/or other materials provided with the distribution.
13 |     * Neither the name of Mendeley Ltd. nor the
14 |       names of its contributors may be used to endorse or promote products
15 |       derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON ?= python
 2 | CYTHON ?= cython
 3 | NOSETESTS ?= nosetests
 4 | 
 5 | # Compilation...
 6 | 
 7 | CYTHONSRC= $(wildcard mrec/*/*.pyx)
 8 | CSRC= $(CYTHONSRC:.pyx=.cpp)
 9 | 
10 | inplace: cython
11 | 	$(PYTHON) setup.py build_ext -i
12 | 
13 | cython: $(CSRC)
14 | 
15 | clean:
16 | 	rm -f mrec/*/*.c mrec/*/*.so mrec/*/*.html mrec/*/*.pyc
17 | 
18 | %.cpp: %.pyx
19 | 	$(CYTHON) $<
20 | 
21 | # Tests...
22 | #
23 | test-code:
24 | 	$(NOSETESTS) -s mrec
25 | 
26 | test-coverage:
27 | 	$(NOSETESTS) -s --with-coverage --cover-html --cover-html-dir=coverage \
28 | 	--cover-package=mrec mrec
29 | 
30 | test: test-code
31 | 
32 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ================================
 2 | mrec recommender systems library
 3 | ================================
 4 | 
 5 | .. image:: https://img.shields.io/pypi/v/mrec.svg
 6 |   :target: https://pypi.python.org/pypi/mrec/
 7 | .. image:: https://travis-ci.org/Mendeley/mrec.svg?branch=master
 8 |   :target: https://travis-ci.org/Mendeley/mrec
 9 | 
10 | Introduction
11 | ------------
12 | `mrec` is a Python package developed at `Mendeley <http://www.mendeley.com>`_ to support recommender systems development and evaluation.  The package currently focuses on item similarity and other methods that work well on implicit feedback, and on experimental evaluation.
13 | 
14 | Why another package when there are already some really good software projects implementing recommender systems?
15 | 
16 | `mrec` tries to fill two small gaps in the current landscape, firstly by supplying
17 | simple tools for consistent and reproducible evaluation, and secondly by offering examples
18 | of how to use IPython.parallel to run the same code either on the cores of a single machine
19 | or on a cluster.  The combination of IPython and scientific Python libraries is very powerful,
20 | but there are still rather few examples around that show how to get it to work in practice.
21 | 
22 | Highlights:
23 | 
24 | - a (relatively) efficient implementation of the SLIM item similarity method [1]_.
25 | - an implementation of Hu, Koren & Volinsky's WRMF weighted matrix factorization for implicit feedback [2]_.
26 | - a matrix factorization model that optimizes the Weighted Approximately Ranked Pairwise (WARP) ranking loss [3]_.
27 | - a hybrid model optimizing the WARP loss for a ranking based jointly on a user-item matrix and on content features for each item.
28 | - utilities to train models and make recommendations in parallel using IPython.
29 | - utilities to prepare datasets and compute quality metrics.
30 | 
31 | Documentation for mrec can be found at http://mendeley.github.io/mrec.
32 | 
33 | The source code is available at https://github.com/mendeley/mrec.
34 | 
35 | `mrec` implements the SLIM recommender described in [1]_.  Please cite this paper if you 
36 | use `mrec` in your research.
37 | 
38 | Usage
39 | -----
40 | 
41 | To use mrec in your Python project:
42 | 
43 | 1. Set up a virtualenv for your project... or don't.
44 | 2. Run ``pip install mrec``
45 | 
46 | Contributing
47 | ------------
48 | 
49 | To set up the project on your own development machine, follow these steps.
50 | 
51 | To install the dependencies:
52 | 
53 | 1. Install Cython, Numpy and Scipy. This is the difficult step. On Windows or OS X you could install one of the Scipy distributions. On Linuxes you could install libblas, liblapack, gfortran from your OS package manager, then run ``pip install cython numpy scipy``.
54 | 2. Run ``python setup.py install`` to obtain the other Python dependencies.
55 | 
56 | To run the tests:
57 | 
58 | - Run ``py.test``
59 | 
60 | For more specific project build instructions, please see the .travis.yml config file at the top of this Git repo, which specifies how Travis CI auto-builds and tests our project.
61 | 
62 | If you have fixed a bug or added a neat new feature, feel free to submit a pull request to us on GitHub.
63 | 
64 | References
65 | ----------
66 | .. [1] Mark Levy, Kris Jack (2013). Efficient Top-N Recommendation by Linear Regression. In Large Scale Recommender Systems Workshop in RecSys'13.
67 | .. [2] Hu, Y., Koren, Y., & Volinsky, C. (2008). Collaborative filtering for implicit feedback datasets. In IEEE ICDM'08.
68 | .. [3] Weston, J., Bengio, S., & Usunier, N. (2010). Large scale image annotation: learning to rank with joint word-image embeddings. Machine learning, 81(1), 21-35.
69 | 


--------------------------------------------------------------------------------
/bin/extract_movie_features:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This script generates text features for movies in the movielens datasets from
  5 | the IMDb plot.list file, which you can download from one of the official mirrors
  6 | listed on http://www.imdb.com/interfaces#plain.
  7 | 
  8 | Usage:
  9 |     ./extract_movie_features <plot_list_file> <item_file> <outfile>
 10 | 
 11 | Examples:
 12 |     ./extract_movie_features plot.list ml-100k/u.item 100k.item.features.npz
 13 |     ./extract_movie_features plot.list ml-10m/movies.dat 10m.item.features.npz
 14 | """
 15 | 
 16 | import re
 17 | from urllib import unquote_plus
 18 | import numpy as np
 19 | from sklearn.feature_extraction.text import TfidfVectorizer
 20 | 
 21 | from mrec import save_sparse_matrix
 22 | 
 23 | def extract_plots(plot_list_file):
 24 | 
 25 |     MV = re.compile('^MV: (.+)$')
 26 |     PL = re.compile('^PL: (.+)$')
 27 | 
 28 |     title2plot = dict()
 29 |     title = None
 30 |     plot = []
 31 | 
 32 |     for line in open(plot_list_file):
 33 |         line = line.strip()
 34 |         mv = MV.match(line)
 35 |         if mv:
 36 |             if title is not None:
 37 |                 title2plot[title] = ' '.join(plot)
 38 |                 plot = []
 39 |             title = mv.group(1)
 40 |         else:
 41 |             pl = PL.match(line)
 42 |             if pl:
 43 |                 plot_line = pl.group(1)
 44 |                 try:
 45 |                     # test that vectorizer will be able to handle line
 46 |                     plot_line.decode('utf-8')
 47 |                     plot.append(pl.group(1))
 48 |                 except:
 49 |                     pass  # skip lines that will make the vectorizer barf
 50 |     title2plot[title] = ' '.join(plot)
 51 |     return title2plot
 52 | 
 53 | def create_features(item_file,title_in_url,title2plot,outfile):
 54 | 
 55 |     ITEM = re.compile('^([0-9]+)')
 56 | 
 57 |     if title_in_url:
 58 |         # ml-100k format
 59 |         TITLE = re.compile(r'^.*\|.*\|.*\|.*\|http://us.imdb.com/.+?\?(.+?)\|')
 60 |     else:
 61 |         # ml-1m / ml-10m format
 62 |         TITLE = re.compile(r'^.+::(.+?)::')
 63 | 
 64 |     plots = []
 65 | 
 66 |     for line in open(item_file):
 67 |         item = int(ITEM.match(line).group(1))
 68 |         # add empty plots for any missing movie ids
 69 |         while len(plots) < item - 1:
 70 |             plots.append('')
 71 |         try:
 72 |             title = TITLE.match(line).group(1)
 73 |             if title_in_url:
 74 |                 title = unquote_plus(title)
 75 |             plot = title2plot.get(title,None)
 76 |             if plot is None:
 77 |                 plot = ''
 78 |         except:
 79 |             plot = ''
 80 |         plots.append(plot)
 81 | 
 82 |     v = TfidfVectorizer(min_df=20,max_df=0.05)
 83 |     item_features = v.fit_transform(plots)
 84 |     
 85 |     # save features as numpy arrays
 86 |     save_sparse_matrix(item_features,'npz',outfile)
 87 | 
 88 | if __name__ == '__main__':
 89 |     import sys
 90 | 
 91 |     if len(sys.argv) != 4:
 92 |         raise SystemExit(__doc__.strip())
 93 | 
 94 |     plot_list_file = sys.argv[1]
 95 |     item_file = sys.argv[2]
 96 |     outfile = sys.argv[3]
 97 | 
 98 |     if item_file.endswith('movies.dat'):
 99 |         title_in_url = False
100 |     elif item_file.endswith('u.item'):
101 |         title_in_url = True
102 |     else:
103 |         raise SystemExit('unexpected input, should be a movielens movies.dat or u.item file')
104 | 
105 |     title2plot = extract_plots(plot_list_file)
106 |     create_features(item_file,title_in_url,title2plot,outfile)
107 | 


--------------------------------------------------------------------------------
/bin/run_graphchi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GRAPHCHI_HOME=$1
 4 | TRAIN_GLOB=$2
 5 | HERE=`pwd`
 6 | 
 7 | mkdir -p mm_factors
 8 | 
 9 | for TRAIN in $TRAIN_GLOB
10 | do
11 |     cd $HERE
12 |     mrec_convert --input_format tsv \
13 |     --input $TRAIN \
14 |     --output_format mm \
15 |     --output $TRAIN.mm
16 | 
17 |     cd $GRAPHCHI_HOME
18 |     ./toolkits/collaborative_filtering/climf \
19 |         --training=$HERE/$TRAIN.mm  \
20 |         --binary_relevance_thresh=1 --sgd_lambda=0.1 --sgd_gamma=0.0001 --max_iter=10 --quiet=1
21 |     rm -r $HERE/$TRAIN.mm.* $HERE/$TRAIN.mm_degs.bin
22 | 
23 |     cd $HERE
24 |     mrec_factors --factor_format mm \
25 |         --user_factors $TRAIN.mm_U.mm \
26 |         --item_factors $TRAIN.mm_V.mm \
27 |         --train $TRAIN \
28 |         --outdir climf_models \
29 |         --description climf
30 |     mv $HERE/$TRAIN.mm* mm_factors/
31 | done
32 | 
33 | mrec_predict -n4 --input_format tsv --test_input_format tsv \
34 |     --train "$TRAIN_GLOB" \
35 |     --modeldir climf_models \
36 |     --outdir climf_recs
37 | 


--------------------------------------------------------------------------------
/bin/run_mymedialite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MYMEDIALITE_HOME=$1
 4 | TRAIN_GLOB=$2
 5 | 
 6 | mkdir -p wrmf_recs
 7 | 
 8 | for TRAIN in $TRAIN_GLOB
 9 | do
10 |     recspath=wrmf_recs/`basename $TRAIN`.recs
11 | 
12 |     $MYMEDIALITE_HOME/bin/item_recommendation --training-file $TRAIN \
13 |        --recommender WRMF \
14 |        --predict-items-number 20 \
15 |        --prediction-file $recspath.tmp
16 | 
17 |     python -c \
18 | "import sys
19 | for line in sys.stdin:
20 |     u,z = line.strip().split()
21 |     recs = eval(z.replace(',','),(').replace(':',',').replace('[','[(').replace(']',')]'))
22 |     for i,v in recs:
23 |         print '{0}\t{1}\t{2}'.format(u,i,v)" \
24 |         < $recspath.tmp \
25 |         > $recspath.tsv
26 | 
27 |     rm $recspath.tmp
28 | done
29 | 
30 | mrec_evaluate --input_format tsv --test_input_format tsv \
31 |     --train "$TRAIN_GLOB" \
32 |     --recsdir wrmf_recs \
33 |     --description "wrmf"
34 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = ../gh-pages
  9 | 
 10 | 
 11 | # User-friendly check for sphinx-build
 12 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 13 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 14 | endif
 15 | 
 16 | # Internal variables.
 17 | PAPEROPT_a4     = -D latex_paper_size=a4
 18 | PAPEROPT_letter = -D latex_paper_size=letter
 19 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 20 | # the i18n builder cannot share the environment and doctrees with the others
 21 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 22 | 
 23 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 24 | 
 25 | help:
 26 | 	@echo "Please use \`make <target>' where <target> is one of"
 27 | 	@echo "  html       to make standalone HTML files"
 28 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 29 | 	@echo "  singlehtml to make a single large HTML file"
 30 | 	@echo "  pickle     to make pickle files"
 31 | 	@echo "  json       to make JSON files"
 32 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 33 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 
 50 | clean:
 51 | 	rm -rf $(BUILDDIR)/*
 52 | 
 53 | html:
 54 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 55 | 	@echo
 56 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 57 | 
 58 | dirhtml:
 59 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 60 | 	@echo
 61 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 62 | 
 63 | singlehtml:
 64 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 65 | 	@echo
 66 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 67 | 
 68 | pickle:
 69 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 70 | 	@echo
 71 | 	@echo "Build finished; now you can process the pickle files."
 72 | 
 73 | json:
 74 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the JSON files."
 77 | 
 78 | htmlhelp:
 79 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 80 | 	@echo
 81 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 82 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 83 | 
 84 | qthelp:
 85 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 86 | 	@echo
 87 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 88 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 89 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mrec.qhcp"
 90 | 	@echo "To view the help file:"
 91 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mrec.qhc"
 92 | 
 93 | devhelp:
 94 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 95 | 	@echo
 96 | 	@echo "Build finished."
 97 | 	@echo "To view the help file:"
 98 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/mrec"
 99 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mrec"
100 | 	@echo "# devhelp"
101 | 
102 | epub:
103 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
104 | 	@echo
105 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
106 | 
107 | latex:
108 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
109 | 	@echo
110 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
111 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
112 | 	      "(use \`make latexpdf' here to do that automatically)."
113 | 
114 | latexpdf:
115 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
116 | 	@echo "Running LaTeX files through pdflatex..."
117 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
118 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
119 | 
120 | latexpdfja:
121 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
123 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
124 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
125 | 
126 | text:
127 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
128 | 	@echo
129 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
130 | 
131 | man:
132 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
133 | 	@echo
134 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
135 | 
136 | texinfo:
137 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
138 | 	@echo
139 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
140 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
141 | 	      "(use \`make info' here to do that automatically)."
142 | 
143 | info:
144 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
145 | 	@echo "Running Texinfo files through makeinfo..."
146 | 	make -C $(BUILDDIR)/texinfo info
147 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
148 | 
149 | gettext:
150 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
151 | 	@echo
152 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
153 | 
154 | changes:
155 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
156 | 	@echo
157 | 	@echo "The overview file is in $(BUILDDIR)/changes."
158 | 
159 | linkcheck:
160 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
161 | 	@echo
162 | 	@echo "Link check complete; look for any errors in the above output " \
163 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
164 | 
165 | doctest:
166 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
167 | 	@echo "Testing of doctests in the sources finished, look at the " \
168 | 	      "results in $(BUILDDIR)/doctest/output.txt."
169 | 
170 | xml:
171 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
172 | 	@echo
173 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
174 | 
175 | pseudoxml:
176 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
177 | 	@echo
178 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
179 | 


--------------------------------------------------------------------------------
/doc/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {%- set rellinks = rellinks[:2]+rellinks[3:] %}
3 | 


--------------------------------------------------------------------------------
/doc/api.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | API documentation
 3 | =================
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 5
 7 | 
 8 |     mrec
 9 | 
10 | 
11 | 
12 | Indices
13 | -------
14 | - :ref:`genindex`
15 | - :ref:`modindex`
16 | 


--------------------------------------------------------------------------------
/doc/aws.rst:
--------------------------------------------------------------------------------
 1 | ===================================
 2 | Running mrec on Amazon Web Services
 3 | ===================================
 4 | 
 5 | If you have a large dataset of ratings, the SLIM recommender implemented here can take a fair number of CPU cycles to train because it has to solve a separate regression problem for each item.
 6 | Fortunately it's easy to reduce your waiting time by running in parallel on a cluster of computers
 7 | using the IPython.parallel framework.
 8 | 
 9 | The `StarCluster <https://github.com/jtriley/StarCluster>`_ project makes it extremely simple to 
10 | provision an IPython cluster, by following the StarCluster `Quick-Start <http://star.mit.edu/cluster/docs/latest/quickstart.html>`_ and then
11 | the instructions given `here <http://star.mit.edu/cluster/docs/latest/plugins/ipython.html>`_.  To run `mrec` jobs on your cluster you'll need edit the `.starcluster/config` file to install the `mrec` package.  Your cluster configuration should look
12 | something like this:
13 | 
14 | .. code-block:: ini
15 | 
16 |     [cluster ip]
17 |     KEYNAME = your-keypair
18 |     CLUSTER_USER = ipuser
19 |     NODE_IMAGE_ID = ami-6c3a2f18
20 |     NODE_INSTANCE_TYPE = m1.xlarge
21 |     CLUSTER_SIZE = 40
22 |     CLUSTER_SHELL = bash
23 |     DISABLE_QUEUE = True
24 |     SPOT_BID = 0.15
25 |     PLUGINS = python-packages, ipcluster
26 |     VOLUMES = your-s3-volume
27 | 
28 |     [plugin python-packages]
29 |     setup_class = starcluster.plugins.pypkginstaller.PyPkgInstaller
30 |     install_command = pip install -U %s
31 |     packages = pyzmq,
32 |                git+http://github.com/ipython/ipython.git,
33 |                mrec
34 | 
35 |     [plugin ipcluster]
36 |     SETUP_CLASS = starcluster.plugins.ipcluster.IPCluster
37 |     PACKER = pickle
38 |     ENABLE_NOTEBOOK = True
39 | 
40 | This specifies an ``ip`` cluster template based on a StarCluster Ubuntu image which already has
41 | a number of scientific Python libraries installed.  The template also specifies two plugins
42 | to run after the machines are booted.  The first of these installs the remaining required Python
43 | packages: `pyzmq`, the latest version of IPython from github (this can be a good idea but but your mileage may vary), and `mrec` itself.  Finally the second plugin launches the IPython controller and worker processes themselves, and specifies ``pickle`` as the packer used to serialize objects
44 | passed between them.
45 | 
46 | You can then fire up a cluster ready to run `mrec` jobs::
47 | 
48 |     $ starcluster start -c ip mrec_cluster
49 | 
50 | This launches a cluster called "mrec_cluster" made up of the number of nodes specified in the ``ip`` template, starts a controller on the
51 | master node and a worker engine on each remaining core and on all the cores of the other nodes.  It also sets up a shared NFS file system
52 | visible to all of the nodes.
53 | 
54 | You can make your training data available either on an EBS volume, by following the instructions
55 | in the StarCluster documentation (usually just by configuring it in the StarCluster config file),
56 | or by putting it to the NFS by hand like this::
57 | 
58 |     $ starcluster sshmaster -u ipuser mrec_cluster 'mkdir data'
59 |     $ starcluster put -u ipuser /path/to/datasets data/
60 | 
61 | Now you can simply log in to the master node::
62 | 
63 |     $ starcluster sshmaster -u ipuser mrec_cluster
64 | 
65 | and start training as usual, just remembering that you probably have more engines available than on your local machine::
66 | 
67 |     $ mrec_train -n160 --input_format tsv --train "data/datasets/train.*" --outdir models
68 | 
69 | You can also use the cluster from the IPython command line or via a web notebook: see the
70 | StarCluster documentation for more details.
71 | 
72 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # mrec documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Aug 30 16:35:35 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | sys.path.insert(0, os.path.abspath('.'))
 20 | sys.path.insert(0, os.path.abspath('..'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.autosummary', 'numpydoc']
 30 | 
 31 | # Add any paths that contain templates here, relative to this directory.
 32 | templates_path = ['_templates']
 33 | 
 34 | # The suffix of source filenames.
 35 | source_suffix = '.rst'
 36 | 
 37 | # The encoding of source files.
 38 | #source_encoding = 'utf-8-sig'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'index'
 42 | 
 43 | # General information about the project.
 44 | project = u'mrec'
 45 | copyright = u'2013, Mendeley Ltd.'
 46 | 
 47 | # The version info for the project you're documenting, acts as replacement for
 48 | # |version| and |release|, also used in various other places throughout the
 49 | # built documents.
 50 | import pkg_resources
 51 | try:
 52 |     release = pkg_resources.get_distribution('mrec').version
 53 | except pkg_resources.DistributionNotFound:
 54 |     print 'To build the documentation, The distribution information of mrec'
 55 |     print 'has to be available.  Either install the package into your'
 56 |     print 'development environment or run "python setup.py develop" to setup'
 57 |     print 'the metadata.'
 58 |     sys.exit(1)
 59 | del pkg_resources
 60 | version = '.'.join(release.split('.')[:2])
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #language = None
 65 | 
 66 | # There are two options for replacing |today|: either, you set today to some
 67 | # non-false value, then it is used:
 68 | #today = ''
 69 | # Else, today_fmt is used as the format for a strftime call.
 70 | #today_fmt = '%B %d, %Y'
 71 | 
 72 | # List of patterns, relative to source directory, that match files and
 73 | # directories to ignore when looking for source files.
 74 | exclude_patterns = ['_build']
 75 | 
 76 | # The reST default role (used for this markup: `text`) to use for all documents.
 77 | #default_role = None
 78 | 
 79 | # If true, '()' will be appended to :func: etc. cross-reference text.
 80 | #add_function_parentheses = True
 81 | 
 82 | # If true, the current module name will be prepended to all description
 83 | # unit titles (such as .. function::).
 84 | #add_module_names = True
 85 | 
 86 | # If true, sectionauthor and moduleauthor directives will be shown in the
 87 | # output. They are ignored by default.
 88 | #show_authors = False
 89 | 
 90 | # The name of the Pygments (syntax highlighting) style to use.
 91 | pygments_style = 'sphinx'
 92 | 
 93 | # A list of ignored prefixes for module index sorting.
 94 | #modindex_common_prefix = []
 95 | 
 96 | 
 97 | # -- Options for HTML output ---------------------------------------------------
 98 | 
 99 | # The theme to use for HTML and HTML Help pages.  See the documentation for
100 | # a list of builtin themes.
101 | html_theme = 'sphinxdoc'
102 | 
103 | # Theme options are theme-specific and customize the look and feel of a theme
104 | # further.  For a list of options available for each theme, see the
105 | # documentation.
106 | #html_theme_options = {}
107 | 
108 | # Add any paths that contain custom themes here, relative to this directory.
109 | #html_theme_path = []
110 | 
111 | # The name for this set of Sphinx documents.  If None, it defaults to
112 | # "<project> v<release> documentation".
113 | #html_title = None
114 | 
115 | # A shorter title for the navigation bar.  Default is the same as html_title.
116 | #html_short_title = None
117 | 
118 | # The name of an image file (relative to this directory) to place at the top
119 | # of the sidebar.
120 | #html_logo = None
121 | 
122 | # The name of an image file (within the static path) to use as favicon of the
123 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
124 | # pixels large.
125 | #html_favicon = None
126 | 
127 | # Add any paths that contain custom static files (such as style sheets) here,
128 | # relative to this directory. They are copied after the builtin static files,
129 | # so a file named "default.css" will overwrite the builtin "default.css".
130 | html_static_path = ['_static']
131 | 
132 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
133 | # using the given strftime format.
134 | #html_last_updated_fmt = '%b %d, %Y'
135 | 
136 | # If true, SmartyPants will be used to convert quotes and dashes to
137 | # typographically correct entities.
138 | html_use_smartypants = True
139 | 
140 | # Custom sidebar templates, maps document names to template names.
141 | #html_sidebars = {}
142 | 
143 | # Additional templates that should be rendered to pages, maps page names to
144 | # template names.
145 | #html_additional_pages = {}
146 | 
147 | # If false, no module index is generated.
148 | #html_domain_indices = True
149 | 
150 | # If false, no index is generated.
151 | #html_use_index = True
152 | 
153 | # If true, the index is split into individual pages for each letter.
154 | #html_split_index = False
155 | 
156 | # If true, links to the reST sources are added to the pages.
157 | #html_show_sourcelink = True
158 | 
159 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
160 | #html_show_sphinx = True
161 | 
162 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
163 | #html_show_copyright = True
164 | 
165 | # If true, an OpenSearch description file will be output, and all pages will
166 | # contain a <link> tag referring to it.  The value of this option must be the
167 | # base URL from which the finished HTML is served.
168 | #html_use_opensearch = ''
169 | 
170 | # This is the file name suffix for HTML files (e.g. ".xhtml").
171 | #html_file_suffix = None
172 | 
173 | # Output file base name for HTML help builder.
174 | htmlhelp_basename = 'mrecdoc'
175 | 
176 | 
177 | # -- Options for LaTeX output --------------------------------------------------
178 | 
179 | latex_elements = {
180 | # The paper size ('letterpaper' or 'a4paper').
181 | #'papersize': 'letterpaper',
182 | 
183 | # The font size ('10pt', '11pt' or '12pt').
184 | #'pointsize': '10pt',
185 | 
186 | # Additional stuff for the LaTeX preamble.
187 | #'preamble': '',
188 | }
189 | 
190 | # Grouping the document tree into LaTeX files. List of tuples
191 | # (source start file, target name, title, author, documentclass [howto/manual]).
192 | latex_documents = [
193 |   ('index', 'mrec.tex', u'mrec Documentation',
194 |    u'Mark Levy, Mendeley Ltd.', 'manual'),
195 | ]
196 | 
197 | # The name of an image file (relative to this directory) to place at the top of
198 | # the title page.
199 | #latex_logo = None
200 | 
201 | # For "manual" documents, if this is true, then toplevel headings are parts,
202 | # not chapters.
203 | #latex_use_parts = False
204 | 
205 | # If true, show page references after internal links.
206 | #latex_show_pagerefs = False
207 | 
208 | # If true, show URL addresses after external links.
209 | #latex_show_urls = False
210 | 
211 | # Documents to append as an appendix to all manuals.
212 | #latex_appendices = []
213 | 
214 | # If false, no module index is generated.
215 | #latex_domain_indices = True
216 | 
217 | 
218 | # -- Options for manual page output --------------------------------------------
219 | 
220 | # One entry per manual page. List of tuples
221 | # (source start file, name, description, authors, manual section).
222 | man_pages = [
223 |     ('index', 'mrec', u'mrec Documentation',
224 |      [u'Mark Levy, Mendeley Ltd.'], 1)
225 | ]
226 | 
227 | # If true, show URL addresses after external links.
228 | #man_show_urls = False
229 | 
230 | 
231 | # -- Options for Texinfo output ------------------------------------------------
232 | 
233 | # Grouping the document tree into Texinfo files. List of tuples
234 | # (source start file, target name, title, author,
235 | #  dir menu entry, description, category)
236 | texinfo_documents = [
237 |   ('index', 'mrec', u'mrec Documentation',
238 |    u'Mark Levy, Mendeley Ltd.', 'mrec', 'One line description of project.',
239 |    'Miscellaneous'),
240 | ]
241 | 
242 | # Documents to append as an appendix to all manuals.
243 | #texinfo_appendices = []
244 | 
245 | # If false, no module index is generated.
246 | #texinfo_domain_indices = True
247 | 
248 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
249 | #texinfo_show_urls = 'footnote'
250 | 


--------------------------------------------------------------------------------
/doc/evaluation.rst:
--------------------------------------------------------------------------------
 1 | .. _evaluation:
 2 | 
 3 | =====================================
 4 | Making and evaluating recommendations
 5 | =====================================
 6 | 
 7 | Once you have a trained model, you can use the ``mrec_predict`` script to generate recommendations
 8 | and to evaluate them::
 9 | 
10 |     $ mrec_predict
11 |     Usage: mrec_predict [options]
12 | 
13 |     Options:
14 |       -h, --help            show this help message and exit
15 |       --mb_per_task=MB_PER_TASK
16 |                             approximate memory limit per task in MB, so total
17 |                             memory usage is num_engines * mb_per_task (default:
18 |                             share all available RAM across engines)
19 |       --input_format=INPUT_FORMAT
20 |                             format of training dataset(s) tsv | csv | mm
21 |                             (matrixmarket) | fsm (fast_sparse_matrix)
22 |       --test_input_format=TEST_INPUT_FORMAT
23 |                             format of test dataset(s) tsv | csv | mm
24 |                             (matrixmarket) | npz (numpy binary)  (default: npz)
25 |       --train=TRAIN         glob specifying path(s) to training dataset(s)
26 |                             IMPORTANT: must be in quotes if it includes the *
27 |                             wildcard
28 |       --item_features=ITEM_FEATURES
29 |                             path to sparse item features in tsv format
30 |                             (item_id,feature_id,val)
31 |       --modeldir=MODELDIR   directory containing trained models
32 |       --outdir=OUTDIR       directory for output files
33 |       --metrics=METRICS     which set of metrics to compute, main|hitrate
34 |                             (default: main)
35 |       --overwrite           overwrite existing files in outdir (default: False)
36 |       --packer=PACKER       packer for IPython.parallel (default: json)
37 |       --add_module_paths=ADD_MODULE_PATHS
38 |                             optional comma-separated list of paths to append to
39 |                             pythonpath (useful if you need to import uninstalled
40 |                             modules to IPython engines on a cluster)
41 | 
42 | Even though you're making predictions with a recommender that has already been trained,
43 | you need to specify the training file with the ``--train`` option so that the recommender
44 | is able to exclude items that each user has already seen from their recommendations.
45 | The corresponding test file used for evaluation is assumed to be in the same directory
46 | as the training file, and with a related filepath following the convention described
47 | in :ref:`filename_conventions-link`.
48 | 
49 | You only need to supply a filepath with the ``--item_features`` option if you used the
50 | features during training.
51 | 
52 | You can choose one of two sets of metrics, the `main` metrics which include Precision@k
53 | for various small values of `k` and Mean Reciprocal Rank, or `hitrate` which simply computes
54 | the HitRate@10.  `hitrate` is only appropriate if your test set contains a single item for
55 | each user; it measures how often the single test item appears in the top 10 recommendations, 
56 | and is equivalent to Recall@10.
57 | 
58 | The recommendations themselves will be written to file in the ``--outdir``, in tsv format
59 | `user`, `item`, `score`.  The `score` is not directly meaningful but higher is better for
60 | when comparing two recommended items for the same user.
61 | 
62 | If your dataset is of any significant size, and particularly if your trained model is a
63 | matrix factorization recommender, you may want to limit the amount of memory allocated by
64 | each task to avoid OOM errors if you plan to do other work while ``mrec_predict`` is running.
65 | You can do this with the ``--mb_per_task`` option: bear in
66 | mind that the amount of memory specified with this option will be used concurrently on each
67 | IPython engine.
68 | 
69 | Evaluating existing recommendations
70 | -----------------------------------                            
71 | For convenience the ``mrec_evaluate`` script lets you compute the same evaluation metrics for recommendations that have already been saved to disk, whether
72 | with ``mrec_predict`` or some other external program::
73 | 
74 |     $ mrec_evaluate
75 |     Usage: mrec_evaluate [options]
76 | 
77 |     Options:
78 |       -h, --help            show this help message and exit
79 |       --input_format=INPUT_FORMAT
80 |                             format of training dataset(s) tsv | csv | mm
81 |                             (matrixmarket) | fsm (fast_sparse_matrix)
82 |       --test_input_format=TEST_INPUT_FORMAT
83 |                             format of test dataset(s) tsv | csv | mm
84 |                             (matrixmarket) | npz (numpy binary)  (default: npz)
85 |       --train=TRAIN         glob specifying path(s) to training dataset(s)
86 |                             IMPORTANT: must be in quotes if it includes the *
87 |                             wildcard
88 |       --recsdir=RECSDIR     directory containing tsv files of precomputed
89 |                             recommendations
90 |       --metrics=METRICS     which set of metrics to compute, main|hitrate
91 |                             (default: main)
92 |       --description=DESCRIPTION
93 |                             description of model which generated the
94 |                             recommendation
95 |                             
96 | 


--------------------------------------------------------------------------------
/doc/hybrid.rst:
--------------------------------------------------------------------------------
 1 | .. _hybrid:
 2 | 
 3 | ===================================
 4 | Learning jointly from item features
 5 | ===================================
 6 | 
 7 | In real world settings it's common to have features describing each item as well as ratings or
 8 | other counts expressing users historical interactions with items. As we expect that users might
 9 | like items with similar features to those that they have liked in the past, it should be useful
10 | for a recommender to take item features into account. One way of doing this is to extend the
11 | matrix factorization approach, which represents each user and item with a low-dimensional vector,
12 | by learning to represent each feature in the same low-dimensional space. ``mrec`` includes
13 | an implementation of a :class:`joint model <mrec.mf.model.warp2.WARP2>` of this kind which optimizes
14 | the WARP ranking loss [1]_. This model learns an embedding matrix which maps the item features into
15 | the low-dimensional space. To predict the rating or preference score for an unseen item, it
16 | computes the dot product of the user factor and item factor in the usual way for a matrix
17 | factorization recommender, but then adds the dot product of the user factor and the
18 | low-dimensional mapping of its feature vector.
19 | 
20 | As an example we'll look again at the small movie ratings dataset that we previously worked with
21 | in :ref:`quickstart`, but this time we'll add some features based on movie plot descriptions
22 | from IMDb. To create the features, first download the plot.list.gz file from one of the `official IMDb ftp sites <http://www.imdb.com/interfaces#plain>`_. This contains plot summaries for most of the movies
23 | in the MovieLens datasets. Once you've unzipped this file you can use the ``extract_movie_features``
24 | script in the bin directory of the ``mrec`` source tree to create features and save them to file::
25 |     
26 |     $ cd mrec
27 |     $ ./bin/extract_movie_features plot.list ml-100k/u.item 100k.features.npz
28 | 
29 | .. note::
30 | 
31 |     The ``extract_movie_features`` script isn't installed automatically with ``mrec`` so
32 |     you'll need to `grab the source code <https://github.com/mendeley/mrec>`_ if you don't
33 |     already have it.
34 | 
35 | The resulting features are simply `tf-idf counts <http://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_ of the words found in the plot summaries for each movie. You can load them like this::
36 | 
37 |     >>> from mrec import load_sparse_matrix
38 |     >>> features = load_sparse_matrix('npz','100k.features.npz')
39 | 
40 | and inspect the top few word counts for the first few items::
41 | 
42 |     >>> for i in xrange(3):
43 |     ...     for tfidf,word in sorted(zip(features[i].data,features[i].indices),reverse=True)[:3]:
44 |     ...         print '{0}\t{1}\t{2:.3f}'.format(i,word,tfidf)
45 |     ...
46 |     0   500 0.440
47 |     0   549 0.340
48 |     0   4   0.242
49 |     1   311 0.412
50 |     1   564 0.335
51 |     1   549 0.243
52 |     2   117 0.430
53 |     2   286 0.427
54 |     2   670 0.220
55 | 
56 | Now we can train a recommender in the usual way, specifying the features with the ``item_features``
57 | and ``item_feature_format`` options::
58 | 
59 |     $ mrec_train -n4 --input_format tsv --train u.data.train.0 --outdir models --model warp --item_features 100k.features.npz --item_feature_format npz
60 | 
61 | Once this has finished (it will take a few minutes even on a single split of this small dataset)
62 | you can use the recommender to make and evaluate predictions::
63 | 
64 |     $ mrec_predict --input_format tsv --test_input_format tsv --train u.data.train.0 --modeldir models --outdir recs --item_features 100k.features.npz --item_feature_format npz
65 | 
66 | After a few seconds you'll get the results as usual::
67 | 
68 |     WARP2MF(d=80,gamma=0.01,C=100.0)
69 |     mrr            0.6008 +/- 0.0000
70 |     prec@5         0.3650 +/- 0.0000
71 |     prec@10        0.3221 +/- 0.0000
72 |     prec@15        0.2915 +/- 0.0000
73 |     prec@20        0.2699 +/- 0.0000
74 | 
75 | .. [1] Weston, J., Bengio, S., & Usunier, N. (2010). Large scale image annotation: learning to rank with joint word-image embeddings. Machine learning, 81(1), 21-35.
76 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../README.rst
 2 | 
 3 | Contents
 4 | --------
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 1
 8 | 
 9 |     quickstart
10 |     preparation
11 |     training
12 |     hybrid
13 |     evaluation
14 |     aws
15 |     api
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/doc/mrec.evaluation.rst:
--------------------------------------------------------------------------------
 1 | mrec.evaluation Package
 2 | =======================
 3 | 
 4 | :mod:`preprocessing` Module
 5 | ---------------------------
 6 | 
 7 | .. automodule:: mrec.evaluation.preprocessing
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`metrics` Module
13 | ---------------------
14 | 
15 | .. automodule:: mrec.evaluation.metrics
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/doc/mrec.examples.rst:
--------------------------------------------------------------------------------
 1 | mrec.examples Package
 2 | =====================
 3 | 
 4 | :mod:`train` Module
 5 | -------------------
 6 | 
 7 | .. automodule:: mrec.examples.train
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`predict` Module
13 | ---------------------
14 | 
15 | .. automodule:: mrec.examples.predict
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | :mod:`evaluate` Module
21 | ----------------------
22 | 
23 | .. automodule:: mrec.examples.evaluate
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | 
29 | :mod:`filename_conventions` Module
30 | ----------------------------------
31 | 
32 | .. automodule:: mrec.examples.filename_conventions
33 |     :members:
34 |     :undoc-members:
35 |     :show-inheritance:
36 | 
37 | 


--------------------------------------------------------------------------------
/doc/mrec.item_similarity.rst:
--------------------------------------------------------------------------------
 1 | mrec.item_similarity Package
 2 | ============================
 3 | 
 4 | :mod:`slim` Module
 5 | ------------------
 6 | 
 7 | .. automodule:: mrec.item_similarity.slim
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`knn` Module
13 | -----------------
14 | 
15 | .. automodule:: mrec.item_similarity.knn
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | :mod:`precomputed` Module
21 | -------------------------
22 | 
23 | .. automodule:: mrec.item_similarity.precomputed
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | :mod:`recommender` Module
29 | -------------------------
30 | 
31 | .. automodule:: mrec.item_similarity.recommender
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | 


--------------------------------------------------------------------------------
/doc/mrec.mf.model.rst:
--------------------------------------------------------------------------------
 1 | mrec.mf.model Package
 2 | =======================
 3 | 
 4 | :mod:`warp` Module
 5 | ------------------
 6 | 
 7 | .. automodule:: mrec.mf.model.warp
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`warp2` Module
13 | -------------------
14 | 
15 | .. automodule:: mrec.mf.model.warp2
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/doc/mrec.mf.rst:
--------------------------------------------------------------------------------
 1 | mrec.mf Package
 2 | ===============
 3 | 
 4 | :mod:`wrmf` Module
 5 | ------------------
 6 | 
 7 | .. automodule:: mrec.mf.wrmf
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`warp` Module
13 | ------------------
14 | 
15 | .. automodule:: mrec.mf.warp
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | :mod:`warp2` Module
21 | -------------------
22 | 
23 | .. automodule:: mrec.mf.warp2
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | :mod:`recommender` Module
29 | -------------------------
30 | 
31 | .. automodule:: mrec.mf.recommender
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 |  Subpackages
37 | ------------
38 | 
39 | .. toctree::
40 | 
41 |     mrec.mf.model
42 | 
43 | 


--------------------------------------------------------------------------------
/doc/mrec.parallel.rst:
--------------------------------------------------------------------------------
 1 | mrec.parallel Package
 2 | =====================
 3 | 
 4 | :mod:`item_similarity` Module
 5 | -----------------------------
 6 | 
 7 | .. automodule:: mrec.parallel.item_similarity
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`wrmf` Module
13 | ------------------
14 | 
15 | .. automodule:: mrec.parallel.wrmf
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | :mod:`warp` Module
21 | ------------------
22 | 
23 | .. automodule:: mrec.parallel.warp
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | :mod:`predict` Module
29 | ---------------------
30 | 
31 | .. automodule:: mrec.parallel.predict
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 


--------------------------------------------------------------------------------
/doc/mrec.rst:
--------------------------------------------------------------------------------
 1 | mrec Package
 2 | ===================
 3 | 
 4 | :mod:`sparse` Module
 5 | --------------------
 6 | 
 7 | .. automodule:: mrec.sparse
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`popularity` Module
13 | ------------------------
14 | 
15 | .. automodule:: mrec.popularity
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | :mod:`reranking_recommender` Module
21 | -----------------------------------
22 | 
23 | .. automodule:: mrec.reranking_recommender
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | :mod:`base_recommender` Module
29 | ------------------------------
30 | 
31 | .. automodule:: mrec.base_recommender
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | Subpackages
37 | -----------
38 | 
39 | .. toctree::
40 | 
41 |     mrec.evaluation
42 |     mrec.examples
43 |     mrec.item_similarity
44 |     mrec.mf
45 |     mrec.parallel
46 | 
47 | 


--------------------------------------------------------------------------------
/doc/preparation.rst:
--------------------------------------------------------------------------------
 1 | .. _preparation:
 2 | 
 3 | =======================
 4 | Preparing training data
 5 | =======================
 6 | 
 7 | Run the ``mrec_prepare`` script to create train/test splits from a ratings dataset in TSV format.
 8 | Each line should contain: `user`, `item`, `score`. `user` and `item` should be integer IDs starting from 1, and `score` is a rating or some other value describing how much the user likes or has interacted with the item.  Any further fields in each line will be ignored::
 9 | 
10 |     $ mrec_prepare
11 |     Usage: mrec_prepare [options]
12 | 
13 |     Options:
14 |       -h, --help            show this help message and exit
15 |       --dataset=DATASET     path to input dataset in tsv format
16 |       --outdir=OUTDIR       directory for output files
17 |       --num_splits=NUM_SPLITS
18 |                             number of train/test splits to create (default: 5)
19 |       --min_items_per_user=MIN_ITEMS_PER_USER
20 |                             skip users with less than this number of ratings
21 |                             (default: 10)
22 |       --binarize            binarize ratings
23 |       --normalize           scale training ratings to unit norm
24 |       --rating_thresh=RATING_THRESH
25 |                             treat ratings below this as zero (default: 0)
26 |       --test_size=TEST_SIZE
27 |                             target number of test items for each user, if
28 |                             test_size >= 1 treat as an absolute number, otherwise
29 |                             treat as a fraction of the total items (default: 0.5)
30 |       --discard_zeros       discard zero training ratings after thresholding (not
31 |                             recommended, incompatible with using training items to
32 |                             guarantee that recommendations are novel)
33 |       --sample_before_thresholding
34 |                             choose test items before thresholding ratings (not
35 |                             recommended, test items below threshold will then be
36 |                             discarded)
37 | 
38 | 
39 | The options are designed to support various common training and evaluation scenarios.
40 | 
41 | Rating preprocessing options
42 | ----------------------------
43 | If you plan to train a SLIM recommender then you most likely need to ``--binarize`` or
44 | ``--normalize`` ratings to get good results.  You may also want to set
45 | a global ``--rating_thresh`` so that an item to which a user has given a low ratings is
46 | not considered as 'liked' by that user; ratings below the specified threshold are set
47 | to zero.
48 | 
49 | Split options
50 | -------------
51 | To evaluate a recommender you need to hide some of the items that were liked by each user
52 | by removing them to a test set.  Then you can generate recommendations based on the remaining
53 | training ratings, and see how many of the test items were successfully recommended.
54 | It will be hard to get meaningful results for users with very few rated items so these
55 | can simply be skipped by setting ``--min_items_per_user``.  Usually you'll want to create
56 | several train/test splits at random using the ``--num_splits`` option and then average evaluation
57 | results across them, so that the results aren't biased by the particular way in which any
58 | one split happens to be chosen.
59 | 
60 | You can choose how many items to move into the test set for
61 | each user with the ``--test_size`` option.  A typical choice for this is 0.5, which puts
62 | half of each users liked items into the test set, but you can vary this if you need to compare
63 | with previous results that used a different split.  You can also specify an absolute number
64 | of ratings by setting ``--test_size`` to an integer of 1 or more.  This is also useful if you
65 | plan to measure :ref:`Hit Rate <evaluation>` in which case you should specify ``--test_size 1``.
66 | 
67 | .. _filename_conventions-link:
68 | 
69 | Filename conventions
70 | --------------------
71 | ``mrec_prepare`` and the other `mrec` scripts use a set of filename conventions defined
72 | in the :mod:`mrec.examples.filename_conventions` module:
73 | 
74 | - input dataset: `ratings.tsv`
75 | - training files created by ``mrec_prepare``: `ratings.tsv.train.0`, `ratings.tsv.train.1`, ...
76 | - test files created by ``mrec_prepare``: `ratings.tsv.test.0`, `ratings.tsv.test.1`, ...
77 | - models created by ``mrec_train``: `ratings.tsv.train.0.model.npz`, `ratings.tsv.train.1.model.npz`, ...
78 | - recommendations created by ``mrec_predict``: `ratings.tsv.train.0.recs.tsv`, `ratings.tsv.train.1.recs.tsv`, ...
79 | 


--------------------------------------------------------------------------------
/doc/quickstart.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | =========================
  4 | Getting started with mrec
  5 | =========================
  6 | 
  7 | Install mrec
  8 | ------------
  9 | You can most easily install `mrec` with pip::
 10 | 
 11 |     $ sudo pip install mrec
 12 | 
 13 | Installing from source
 14 | ~~~~~~~~~~~~~~~~~~~~~~
 15 | Alternatively you can install `mrec` from source.  Installing `mrec` requires `numpy`, `scipy`, `scikit-learn`, `ipython`,
 16 | `cython` and `psutil`, and you'll also need `pyzmq` to run the utilities.
 17 | You can most easily install these using pip::
 18 | 
 19 |     $ sudo pip install numpy scipy scikit-learn cython ipython pyzmq psutil
 20 | 
 21 | You can then install `mrec` from source in the standard way::
 22 | 
 23 |     $ git clone https://github.com/Mendeley/mrec.git
 24 |     $ cd mrec
 25 |     $ sudo python setup.py install
 26 | 
 27 | This installs both the `mrec` library and the scripts described in the following sections.
 28 | 
 29 | .. note::
 30 | 
 31 |     You may want to specify where the scripts are installed::
 32 | 
 33 |     $ sudo python setup.py install --install-scripts /path/to/script/dir
 34 | 
 35 | Get some data
 36 | -------------
 37 | Let's start by grabbing a small dataset of movie ratings from the MovieLens project::
 38 | 
 39 |     $ wget http://www.grouplens.org/system/files/ml-100k.zip
 40 |     $ unzip ml-100k.zip
 41 | 
 42 | We'll work with the `u.data` file: this contains the ratings themselves in TSV format: user, item, rating, timestamp
 43 | (we'll be ignorning the timestamps)::
 44 | 
 45 |     $ head ml-100k/u.data
 46 |     196 242 3   881250949
 47 |     186 302 3   891717742
 48 |     22  377 1   878887116
 49 |     244 51  2   880606923
 50 |     166 346 1   886397596
 51 |     298 474 4   884182806
 52 |     115 265 2   881171488
 53 |     253 465 5   891628467
 54 |     305 451 3   886324817
 55 |     6   86  3   883603013
 56 | 
 57 | Get the data ready to use
 58 | -------------------------
 59 | To do useful work we need to split this dataset into `train` and `test` movies for each user.  The idea is that
 60 | we choose some items which the user rated and liked, and move them into the test set.  We then train our
 61 | recommender using only the remaining items for each user.  Once we've generated recommendations
 62 | we can evaluate them by seeing how many of the test items we've actually recommended.
 63 | 
 64 | Deciding which items a user liked involves taking some decisions about how to interpret rating scores (or
 65 | whatever other values you have in your input data - click counts, page views, and so on).  The Movielens
 66 | ratings run from 1 to 5 stars, so let's only put items in our test set if they have a score of 4 or 5.
 67 | We also have to decide how many of the items rated by each user we should put in the test set.  Selecting
 68 | too few test items means that we leave plenty of ratings for our recommender to learn from, but our evaluation
 69 | scores are likely to be low (as there are few "correct" test items that can be predicted) so may not give
 70 | us a very clear picture of whether one recommender is better than another.  Selecting too many test items means
 71 | that we don't leave enough training data for our recommender to learn anything.  For now let's put roughly
 72 | half of the movies that each user liked into the test set.
 73 | 
 74 | Run the ``mrec_prepare`` script to split the movies that users rated 4 or higher into roughly equal sized training and test
 75 | sets like this::
 76 | 
 77 |     $ mrec_prepare --dataset ml-100k/u.data --outdir splits --rating_thresh 4 --test_size 0.5 --binarize
 78 | 
 79 | This creates five different randomly chosen train/test splits::
 80 | 
 81 |     $ ls -lh splits/
 82 |     total 3.7M
 83 |     -rw-rw-r-- 1 mark mark 266K Sep 21 19:17 u.data.test.0
 84 |     -rw-rw-r-- 1 mark mark 266K Sep 21 19:17 u.data.test.1
 85 |     -rw-rw-r-- 1 mark mark 266K Sep 21 19:17 u.data.test.2
 86 |     -rw-rw-r-- 1 mark mark 266K Sep 21 19:17 u.data.test.3
 87 |     -rw-rw-r-- 1 mark mark 266K Sep 21 19:17 u.data.test.4
 88 |     -rw-rw-r-- 1 mark mark 474K Sep 21 19:17 u.data.train.0
 89 |     -rw-rw-r-- 1 mark mark 474K Sep 21 19:17 u.data.train.1
 90 |     -rw-rw-r-- 1 mark mark 474K Sep 21 19:17 u.data.train.2
 91 |     -rw-rw-r-- 1 mark mark 474K Sep 21 19:17 u.data.train.3
 92 |     -rw-rw-r-- 1 mark mark 474K Sep 21 19:17 u.data.train.4
 93 | 
 94 | If you look into any of these files you'll see that the ``--binarize`` option we gave to ``mrec_prepare``
 95 | has replaced the ratings with 0 or 1, depending whether or not the original rating met our chosen
 96 | threshold of 4.
 97 | 
 98 | Averaging evaluation results from each of these train/test splits should give us some reasonably trustworthy numbers.
 99 | 
100 | .. note::
101 | 
102 |     You'll see that each test file is only half as big as the corresponding training file.
103 |     That's because we only pick movies that the user liked to put into the test set.  The
104 |     training files contain the other half of the movies that users liked, and *all* of
105 |     the movies they didn't like. Even though our recommender won't try to learn a user's
106 |     tastes from their low-rated
107 |     movies, we need to leave them in the training data so that we don't end up
108 |     recommending a movie that they've already seen.
109 | 
110 | For full details about using the ``mrec_prepare`` script see :ref:`Preparing training data <preparation>`.
111 | 
112 | Learn from the data
113 | -------------------
114 | Now you've prepared some data you can start training recommenders with the ``mrec_train`` script, but first
115 | you'll need to start up some IPython engines to do the work::
116 | 
117 |     $ ipcluster start -n4 --daemonize
118 | 
119 | The ``-n4`` argument says that you want to start four engines.  In practice you'll want one engine for each core
120 | you plan to use for processing.
121 | If you don't specify ``-n``, ``ipcluster`` will start one engine for each core on your machine. That's fine, but
122 | it's useful to know exactly how many engines are running.
123 | 
124 | Once the IPython engines are running you can kick off training a separate recommender for each train/test split
125 | like this::
126 | 
127 |     $ mrec_train -n4 --input_format tsv --train "splits/u.data.train.*" --outdir models
128 | 
129 | This will run for a few seconds and you'll then find the trained models in the ``models`` directory::
130 | 
131 |     $ ls -lh models/
132 |     total 17M
133 |     -rw-rw-r-- 1 mark mark 1.4M Sep 21 19:48 u.data.train.0.model.npz
134 |     -rw-rw-r-- 1 mark mark 2.1M Sep 21 19:48 u.data.train.0.sims.tsv
135 |     -rw-rw-r-- 1 mark mark 1.4M Sep 21 19:48 u.data.train.1.model.npz
136 |     -rw-rw-r-- 1 mark mark 2.1M Sep 21 19:48 u.data.train.1.sims.tsv
137 |     -rw-rw-r-- 1 mark mark 1.4M Sep 21 19:48 u.data.train.2.model.npz
138 |     -rw-rw-r-- 1 mark mark 2.1M Sep 21 19:48 u.data.train.2.sims.tsv
139 |     -rw-rw-r-- 1 mark mark 1.4M Sep 21 19:48 u.data.train.3.model.npz
140 |     -rw-rw-r-- 1 mark mark 2.1M Sep 21 19:48 u.data.train.3.sims.tsv
141 |     -rw-rw-r-- 1 mark mark 1.4M Sep 21 19:48 u.data.train.4.model.npz
142 |     -rw-rw-r-- 1 mark mark 2.1M Sep 21 19:48 u.data.train.4.sims.tsv
143 | 
144 | .. note::
145 | 
146 |     Alongside each model you'll see a file containing the item similarity matrix in TSV format.
147 |     These can be useful if you want to inspect the similarity scores or use them outside of `mrec`,
148 |     but they aren't essential and you can delete them if you want.
149 | 
150 | For more information about training recommenders with ``mrec_train`` see :ref:`Training a recommender <training>`.
151 | 
152 | Make some recommendations and evaluate them
153 | -------------------------------------------
154 | Now we have some trained models you can run the ``mrec_predict`` script to generate recommendations
155 | and more importantly to evaluate them::
156 | 
157 |     $ mrec_predict --input_format tsv --test_input_format tsv --train "splits/u.data.train.*" --modeldir models --outdir recs
158 | 
159 | This will run for a few seconds printing out some progress information before showing the evaluation results::
160 | 
161 |     SLIM(SGDRegressor(alpha=0.101, epsilon=0.1, eta0=0.01, fit_intercept=False,
162 |        l1_ratio=0.990099009901, learning_rate=invscaling,
163 |        loss=squared_loss, n_iter=5, p=None, penalty=elasticnet,
164 |        power_t=0.25, random_state=None, rho=None, shuffle=False, verbose=0,
165 |        warm_start=False))
166 |     mrr            0.6541 +/- 0.0023
167 |     prec@5         0.4082 +/- 0.0016
168 |     prec@10        0.3529 +/- 0.0010
169 |     prec@15        0.3180 +/- 0.0009
170 |     prec@20        0.2933 +/- 0.0008
171 | 
172 | This tells us that the recommender we trained was a SLIM model, based on scikit-learn's SGDRegressor.
173 | The metrics shown are Mean Reciprocal Rank and Precision@k for a few values of k.  The precision values
174 | are the easiest to understand: prec@5 of 0.4 means that on average two of the first five items recommended
175 | to each user were found in the test set, i.e. they were movies that the user did really like.
176 | 
177 | You'll find the recommendations themselves in the `recs` directory::
178 | 
179 |     $ head recs/u.data.train.0.recs.tsv 
180 |     237 100 0.22976178339
181 |     237 194 0.215614718584
182 |     237 174 0.205740941451
183 |     237 318 0.199876443948
184 |     237 357 0.190513438762
185 |     237 195 0.188450807147
186 |     237 480 0.16834165636
187 |     237 197 0.167543389552
188 |     237 181 0.166211624407
189 |     237 134 0.164500008501
190 | 
191 | As you can see the first few recommendations from this run were for user 237, and our top recommendations
192 | for him are movies 100, 194, 174, 318, 357.  If you're interested you can look these up in the u.item file
193 | provided by MovieLens: they are `Fargo`, `The Sting`, `Raiders of the Lost Ark`, `Schindler's
194 | List` and `One Flew Over the Cuckoo's Nest`.  The third column in the recommendations file is a predicted preference score.
195 | It doesn't have a direct meaning, but higher is better.
196 | 
197 | For more details about making and evaluating recommendations with `mrec` see :ref:`Making and evaluating recommendations <evaluation>`.
198 | 


--------------------------------------------------------------------------------
/doc/training.rst:
--------------------------------------------------------------------------------
  1 | .. _training:
  2 | 
  3 | ======================
  4 | Training a recommender
  5 | ======================
  6 | 
  7 | Here are the basic options for ``mrec_train``::
  8 | 
  9 |     $ mrec_train
 10 |     Usage: mrec_train [options]
 11 | 
 12 |     Options:
 13 |       -h, --help            show this help message and exit
 14 |       -n NUM_ENGINES, --num_engines=NUM_ENGINES
 15 |                             number of IPython engines to use
 16 |       --input_format=INPUT_FORMAT
 17 |                             format of training dataset(s) tsv | csv | mm
 18 |                             (matrixmarket) | fsm (fast_sparse_matrix)
 19 |       --train=TRAIN         glob specifying path(s) to training dataset(s)
 20 |                             IMPORTANT: must be in quotes if it includes the *
 21 |                             wildcard
 22 |       --outdir=OUTDIR       directory for output files
 23 |       --overwrite           overwrite existing files in outdir
 24 |       --model=MODEL         type of model to train: slim | knn | wrmf | warp | popularity
 25 |                             (default: slim)
 26 |       --max_sims=MAX_SIMS   max similar items to output for each training item
 27 |                             (default: 100)
 28 | 
 29 | ``mrec_train`` currently supports five types of recommender:
 30 | 
 31 | - `knn` learns a traditional k-nearest neighbours item similarity model
 32 | - `slim` specifies a SLIM model which learns item similarities by solving a regression problem
 33 | - `wrmf` fits a confidence-weighted matrix factorization model
 34 | - `warp` trains a model that optimizes a ranking loss, and can also learn from item features
 35 | - `popularity` is a trivial baseline that will make the same recommendations for all users, but which can be useful for evaluation.
 36 | 
 37 | The ``--train`` input file for training can hold the user-item matrix in a variety of formats.
 38 | You can specify more than one input file by passing a standard unix file glob
 39 | containing the * wildcard, for example specifying `--train ml100-k/u.train.*` will
 40 | train separate models for `ml-100k/u.train.0`, `ml-100k/u.train.1` and so on.  
 41 | This can be useful if you're doing cross-validation.
 42 | 
 43 | .. note::
 44 | 
 45 |     All input training files must have the same data format.  
 46 | 
 47 | A separate recommender will be trained for each input file, and saved to disk in the
 48 | specified output directory: if the input file is called `u.train.0` then the
 49 | recommender will be saved in the file `u.train.0.model.npz`, and so on.  See :ref:`filename_conventions-link` for more information.
 50 | 
 51 | The saved model
 52 | can be passed to the ``mrec_predict`` script as described in :ref:`evaluation`, or used programmatically like
 53 | this::
 54 | 
 55 |     >>> from mrec import load_sparse_matrix, load_recommender
 56 |     >>> train = load_sparse_matrix('tsv','u.train.0')
 57 |     >>> model = load_recommender('u.train.0.model.npz')
 58 |     >>> sims = model.get_similar_items(231)  # get items similar to 231
 59 |     >>> recs = model.recommend_items(train,101,max_items=30)  # recommend top 30 items for user 101
 60 | 
 61 | See :mod:`mrec.item_similarity.recommender` for more details.
 62 | 
 63 | You can supply additional options to ``mrec_train`` specifying parameter settings for the particular type of recommender you are training.
 64 | For a SLIM recommender you probably want to specify::
 65 | 
 66 |       --learner=LEARNER     underlying learner for SLIM learner: sgd | elasticnet
 67 |                             | fs_sgd (default: sgd)
 68 |       --l1_reg=L1_REG       l1 regularization constant (default: 0.1)
 69 |       --l2_reg=L2_REG       l2 regularization constant (default: 0.001)
 70 | 
 71 | For a k-nearest neighbour recommender you just need to supply::
 72 | 
 73 |       --metric=METRIC       metric for knn recommender: cosine | dot (default:
 74 |                         cosine)
 75 | 
 76 | In this case ``max_sims`` is simply passed to the constructor
 77 | of the ``KNNRecommender`` as the value of ``k``.
 78 | 
 79 | For the Weighted Regularized Matrix Factoriation (WRMF) recommender you can specify::
 80 | 
 81 |     --num_factors=NUM_FACTORS
 82 |                           number of latent factors (default: 80)
 83 |     --alpha=ALPHA         wrmf confidence constant (default: 1.0)
 84 |     --lbda=LBDA           wrmf regularization constant (default: 0.015)
 85 |     --als_iters=ALS_ITERS number of als iterations (default: 15)
 86 | 
 87 | The confidence constant determines the model's confidence in the rating/count associated
 88 | with an item using a simple linear formula::
 89 | 
 90 |     confidence = 1 + alpha * count
 91 | 
 92 | The regularization constant and number of learning iterations control over-fitting.
 93 | 
 94 | For the Weighted Approximately Ranked Pairwise (WARP) loss recommender the options are::
 95 | 
 96 |     --num_factors=NUM_FACTORS
 97 |                           number of latent factors (default: 80)
 98 |     --gamma=GAMMA         warp learning rate (default: 0.01)
 99 |     --C=C                 warp regularization constant (default: 100.0)
100 |     --item_features=ITEM_FEATURES
101 |                           path to sparse item features in tsv format
102 |                           (item_id,feature_id,val)
103 | 
104 | The ``item_features`` option here is particularly interesting: if you supply a filepath
105 | here then a hybrid recommender will be created, based on a model that learns
106 | jointly from the item features in the file and from the ratings or preference scores in
107 | the training user-item matrix. See :ref:`hybrid` for more details.
108 | 
109 | You can also train a baseline non-personalized recommender that just finds the most popular
110 | items and recommends them to everybody. The options for this are::
111 | 
112 |     --popularity_method=POPULARITY_METHOD
113 |                           how to compute popularity for baseline recommender:
114 |                           count | sum | avg | thresh (default: count)
115 |     --popularity_thresh=POPULARITY_THRESH
116 |                           ignore scores below this when computing popularity for
117 |                           baseline recommender (default: 0)
118 |                         
119 | The different measures mean let you base the popularity of an item on its total number of
120 | ratings of any value, or its total above some threshold; or on the sum or mean of its ratings.
121 | 
122 | There are also a couple of options relating to the IPython.parallel framework::
123 | 
124 |     --packer=PACKER       packer for IPython.parallel (default: pickle)
125 |     --add_module_paths=ADD_MODULE_PATHS
126 |                           optional comma-separated list of paths to append to
127 |                           pythonpath (useful if you need to import uninstalled
128 |                           modules to IPython engines on a cluster)
129 | 
130 | The ``--add_module_paths`` option can be useful to specify the path to `mrec` itself
131 | if you didn't install it at start up time on all the machines in your cluster.
132 | 
133 | Parameter tuning for SLIM
134 | -------------------------
135 | Before training a SLIM recommender, you'll need to choose the regularization constants.
136 | You can do this easily using the ``mrec_tune`` script, which computes similarity weights for some
137 | sample items over a range of values for each constant, and picks the best combination based on some
138 | simple parameters.  The 'best' regularization constants are those that give similarity weights
139 | that are as sparse as possible, but not too sparse.  You run ``mrec_tune`` like this::
140 | 
141 |     $ mrec_tune -d u.data.train.0 --input_format tsv \
142 |         --l1_min 0.001 --l1_max 1.0 \
143 |         --l2_min 0.0001 --l2_max 1 \
144 |         --max_sims 200 --min_sims 1 --max_sparse 0.3
145 | 
146 | This says that we want to find the best constants that result in no more than 200 similar items for each item,
147 | provided no more than 30% of items have no similar items at all.  We'd like to explore combinations of regularization
148 | constants where the l1 constant ranges from 0.001 to 1.0 and the l2 constant from 0.0001 to 1.0.
149 | The script will run for a few seconds and then report the best settings::
150 | 
151 |     best parameter setting: {'l1_reg': 0.1, 'l2_reg': 0.001}
152 |     mean # positive similarity weights per item = 96.0
153 |     proportion of items with fewer than 1 positive similarity weights = 0.25
154 |     mean # negative similarity weights per item = 43.4
155 | 
156 | .. note::
157 | 
158 |     For this little dataset even the best constant values shown will mean that we won't learn
159 |     any similar items for quite a large proportion of the training items.  This isn't
160 |     usually a problem with production size datasets.
161 | 


--------------------------------------------------------------------------------
/mrec/__init__.py:
--------------------------------------------------------------------------------
  1 | from itertools import izip
  2 | import numpy as np
  3 | from scipy.sparse import coo_matrix, csr_matrix
  4 | from scipy.io import mmread, mmwrite
  5 | try:
  6 |     import cPickle as pickle
  7 | except ImportError:
  8 |     import pickle
  9 | 
 10 | from sparse import fast_sparse_matrix, loadtxt, loadz, savez
 11 | from base_recommender import BaseRecommender
 12 | 
 13 | __version__ = '0.3.1'
 14 | 
 15 | def load_fast_sparse_matrix(input_format,filepath):
 16 |     """
 17 |     Load a fast_sparse_matrix from an input file of the specified format,
 18 |     by delegating to the appropriate static method.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     input_format : str
 23 |         Specifies the file format:
 24 |         - tsv
 25 |         - csv
 26 |         - mm  (MatrixMarket)
 27 |         - fsm (mrec.sparse.fast_sparse_matrix)
 28 |     filepath : str
 29 |         The file to load.
 30 |     """
 31 |     if input_format == 'tsv':
 32 |         return fast_sparse_matrix.loadtxt(filepath)
 33 |     elif input_format == 'csv':
 34 |         return fast_sparse_matrix.loadtxt(filepath,delimiter=',')
 35 |     elif input_format == 'mm':
 36 |         return fast_sparse_matrix.loadmm(filepath)
 37 |     elif input_format == 'fsm':
 38 |         return fast_sparse_matrix.load(filepath)
 39 |     raise ValueError('unknown input format: {0}'.format(input_format))
 40 | 
 41 | def load_sparse_matrix(input_format,filepath):
 42 |     """
 43 |     Load a scipy.sparse.csr_matrix from an input file of the specified format.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     input_format : str
 48 |         Specifies the file format:
 49 |         - tsv
 50 |         - csv
 51 |         - mm  (MatrixMarket)
 52 |         - npz (scipy.sparse.csr_matrix serialized with mrec.sparse.savez())
 53 |         - fsm (mrec.sparse.fast_sparse_matrix)
 54 |     filepath : str
 55 |         The file to load.
 56 |     """
 57 |     if input_format == 'tsv':
 58 |         return loadtxt(filepath)
 59 |     elif input_format == 'csv':
 60 |         return loadtxt(filepath,delimiter=',')
 61 |     elif input_format == 'mm':
 62 |         return mmread(filepath).tocsr()
 63 |     elif input_format == 'npz':
 64 |         return loadz(filepath).tocsr()
 65 |     elif input_format == 'fsm':
 66 |         return fast_sparse_matrix.load(filepath).X
 67 |     raise ValueError('unknown input format: {0}'.format(input_format))
 68 | 
 69 | def save_sparse_matrix(data,fmt,filepath):
 70 |     """
 71 |     Save a scipy sparse matrix in the specified format. Row and column
 72 |     indices will be converted to 1-indexed if you specify a plain text
 73 |     format (tsv, csv, mm). Note that zero entries are guaranteed to be
 74 |     saved in tsv or csv format.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     data : scipy sparse matrix to save
 79 |     fmt : str
 80 |         Specifies the file format to write:
 81 |         - tsv
 82 |         - csv
 83 |         - mm  (MatrixMarket)
 84 |         - npz (save as npz archive of numpy arrays)
 85 |         - fsm (mrec.sparse.fast_sparse_matrix)
 86 |     filepath : str
 87 |         The file to load.
 88 |     """
 89 |     if fmt == 'tsv':
 90 |         m = data.tocoo()
 91 |         with open(filepath,'w') as out:
 92 |             for u,i,v in izip(m.row,m.col,m.data):
 93 |                 print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,v)
 94 |     elif fmt == 'csv':
 95 |         m = data.tocoo()
 96 |         with open(filepath,'w') as out:
 97 |             for u,i,v in izip(m.row,m.col,m.data):
 98 |                 print >>out,'{0},{1},{2}'.format(u+1,i+1,v)
 99 |     elif fmt == 'mm':
100 |         mmwrite(filepath,data)
101 |     elif fmt == 'npz':
102 |         savez(data.tocoo(),filepath)
103 |     elif fmt == 'fsm':
104 |         fast_sparse_matrix(data).save(filepath)
105 |     else:
106 |         raise ValueError('unknown output format: {0}'.format(fmt))
107 | 
108 | def save_recommender(model,filepath):
109 |     """
110 |     Save a recommender model to file.
111 | 
112 |     Parameters
113 |     ----------
114 |     model : mrec.base_recommender.BaseRecommender
115 |         The recommender to save.
116 |     filepath : str
117 |         The filepath to write to.
118 |     """
119 |     model.save(filepath)
120 | 
121 | def load_recommender(filepath):
122 |     """
123 |     Load a recommender model from file after it has been saved by
124 |     save_recommender().
125 | 
126 |     Parameters
127 |     ----------
128 |     filepath : str
129 |         The filepath to read from.
130 |     """
131 |     return BaseRecommender.load(filepath)
132 | 
133 | def read_recommender_description(filepath):
134 |     """
135 |     Read a recommender model description from file after it has
136 |     been saved by save_recommender(), without loading all the
137 |     associated data into memory.
138 | 
139 |     Parameters
140 |     ----------
141 |     filepath : str
142 |         The filepath to read from.
143 |     """
144 |     return BaseRecommender.read_recommender_description(filepath)
145 | 


--------------------------------------------------------------------------------
/mrec/base_recommender.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     import cPickle as pickle
  3 | except ImportError:
  4 |     import pickle
  5 | import numpy as np
  6 | from scipy.sparse import csr_matrix
  7 | 
  8 | class BaseRecommender(object):
  9 |     """
 10 |     Minimal interface to be implemented by recommenders, along with
 11 |     some helper methods. A concrete recommender must implement the
 12 |     recommend_items() method and should provide its own implementation
 13 |     of __str__() so that it can be identified when printing results.
 14 | 
 15 |     Notes
 16 |     =====
 17 |     In most cases you should inherit from either
 18 |     `mrec.mf.recommender.MatrixFactorizationRecommender` or
 19 |     `mrec.item_similarity.recommender.ItemSimilarityRecommender`
 20 |     and *not* directly from this class.
 21 | 
 22 |     These provide more efficient implementations of save(), load()
 23 |     and the batch methods to recommend items.
 24 |     """
 25 | 
 26 |     def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None):
 27 |         """
 28 |         Recommend new items for a user.
 29 | 
 30 |         Parameters
 31 |         ==========
 32 |         dataset : scipy.sparse.csr_matrix
 33 |             User-item matrix containing known items.
 34 |         u : int
 35 |             Index of user for which to make recommendations.
 36 |         max_items : int
 37 |             Maximum number of recommended items to return.
 38 |         return_scores : bool
 39 |             If true return a score along with each recommended item.
 40 |         item_features : array_like, shape = [num_items, num_features]
 41 |             Optionally supply features for each item in the dataset.
 42 | 
 43 |         Returns
 44 |         =======
 45 |         recs : list
 46 |             List of (idx,score) pairs if return_scores is True, else
 47 |             just a list of idxs.
 48 |         """
 49 |         raise NotImplementedError('you must implement recommend_items()')
 50 | 
 51 |     def fit(self,train,item_features=None):
 52 |         """
 53 |         Train on supplied data. In general you will want to
 54 |         implement this rather than computing recommendations on
 55 |         the fly.
 56 | 
 57 |         Parameters
 58 |         ==========
 59 |         train : scipy.sparse.csr_matrix or mrec.sparse.fast_sparse_matrix, shape = [num_users, num_items]
 60 |             User-item matrix.
 61 |         item_features : array_like, shape = [num_items, num_features]
 62 |             Features for items in training set, required by some recommenders.
 63 |         """
 64 |         raise NotImplementedError('you should implement fit()')
 65 | 
 66 |     def save(self,filepath):
 67 |         """
 68 |         Serialize model to file.
 69 | 
 70 |         Parameters
 71 |         ==========
 72 |         filepath : str
 73 |             Filepath to write to, which must have the '.npz' suffix.
 74 | 
 75 |         Notes
 76 |         =====
 77 |         Internally numpy.savez may be used to serialize the model and
 78 |         this would add the '.npz' suffix to the supplied filepath if
 79 |         it were not already present, which would most likely cause errors
 80 |         in client code.
 81 |         """
 82 |         if not filepath.endswith('.npz'):
 83 |             raise ValueError('invalid filepath {0}, must have ".npz" suffix'.format(filepath))
 84 | 
 85 |         archive = self._create_archive()
 86 |         if archive:
 87 |             np.savez(filepath,**archive)
 88 |         else:
 89 |             pickle.dump(self,open(filepath,'w'))
 90 | 
 91 |     def _create_archive(self):
 92 |         """
 93 |         Optionally return a dict of fields to be serialized
 94 |         in a numpy archive: this lets you store arrays efficiently
 95 |         by separating them from the model itself.
 96 | 
 97 |         Returns
 98 |         =======
 99 |         archive : dict
100 |             Fields to serialize, must include the model itself
101 |             under the key 'model'.
102 |         """
103 |         pass
104 | 
105 |     @staticmethod
106 |     def load(filepath):
107 |         """
108 |         Load a recommender model from file after it has been serialized with
109 |         save().
110 | 
111 |         Parameters
112 |         ==========
113 |         filepath : str
114 |             The filepath to read from.
115 |         """
116 |         r = np.load(filepath)
117 |         if isinstance(r,BaseRecommender):
118 |             model = r
119 |         else:
120 |             model = np.loads(str(r['model']))
121 |             model._load_archive(r)  # restore any fields serialized separately
122 |         return model
123 | 
124 |     def _load_archive(archive):
125 |         """
126 |         Load fields from a numpy archive.
127 | 
128 |         Notes
129 |         =====
130 |         This is called by the static load() method and should be used
131 |         to restore the fields returned by _create_archive().
132 |         """
133 |         pass
134 | 
135 |     @staticmethod
136 |     def read_recommender_description(filepath):
137 |         """
138 |         Read a recommender model description from file after it has
139 |         been saved by save(), without loading any additional
140 |         associated data into memory.
141 | 
142 |         Parameters
143 |         ==========
144 |         filepath : str
145 |             The filepath to read from.
146 |         """
147 |         r = np.load(filepath,mmap_mode='r')
148 |         if isinstance(r,BaseRecommender):
149 |             model = r
150 |         else:
151 |             model = np.loads(str(r['model']))
152 |         return str(model)
153 | 
154 |     def __str__(self):
155 |         if hasattr(self,'description'):
156 |             return self.description
157 |         return 'unspecified recommender: you should set self.description or implement __str__()'
158 | 
159 |     def batch_recommend_items(self,
160 |                               dataset,
161 |                               max_items=10,
162 |                               return_scores=True,
163 |                               show_progress=False,
164 |                               item_features=None):
165 |         """
166 |         Recommend new items for all users in the training dataset.
167 | 
168 |         Parameters
169 |         ==========
170 |         dataset : scipy.sparse.csr_matrix
171 |             User-item matrix containing known items.
172 |         max_items : int
173 |             Maximum number of recommended items to return.
174 |         return_scores : bool
175 |             If true return a score along with each recommended item.
176 |         show_progress: bool
177 |             If true print something to stdout to show progress.
178 |         item_features : array_like, shape = [num_items, num_features]
179 |             Optionally supply features for each item in the dataset.
180 | 
181 |         Returns
182 |         =======
183 |         recs : list of lists
184 |             Each entry is a list of (idx,score) pairs if return_scores is True,
185 |             else just a list of idxs.
186 | 
187 |         Notes
188 |         =====
189 |         This provides a default implementation, you will be able to optimize
190 |         this for most recommenders.
191 |         """
192 |         recs = []
193 |         for u in xrange(self.num_users):
194 |             if show_progress and u%1000 == 0:
195 |                print u,'..',
196 |             recs.append(self.recommend_items(dataset,u,max_items,return_scores))
197 |         if show_progress:
198 |             print
199 |         return recs
200 | 
201 |     def range_recommend_items(self,
202 |                               dataset,
203 |                               user_start,
204 |                               user_end,
205 |                               max_items=10,
206 |                               return_scores=True,
207 |                               item_features=None):
208 |         """
209 |         Recommend new items for a range of users in the training dataset.
210 | 
211 |         Parameters
212 |         ==========
213 |         dataset : scipy.sparse.csr_matrix
214 |             User-item matrix containing known items.
215 |         user_start : int
216 |             Index of first user in the range to recommend.
217 |         user_end : int
218 |             Index one beyond last user in the range to recommend.
219 |         max_items : int
220 |             Maximum number of recommended items to return.
221 |         return_scores : bool
222 |             If true return a score along with each recommended item.
223 |         item_features : array_like, shape = [num_items, num_features]
224 |             Optionally supply features for each item in the dataset.
225 | 
226 |         Returns
227 |         =======
228 |         recs : list of lists
229 |             Each entry is a list of (idx,score) pairs if return_scores is True,
230 |             else just a list of idxs.
231 | 
232 |         Notes
233 |         =====
234 |         This provides a default implementation, you will be able to optimize
235 |         this for most recommenders.
236 |         """
237 |         return [self.recommend_items(dataset,u,max_items,return_scores) for u in xrange(user_start,user_end)]
238 | 
239 |     def _zero_known_item_scores(self,r,train):
240 |         """
241 |         Helper function to set predicted scores/ratings for training items
242 |         to zero or less, to avoid recommending already known items.
243 | 
244 |         Parameters
245 |         ==========
246 |         r : numpy.ndarray or scipy.sparse.csr_matrix
247 |             Predicted scores/ratings.
248 |         train : scipy.sparse.csr_matrix
249 |             The training user-item matrix, which can include zero-valued entries.
250 | 
251 |         Returns
252 |         =======
253 |         r_safe : scipy.sparse.csr_matrix
254 |             r_safe is equal to r except that r[u,i] <= 0 for all u,i with entries
255 |             in train.
256 |         """
257 |         col = train.indices
258 |         if isinstance(r,csr_matrix):
259 |             max_score = r.data.max()
260 |         else:
261 |             max_score = r.max()
262 |         data = max_score * np.ones(col.shape)
263 |         # build up the row (user) indices
264 |         # - we can't just use row,col = train.nonzero() as this eliminates
265 |         #   u,i for which train[u,i] has been explicitly set to zero
266 |         row = np.zeros(col.shape)
267 |         for u in xrange(train.shape[0]):
268 |             start,end = train.indptr[u],train.indptr[u+1]
269 |             if end > start:
270 |                 row[start:end] = u
271 |         return r - csr_matrix((data,(row,col)),shape=r.shape)
272 | 
273 | 


--------------------------------------------------------------------------------
/mrec/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | class Evaluator(object):
 2 |     """
 3 |     Compute metrics for recommendations that have been written to file.
 4 | 
 5 |     Parameters
 6 |     ----------
 7 |     compute_metrics : function(list,list)
 8 |         The evaluation function which should accept two lists of predicted
 9 |         and actual item indices.
10 |     max_items : int
11 |         The number of recommendations needed to compute the evaluation function.
12 |     """
13 | 
14 |     def __init__(self,compute_metrics,max_items):
15 |         self.compute_metrics = compute_metrics
16 |         self.max_items = max_items
17 | 
18 |     def _add_metrics(self,predicted,actual):
19 |         metrics = self.compute_metrics(predicted,actual)
20 |         if metrics:
21 |             for m,val in metrics.iteritems():
22 |                 self.cum_metrics[m] += val
23 |             self.count += 1
24 | 
25 |     def process(self,testdata,recsfile,start,end,offset=1):
26 |         """
27 |         Parameters
28 |         ----------
29 |         testdata : scipy sparse matrix
30 |             The test items for each user.
31 |         recsfile : str
32 |             Filepath to the recommendations.  The file should contain TSV
33 |             of the form: user, item, score.  IMPORTANT: the recommendations must
34 |             be sorted by user and score.
35 |         start : int
36 |             First user to evaluate.
37 |         end: int
38 |             One after the last user to evaluate.
39 |         offset : int
40 |             Index offset for users and items in recommendations file.
41 | 
42 |         Returns
43 |         -------
44 |         cum_metrics : dict
45 |             Aggregated metrics i.e. total values for all users.
46 |         count : int
47 |             The number of users for whom metrics were computed.
48 |         """
49 |         from collections import defaultdict
50 | 
51 |         self.cum_metrics = defaultdict(float)
52 |         self.count = 0
53 | 
54 |         last_user = start
55 |         recs = []
56 |         for line in open(recsfile):
57 |             user,item,score = line.strip().split('\t')
58 |             user = int(user)-1  # convert to 0-indxed
59 |             item = int(item)-1
60 |             if user >= end:
61 |                 break
62 |             if user < start:
63 |                 continue
64 |             if user != last_user:
65 |                 self._add_metrics(recs,testdata[last_user,:].indices.tolist())
66 |                 last_user = user
67 |                 recs = []
68 |             if len(recs) < self.max_items:
69 |                 recs.append(item)
70 |         self._add_metrics(recs,testdata[last_user,:].indices.tolist())
71 | 
72 |         return self.cum_metrics,self.count
73 | 


--------------------------------------------------------------------------------
/mrec/evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Metrics to evaluate recommendations:
  3 | * with hit rate, following e.g. Karypis lab SLIM and FISM papers
  4 | * with prec@k and MRR
  5 | """
  6 | 
  7 | import numpy as np
  8 | from scipy import stats
  9 | from collections import defaultdict
 10 | 
 11 | # classes to access known items for each test user
 12 | 
 13 | class get_known_items_from_dict(object):
 14 | 
 15 |     def __init__(self,data):
 16 |         self.data = data
 17 | 
 18 |     def __call__(self,u):
 19 |         return self.data[u]
 20 | 
 21 | class get_known_items_from_csr_matrix(object):
 22 | 
 23 |     def __init__(self,data):
 24 |         self.data = data
 25 | 
 26 |     def __call__(self,u):
 27 |         return self.data[u].indices
 28 | 
 29 | class get_known_items_from_thresholded_csr_matrix(object):
 30 | 
 31 |     def __init__(self,data,min_value):
 32 |         self.data = data
 33 |         self.min_value = min_value
 34 | 
 35 |     def __call__(self,u):
 36 |         items = self.data[u].toarray().flatten()
 37 |         items[items<self.min_value] = 0
 38 |         return items.nonzero()
 39 | 
 40 | # methods to refit a model to a new training dataset
 41 | 
 42 | def retrain_recommender(model,dataset):
 43 |     model.fit(dataset)
 44 | 
 45 | # methods for metric computation itself
 46 | 
 47 | def run_evaluation(models,retrain,get_split,num_runs,evaluation_func):
 48 |     """
 49 |     This is the main entry point to run an evaluation.
 50 | 
 51 |     Supply functions to retrain model, to get a new split of data on
 52 |     each run, to get known items from the test set, and to compute the
 53 |     metrics you want:
 54 |     - retrain(model,dataset) should retrain model
 55 |     - get_split() should return train_data,test_users,test_data
 56 |     - evaluation_func(model,users,test) should return a dict of metrics
 57 |     A number of suitable functions are already available in the module.
 58 |     """
 59 |     metrics = [defaultdict(list) for m in models]
 60 |     for _ in xrange(num_runs):
 61 |         train,users,test = get_split()
 62 |         for i,model in enumerate(models):
 63 |             retrain(model,train)
 64 |             run_metrics = evaluation_func(model,train,users,test)
 65 |             for m,val in run_metrics.iteritems():
 66 |                 print m,val
 67 |                 metrics[i][m].append(val)
 68 |     return metrics
 69 | 
 70 | def generate_metrics(get_known_items,compute_metrics):
 71 |     def evaluation_func(model,train,users,test):
 72 |         return evaluate(model,train,users,get_known_items(test),compute_metrics)
 73 |     return evaluation_func
 74 | 
 75 | def sort_metrics_by_name(names):
 76 |     # group by name and number in "@n"
 77 |     prefix2val = defaultdict(list)
 78 |     for name in names:
 79 |         parts = name.split('@')
 80 |         name = parts[0]
 81 |         if len(parts) > 1:
 82 |             val = int(parts[1])
 83 |             prefix2val[name].append(val)
 84 |         else:
 85 |             prefix2val[name] = []
 86 |     for name,vals in prefix2val.iteritems():
 87 |         prefix2val[name] = sorted(vals)
 88 |     ret = []
 89 |     for name,vals in sorted(prefix2val.iteritems()):
 90 |         if vals:
 91 |             for val in vals:
 92 |                 ret.append('{0}@{1}'.format(name,val))
 93 |         else:
 94 |             ret.append(name)
 95 |     return ret
 96 | 
 97 | def print_report(models,metrics):
 98 |     """
 99 |     Call this to print out the metrics returned by run_evaluation().
100 |     """
101 |     for model,results in zip(models,metrics):
102 |         print model
103 |         if hasattr(model,'similarity_matrix'):
104 |             nnz = model.similarity_matrix.nnz
105 |             num_items = model.similarity_matrix.shape[0]
106 |             density = float(model.similarity_matrix.nnz)/num_items**2
107 |             print 'similarity matrix nnz = {0} (density {1:.3f})'.format(nnz,density)
108 |         for m in sort_metrics_by_name(results.keys()):
109 |             vals = results[m]
110 |             print '{0}{1:.4f} +/- {2:.4f}'.format(m.ljust(15),np.mean(vals),stats.sem(vals,ddof=0))
111 | 
112 | def evaluate(model,train,users,get_known_items,compute_metrics):
113 |     avg_metrics = defaultdict(float)
114 |     count = 0
115 |     for u in users:
116 |         recommended = [r for r,_ in model.recommend_items(train,u,max_items=20)]
117 |         metrics = compute_metrics(recommended,get_known_items(u))
118 |         if metrics:
119 |             for m,val in metrics.iteritems():
120 |                 avg_metrics[m] += val
121 |             count += 1
122 |     for m in avg_metrics:
123 |         avg_metrics[m] /= float(count)
124 |     return avg_metrics
125 | 
126 | # collections of metrics
127 | 
128 | def compute_main_metrics(recommended,known):
129 |     if not known:
130 |         return None
131 |     return {'prec@5':prec(recommended,known,5),
132 |             'prec@10':prec(recommended,known,10),
133 |             'prec@15':prec(recommended,known,15),
134 |             'prec@20':prec(recommended,known,20),
135 |             'mrr':rr(recommended,known)}
136 | 
137 | def compute_hit_rate(recommended,known):
138 |     if not known:
139 |         return None
140 |     return {'hit rate@10':hit_rate(recommended,known,10)}
141 | 
142 | # individual metrics
143 | 
144 | def prec(predicted,true,k,ignore_missing=False):
145 |     """
146 |     Compute precision@k.
147 | 
148 |     Parameters
149 |     ==========
150 |     predicted : array like
151 |         Predicted items.
152 |     true : array like
153 |         True items.
154 |     k : int
155 |         Measure precision@k.
156 |     ignore_missing : boolean (default: False)
157 |         If True then measure precision only up to rank len(predicted)
158 |         even if this is less than k, otherwise assume that the missing
159 |         predictions were all incorrect
160 | 
161 |     Returns
162 |     =======
163 |     prec@k : float
164 |         Precision at k.
165 |     """
166 |     if len(predicted) == 0:
167 |         return 0
168 |     correct = len(set(predicted[:k]).intersection(set(true)))
169 |     num_predicted = k
170 |     if len(predicted) < k and ignore_missing:
171 |         num_predicted = len(predicted)
172 |     return float(correct)/num_predicted
173 | 
174 | def hit_rate(predicted,true,k):
175 |     """
176 |     Compute hit rate i.e. recall@k assume a single test item.
177 | 
178 |     Parameters
179 |     ==========
180 |     predicted : array like
181 |         Predicted items.
182 |     true : array like
183 |         Containing the single true test item.
184 |     k : int
185 |         Measure hit rate@k.
186 | 
187 |     Returns
188 |     =======
189 |     hitrate : int
190 |         1 if true is amongst predicted, 0 if not.
191 |     """
192 |     if len(true) != 1:
193 |         raise ValueError('can only evaluate hit rate for exactly 1 true item')
194 |     return int(true[0] in predicted[:k])
195 | 
196 | def rr(predicted,true):
197 |     """
198 |     Compute Reciprocal Rank.
199 | 
200 |     Parameters
201 |     ==========
202 |     predicted : array like
203 |         Predicted items.
204 |     true : array like
205 |         True items.
206 | 
207 |     Returns
208 |     =======
209 |     rr : float
210 |         Reciprocal of rank at which first true item is found in predicted.
211 | 
212 |     Notes
213 |     =====
214 |     We'll under report this as our predictions are truncated.
215 |     """
216 |     for i,x in enumerate(predicted):
217 |         if x in true:
218 |             return 1.0/(i+1)
219 |     return 0
220 | 


--------------------------------------------------------------------------------
/mrec/evaluation/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | class TSVParser(object):
  4 |     """
  5 |     Parses tsv input: user, item, score.
  6 | 
  7 |     Parameters
  8 |     ----------
  9 |     thresh : float (default: 0)
 10 |         Set scores below this to zero.
 11 |     binarize : bool (default: False)
 12 |         If True, set all non-zero scores to 1.
 13 |     """
 14 | 
 15 |     def __init__(self,thresh=0,binarize=False,delimiter='\t'):
 16 |         self.thresh = thresh
 17 |         self.binarize = binarize
 18 |         self.delimiter = delimiter
 19 | 
 20 |     def parse(self,line):
 21 |         parts = line.strip().split(self.delimiter)
 22 |         user,item,count = parts[:3]
 23 |         val = float(count)
 24 |         if val >= self.thresh:
 25 |             if self.binarize:
 26 |                 val = 1
 27 |         else:
 28 |             val = 0
 29 |         return int(user),(int(item),val)
 30 | 
 31 | class SplitCreator(object):
 32 |     """
 33 |     Split ratings for a user randomly into train
 34 |     and test groups.  Only items with positive scores
 35 |     will be included in the test group.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     test_size : float
 40 |         If test_size >= 1 this specifies the absolute number
 41 |         of items to put in the test group; if test_size < 1
 42 |         then this specifies the test proportion.
 43 |     normalize : bool (default: False)
 44 |         If True, scale training scores for each user to have unit norm.
 45 |     discard_zeros : bool (default: False)
 46 |         If True then discard items with zero scores, if
 47 |         False then retain them in the training group.This
 48 |         should normally be False as such items have been seen
 49 |         (if not liked) and so the training set should include
 50 |         them so that it can be used to determine which items
 51 |         are actually novel at recommendation time.
 52 |     sample_before_thresholding : bool (default: False)
 53 |         If True then consider any item seen by the user for
 54 |         inclusion in the test group, even though only items
 55 |         with positive scrore will be selected. If the input
 56 |         includes items with zero scores this means that the
 57 |         test set may be smaller than the requested size for
 58 |         some users, even though they have apparently seen
 59 |         enough items.
 60 |     """
 61 | 
 62 |     def __init__(self,test_size,normalize=False,discard_zeros=False,sample_before_thresholding=False):
 63 |         self.test_size = test_size
 64 |         self.normalize = normalize
 65 |         self.discard_zeros = discard_zeros
 66 |         self.sample_before_thresholding = sample_before_thresholding
 67 | 
 68 |     def handle(self,vals):
 69 |         if self.sample_before_thresholding:
 70 |             train,test = self.split(vals)
 71 |         else:
 72 |             train,test = self.stratified_split(vals)
 73 |         train = [(v,c) for v,c in train if not self.discard_zeros or c > 0]
 74 |         test = [(v,c) for v,c in test if c > 0]
 75 |         if self.normalize:
 76 |             norm = sum(c*c for v,c in train)**0.5
 77 |             if norm > 0:
 78 |                 train = [(v,c/norm) for v,c in train]
 79 |         return train,test
 80 | 
 81 |     def pos_neg_vals(self,vals):
 82 |         vals = list(vals)
 83 |         pos = [(v,c) for v,c in vals if c > 0]
 84 |         neg = [(v,0) for v,c in vals if c == 0]
 85 |         return pos,neg
 86 | 
 87 |     def split(self,vals):
 88 |         random.shuffle(vals)
 89 |         num_train = self.num_train(vals)
 90 |         return vals[:num_train],vals[num_train:]
 91 | 
 92 |     def stratified_split(self,vals):
 93 |         pos,neg = self.pos_neg_vals(vals)
 94 |         random.shuffle(pos)
 95 |         train = pos[:self.num_train(pos)]
 96 |         if not self.discard_zeros:
 97 |             random.shuffle(neg)
 98 |             train.extend(neg[:self.num_train(neg)])
 99 |             random.shuffle(train)
100 |         test = pos[self.num_train(pos):]
101 |         return train,test
102 | 
103 |     def num_train(self,vals):
104 |         if self.test_size >= 1:
105 |             return len(vals)-self.test_size
106 |         return int(len(vals)*(1.0-self.test_size))
107 | 


--------------------------------------------------------------------------------
/mrec/evaluation/tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | from sklearn.utils.testing import assert_equal
 2 | from sklearn.utils.testing import assert_raises
 3 | 
 4 | from mrec.evaluation import metrics
 5 | 
 6 | def test_sort_metrics_by_name():
 7 |     names = ['recall@10','z-score','auc','recall@5']
 8 |     expected = ['auc','recall@5','recall@10','z-score']
 9 |     assert_equal(expected,metrics.sort_metrics_by_name(names))
10 | 
11 | def test_prec():
12 |     true = [2,8,6,4]
13 |     predicted = [6,5,8,7]
14 |     expected = [1,0.5,2./3.,0.5]
15 |     for k in xrange(1,5):
16 |         assert_equal(metrics.prec([],true,k),0)
17 |         assert_equal(metrics.prec(true,true,k),1)
18 |         assert_equal(metrics.prec(predicted,true,k),expected[k-1])
19 |     assert_equal(metrics.prec(true,true,5),0.8)
20 |     assert_equal(metrics.prec(true,true,5,ignore_missing=True),1)
21 |     assert_equal(metrics.prec(predicted,true,5),0.4)
22 |     assert_equal(metrics.prec(predicted,true,5,ignore_missing=True),expected[3])
23 | 
24 | def test_hit_rate():
25 |     predicted = [6,5,8,7]
26 |     for true in [[],[2,8]]:
27 |         for k in xrange(1,5):
28 |             with assert_raises(ValueError):
29 |                 metrics.hit_rate(predicted,true,k)
30 |     true = [5]
31 |     expected = [0,1,1,1]
32 |     for k in xrange(1,5):
33 |         assert_equal(metrics.hit_rate(predicted,true,k),expected[k-1])
34 | 
35 | def test_rr():
36 |     true = [2,8,6,4]
37 |     predicted = [5,7,6,8]
38 |     expected = [0,0,1./3.,1./3.]
39 |     for k in xrange(1,5):
40 |         assert_equal(metrics.rr(predicted[:k],true),expected[k-1])
41 | 


--------------------------------------------------------------------------------
/mrec/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mendeley/mrec/d299e3b9490703843b041e6585643b7e42e229f0/mrec/examples/__init__.py


--------------------------------------------------------------------------------
/mrec/examples/convert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert sparse matrix from one file format to another.
 3 | """
 4 | 
 5 | import os
 6 | import subprocess
 7 | 
 8 | def tsv2mtx(infile,outfile):
 9 |     num_users,num_items,nnz = 0,0,0
10 |     for line in open(infile):
11 |         u,i,v = line.strip().split()
12 |         u = int(u)
13 |         i = int(i)
14 |         if u > num_users:
15 |             num_users = u
16 |         if i > num_items:
17 |             num_items = i
18 |         nnz += 1
19 |     headerfile = outfile+'.header'
20 |     with open(headerfile,'w') as header:
21 |         print >>header,'%%MatrixMarket matrix coordinate real general'
22 |         print >>header,'{0} {1} {2}'.format(num_users,num_items,nnz)
23 |     subprocess.check_call(['cat',headerfile,infile],stdout=open(outfile,'w'))
24 |     subprocess.check_call(['rm',headerfile])
25 | 
26 | def main():
27 |     from optparse import OptionParser
28 | 
29 |     from mrec import load_sparse_matrix, save_sparse_matrix
30 | 
31 |     parser = OptionParser()
32 |     parser.add_option('--input_format',dest='input_format',help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)')
33 |     parser.add_option('--input',dest='input',help='filepath to input')
34 |     parser.add_option('--output_format',dest='output_format',help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)')
35 |     parser.add_option('--output',dest='output',help='filepath for output')
36 | 
37 |     (opts,args) = parser.parse_args()
38 |     if not opts.input or not opts.output or not opts.input_format or not opts.output_format:
39 |         parser.print_help()
40 |         raise SystemExit
41 | 
42 |     if opts.output_format == opts.input_format:
43 |         raise SystemExit('input and output format are the same, not doing anything')
44 | 
45 |     if opts.input_format == 'tsv' and opts.output_format == 'mm':
46 |         # we can do this without loading the data
47 |         tsv2mtx(opts.input,opts.output)
48 |     else:
49 |         data = load_sparse_matrix(opts.input_format,opts.input)
50 |         save_sparse_matrix(data,opts.output_format,opts.output)
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 
55 | 


--------------------------------------------------------------------------------
/mrec/examples/evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Evaluate precomputed recommendations for one or more training/test sets.
 3 | Test and recommendation files must following naming conventions relative
 4 | to the training filepaths.
 5 | """
 6 | 
 7 | def main():
 8 | 
 9 |     import os
10 |     import logging
11 |     import glob
12 |     from optparse import OptionParser
13 |     from collections import defaultdict
14 | 
15 |     from mrec import load_sparse_matrix
16 |     from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
17 |     from mrec.evaluation import Evaluator
18 |     from mrec.evaluation.metrics import print_report
19 |     from filename_conventions import get_testfile, get_recsfile
20 | 
21 |     logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')
22 | 
23 |     parser = OptionParser()
24 |     parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
25 |     parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)')
26 |     parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
27 |     parser.add_option('--recsdir',dest='recsdir',help='directory containing tsv files of precomputed recommendations')
28 |     parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)')
29 |     parser.add_option('--description',dest='description',help='description of model which generated the recommendations')
30 |     metrics_funcs = {'main':compute_main_metrics,
31 |                      'hitrate':compute_hit_rate}
32 | 
33 |     (opts,args) = parser.parse_args()
34 |     if not opts.input_format or not opts.train or not opts.recsdir \
35 |             or opts.metrics not in metrics_funcs:
36 |         parser.print_help()
37 |         raise SystemExit
38 | 
39 |     opts.train = os.path.abspath(os.path.expanduser(opts.train))
40 |     opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir))
41 | 
42 |     evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20)
43 | 
44 |     trainfiles = glob.glob(opts.train)
45 | 
46 |     all_metrics = defaultdict(list)
47 |     for trainfile in trainfiles:
48 |         logging.info('processing {0}...'.format(trainfile))
49 |         testfile = get_testfile(trainfile)
50 |         recsfile = get_recsfile(trainfile,opts.recsdir)
51 |         testdata = load_sparse_matrix(opts.test_input_format,testfile).tocsr()
52 |         cum_metrics,count = evaluator.process(testdata,recsfile,0,testdata.shape[0])
53 |         if cum_metrics is not None:
54 |             for m in cum_metrics:
55 |                 all_metrics[m].append(float(cum_metrics[m])/count)
56 | 
57 |     print_report([opts.description],[all_metrics])
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/mrec/examples/factors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Postprocess externally computed user/item factors so we can make
 3 | and evaluation recommendations with mrec scripts.
 4 | """
 5 | 
 6 | def main():
 7 | 
 8 |     import os
 9 |     import logging
10 |     import subprocess
11 |     from optparse import OptionParser
12 |     import numpy as np
13 |     from scipy.io import mmread
14 | 
15 |     from mrec import save_recommender
16 |     from mrec.mf.recommender import MatrixFactorizationRecommender
17 |     from filename_conventions import get_modelfile
18 | 
19 |     logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')
20 | 
21 |     parser = OptionParser()
22 |     parser.add_option('--factor_format',dest='factor_format',help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)')
23 |     parser.add_option('--user_factors',dest='user_factors',help='user factors filepath')
24 |     parser.add_option('--item_factors',dest='item_factors',help='item factors filepath')
25 |     parser.add_option('--train',dest='train',help='filepath to training data, just used to apply naming convention to output model saved here')
26 |     parser.add_option('--outdir',dest='outdir',help='directory for output')
27 |     parser.add_option('--description',dest='description',help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results')
28 | 
29 |     (opts,args) = parser.parse_args()
30 |     if not opts.factor_format or not opts.user_factors or not opts.item_factors \
31 |             or not opts.outdir:
32 |         parser.print_help()
33 |         raise SystemExit
34 | 
35 |     model = MatrixFactorizationRecommender()
36 | 
37 |     logging.info('loading factors...')
38 | 
39 |     if opts.factor_format == 'npy':
40 |         model.U = np.load(opts.user_factors)
41 |         model.V = np.load(opts.item_factors)
42 |     elif opts.factor_format == 'mm':
43 |         model.U = mmread(opts.user_factors)
44 |         model.V = mmread(opts.item_factors)
45 |     elif opts.factor_format == 'tsv':
46 |         model.U = np.loadtxt(opts.user_factors)
47 |         model.V = np.loadtxt(opts.item_factors)
48 |     else:
49 |         raise ValueError('unknown factor format: {0}'.format(factor_format))
50 | 
51 |     if opts.description:
52 |         model.description = opts.description
53 | 
54 |     logging.info('saving model...')
55 | 
56 |     logging.info('creating output directory {0}...'.format(opts.outdir))
57 |     subprocess.check_call(['mkdir','-p',opts.outdir])
58 | 
59 |     modelfile = get_modelfile(opts.train,opts.outdir)
60 |     save_recommender(model,modelfile)
61 | 
62 |     logging.info('done')
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/mrec/examples/filename_conventions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | File naming conventions:
 3 | 
 4 | * training files must contain 'train' in their filename.
 5 | * the corresponding test files must have the same filepaths,
 6 |   but with 'test' in place of 'train' in their filenames.
 7 | * models, similarity matrices and recommendations will be
 8 |   written to filenames based on the training file.
 9 | """
10 | 
11 | import os
12 | 
13 | def get_testfile(trainfile):
14 |     filename = os.path.basename(trainfile)
15 |     return os.path.join(os.path.dirname(trainfile),filename.replace('train','test'))
16 | 
17 | def get_simsdir(trainfile,outdir):
18 |     filename = os.path.basename(trainfile)
19 |     return os.path.join(outdir,'{0}-sims'.format(filename))
20 | 
21 | def get_recsdir(trainfile,outdir):
22 |     filename = os.path.basename(trainfile)
23 |     return os.path.join(outdir,'{0}-recs'.format(filename))
24 | 
25 | def get_modelsdir(trainfile,outdir):
26 |     filename = os.path.basename(trainfile)
27 |     return os.path.join(outdir,'{0}-models'.format(filename))
28 | 
29 | def get_factorsdir(trainfile,outdir):
30 |     filename = os.path.basename(trainfile)
31 |     return os.path.join(outdir,'{0}-factors'.format(filename))
32 | 
33 | def get_simsfile(trainfile,outdir):
34 |     filename = os.path.basename(trainfile)
35 |     return os.path.join(outdir,'{0}.sims.tsv'.format(filename))
36 | 
37 | def get_recsfile(trainfile,outdir):
38 |     filename = os.path.basename(trainfile)
39 |     return os.path.join(outdir,'{0}.recs.tsv'.format(filename))
40 | 
41 | def get_modelfile(trainfile,outdir):
42 |     filename = os.path.basename(trainfile)
43 |     return os.path.join(outdir,'{0}.model.npz'.format(filename))
44 | 
45 | def get_sortedfile(infile,outdir):
46 |     filename = os.path.basename(infile)
47 |     return os.path.join(outdir,'{0}.sorted'.format(filename))
48 | 
49 | def get_splitfile(infile,outdir,split_type,i):
50 |     filename = os.path.basename(infile)
51 |     return os.path.join(outdir,'{0}.{1}.{2}'.format(filename,split_type,i))
52 | 


--------------------------------------------------------------------------------
/mrec/examples/prepare.py:
--------------------------------------------------------------------------------
  1 | class Processor(object):
  2 | 
  3 |     def __init__(self,splitter,parser,min_items_per_user,preprocess=None):
  4 |         self.splitter = splitter
  5 |         self.parser = parser
  6 |         self.min_items_per_user = min_items_per_user
  7 |         self.preprocess = preprocess
  8 | 
  9 |     def output(self,user,vals,outfile):
 10 |         for v,c in vals:
 11 |             print >>outfile,'{0}\t{1}\t{2}'.format(user,v,c)
 12 | 
 13 |     def handle(self,user,vals):
 14 |         if len(vals) >= self.min_items_per_user:
 15 |             if self.preprocess is not None:
 16 |                 vals = self.preprocess(vals)
 17 |             train,test = self.splitter.handle(vals)
 18 |             self.output(user,train,self.train_out)
 19 |             self.output(user,test,self.test_out)
 20 |         else:
 21 |             self.too_few_items += 1
 22 | 
 23 |     def create_split(self,infile,train_out,test_out):
 24 |         self.train_out = train_out
 25 |         self.test_out = test_out
 26 |         self.too_few_items = 0
 27 |         last_user = None
 28 |         vals = []
 29 |         for line in infile:
 30 |             user,val = self.parser.parse(line)
 31 |             if user != last_user:
 32 |                 if last_user is not None:
 33 |                     self.handle(last_user,vals)
 34 |                 last_user = user
 35 |                 vals = []
 36 |             vals.append(val)
 37 |         self.handle(last_user,vals)
 38 | 
 39 |     def get_too_few_items(self):
 40 |         return self.too_few_items
 41 | 
 42 | def main():
 43 |     import os
 44 |     import logging
 45 |     import subprocess
 46 |     from optparse import OptionParser
 47 | 
 48 |     from mrec.evaluation.preprocessing import TSVParser, SplitCreator
 49 |     from filename_conventions import get_sortedfile, get_splitfile
 50 | 
 51 |     logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')
 52 | 
 53 |     parser = OptionParser()
 54 |     parser.add_option('--dataset',dest='dataset',help='path to input dataset in tsv format')
 55 |     parser.add_option('--delimiter',dest='delimiter',default='\t',help='input delimiter (default: tab)')
 56 |     parser.add_option('--outdir',dest='outdir',help='directory for output files')
 57 |     parser.add_option('--num_splits',dest='num_splits',type='int',default=5,help='number of train/test splits to create (default: %default)')
 58 |     parser.add_option('--min_items_per_user',dest='min_items_per_user',type='int',default=10,help='skip users with less than this number of ratings (default: %default)')
 59 |     parser.add_option('--binarize',dest='binarize',action='store_true',default=False,help='binarize ratings')
 60 |     parser.add_option('--normalize',dest='normalize',action='store_true',help='scale training ratings to unit norm')
 61 |     parser.add_option('--rating_thresh',dest='rating_thresh',type='float',default=0,help='treat ratings below this as zero (default: %default)')
 62 |     parser.add_option('--test_size',dest='test_size',type='float',default=0.5,help='target number of test items for each user, if test_size >= 1 treat as an absolute number, otherwise treat as a fraction of the total items (default: %default)')
 63 |     parser.add_option('--discard_zeros',dest='discard_zeros',action='store_true',help='discard zero training ratings after thresholding (not recommended, incompatible with using training items to guarantee that recommendations are novel)')
 64 |     parser.add_option('--sample_before_thresholding',dest='sample_before_thresholding',action='store_true',help='choose test items before thresholding ratings (not recommended, test items below threshold will then be discarded)')
 65 | 
 66 |     (opts,args) = parser.parse_args()
 67 |     if not opts.dataset or not opts.outdir:
 68 |         parser.print_help()
 69 |         raise SystemExit
 70 | 
 71 |     opts.dataset = os.path.abspath(opts.dataset)
 72 |     opts.outdir = os.path.abspath(opts.outdir)
 73 | 
 74 |     logging.info('sorting input data...')
 75 |     infile = get_sortedfile(opts.dataset,opts.outdir)
 76 |     subprocess.check_call(['mkdir','-p',opts.outdir])
 77 |     subprocess.check_call(['sort','-k1','-n',opts.dataset],stdout=open(infile,'w'))
 78 | 
 79 |     parser = TSVParser(thresh=opts.rating_thresh,binarize=opts.binarize,delimiter=opts.delimiter)
 80 |     splitter = SplitCreator(test_size=opts.test_size,normalize=opts.normalize,discard_zeros=opts.discard_zeros,
 81 |                             sample_before_thresholding=opts.sample_before_thresholding)
 82 |     processor = Processor(splitter,parser,opts.min_items_per_user)
 83 | 
 84 |     for i in xrange(opts.num_splits):
 85 |         trainfile = get_splitfile(opts.dataset,opts.outdir,'train',i)
 86 |         testfile = get_splitfile(opts.dataset,opts.outdir,'test',i)
 87 | 
 88 |         logging.info('creating split {0}: {1} {2}'.format(i,trainfile,testfile))
 89 |         processor.create_split(open(infile),open(trainfile,'w'),open(testfile,'w'))
 90 | 
 91 |         too_few_items = processor.get_too_few_items()
 92 |         if (too_few_items):
 93 |             logging.info('skipped {0} users with less than {1} ratings'.format(too_few_items,opts.min_items_per_user))
 94 | 
 95 |     logging.info('cleaning up...')
 96 |     subprocess.check_call(['rm',infile])
 97 |     logging.info('done')
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 | 
102 | 


--------------------------------------------------------------------------------
/mrec/examples/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Train an item similarity model in parallel on an ipython cluster.
  3 | We assume a shared filesystem (as you'll have when running locally
  4 | or on an AWS cluster fired up with StarCluster) to avoid passing
  5 | data between the controller and the worker engines, as this can
  6 | cause OOM issues for the controller.
  7 | 
  8 | You can specify multiple training sets and the model will learn a
  9 | separate similarity matrix for each input dataset: this makes it
 10 | easy to generate data for cross-validated evaluation.
 11 | """
 12 | 
 13 | from filename_conventions import *
 14 | 
 15 | def main():
 16 | 
 17 |     import os
 18 |     import logging
 19 |     import glob
 20 |     import subprocess
 21 |     from optparse import OptionParser
 22 |     from IPython.parallel import Client
 23 | 
 24 |     from mrec import load_fast_sparse_matrix, save_recommender
 25 |     from mrec.item_similarity.slim import SLIM
 26 |     from mrec.item_similarity.knn import CosineKNNRecommender, DotProductKNNRecommender
 27 |     from mrec.mf.wrmf import WRMFRecommender
 28 |     from mrec.mf.warp import WARPMFRecommender
 29 |     from mrec.mf.warp2 import WARP2MFRecommender
 30 |     from mrec.popularity import ItemPopularityRecommender
 31 |     from mrec.parallel.item_similarity import ItemSimilarityRunner
 32 |     from mrec.parallel.wrmf import WRMFRunner
 33 |     from mrec.parallel.warp import WARPMFRunner
 34 | 
 35 |     logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')
 36 | 
 37 |     parser = OptionParser()
 38 |     parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
 39 |     parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
 40 |     parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
 41 |     parser.add_option('--outdir',dest='outdir',help='directory for output files')
 42 |     parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir')
 43 |     parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)')
 44 |     parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)')
 45 |     parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)')
 46 |     parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)')
 47 |     parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)')
 48 |     parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)')
 49 |     parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)')
 50 |     parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)')
 51 |     parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)')
 52 |     parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)')
 53 |     parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)')
 54 |     parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)')
 55 |     parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
 56 |     parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)')
 57 |     parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)')
 58 |     parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)')
 59 |     parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
 60 |     parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')
 61 | 
 62 |     (opts,args) = parser.parse_args()
 63 |     if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines:
 64 |         parser.print_help()
 65 |         raise SystemExit
 66 | 
 67 |     opts.train = os.path.abspath(os.path.expanduser(opts.train))
 68 |     opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))
 69 | 
 70 |     trainfiles = glob.glob(opts.train)
 71 | 
 72 |     if opts.model == 'popularity':
 73 |         # special case, don't need to run in parallel
 74 |         subprocess.check_call(['mkdir','-p',opts.outdir])
 75 |         for trainfile in trainfiles:
 76 |             logging.info('processing {0}...'.format(trainfile))
 77 |             model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh)
 78 |             dataset = load_fast_sparse_matrix(opts.input_format,trainfile)
 79 |             model.fit(dataset)
 80 |             modelfile = get_modelfile(trainfile,opts.outdir)
 81 |             save_recommender(model,modelfile)
 82 |         logging.info('done')
 83 |         return
 84 | 
 85 |     # create an ipython client
 86 |     c = Client(packer=opts.packer)
 87 |     view = c.load_balanced_view()
 88 | 
 89 |     if opts.add_module_paths:
 90 |         c[:].execute('import sys')
 91 |         for path in opts.add_module_paths.split(','):
 92 |             logging.info('adding {0} to pythonpath on all engines'.format(path))
 93 |             c[:].execute("sys.path.append('{0}')".format(path))
 94 | 
 95 |     if opts.model == 'slim':
 96 |         if opts.learner == 'fs_sgd':
 97 |             num_selected_features = 2 * opts.max_sims  # preselect this many candidate similar items
 98 |             model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features)
 99 |         else:
100 |             model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner)
101 |     elif opts.model == 'knn':
102 |         if opts.metric == 'cosine':
103 |             model = CosineKNNRecommender(k=opts.max_sims)
104 |         elif opts.metric == 'dot':
105 |             model = DotProductKNNRecommender(k=opts.max_sims)
106 |         else:
107 |             parser.print_help()
108 |             raise SystemExit('unknown metric: {0}'.format(opts.metric))
109 |     elif opts.model == 'wrmf':
110 |         model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters)
111 |     elif opts.model == 'warp':
112 |         num_factors_per_engine = max(opts.num_factors/opts.num_engines,1)
113 |         if opts.item_features:
114 |             model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
115 |         else:
116 |             model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C)
117 |     else:
118 |         parser.print_help()
119 |         raise SystemExit('unknown model type: {0}'.format(opts.model))
120 | 
121 |     for trainfile in trainfiles:
122 |         logging.info('processing {0}...'.format(trainfile))
123 |         modelfile = get_modelfile(trainfile,opts.outdir)
124 |         if opts.model == 'wrmf':
125 |             runner = WRMFRunner()
126 |             factorsdir = get_factorsdir(trainfile,opts.outdir)
127 |             runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile)
128 |         elif opts.model == 'warp':
129 |             runner = WARPMFRunner()
130 |             modelsdir = get_modelsdir(trainfile,opts.outdir)
131 |             runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile)
132 |         else:
133 |             runner = ItemSimilarityRunner()
134 |             simsdir = get_simsdir(trainfile,opts.outdir)
135 |             simsfile = get_simsfile(trainfile,opts.outdir)
136 |             runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile)
137 | 
138 | if __name__ == '__main__':
139 |     main()
140 | 


--------------------------------------------------------------------------------
/mrec/examples/tune_slim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Try to find a sensible range for regularization
  3 | constants for SLIM by looking at model sparsity.
  4 | """
  5 | 
  6 | import random
  7 | from math import log10
  8 | import logging
  9 | from operator import itemgetter
 10 | from optparse import OptionParser
 11 | try:
 12 |     from sklearn.grid_search import ParameterGrid
 13 | except ImportError:
 14 |     from sklearn.grid_search import IterGrid as ParameterGrid
 15 | from IPython.parallel import Client
 16 | 
 17 | from mrec import load_fast_sparse_matrix
 18 | 
 19 | def estimate_sparsity(task):
 20 |     from mrec.item_similarity.slim import SLIM
 21 |     args,dataset,min_nnz,sample_items = task
 22 |     model = SLIM(**args)
 23 |     tot_nnz = 0
 24 |     tot_neg = 0
 25 |     below_min_nnz = 0
 26 | 
 27 |     for i in sample_items:
 28 |         w = model.compute_similarities(dataset,i)
 29 |         nnz = sum(w>0)
 30 |         tot_nnz += nnz
 31 |         if nnz < min_nnz:
 32 |             below_min_nnz += 1
 33 |         tot_neg += sum(w<0)
 34 | 
 35 |     num_samples = len(sample_items)
 36 |     avg_nnz = float(tot_nnz)/num_samples
 37 |     too_few_sims = float(below_min_nnz)/num_samples
 38 |     avg_neg = float(tot_neg)/num_samples
 39 |     return args,avg_nnz,too_few_sims,avg_neg
 40 | 
 41 | def pow_range(small,big):
 42 |     return [10**v for v in xrange(int(log10(small)),int(log10(big))+1)]
 43 | 
 44 | def main():
 45 |     parser = OptionParser()
 46 |     parser.add_option('-d','--dataset',dest='dataset',help='path to dataset')
 47 |     parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
 48 |     parser.add_option('--l1_min',dest='l1_min',type='float',help='min l1 constant to try (expected to be a power of 10)')
 49 |     parser.add_option('--l1_max',dest='l1_max',type='float',help='max l1 constant to try (expected to be a power of 10)')
 50 |     parser.add_option('--l2_min',dest='l2_min',type='float',help='min l2 constant to try (expected to be a power of 10)')
 51 |     parser.add_option('--l2_max',dest='l2_max',type='float',help='max l2 constant to try (expected to be a power of 10)')
 52 |     parser.add_option('--max_sims',dest='max_sims',type='int',default=2000,help='max desired number of positive item similarity weights (default: %default)')
 53 |     parser.add_option('--min_sims',dest='min_sims',type='int',default=15,help='min desired number of positive item similarity weights (default: %default)')
 54 |     parser.add_option('--max_sparse',dest='max_sparse',type='float',default=0.01,help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)')
 55 |     parser.add_option('--num_samples',dest='num_samples',type='int',default=100,help='number of sample items to evaluate for each regularization setting')
 56 |     parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
 57 |     parser.add_option('--add_module_paths',dest='add_module_paths',help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules')
 58 | 
 59 |     (opts,args) = parser.parse_args()
 60 |     if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max:
 61 |         parser.print_help()
 62 |         raise SystemExit
 63 | 
 64 |     logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')
 65 | 
 66 |     dataset = load_fast_sparse_matrix(opts.input_format,opts.dataset)
 67 | 
 68 |     params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max),
 69 |               'l2_reg':pow_range(opts.l2_min,opts.l2_max)}
 70 |     num_items = dataset.shape[1]
 71 |     sample_items = random.sample(xrange(num_items),opts.num_samples)
 72 | 
 73 |     logging.info('preparing tasks for a grid search of these values:')
 74 |     logging.info(params)
 75 |     tasks = [(args,dataset,opts.min_sims,sample_items) for args in ParameterGrid(params)]
 76 | 
 77 |     c = Client(packer=opts.packer)
 78 |     view = c.load_balanced_view()
 79 | 
 80 |     if opts.add_module_paths:
 81 |         c[:].execute('import sys')
 82 |         for path in opts.add_module_paths.split(','):
 83 |             logging.info('adding {0} to pythonpath on all engines'.format(path))
 84 |             c[:].execute("sys.path.append('{0}')".format(path))
 85 | 
 86 |     logging.info('running {0} tasks in parallel...'.format(len(tasks)))
 87 |     results = view.map(estimate_sparsity,tasks,ordered=False)
 88 | 
 89 |     candidates = [(args,nsims,nsparse,nneg) for args,nsims,nsparse,nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse]
 90 | 
 91 |     if candidates:
 92 |         best = min(candidates,key=itemgetter(1))
 93 | 
 94 |         print 'best parameter setting: {0}'.format(best[0])
 95 |         print 'mean # positive similarity weights per item = {0:.3}'.format(best[1])
 96 |         print 'proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2])
 97 |         print 'mean # negative similarity weights per item = {0:.3}'.format(best[3])
 98 |     else:
 99 |         print 'no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse'
100 | 
101 | if __name__ == '__main__':
102 |     main()
103 | 


--------------------------------------------------------------------------------
/mrec/item_similarity/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mendeley/mrec/d299e3b9490703843b041e6585643b7e42e229f0/mrec/item_similarity/__init__.py


--------------------------------------------------------------------------------
/mrec/item_similarity/knn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Brute-force k-nearest neighbour recommenders
  3 | intended to provide evaluation baselines.
  4 | """
  5 | 
  6 | import numpy as np
  7 | from sklearn.metrics.pairwise import cosine_similarity
  8 | from recommender import ItemSimilarityRecommender
  9 | 
 10 | class KNNRecommender(ItemSimilarityRecommender):
 11 |     """
 12 |     Abstract base class for k-nn recommenders.  You must supply an
 13 |     implementation of the compute_all_similarities() method.
 14 | 
 15 |     Parameters
 16 |     ==========
 17 |     k : int
 18 |         The number of nearest neighbouring items to retain
 19 |     """
 20 | 
 21 |     def __init__(self,k):
 22 |         self.k = k
 23 | 
 24 |     def compute_similarities(self,dataset,j):
 25 |         A = dataset.X
 26 |         a = dataset.fast_get_col(j)
 27 |         d = self.compute_all_similarities(A,a)
 28 |         d[j] = 0  # zero out self-similarity
 29 |         # now zero out similarities for all but top-k items
 30 |         nn = d.argsort()[-1:-1-self.k:-1]
 31 |         w = np.zeros(A.shape[1])
 32 |         w[nn] = d[nn]
 33 |         return w
 34 | 
 35 |     def compute_all_similarities(self,A,a):
 36 |         """
 37 |         Compute similarity scores between item vector a
 38 |         and all the rows of A.
 39 | 
 40 |         Parameters
 41 |         ==========
 42 |         A : scipy.sparse.csr_matrix
 43 |             Matrix of item vectors.
 44 |         a : array_like
 45 |             The item vector to be compared to each row of A.
 46 | 
 47 |         Returns
 48 |         =======
 49 |         similarities : numpy.ndarray
 50 |             Vector of similarity scores.
 51 |         """
 52 |         pass
 53 | 
 54 | class DotProductKNNRecommender(KNNRecommender):
 55 |     """
 56 |     Similarity between two items is their dot product
 57 |     (i.e. cooccurrence count if input data is binary).
 58 |     """
 59 | 
 60 |     def compute_all_similarities(self,A,a):
 61 |         return A.T.dot(a).toarray().flatten()
 62 | 
 63 |     def __str__(self):
 64 |         return 'DotProductKNNRecommender(k={0})'.format(self.k)
 65 | 
 66 | class CosineKNNRecommender(KNNRecommender):
 67 |     """
 68 |     Similarity between two items is their cosine distance.
 69 |     """
 70 | 
 71 |     def compute_all_similarities(self,A,a):
 72 |         return cosine_similarity(A.T,a.T).flatten()
 73 | 
 74 |     def __str__(self):
 75 |         return 'CosineKNNRecommender(k={0})'.format(self.k)
 76 | 
 77 | if __name__ == '__main__':
 78 | 
 79 |     # use knn models like this:
 80 | 
 81 |     import random
 82 |     import StringIO
 83 |     from mrec import load_fast_sparse_matrix
 84 | 
 85 |     random.seed(0)
 86 | 
 87 |     print 'loading test data...'
 88 |     data = """\
 89 | %%MatrixMarket matrix coordinate real general
 90 | 3 5 9
 91 | 1	1	1
 92 | 1	2	1
 93 | 1	3	1
 94 | 1	4	1
 95 | 2	2	1
 96 | 2	3	1
 97 | 2	5	1
 98 | 3	3	1
 99 | 3	4	1
100 | """
101 |     print data
102 |     dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data))
103 |     num_users,num_items = dataset.shape
104 | 
105 |     model = CosineKNNRecommender(k=2)
106 | 
107 |     num_samples = 2
108 | 
109 |     def output(i,j,val):
110 |         # convert back to 1-indexed
111 |         print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)
112 | 
113 |     print 'computing some item similarities...'
114 |     print 'item\tsim\tweight'
115 |     # if we want we can compute these individually without calling fit()
116 |     for i in random.sample(xrange(num_items),num_samples):
117 |         for j,weight in model.get_similar_items(i,max_similar_items=2,dataset=dataset):
118 |             output(i,j,weight)
119 | 
120 |     print 'learning entire similarity matrix...'
121 |     # more usually we just call train() on the entire dataset
122 |     model = CosineKNNRecommender(k=2)
123 |     model.fit(dataset)
124 |     print 'making some recommendations...'
125 |     print 'user\trec\tscore'
126 |     for u in random.sample(xrange(num_users),num_samples):
127 |         for i,score in model.recommend_items(dataset.X,u,max_items=10):
128 |             output(u,i,score)
129 | 
130 |     print 'making batch recommendations...'
131 |     recs = model.batch_recommend_items(dataset.X)
132 |     for u in xrange(num_users):
133 |         for i,score in recs[u]:
134 |             output(u,i,score)
135 | 
136 |     print 'making range recommendations...'
137 |     for start,end in [(0,2),(2,3)]:
138 |         recs = model.range_recommend_items(dataset.X,start,end)
139 |         for u in xrange(start,end):
140 |             for i,score in recs[u-start]:
141 |                 output(u,i,score)
142 | 


--------------------------------------------------------------------------------
/mrec/item_similarity/precomputed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Make recommendations from a precomputed item similarity matrix.
 3 | """
 4 | 
 5 | from recommender import ItemSimilarityRecommender
 6 | 
 7 | class PrecomputedItemSimilarityRecommender(ItemSimilarityRecommender):
 8 |     """
 9 |     Wrapper class to make recommendations using a precomputed item similarity matrix.
10 | 
11 |     Parameters
12 |     ==========
13 |     description : str
14 |         Printable name for this recommender.
15 |     similarity_matrix : array_like(num_items,num_items)
16 |         The precomputed item similarity matrix.
17 |     """
18 | 
19 | 
20 |     def __init__(self,description,similarity_matrix):
21 |         self.description = description
22 |         self.set_similarity_matrix(similarity_matrix)
23 | 
24 |     def set_similarity_matrix(self,similarity_matrix):
25 |         self.similarity_matrix = similarity_matrix
26 | 
27 |     def compute_similarities(self,j):
28 |         return self.similarity_matrix[j,:]
29 | 
30 |     def fit(self,dataset,item_features=None):
31 |         pass
32 | 
33 |     def __str__(self):
34 |         return self.description
35 | 


--------------------------------------------------------------------------------
/mrec/item_similarity/slim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Train a Sparse Linear Methods (SLIM) item similarity model using various
  3 | methods for sparse regression.
  4 | 
  5 | See:
  6 |     Efficient Top-N Recommendation by Linear Regression,
  7 |     M. Levy and K. Jack, LSRS workshop at RecSys 2013.
  8 | 
  9 |     SLIM: Sparse linear methods for top-n recommender systems,
 10 |     X. Ning and G. Karypis, ICDM 2011.
 11 |     http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
 12 | """
 13 | 
 14 | from sklearn.linear_model import SGDRegressor, ElasticNet
 15 | from sklearn.preprocessing import binarize
 16 | import sklearn
 17 | import numpy as np
 18 | 
 19 | from recommender import ItemSimilarityRecommender
 20 | 
 21 | 
 22 | def parse_version(version_string):
 23 |     if '-' in version_string:
 24 |         version_string = version_string.split('-', 1)[0]
 25 |     return tuple(map(int, version_string.split('.')))
 26 | 
 27 | 
 28 | class NNFeatureSelectingSGDRegressor(object):
 29 |     """
 30 |     Wraps nearest-neighbour feature selection and regression in a single model.
 31 |     """
 32 | 
 33 |     def __init__(self,model,k):
 34 |         self.model = model
 35 |         self.k = k
 36 | 
 37 |     def fit(self,A,a):
 38 |         # find k-NN by brute force
 39 |         d = A.T.dot(a).flatten()  # distance = dot product
 40 |         nn = d.argsort()[-1:-1-self.k:-1]
 41 |         # fit the model to selected features only
 42 |         self.model.fit(A[:,nn],a)
 43 |         # set our weights for the selected "features" i.e. items
 44 |         self.coef_ = np.zeros(A.shape[1])
 45 |         self.coef_[nn] = self.model.coef_
 46 | 
 47 |     def __str__(self):
 48 |         return 'NN-feature selecting {0}'.format(self.model)
 49 | 
 50 | class SLIM(ItemSimilarityRecommender):
 51 |     """
 52 |     Parameters
 53 |     ==========
 54 | 
 55 |     l1_reg : float
 56 |         L1 regularisation constant.
 57 |     l2_reg : float
 58 |         L2 regularisation constant.
 59 |     fit_intercept : bool
 60 |         Whether to fit a constant term.
 61 |     ignore_negative_weights : bool
 62 |         If true discard any computed negative similarity weights.
 63 |     num_selected_features : int
 64 |         The number of "features" (i.e. most similar items) to retain when using feature selection.
 65 |     model : string
 66 |         The underlying model to use: sgd, elasticnet, fs_sgd.
 67 |         :sgd: SGDRegressor with elasticnet penalty
 68 |         :elasticnet: ElasticNet
 69 |         :fs_sgd: NNFeatureSelectingSGDRegressor
 70 |     """
 71 |     def __init__(self,
 72 |                  l1_reg=0.001,
 73 |                  l2_reg=0.0001,
 74 |                  fit_intercept=False,
 75 |                  ignore_negative_weights=False,
 76 |                  num_selected_features=200,
 77 |                  model='sgd'):
 78 |         alpha = l1_reg+l2_reg
 79 |         l1_ratio = l1_reg/alpha
 80 |         if parse_version(sklearn.__version__) <= (0, 14, 1):
 81 |             # Backward compat: in old versions of scikit-learn l1_ratio had
 82 |             # the opposite sign...
 83 |             l1_ratio = (1 - l1_ratio)
 84 |         if model == 'sgd':
 85 |             self.model = SGDRegressor(penalty='elasticnet',fit_intercept=fit_intercept,alpha=alpha,l1_ratio=l1_ratio)
 86 |         elif model == 'elasticnet':
 87 |             self.model = ElasticNet(alpha=alpha,l1_ratio=l1_ratio,positive=True,fit_intercept=fit_intercept,copy_X=False)
 88 |         elif model == 'fs_sgd':
 89 |             m = SGDRegressor(penalty='elasticnet',fit_intercept=fit_intercept,alpha=alpha,l1_ratio=l1_ratio)
 90 |             self.model = NNFeatureSelectingSGDRegressor(m,num_selected_features)
 91 |         else:
 92 |             raise SystemExit('unknown model type: {0}'.format(model))
 93 |         self.ignore_negative_weights = ignore_negative_weights
 94 | 
 95 |     def compute_similarities(self,dataset,j):
 96 |         """Compute item similarity weights for item j."""
 97 |         # zero out the j-th column of the input so we get w[j] = 0
 98 |         a = dataset.fast_get_col(j)
 99 |         dataset.fast_update_col(j,np.zeros(a.nnz))
100 |         self.model.fit(dataset.X,a.toarray().ravel())
101 |         # reinstate the j-th column
102 |         dataset.fast_update_col(j,a.data)
103 |         w = self.model.coef_
104 |         if self.ignore_negative_weights:
105 |             w[w<0] = 0
106 |         return w
107 | 
108 |     def compute_similarities_from_vec(self,dataset,a):
109 |         """Compute item similarity weights for out-of-dataset item vector."""
110 |         self.model.fit(dataset.X,a)
111 |         return self.model.coef_
112 | 
113 |     def __str__(self):
114 |         if self.ignore_negative_weights:
115 |             return 'SLIM({0} ignoring negative weights)'.format(self.model)
116 |         else:
117 |             return 'SLIM({0})'.format(self.model)
118 | 
119 | if __name__ == '__main__':
120 | 
121 |     # use SLIM like this:
122 | 
123 |     import random
124 |     import StringIO
125 |     from mrec import load_fast_sparse_matrix
126 | 
127 |     random.seed(0)
128 | 
129 |     print 'loading test data...'
130 |     data = """\
131 | %%MatrixMarket matrix coordinate real general
132 | 3 5 9
133 | 1	1	1
134 | 1	2	1
135 | 1	3	1
136 | 1	4	1
137 | 2	2	1
138 | 2	3	1
139 | 2	5	1
140 | 3	3	1
141 | 3	4	1
142 | """
143 |     print data
144 |     dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data))
145 |     num_users,num_items = dataset.shape
146 | 
147 |     model = SLIM()
148 | 
149 |     num_samples = 2
150 | 
151 |     def output(i,j,val):
152 |         # convert back to 1-indexed
153 |         print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)
154 | 
155 |     print 'computing some item similarities...'
156 |     print 'item\tsim\tweight'
157 |     # if we want we can compute these individually without calling fit()
158 |     for i in random.sample(xrange(num_items),num_samples):
159 |         for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset):
160 |             output(i,j,weight)
161 | 
162 |     print 'learning entire similarity matrix...'
163 |     # usually we'll call train() on the entire dataset
164 |     model = SLIM()
165 |     model.fit(dataset)
166 |     print 'making some recommendations...'
167 |     print 'user\trec\tscore'
168 |     for u in random.sample(xrange(num_users),num_samples):
169 |         for i,score in model.recommend_items(dataset.X,u,max_items=10):
170 |             output(u,i,score)
171 | 
172 |     print 'making batch recommendations...'
173 |     recs = model.batch_recommend_items(dataset.X)
174 |     for u in xrange(num_users):
175 |         for i,score in recs[u]:
176 |             output(u,i,score)
177 | 
178 |     print 'making range recommendations...'
179 |     for start,end in [(0,2),(2,3)]:
180 |         recs = model.range_recommend_items(dataset.X,start,end)
181 |         for u in xrange(start,end):
182 |             for i,score in recs[u-start]:
183 |                 output(u,i,score)
184 | 


--------------------------------------------------------------------------------
/mrec/mf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mendeley/mrec/d299e3b9490703843b041e6585643b7e42e229f0/mrec/mf/__init__.py


--------------------------------------------------------------------------------
/mrec/mf/climf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CLiMF Collaborative Less-is-More Filtering, a variant of latent factor CF
  3 | which optimises a lower bound of the smoothed reciprocal rank of "relevant"
  4 | items in ranked recommendation lists.  The intention is to promote diversity
  5 | as well as accuracy in the recommendations.  The method assumes binary
  6 | relevance data, as for example in friendship or follow relationships.
  7 | 
  8 | CLiMF: Learning to Maximize Reciprocal Rank with Collaborative Less-is-More Filtering
  9 | Yue Shi, Martha Larson, Alexandros Karatzoglou, Nuria Oliver, Linas Baltrunas, Alan Hanjalic
 10 | ACM RecSys 2012
 11 | """
 12 | 
 13 | from math import exp, log
 14 | import random
 15 | import numpy as np
 16 | 
 17 | from mrec.mf.recommender import MatrixFactorizationRecommender
 18 | 
 19 | 
 20 | # TODO: cythonize most of this...
 21 | 
 22 | 
 23 | def g(x):
 24 |     """sigmoid function"""
 25 |     return 1/(1+exp(-x))
 26 | 
 27 | def dg(x):
 28 |     """derivative of sigmoid function"""
 29 |     return exp(x)/(1+exp(x))**2
 30 | 
 31 | class CLiMFRecommender(MatrixFactorizationRecommender):
 32 | 
 33 |     def __init__(self,d,lbda=0.01,gamma=0.01,max_iters=25):
 34 |         self.d = d
 35 |         self.lbda = lbda
 36 |         self.gamma = gamma
 37 |         self.max_iters = max_iters
 38 | 
 39 |     def fit(self,data):
 40 |         self.U = 0.01*np.random.random_sample((data.shape[0],self.d))
 41 |         self.V = 0.01*np.random.random_sample((data.shape[1],self.d))
 42 |         # TODO: create a validation set
 43 | 
 44 |         for iter in xrange(self.max_iters):
 45 |             print 'iteration {0}:'.format(iter+1)
 46 |             print 'objective = {0:.4f}'.format(self.objective(data))
 47 |             self.update(data)
 48 |             # TODO: compute MRR on validation set, terminate if appropriate
 49 | 
 50 |     def precompute_f(self,data,i):
 51 |         """
 52 |         precompute f[j] = <U[i],V[j]>
 53 | 
 54 |         params:
 55 |           data: scipy csr sparse matrix containing user->(item,count)
 56 |           U   : user factors
 57 |           V   : item factors
 58 |           i   : item of interest
 59 | 
 60 |         returns:
 61 |           dot products <U[i],V[j]> for all j in data[i]
 62 |         """
 63 |         items = data[i].indices
 64 |         f = dict((j,np.dot(self.U[i],self.V[j])) for j in items)
 65 |         return f
 66 | 
 67 |     def objective(self,data):
 68 |         """
 69 |         compute objective function F(U,V)
 70 | 
 71 |         params:
 72 |           data: scipy csr sparse matrix containing user->(item,count)
 73 |           U   : user factors
 74 |           V   : item factors
 75 |           lbda: regularization constant lambda
 76 |         returns:
 77 |           current value of F(U,V)
 78 |         """
 79 |         F = -0.5*self.lbda*(np.sum(self.U*self.U)+np.sum(self.V*self.V))
 80 |         for i in xrange(len(self.U)):
 81 |             f = self.precompute_f(data,i)
 82 |             for j in f:
 83 |                 F += log(g(f[j]))
 84 |                 for k in f:
 85 |                     F += log(1-g(f[k]-f[j]))
 86 |         return F
 87 | 
 88 |     def update(self,data):
 89 |         """
 90 |         update user/item factors using stochastic gradient ascent
 91 | 
 92 |         params:
 93 |           data : scipy csr sparse matrix containing user->(item,count)
 94 |           U    : user factors
 95 |           V    : item factors
 96 |           lbda : regularization constant lambda
 97 |           gamma: learning rate
 98 |         """
 99 |         for i in xrange(len(self.U)):
100 |             dU = -self.lbda*self.U[i]
101 |             f = self.precompute_f(data,i)
102 |             for j in f:
103 |                 dV = g(-f[j])-self.lbda*self.V[j]
104 |                 for k in f:
105 |                     dV += dg(f[j]-f[k])*(1/(1-g(f[k]-f[j]))-1/(1-g(f[j]-f[k])))*self.U[i]
106 |                 self.V[j] += self.gamma*dV
107 |                 dU += g(-f[j])*self.V[j]
108 |                 for k in f:
109 |                     dU += (self.V[j]-self.V[k])*dg(f[k]-f[j])/(1-g(f[k]-f[j]))
110 |             self.U[i] += self.gamma*dU
111 | 
112 |     def compute_mrr(self,data,test_users=None):
113 |         """
114 |         compute average Mean Reciprocal Rank of data according to factors
115 | 
116 |         params:
117 |           data      : scipy csr sparse matrix containing user->(item,count)
118 |           U         : user factors
119 |           V         : item factors
120 |           test_users: optional subset of users over which to compute MRR
121 | 
122 |         returns:
123 |           the mean MRR over all users in data
124 |         """
125 |         mrr = []
126 |         if test_users is None:
127 |             test_users = range(len(self.U))
128 |         for ix,i in enumerate(test_users):
129 |             items = set(data[i].indices)
130 |             if not items:
131 |                 continue
132 |             predictions = np.sum(np.tile(self.U[i],(len(self.V),1))*self.V,axis=1)
133 |             found = False
134 |             for rank,item in enumerate(np.argsort(predictions)[::-1]):
135 |                 if item in items:
136 |                     mrr.append(1.0/(rank+1))
137 |                     found = True
138 |                     break
139 |             if not found:
140 |                 print 'fail, no relevant items predicted for test user {0}'.format(i+1)
141 |                 print 'known items: {0}'.format(items)
142 |         assert(len(mrr) == len(test_users))
143 |         return np.mean(mrr)
144 | 
145 | def main():
146 |     import sys
147 |     from mrec import load_sparse_matrix, save_recommender
148 |     from mrec.mf.climf import CLiMFRecommender
149 | 
150 |     file_format = sys.argv[1]
151 |     filepath = sys.argv[2]
152 |     outfile = sys.argv[3]
153 | 
154 |     # load training set as scipy sparse matrix
155 |     train = load_sparse_matrix(file_format,filepath)
156 | 
157 |     model = CLiMFRecommender(d=5)
158 |     model.fit(train)
159 | 
160 |     save_recommender(model,outfile)
161 | 
162 | if __name__ == '__main__':
163 |     import cProfile
164 |     cProfile.run('main()')
165 | 


--------------------------------------------------------------------------------
/mrec/mf/evaluate.py:
--------------------------------------------------------------------------------
 1 | def retrain_recommender(model,dataset):
 2 |     model.fit(dataset.X)
 3 | 
 4 | if __name__ == '__main__':
 5 | 
 6 |     try:
 7 |         from sklearn.grid_search import ParameterGrid
 8 |     except ImportError:
 9 |         from sklearn.grid_search import IterGrid as ParameterGrid
10 |     from optparse import OptionParser
11 |     from warp import WARPMFRecommender
12 | 
13 |     from mrec.evaluation.metrics import *
14 | 
15 |     parser = OptionParser()
16 |     parser.add_option('-m','--main_split_dir',dest='main_split_dir',help='directory containing 50/50 splits for main evaluation')
17 |     parser.add_option('-l','--loo_split_dir',dest='loo_split_dir',help='directory containing LOO splits for hit rate evaluation')
18 |     parser.add_option('-n','--num_splits',dest='num_splits',type='int',default=5,help='number of splits in each directory (default: %default)')
19 | 
20 |     (opts,args) = parser.parse_args()
21 |     if not (opts.main_split_dir or opts.loo_split_dir) or not opts.num_splits:
22 |         parser.print_help()
23 |         raise SystemExit
24 | 
25 |     print 'doing a grid search for regularization parameters...'
26 |     params = {'d':[100],'gamma':[0.01],'C':[100],'max_iter':[100000],'validation_iters':[500]}
27 |     models = [WARPMFRecommender(**a) for a in ParameterGrid(params)]
28 | 
29 |     for train in glob:
30 |         # get test
31 |         # load em both up
32 |         # put them into something that returns train,test.keys(),test in a generator()
33 |         # test is a dict id->[id,id,...]
34 | 
35 |     if opts.main_split_dir:
36 |         generate_main_metrics = generate_metrics(get_known_items_from_dict,compute_main_metrics)
37 |         main_metrics = run_evaluation(models,
38 |                                       retrain_recommender,
39 |                                       load_splits(opts.main_split_dir,opts.num_splits),
40 |                                       opts.num_splits,
41 |                                       generate_main_metrics)
42 |         print_report(models,main_metrics)
43 | 
44 |     if opts.loo_split_dir:
45 |         generate_hit_rate = generate_metrics(get_known_items_from_dict,compute_hit_rate)
46 |         hit_rate_metrics = run_evaluation(models,
47 |                                           retrain_recommender,
48 |                                           load_splits(opts.loo_split_dir,opts.num_splits),
49 |                                           opts.num_splits,
50 |                                           generate_hit_rate)
51 |         print_report(models,hit_rate_metrics)
52 | 


--------------------------------------------------------------------------------
/mrec/mf/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mendeley/mrec/d299e3b9490703843b041e6585643b7e42e229f0/mrec/mf/model/__init__.py


--------------------------------------------------------------------------------
/mrec/mf/model/warp2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import random
  4 | 
  5 | from warp import WARPBatchUpdate, WARPDecomposition, WARP
  6 | from warp_fast import warp2_sample
  7 | 
  8 | class WARP2BatchUpdate(WARPBatchUpdate):
  9 |     """Collection of arrays to hold a batch of sgd updates."""
 10 | 
 11 |     def __init__(self,batch_size,num_features,d):
 12 |         WARPBatchUpdate.__init__(self,batch_size,d)
 13 |         self.dW = np.zeros((num_features,d))
 14 | 
 15 |     def clear(self):
 16 |         self.dW[:] = 0
 17 | 
 18 |     def set_update(self,ix,update):
 19 |         u,v_pos,v_neg,dU,dV_pos,dV_neg,dW = update
 20 |         WARPBatchUpdate.set_update(self,ix,(u,v_pos,v_neg,dU,dV_pos,dV_neg))
 21 |         self.dW += dW
 22 | 
 23 | class WARP2Decomposition(WARPDecomposition):
 24 |     """
 25 |     Joint matrix and feature embedding optimizing the WARP loss.
 26 | 
 27 |     Parameters
 28 |     ==========
 29 |     num_rows : int
 30 |         Number of rows in the full matrix.
 31 |     num_cols : int
 32 |         Number of columns in the full matrix.
 33 |     X : array_like, shape = [num_cols, num_features]
 34 |         Features describing each column in the matrix.
 35 |     d : int
 36 |         The embedding dimension.
 37 |     """
 38 | 
 39 |     def __init__(self,num_rows,num_cols,X,d):
 40 |         WARPDecomposition.__init__(self,num_rows,num_cols,d)
 41 |         # W holds latent factors for each item feature
 42 |         self.W = d**-0.5*np.random.random_sample((X.shape[1],d))
 43 |         self.X = X
 44 |         self.is_sparse = isinstance(X,scipy.sparse.csr_matrix)
 45 | 
 46 |     def compute_gradient_step(self,u,i,j,L):
 47 |         """
 48 |         Compute a gradient step from results of sampling.
 49 | 
 50 |         Parameters
 51 |         ==========
 52 |         u : int
 53 |             The sampled row.
 54 |         i : int
 55 |             The sampled positive column.
 56 |         j : int
 57 |             The sampled violating negative column i.e. U[u].V[j] is currently
 58 |             too large compared to U[u].V[i]
 59 |         L : int
 60 |             The number of trials required to find a violating negative column.
 61 | 
 62 |         Returns
 63 |         =======
 64 |         u : int
 65 |             As input.
 66 |         i : int
 67 |             As input.
 68 |         j : int
 69 |             As input.
 70 |         dU : numpy.ndarray
 71 |             Gradient step for U[u].
 72 |         dV_pos : numpy.ndarray
 73 |             Gradient step for V[i].
 74 |         dV_neg : numpy.ndarray
 75 |             Gradient step for V[j].
 76 |         dW : numpy.ndarray
 77 |             Gradient step for W.
 78 |         """
 79 |         dU = L*(self.V[i]-self.V[j])
 80 |         dV_pos = L*self.U[u]
 81 |         dV_neg = -L*self.U[u]
 82 |         dx = self.X[i]-self.X[j]
 83 |         if not self.is_sparse:
 84 |             dx = np.atleast_2d(dx)
 85 |         dW = L*dx.T.dot(np.atleast_2d(self.U[u]))
 86 |         return u,i,j,dU,dV_pos,dV_neg,dW
 87 | 
 88 |     def apply_updates(self,updates,gamma,C):
 89 |         WARPDecomposition.apply_updates(self,updates,gamma,C)
 90 |         self.apply_matrix_update(self.W,updates.dW,gamma,C)
 91 | 
 92 |     def apply_matrix_update(self,W,dW,gamma,C):
 93 |         W += gamma*dW
 94 |         # ensure that ||W_k|| < C for all k
 95 |         p = np.sum(np.abs(W)**2,axis=-1)**0.5/C
 96 |         p[p<1] = 1
 97 |         W /= p[:,np.newaxis]
 98 | 
 99 |     def reconstruct(self,rows):
100 |         if rows is None:
101 |             U = self.U
102 |         else:
103 |             U = np.asfortranarray(self.U[rows,:])
104 |         return U.dot(self.V.T + self.X.dot(self.W).T)
105 | 
106 | class WARP2(WARP):
107 |     """
108 |     Learn low-dimensional embedding optimizing the WARP loss.
109 | 
110 |     Parameters
111 |     ==========
112 |     d : int
113 |         Embedding dimension.
114 |     gamma : float
115 |         Learning rate.
116 |     C : float
117 |         Regularization constant.
118 |     max_iters : int
119 |         Maximum number of SGD updates.
120 |     validation_iters : int
121 |         Number of SGD updates between checks for stopping condition.
122 |     batch_size : int
123 |         Mini batch size for SGD updates.
124 |     positive_thresh: float
125 |         Training entries below this are treated as zero.
126 |     max_trials : int
127 |         Number of attempts allowed to find a violating negative example during
128 |         training updates. This means that in practice we optimize for ranks 1
129 |         to max_trials-1.
130 | 
131 |     Attributes
132 |     ==========
133 |     U_ : numpy.ndarray
134 |         Row factors.
135 |     V_ : numpy.ndarray
136 |         Column factors.
137 |     W_ : numpy.ndarray
138 |         Item feature factors.
139 |     """
140 | 
141 |     def fit(self,train,X,validation=None):
142 |         """
143 |         Learn embedding from training set. A suitable dot product of the
144 |         factors reconstructs the training matrix approximately, minimizing
145 |         the WARP ranking loss relative to the original data.
146 | 
147 |         Parameters
148 |         ==========
149 |         train : scipy.sparse.csr_matrix
150 |             Training matrix to be factorized.
151 |         X : array_like, shape = [num_cols, num_features]
152 |             Item features.
153 |         validation : dict or int
154 |             Validation set to control early stopping, based on precision@30.
155 |             The dict should have the form row->[cols] where the values in cols
156 |             are those we expected to be highly ranked in the reconstruction of
157 |             row. If an int is supplied then instead we evaluate precision
158 |             against the training data for the first validation rows.
159 | 
160 |         Returns
161 |         =======
162 |         self : object
163 |             This model itself.
164 |         """
165 |         num_rows,num_cols = train.shape
166 |         decomposition = WARP2Decomposition(num_rows,num_cols,X,self.d)
167 |         updates = WARP2BatchUpdate(self.batch_size,X.shape[1],self.d)
168 |         self.precompute_warp_loss(num_cols)
169 | 
170 |         self._fit(decomposition,updates,train,validation)
171 | 
172 |         self.U_ = decomposition.U
173 |         self.V_ = decomposition.V
174 |         self.W_ = decomposition.W
175 | 
176 |         return self
177 | 
178 |     def sample(self,train,decomposition):
179 |         # delegate to cython implementation
180 |         return warp2_sample(decomposition.U,
181 |                             decomposition.V,
182 |                             decomposition.W,
183 |                             decomposition.X,
184 |                             train.data,
185 |                             train.indices,
186 |                             train.indptr,
187 |                             self.positive_thresh,
188 |                             self.max_trials)
189 | 
190 | 


--------------------------------------------------------------------------------
/mrec/mf/recommender.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Base class for recommenders that work
  3 | by matrix factorization.
  4 | """
  5 | 
  6 | try:
  7 |     import cPickle as pickle
  8 | except ImportError:
  9 |     import pickle
 10 | import numpy as np
 11 | from itertools import izip
 12 | from scipy.sparse import csr_matrix
 13 | 
 14 | from mrec.base_recommender import BaseRecommender
 15 | 
 16 | class MatrixFactorizationRecommender(BaseRecommender):
 17 |     """
 18 |     Base class for matrix factorization recommenders.
 19 |     """
 20 | 
 21 |     def _create_archive(self):
 22 |         """
 23 |         Return fields to be serialized in a numpy archive.
 24 | 
 25 |         Returns
 26 |         =======
 27 |         archive : dict
 28 |             Fields to serialize, includes the model itself
 29 |             under the key 'model'.
 30 |         """
 31 |         # pickle the model without its factors
 32 |         # then use numpy to save the factors efficiently
 33 |         tmp = (self.U,self.V)
 34 |         self.U = self.V = None
 35 |         m = pickle.dumps(self)
 36 |         self.U,self.V = tmp
 37 |         return {'model':m,'U':self.U,'V':self.V}
 38 | 
 39 |     def _load_archive(self,archive):
 40 |         """
 41 |         Load fields from a numpy archive.
 42 |         """
 43 |         self.U = archive['U']
 44 |         self.V = archive['V']
 45 | 
 46 |     def __str__(self):
 47 |         if hasattr(self,'description'):
 48 |             return self.description
 49 |         return 'MatrixFactorizationRecommender'
 50 | 
 51 |     def fit(self,train):
 52 |         """
 53 |         Learn user and item factors from training dataset.
 54 | 
 55 |         Parameters
 56 |         ==========
 57 |         train : scipy sparse matrix
 58 |           The user-item matrix.
 59 |         """
 60 |         pass
 61 | 
 62 |     def load_factors(self,user_factor_filepath,item_factor_filepath,fmt):
 63 |         """
 64 |         Load precomputed user and item factors from file.
 65 | 
 66 |         Parameters
 67 |         ==========
 68 |         user_factor_filepath : str
 69 |             Filepath to tsv file holding externally computed user factors. Can be
 70 |             TSV, Matrix Market or numpy array serialized with numpy.save().
 71 |         item_factor_filepath : str
 72 |             Filepath to TSV file holding externally computed item factors. Can be
 73 |             TSV, Matrix Market or numpy array serialized with numpy.save().
 74 |         fmt : str: npy, mm or tsv
 75 |             File format: numpy array, Matrix Market or TSV.  Each line of TSV input
 76 |             should contain all of the factors for a single user or item.
 77 |         """
 78 |         if fmt == 'npy':
 79 |             self.U = np.load(user_factor_filepath)
 80 |             self.V = np.load(item_factor_filepath)
 81 |         elif fmt == 'mm':
 82 |             self.U = mmread(user_factor_filepath)
 83 |             self.V = mmread(item_factor_filepath)
 84 |         elif fmt == 'tsv':
 85 |             self.U = np.loadtxt(user_factor_filepath)
 86 |             self.V = np.loadtxt(item_factor_filepath)
 87 |         else:
 88 |             raise ValueError('unknown input format {0}'.format(fmt))
 89 |         # ensure that memory layout avoids extra allocation in dot product
 90 |         self.U = np.asfortranarray(self.U)
 91 | 
 92 |     def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None):
 93 |         """
 94 |         Recommend up to max_items most highly recommended items for user u.
 95 |         Assumes you've already called fit() to learn the factors.
 96 | 
 97 |         Parameters
 98 |         ==========
 99 |         dataset : scipy.sparse.csr_matrix
100 |             User-item matrix containing known items.
101 |         u : int
102 |             Index of user for which to make recommendations.
103 |         max_items : int
104 |             Maximum number of recommended items to return.
105 |         return_scores : bool
106 |             If true return a score along with each recommended item.
107 |         item_features : array_like, shape = [num_items, num_features]
108 |             Features for each item in the dataset.
109 | 
110 |         Returns
111 |         =======
112 |         recs : list
113 |             List of (idx,score) pairs if return_scores is True, else
114 |             just a list of idxs.
115 |         """
116 |         r = self.predict_ratings(u,item_features=item_features)
117 |         return self._get_recommendations_from_predictions(r,dataset,u,u+1,max_items,return_scores)[0]
118 | 
119 |     def predict_ratings(self,users=None,item_features=None):
120 |         """
121 |         Predict ratings/scores for all items for supplied users.
122 |         Assumes you've already called fit() to learn the factors.
123 | 
124 |         Only call this if you really want predictions for all items.
125 |         To get the top-k recommended items for each user you should
126 |         call one of the recommend_items() instead.
127 | 
128 |         Parameters
129 |         ==========
130 |         users : int or array-like
131 |             Index or indices of users for which to make predictions.
132 |         item_features : array_like, shape = [num_items, num_features]
133 |             Features for each item in the dataset, ignored here.
134 | 
135 |         Returns
136 |         =======
137 |         predictions : numpy.ndarray, shape = [len(users), num_items]
138 |             Predicted ratings for all items for each supplied user.
139 |         """
140 |         if isinstance(users,int):
141 |             users = [users]
142 | 
143 |         if users is None:
144 |             U = self.U
145 |         else:
146 |             U = np.asfortranarray(self.U[users,:])
147 |         return U.dot(self.V.T)
148 | 
149 |     def batch_recommend_items(self,
150 |                               dataset,
151 |                               max_items=10,
152 |                               return_scores=True,
153 |                               show_progress=False,
154 |                               item_features=None):
155 |         """
156 |         Recommend new items for all users in the training dataset.  Assumes
157 |         you've already called fit() to learn the similarity matrix.
158 | 
159 |         Parameters
160 |         ==========
161 |         dataset : scipy.sparse.csr_matrix
162 |             User-item matrix containing known items.
163 |         max_items : int
164 |             Maximum number of recommended items to return.
165 |         return_scores : bool
166 |             If true return a score along with each recommended item.
167 |         show_progress: bool
168 |             If true print something to stdout to show progress.
169 |         item_features : array_like, shape = [num_items, num_features]
170 |             Features for each item in the dataset.
171 | 
172 |         Returns
173 |         =======
174 |         recs : list of lists
175 |             Each entry is a list of (idx,score) pairs if return_scores is True,
176 |             else just a list of idxs.
177 |         """
178 |         r = self.predict_ratings(item_features=item_features)
179 |         return self._get_recommendations_from_predictions(r,dataset,0,r.shape[0],max_items,return_scores,show_progress)
180 | 
181 |     def range_recommend_items(self,
182 |                               dataset,
183 |                               user_start,
184 |                               user_end,
185 |                               max_items=10,
186 |                               return_scores=True,
187 |                               item_features=None):
188 |         """
189 |         Recommend new items for a range of users in the training dataset.
190 |         Assumes you've already called fit() to learn the similarity matrix.
191 | 
192 |         Parameters
193 |         ==========
194 |         dataset : scipy.sparse.csr_matrix
195 |             User-item matrix containing known items.
196 |         user_start : int
197 |             Index of first user in the range to recommend.
198 |         user_end : int
199 |             Index one beyond last user in the range to recommend.
200 |         max_items : int
201 |             Maximum number of recommended items to return.
202 |         return_scores : bool
203 |             If true return a score along with each recommended item.
204 |         item_features : array_like, shape = [num_items, num_features]
205 |             Features for each item in the dataset.
206 | 
207 |         Returns
208 |         =======
209 |         recs : list of lists
210 |             Each entry is a list of (idx,score) pairs if return_scores is True,
211 |             else just a list of idxs.
212 |         """
213 |         r = self.predict_ratings(xrange(user_start,user_end),item_features=item_features)
214 |         return self._get_recommendations_from_predictions(r,dataset,user_start,user_end,max_items,return_scores)
215 | 
216 |     def _get_recommendations_from_predictions(self,
217 |                                               r,
218 |                                               dataset,
219 |                                               user_start,
220 |                                               user_end,
221 |                                               max_items,
222 |                                               return_scores=True,
223 |                                               show_progress=False):
224 |         """
225 |         Select recommendations given predicted scores/ratings.
226 | 
227 |         Parameters
228 |         ==========
229 |         r : numpy.ndarray
230 |             Predicted scores/ratings for all items for users in supplied range.
231 |         dataset : scipy.sparse.csr_matrix
232 |             User-item matrix containing known items.
233 |         user_start : int
234 |             Index of first user in the range to recommend.
235 |         user_end : int
236 |             Index one beyond last user in the range to recommend.
237 |         max_items : int
238 |             Maximum number of recommended items to return.
239 |         return_scores : bool
240 |             If true return a score along with each recommended item.
241 |         show_progress: bool
242 |             If true print something to stdout to show progress.
243 | 
244 |         Returns
245 |         =======
246 |         recs : list of lists
247 |             Each entry is a list of (idx,score) pairs if return_scores is True,
248 |             else just a list of idxs.
249 |         """
250 |         r = np.array(self._zero_known_item_scores(r,dataset[user_start:user_end,:]))
251 |         recs = [[] for u in xrange(user_start,user_end)]
252 |         for u in xrange(user_start,user_end):
253 |             ux = u - user_start
254 |             if show_progress and ux%1000 == 0:
255 |                print ux,'..',
256 |             ru = r[ux]
257 |             if return_scores:
258 |                 recs[ux] = [(i,ru[i]) for i in ru.argsort()[::-1] if ru[i] > 0][:max_items]
259 |             else:
260 |                 recs[ux] = [i for i in ru.argsort()[::-1] if ru[i] > 0][:max_items]
261 |         if show_progress:
262 |             print
263 |         return recs
264 | 


--------------------------------------------------------------------------------
/mrec/mf/warp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | from mrec.evaluation import metrics
  5 | 
  6 | from recommender import MatrixFactorizationRecommender
  7 | from model.warp import WARP
  8 | 
  9 | class WARPMFRecommender(MatrixFactorizationRecommender):
 10 |     """
 11 |     Learn matrix factorization optimizing the WARP loss.
 12 | 
 13 |     Parameters
 14 |     ==========
 15 |     d : int
 16 |         Dimensionality of factors.
 17 |     gamma : float
 18 |         Learning rate.
 19 |     C : float
 20 |         Regularization constant.
 21 |     batch_size : int
 22 |         Mini batch size for SGD updates.
 23 |     positive_thresh: float
 24 |         Consider an item to be "positive" i.e. liked if its rating is at least this.
 25 |     max_trials : int
 26 |         Number of attempts allowed to find a violating negative example during updates.
 27 |         In practice it means that we optimize for ranks 1 to max_trials-1.
 28 |     """
 29 | 
 30 |     def __init__(self,d,gamma,C,batch_size=10,positive_thresh=0.00001,max_trials=50):
 31 |         self.d = d
 32 |         self.gamma = gamma
 33 |         self.C = C
 34 |         self.batch_size = batch_size
 35 |         self.positive_thresh = positive_thresh
 36 |         self.max_trials = max_trials
 37 | 
 38 |     def fit(self,train,item_features=None):
 39 |         """
 40 |         Learn factors from training set.
 41 | 
 42 |         Parameters
 43 |         ==========
 44 |         train : scipy.sparse.csr_matrix
 45 |             User-item matrix.
 46 |         item_features : array_like, shape = [num_items, num_features]
 47 |             Features for each item in the dataset, ignored here.
 48 |         """
 49 |         max_iters,validation_iters,validation = self.create_validation_set(train)
 50 |         model = WARP(self.d,self.gamma,self.C,max_iters,validation_iters,self.batch_size,self.positive_thresh,self.max_trials)
 51 |         self.description = 'WARPMF({0})'.format(model)
 52 |         model.fit(train,validation)
 53 | 
 54 |         self.U = model.U_
 55 |         self.V = model.V_
 56 | 
 57 |     def create_validation_set(self,train):
 58 |         """
 59 |         Hide and return half of the known items for a sample of users,
 60 |         and estimate the number of sgd iterations to run.
 61 | 
 62 |         Parameters
 63 |         ==========
 64 |         train : scipy.sparse.csr_matrix
 65 |             User-item matrix.
 66 | 
 67 |         Returns
 68 |         =======
 69 |         max_iters : int
 70 |             Total number of sgd iterations to run.
 71 |         validation_iters : int
 72 |             Check progress after this many iterations.
 73 |         validation : dict
 74 |             Validation set.
 75 |         """
 76 |         # use 1% of users for validation, with a floor
 77 |         num_users = train.shape[0]
 78 |         num_validation_users = max(num_users/100,100)
 79 |         # ensure reasonable expected number of updates per validation user
 80 |         validation_iters = 100*num_users/num_validation_users
 81 |         # and reasonable number of validation cycles
 82 |         max_iters = 30*validation_iters
 83 | 
 84 |         print num_validation_users,'validation users'
 85 |         print validation_iters,'validation iters'
 86 |         print max_iters,'max_iters'
 87 | 
 88 |         validation = dict()
 89 |         for u in xrange(num_validation_users):
 90 |             positive = np.where(train[u].data > 0)[0]
 91 |             hidden = random.sample(positive,positive.shape[0]/2)
 92 |             if hidden:
 93 |                 train[u].data[hidden] = 0
 94 |                 validation[u] = train[u].indices[hidden]
 95 | 
 96 |         return max_iters,validation_iters,validation
 97 | 
 98 | def main():
 99 |     import sys
100 |     from mrec import load_sparse_matrix, save_recommender
101 |     from mrec.sparse import fast_sparse_matrix
102 | 
103 |     file_format = sys.argv[1]
104 |     filepath = sys.argv[2]
105 |     outfile = sys.argv[3]
106 | 
107 |     # load training set as scipy sparse matrix
108 |     train = load_sparse_matrix(file_format,filepath)
109 | 
110 |     model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10)
111 |     model.fit(train)
112 | 
113 |     save_recommender(model,outfile)
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/mrec/mf/warp2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from warp import WARPMFRecommender
  4 | from model.warp2 import WARP2
  5 | 
  6 | class WARP2MFRecommender(WARPMFRecommender):
  7 |     """
  8 |     Learn matrix factorization optimizing the WARP loss
  9 |     with item features as well as user-item training data.
 10 | 
 11 |     Parameters
 12 |     ==========
 13 |     d : int
 14 |         Dimensionality of factors.
 15 |     gamma : float
 16 |         Learning rate.
 17 |     C : float
 18 |         Regularization constant.
 19 |     batch_size : int
 20 |         Mini batch size for SGD updates.
 21 |     positive_thresh: float
 22 |         Consider an item to be "positive" i.e. liked if its rating is at least this.
 23 |     max_trials : int
 24 |         Number of attempts allowed to find a violating negative example during updates.
 25 |         In practice it means that we optimize for ranks 1 to max_trials-1.
 26 |     """
 27 | 
 28 |     def __str__(self):
 29 |         return 'WARP2MF(d={0},gamma={1},C={2})'.format(self.d,self.gamma,self.C)
 30 | 
 31 |     def fit(self,train,item_features=None):
 32 |         """
 33 |         Learn factors from training set and item features.
 34 | 
 35 |         Parameters
 36 |         ==========
 37 |         train : scipy.sparse.csr_matrix
 38 |             User-item matrix.
 39 |         item_features : array_like, shape = [num_items, num_features]
 40 |             Features for each item in the dataset.
 41 |         """
 42 |         max_iters,validation_iters,validation = self.create_validation_set(train)
 43 |         model = WARP2(self.d,self.gamma,self.C,max_iters,validation_iters,self.batch_size,self.positive_thresh,self.max_trials)
 44 |         self.description = 'WARP2MF({0})'.format(model)
 45 |         model.fit(train,item_features,validation)
 46 | 
 47 |         self.U = model.U_
 48 |         self.V = model.V_
 49 |         self.W = model.W_
 50 | 
 51 |     def predict_ratings(self,users=None,item_features=None):
 52 |         """
 53 |         Predict ratings/scores for all items for supplied users.
 54 |         Assumes you've already called fit() to learn the factors.
 55 | 
 56 |         Only call this if you really want predictions for all items.
 57 |         To get the top-k recommended items for each user you should
 58 |         call one of the recommend_items() instead.
 59 | 
 60 |         Parameters
 61 |         ==========
 62 |         users : int or array-like
 63 |             Index or indices of users for which to make predictions.
 64 |         item_features : array_like, shape = [num_items, num_features]
 65 |             Features for each item in the dataset.
 66 | 
 67 |         Returns
 68 |         =======
 69 |         predictions : numpy.ndarray, shape = [len(users), num_items]
 70 |             Predicted ratings for all items for each supplied user.
 71 |         """
 72 |         if isinstance(users,int):
 73 |             users = [users]
 74 | 
 75 |         if users is None:
 76 |             U = self.U
 77 |         else:
 78 |             U = np.asfortranarray(self.U[users,:])
 79 |         return U.dot(self.V.T + item_features.dot(self.W).T)
 80 | 
 81 | def main(file_format,filepath,feature_format,feature_file,outfile):
 82 |     from mrec import load_sparse_matrix, save_recommender
 83 |     from mrec.sparse import fast_sparse_matrix
 84 | 
 85 |     # load training set
 86 |     train = load_sparse_matrix(file_format,filepath)
 87 |     # load item features, assume they are tsv: item_id,feature_id,val
 88 |     X = load_sparse_matrix(feature_format,feature_file).toarray()
 89 |     # strip features for any trailing items that don't appear in training set
 90 |     num_items = train.shape[1]
 91 |     X = X[:num_items,:]
 92 | 
 93 |     model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10)
 94 |     model.fit(train,X)
 95 | 
 96 |     save_recommender(model,outfile)
 97 | 
 98 | if __name__ == '__main__':
 99 |     import sys
100 |     file_format = sys.argv[1]
101 |     filepath = sys.argv[2]
102 |     feature_format = sys.argv[3]
103 |     feature_file = sys.argv[4]
104 |     outfile = sys.argv[5]
105 | 
106 |     main(file_format,filepath,feature_format,feature_file,outfile)
107 | 


--------------------------------------------------------------------------------
/mrec/mf/wrmf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Weighted Regularize Matrix Factorization by alternating least squares.
  3 | 
  4 | See:
  5 | Y. Hu, Y. Koren and C. Volinsky, Collaborative filtering for implicit feedback datasets, ICDM 2008.
  6 | http://research.yahoo.net/files/HuKorenVolinsky-ICDM08.pdf
  7 | R. Pan et al., One-class collaborative filtering, ICDM 2008.
  8 | http://www.hpl.hp.com/techreports/2008/HPL-2008-48R1.pdf
  9 | """
 10 | 
 11 | import numpy as np
 12 | from scipy.sparse import csr_matrix
 13 | 
 14 | from mrec.sparse import fast_sparse_matrix
 15 | from mrec.mf.recommender import MatrixFactorizationRecommender
 16 | 
 17 | class WRMFRecommender(MatrixFactorizationRecommender):
 18 |     """
 19 |     Parameters
 20 |     ==========
 21 |     d : int
 22 |         Number of latent factors.
 23 |     alpha : float
 24 |         Confidence weight, confidence c = 1 + alpha*r where r is the observed "rating".
 25 |     lbda : float
 26 |         Regularization constant.
 27 |     num_iters : int
 28 |         Number of iterations of alternating least squares.
 29 |     """
 30 | 
 31 |     def __init__(self,d,alpha=1,lbda=0.015,num_iters=15):
 32 |         self.d = d
 33 |         self.alpha = alpha
 34 |         self.lbda = lbda
 35 |         self.num_iters = num_iters
 36 | 
 37 |     def __str__(self):
 38 |         return 'WRMFRecommender (d={0},alpha={1},lambda={2},num_iters={3})'.format(self.d,self.alpha,self.lbda,self.num_iters)
 39 | 
 40 |     def init_factors(self,num_factors,assign_values=True):
 41 |         if assign_values:
 42 |             return self.d**-0.5*np.random.random_sample((num_factors,self.d))
 43 |         return np.empty((num_factors,self.d))
 44 | 
 45 |     def fit(self,train,item_features=None):
 46 |         """
 47 |         Learn factors from training set. User and item factors are
 48 |         fitted alternately.
 49 | 
 50 |         Parameters
 51 |         ==========
 52 |         train : scipy.sparse.csr_matrix or mrec.sparse.fast_sparse_matrix
 53 |             User-item matrix.
 54 |         item_features : array_like, shape = [num_items, num_features]
 55 |             Features for each item in the dataset, ignored here.
 56 |         """
 57 |         if type(train) == csr_matrix:
 58 |             train = fast_sparse_matrix(train)
 59 | 
 60 |         num_users,num_items = train.shape
 61 | 
 62 |         self.U = self.init_factors(num_users,False)  # don't need values, will compute them
 63 |         self.V = self.init_factors(num_items)
 64 |         for it in xrange(self.num_iters):
 65 |             print 'iteration',it
 66 |             # fit user factors
 67 |             VV = self.V.T.dot(self.V)
 68 |             for u in xrange(num_users):
 69 |                 # get (positive i.e. non-zero scored) items for user
 70 |                 indices = train.X[u].nonzero()[1]
 71 |                 if indices.size:
 72 |                     self.U[u,:] = self.update(indices,self.V,VV)
 73 |                 else:
 74 |                     self.U[u,:] = np.zeros(self.d)
 75 |             # fit item factors
 76 |             UU = self.U.T.dot(self.U)
 77 |             for i in xrange(num_items):
 78 |                 indices = train.fast_get_col(i).nonzero()[0]
 79 |                 if indices.size:
 80 |                     self.V[i,:] = self.update(indices,self.U,UU)
 81 |                 else:
 82 |                     self.V[i,:] = np.zeros(self.d)
 83 | 
 84 |     def update(self,indices,H,HH):
 85 |         """
 86 |         Update latent factors for a single user or item.
 87 |         """
 88 |         Hix = H[indices,:]
 89 |         M = HH + self.alpha*Hix.T.dot(Hix) + np.diag(self.lbda*np.ones(self.d))
 90 |         return np.dot(np.linalg.inv(M),(1+self.alpha)*Hix.sum(axis=0))
 91 | 
 92 | def main():
 93 |     import sys
 94 |     from mrec import load_sparse_matrix, save_recommender
 95 |     from mrec.sparse import fast_sparse_matrix
 96 |     from mrec.mf.wrmf import WRMFRecommender
 97 | 
 98 |     file_format = sys.argv[1]
 99 |     filepath = sys.argv[2]
100 |     outfile = sys.argv[3]
101 | 
102 |     # load training set as scipy sparse matrix
103 |     train = load_sparse_matrix(file_format,filepath)
104 | 
105 |     model = WRMFRecommender(d=5)
106 |     model.fit(train)
107 | 
108 |     save_recommender(model,outfile)
109 | 
110 | if __name__ == '__main__':
111 |     main()
112 | 


--------------------------------------------------------------------------------
/mrec/parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mendeley/mrec/d299e3b9490703843b041e6585643b7e42e229f0/mrec/parallel/__init__.py


--------------------------------------------------------------------------------
/mrec/parallel/evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Evaluation task to run on an ipython engine.
 3 | """
 4 | 
 5 | def run(task):
 6 | 
 7 |     # import modules required by engine
 8 |     import numpy as np
 9 |     from scipy.sparse import coo_matrix
10 |     from collections import defaultdict
11 | 
12 |     from mrec import load_sparse_matrix
13 | 
14 |     input_format,testfile,recsfile,start,end,evaluator = task
15 | 
16 |     # load the test data
17 |     testdata = load_sparse_matrix(input_format,testfile)
18 | 
19 |     return evaluator.process(testdata,recsfile,start,end)
20 | 


--------------------------------------------------------------------------------
/mrec/parallel/item_similarity.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import glob
  3 | import re
  4 | import os
  5 | import subprocess
  6 | from shutil import rmtree
  7 | import logging
  8 | 
  9 | from mrec import load_sparse_matrix, save_recommender
 10 | 
 11 | class ItemSimilarityRunner(object):
 12 | 
 13 |     def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile):
 14 | 
 15 |         logging.info('finding number of items...')
 16 |         dataset = load_sparse_matrix(input_format,trainfile)
 17 |         num_users,num_items = dataset.shape
 18 |         del dataset
 19 |         logging.info('%d users and %d items', num_users, num_items)
 20 | 
 21 |         logging.info('creating sims directory {0}...'.format(simsdir))
 22 |         subprocess.check_call(['mkdir','-p',simsdir])
 23 | 
 24 |         done = []
 25 |         if not overwrite:
 26 |             logging.info('checking for existing output sims...')
 27 |             done.extend(self.find_done(simsdir))
 28 |             if done:
 29 |                 logging.info('found {0} output files'.format(len(done)))
 30 | 
 31 |         logging.info('creating tasks...')
 32 |         tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done)
 33 | 
 34 |         if num_engines > 0:
 35 |             logging.info('running %d tasks in parallel across ipython'
 36 |                          ' engines...', len(tasks))
 37 |             async_job = view.map_async(process,tasks,retries=2)
 38 |             # wait for tasks to complete
 39 |             results = async_job.get()
 40 |         else:
 41 |             # Sequential run to make it easier for debugging
 42 |             logging.info('training similarity model sequentially')
 43 |             results = [process(task) for task in tasks]
 44 | 
 45 |         logging.info('checking output files...')
 46 |         done = self.find_done(simsdir)
 47 |         remaining = len(tasks) - len(done)
 48 |         if remaining == 0:
 49 |             logging.info('SUCCESS: all tasks completed')
 50 |             logging.info('concatenating {0} partial output files...'.format(len(done)))
 51 |             paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done]
 52 |             cmd = ['cat']+paths
 53 |             subprocess.check_call(cmd,stdout=open(simsfile,'w'))
 54 |             logging.info('removing partial output files...')
 55 |             rmtree(simsdir)
 56 |             logging.info('loading %d items in %s model from %s',
 57 |                          num_items, type(model).__name__, simsfile)
 58 |             model.load_similarity_matrix(simsfile,num_items)
 59 |             save_recommender(model,modelfile)
 60 |             logging.info('done')
 61 |         else:
 62 |             logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
 63 |             logging.error('try rerunning the command to retry the remaining tasks')
 64 | 
 65 |     def find_done(self,outdir):
 66 |         success_files = glob.glob(os.path.join(outdir,'*.SUCCESS'))
 67 |         r = re.compile('.*?([0-9]+)-([0-9]+)\.SUCCESS$')
 68 |         done = []
 69 |         for path in success_files:
 70 |             m = r.match(path)
 71 |             start = int(m.group(1))
 72 |             end = int(m.group(2))
 73 |             done.append((start,end))
 74 |         return done
 75 | 
 76 |     def create_tasks(self,model,input_format,trainfile,outdir,num_items,num_engines,max_similar_items,done):
 77 |         if num_engines == 0:
 78 |             # special marker for sequential run
 79 |             num_engines = 1
 80 |         items_per_engine = int(math.ceil(float(num_items)/num_engines))
 81 |         tasks = []
 82 |         for start in xrange(0,num_items,items_per_engine):
 83 |             end = min(num_items,start+items_per_engine)
 84 |             if (start,end) not in done:
 85 |                 tasks.append((model,input_format,trainfile,outdir,start,end,max_similar_items))
 86 |         return tasks
 87 | 
 88 | def process(task):
 89 |     """
 90 |     Training task to run on an ipython engine.
 91 |     """
 92 | 
 93 |     # import modules required by engine
 94 |     import os
 95 |     import subprocess
 96 |     from mrec import load_fast_sparse_matrix
 97 | 
 98 |     model,input_format,trainfile,outdir,start,end,max_similar_items = task
 99 | 
100 |     # initialise the model
101 |     dataset = load_fast_sparse_matrix(input_format,trainfile)
102 |     if hasattr(model,'similarity_matrix'):
103 |         # clear out any existing similarity matrix to trigger recomputation of
104 |         # the item-item similarities from the users' ratings.
105 |         model.similarity_matrix = None
106 | 
107 |     # write sims directly to file as we compute them
108 |     outfile = os.path.join(outdir,'sims.{0}-{1}.tsv'.format(start,end))
109 |     out = open(outfile,'w')
110 |     for j in xrange(start,end):
111 |         w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset)
112 |         for k,v in w:
113 |             print >>out,'{0}\t{1}\t{2}'.format(j+1,k+1,v)  # write as 1-indexed
114 |     out.close()
115 | 
116 |     # record success
117 |     cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
118 |     subprocess.check_call(cmd)
119 | 
120 |     # return the range that we've processed
121 |     return start,end
122 | 


--------------------------------------------------------------------------------
/mrec/parallel/predict.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Prediction task to run on an ipython engine.
 3 | """
 4 | 
 5 | def run(task):
 6 | 
 7 |     # import modules required by engine
 8 |     import os
 9 |     import subprocess
10 |     import numpy as np
11 |     from scipy.sparse import coo_matrix
12 | 
13 |     from mrec import load_sparse_matrix, load_recommender
14 |     from mrec.evaluation import Evaluator
15 | 
16 |     modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task
17 | 
18 |     # initialise the model
19 |     model = load_recommender(modelfile)
20 | 
21 |     outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end))
22 | 
23 |     if generate:
24 |         # generate recommendations for our batch of users
25 |         dataset = load_sparse_matrix(input_format,trainfile)
26 |         out = open(outfile,'w')
27 |         if featurefile is not None:
28 |             # currently runs much faster if features are loaded as a dense matrix
29 |             item_features = load_sparse_matrix(feature_format,featurefile).toarray()
30 |             # strip features for any trailing items that don't appear in training set
31 |             num_items = dataset.shape[1]
32 |             item_features = item_features[:num_items,:]
33 |             recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features)
34 |         else:
35 |             recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True)
36 |         for u,items in zip(xrange(start,end),recs):
37 |             for i,w in items:
38 |                 print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w)  # write as 1-indexed
39 |         out.close()
40 | 
41 |         # record success
42 |         cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))]
43 |         subprocess.check_call(cmd)
44 | 
45 |     # load the test data
46 |     testdata = load_sparse_matrix(test_input_format,testfile).tocsr()
47 | 
48 |     # return evaluation metrics
49 |     return evaluator.process(testdata,outfile,start,end)
50 | 


--------------------------------------------------------------------------------
/mrec/parallel/warp.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import re
  3 | import os
  4 | import subprocess
  5 | from shutil import rmtree
  6 | import logging
  7 | import numpy as np
  8 | 
  9 | from mrec import save_recommender, load_recommender
 10 | 
 11 | class WARPMFRunner(object):
 12 | 
 13 |     def run(self,
 14 |             view,
 15 |             model,
 16 |             input_format,
 17 |             trainfile,
 18 |             feature_format,
 19 |             featurefile,
 20 |             num_engines,
 21 |             workdir,
 22 |             overwrite,
 23 |             modelfile):
 24 | 
 25 |         logging.info('creating models directory {0}...'.format(workdir))
 26 |         subprocess.check_call(['mkdir','-p',workdir])
 27 | 
 28 |         done = []
 29 |         if not overwrite:
 30 |             logging.info('checking for existing output models...')
 31 |             done.extend(self.find_done(workdir))
 32 |             if done:
 33 |                 logging.info('found {0} output files'.format(len(done)))
 34 | 
 35 |         logging.info('creating tasks...')
 36 |         tasks = self.create_tasks(model,
 37 |                                   input_format,
 38 |                                   trainfile,
 39 |                                   feature_format,
 40 |                                   featurefile,
 41 |                                   workdir,
 42 |                                   num_engines,
 43 |                                   done)
 44 | 
 45 |         if tasks:
 46 |             logging.info('running in parallel across ipython engines...')
 47 |             async_job = view.map_async(process,tasks,retries=2)
 48 | 
 49 |             # wait for tasks to complete
 50 |             results = async_job.get()
 51 | 
 52 |             logging.info('checking output files...')
 53 |             done = self.find_done(workdir)
 54 |             remaining = len(tasks) - len(done)
 55 |         else:
 56 |             remaining = 0
 57 | 
 58 |         if remaining == 0:
 59 |             logging.info('SUCCESS: all tasks completed')
 60 |             logging.info('concatenating {0} models...'.format(len(done)))
 61 |             for ix in sorted(done):
 62 |                 partial_model = load_recommender(self.get_modelfile(ix,workdir))
 63 |                 if ix == 0:
 64 |                     model = partial_model
 65 |                 else:
 66 |                     # concatenate factors
 67 |                     model.d += partial_model.d
 68 |                     model.U = np.hstack((model.U,partial_model.U))
 69 |                     model.V = np.hstack((model.V,partial_model.V))
 70 |                     if hasattr(model,'W'):
 71 |                         model.W = np.hstack((model.W,partial_model.W))
 72 |             save_recommender(model,modelfile)
 73 |             logging.info('removing partial output files...')
 74 |             rmtree(workdir)
 75 |             logging.info('done')
 76 |         else:
 77 |             logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks)))
 78 |             logging.error('try rerunning the command to retry the remaining tasks')
 79 | 
 80 |     def create_tasks(self,
 81 |                      model,
 82 |                      input_format,
 83 |                      trainfile,
 84 |                      feature_format,
 85 |                      featurefile,
 86 |                      outdir,
 87 |                      num_engines,
 88 |                      done):
 89 |         tasks = []
 90 |         for ix in xrange(num_engines):
 91 |             if ix not in done:
 92 |                 outfile = self.get_modelfile(ix,outdir)
 93 |                 tasks.append((model,input_format,trainfile,feature_format,featurefile,outfile,ix,num_engines))
 94 |         return tasks
 95 | 
 96 |     def find_done(self,outdir):
 97 |         success_files = glob.glob(os.path.join(outdir,'*.SUCCESS'))
 98 |         r = re.compile('.*?([0-9]+)\.model\.npz\.SUCCESS$')
 99 |         done = []
100 |         for path in success_files:
101 |             m = r.match(path)
102 |             ix = int(m.group(1))
103 |             done.append(ix)
104 |         return done
105 | 
106 |     def get_modelfile(self,ix,workdir):
107 |         return os.path.join(workdir,'{0}.model.npz'.format(ix))
108 | 
109 | def process(task):
110 |     """
111 |     Training task to run on an ipython engine.
112 |     """
113 | 
114 |     # import modules required by engine
115 |     import os
116 |     import subprocess
117 |     from mrec import load_sparse_matrix, save_recommender
118 | 
119 |     model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task
120 | 
121 |     dataset = load_sparse_matrix(input_format,trainfile)
122 |     if featurefile is not None:
123 |         # currently runs much faster if features are loaded as a dense matrix
124 |         item_features = load_sparse_matrix(feature_format,featurefile).toarray()
125 |         # strip features for any trailing items that don't appear in training set
126 |         num_items = dataset.shape[1]
127 |         item_features = item_features[:num_items,:]
128 |         model.fit(dataset,item_features=item_features)
129 |     else:
130 |         model.fit(dataset)
131 |     save_recommender(model,outfile)
132 | 
133 |     # record success
134 |     cmd = ['touch','{0}.SUCCESS'.format(outfile)]
135 |     subprocess.check_call(cmd)
136 | 
137 |     # return the offset for the samples that we've learned from
138 |     return offset
139 | 


--------------------------------------------------------------------------------
/mrec/parallel/wrmf.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | import os
  4 | import subprocess
  5 | from shutil import rmtree
  6 | import math
  7 | import numpy as np
  8 | 
  9 | from mrec import load_sparse_matrix, save_recommender
 10 | 
 11 | def get_user_indices(data,u):
 12 |     # get (positive i.e. non-zero scored) items for user
 13 |     return data.X[u].nonzero()[1]
 14 | 
 15 | def get_item_indices(data,i):
 16 |     # get users for item
 17 |     return data.fast_get_col(i).nonzero()[0]
 18 | 
 19 | def get_factor_files(workdir,factor_type):
 20 |     # return partial factor files in sorted order so they can simply be stacked
 21 |     factor_files = glob.glob(os.path.join(workdir,'{0}.*.npy'.format(factor_type)))
 22 |     return sorted(factor_files,key=lambda x: int(x[:-4][x[:-4].rfind('.')+1:]))
 23 | 
 24 | def get_user_factor_files(workdir):
 25 |     return get_factor_files(workdir,'U')
 26 | 
 27 | def get_item_factor_files(workdir):
 28 |     return get_factor_files(workdir,'V')
 29 | 
 30 | def init_item_factors(model,data):
 31 |     num_users,num_items = data.shape
 32 |     return model.init_factors(num_items)
 33 | 
 34 | class WRMFRunner(object):
 35 | 
 36 |     def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile):
 37 |         logging.info('creating factors directory {0}'.format(workdir))
 38 |         subprocess.check_call(['mkdir','-p',workdir])
 39 | 
 40 |         logging.info('getting data size')
 41 |         data = load_sparse_matrix(input_format,trainfile)
 42 |         num_users,num_items = data.shape
 43 |         del data
 44 | 
 45 |         for it in xrange(model.num_iters):
 46 |             logging.info('iteration {0}'.format(it))
 47 |             tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors)
 48 |             self.run_tasks(view,tasks)
 49 |             tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None)  # won't need to initialize user factors
 50 |             self.run_tasks(view,tasks)
 51 | 
 52 |         model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)])
 53 |         model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)])
 54 | 
 55 |         save_recommender(model,modelfile)
 56 | 
 57 |         logging.info('removing partial output files')
 58 |         rmtree(workdir)
 59 |         logging.info('done')
 60 | 
 61 |     def run_tasks(self,view,tasks):
 62 |         async_job = view.map_async(compute_factors,tasks,retries=2)
 63 |         # wait for tasks to complete
 64 |         result = async_job.get()
 65 | 
 66 |     def create_tasks(self,num_factors,num_engines,model,input_format,trainfile,workdir,factor_type,get_indices,get_fixed_factor_files,init_fixed_factors):
 67 |         factors_per_engine = int(math.ceil(float(num_factors)/num_engines))
 68 |         tasks = []
 69 |         for start in xrange(0,num_factors,factors_per_engine):
 70 |             end = min(num_factors,start+factors_per_engine)
 71 |             fixed_factor_files = get_fixed_factor_files(workdir)
 72 |             tasks.append((model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir))
 73 |         return tasks
 74 | 
 75 | def compute_factors(task):
 76 |     """
 77 |     WRMF update method to run on an IPython engine.
 78 |     This reads from file and writes back to file,
 79 |     only filepaths and an empty model need to be passed.
 80 |     """
 81 | 
 82 |     # import modules needed on engine
 83 |     import os
 84 |     import numpy as np
 85 |     from mrec import load_fast_sparse_matrix
 86 | 
 87 |     model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir = task
 88 | 
 89 |     data = load_fast_sparse_matrix(input_format,trainfile)
 90 | 
 91 |     if fixed_factor_files:
 92 |         H = np.vstack([np.load(f) for f in fixed_factor_files])
 93 |     else:
 94 |         H = init_fixed_factors(model,data)
 95 | 
 96 |     HH = H.T.dot(H)
 97 |     W = np.zeros(((end-start),model.d))
 98 |     for j in xrange(start,end):
 99 |         indices = get_indices(data,j)
100 |         if indices.size:
101 |             W[j-start,:] = model.update(indices,H,HH)
102 | 
103 |     np.save(os.path.join(workdir,'{0}.{1}.npy'.format(factor_type,start)),W)
104 | 
105 |     return start,end
106 | 


--------------------------------------------------------------------------------
/mrec/popularity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Trivial unpersonalized item popularity recommender
  3 | intended to provide a baseline for evaluations.
  4 | """
  5 | 
  6 | import numpy as np
  7 | 
  8 | from base_recommender import BaseRecommender
  9 | from sparse import fast_sparse_matrix
 10 | 
 11 | class ItemPopularityRecommender(BaseRecommender):
 12 |     """
 13 |     Create an unpersonalized item popularity recommender, useful
 14 |     to provide a baseline for comparison with a "real" one.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 | 
 19 |     method : 'count', 'sum', 'avg' or 'thresh' (default: 'count')
 20 |         How to calculate the popularity of an item based on its ratings
 21 |         from all users:
 22 |         count - popularity is its total number of ratings of any value
 23 |         sum - popularity is the sum of its ratings
 24 |         avg - popularity is its mean rating
 25 |         thresh - popularity is its number of ratings higher than thresh
 26 |     thresh : float, optional
 27 |         The threshold used by the 'thresh' method of calculating item
 28 |         popularity.
 29 |     """
 30 | 
 31 |     def __init__(self,method='count',thresh=0):
 32 |         self.description = 'ItemPop'
 33 |         if method not in ['count','sum','avg','thresh']:
 34 |             raise ValueError('invalid value for method parameter')
 35 |         self.method = method
 36 |         self.thresh = thresh
 37 | 
 38 |     def fit(self,dataset,item_features=None):
 39 |         """
 40 |         Compute the most popular items using the method specified
 41 |         in the constructor.
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         dataset : scipy sparse matrix or mrec.sparse.fast_sparse_matrix
 46 |             The user-item matrix.
 47 |         item_features : array_like, shape = [num_items, num_features]
 48 |             Features for items in training set, ignored here.
 49 |         """
 50 |         if isinstance(dataset,fast_sparse_matrix):
 51 |             d = dataset.X.tocsc()
 52 |         else:
 53 |             d = dataset.tocsc()
 54 |         if self.method == 'count':
 55 |             # count the total number of ratings for each item
 56 |             popularity = [(d[:,i].nnz,i) for i in xrange(d.shape[1])]
 57 |         elif self.method == 'sum':
 58 |             # find the sum of the ratings for each item
 59 |             popularity = [(d[:,i].sum(),i) for i in xrange(d.shape[1])]
 60 |         elif self.method == 'avg':
 61 |             # find the mean rating for each item
 62 |             popularity = [(d[:,i].mean(),i) for i in xrange(d.shape[1])]
 63 |         elif self.method == 'thresh':
 64 |             # count the number of ratings above thresh for each item
 65 |             popularity = [(sum(d[:,i].data>self.thresh),i) for i in xrange(d.shape[1])]
 66 |         popularity.sort(reverse=True)
 67 |         self.pop_items = [(i,c) for (c,i) in popularity]
 68 | 
 69 |     def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None):
 70 |         """
 71 |         Recommend new items for a user.  Assumes you've already called
 72 |         fit().
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |         dataset : scipy.sparse.csr_matrix
 77 |             User-item matrix containing known items.
 78 |         u : int
 79 |             Index of user for which to make recommendations (for
 80 |             compatibility with other recommenders).
 81 |         max_items : int
 82 |             Maximum number of recommended items to return.
 83 |         return_scores : bool
 84 |             If true return a score along with each recommended item.
 85 |         item_features : array_like, shape = [num_items, num_features]
 86 |             Features for items in training set, ignored here.
 87 | 
 88 |         Returns
 89 |         -------
 90 |         recs : list
 91 |             List of (idx,score) pairs if return_scores is True, else
 92 |             just a list of idxs.
 93 |         """
 94 |         known_items = set(dataset[u].indices)
 95 |         recs = []
 96 |         for i,c in self.pop_items:
 97 |             if i not in known_items:
 98 |                 if return_scores:
 99 |                     recs.append((i,c))
100 |                 else:
101 |                     recs.append(i)
102 |                 if len(recs) >= max_items:
103 |                     break
104 |         return recs
105 | 
106 |     def __str__(self):
107 |         return self.description
108 | 


--------------------------------------------------------------------------------
/mrec/reranking_recommender.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Recommender that gets candidates using an item similarity model
  3 | and then reranks them using a matrix factorization model.
  4 | """
  5 | 
  6 | try:
  7 |     import cPickle as pickle
  8 | except ImportError:
  9 |     import pickle
 10 | import numpy as np
 11 | 
 12 | from base_recommender import BaseRecommender
 13 | 
 14 | class RerankingRecommender(BaseRecommender):
 15 |     """
 16 |     A secondary recommender that combines an item similarity
 17 |     model and a matrix factorization one. The item similarity
 18 |     model is used to select candidate items for each user which
 19 |     are then reranked based on their latent factors.
 20 | 
 21 |     Parameters
 22 |     ==========
 23 |     item_similarity_recommender : mrec.item_similarity.recommender.ItemSimilarityRecommender
 24 |         The model used to select candidates.
 25 |     mf_recommender : mrec.mf.recommender.MatrixFactorizationRecommender
 26 |         The model used to rerank them.
 27 |     num_candidates : int (default: 100)
 28 |         The number of candidate items drawn from the first model for each user.
 29 |     """
 30 | 
 31 |     def __init__(self,item_similarity_recommender,mf_recommender,num_candidates=100):
 32 |         self.item_similarity_recommender = item_similarity_recommender
 33 |         self.mf_recommender = mf_recommender
 34 |         self.num_candidates = num_candidates
 35 |         self.description = 'RerankingRecommender({0},{1})'.format(self.item_similarity_recommender,self.mf_recommender)
 36 | 
 37 |     def _create_archive(self):
 38 |         archive = self.item_similarity_recommender._create_archive()
 39 |         archive['item_similarity_model'] = archive['model']
 40 |         archive.update(self.mf_recommender._create_archive())
 41 |         archive['mf_model'] = archive['model']
 42 |         tmp = self.item_similarity_recommender,self.mf_recommender
 43 |         self.item_similarity_model = self.mf_recommender = None
 44 |         m = pickle.dumps(self)
 45 |         self.item_similarity_model,self.mf_recommender = tmp
 46 |         archive['model'] = m
 47 |         return archive
 48 | 
 49 |     def _load_archive(self,archive):
 50 |         self.item_similarity_recommender = np.loads(str(archive['item_similarity_model']))
 51 |         self.item_similarity_recommender._load_archive(archive)
 52 |         self.mf_recommender = np.loads(str(archive['mf_model']))
 53 |         self.mf_recommender._load_archive(archive)
 54 | 
 55 |     def fit(self,train,item_features=None):
 56 |         """
 57 |         Fit both models to the training data.
 58 | 
 59 |         Parameters
 60 |         ==========
 61 |         train : scipy.sparse.csr_matrix, shape = [num_users, num_items]
 62 |             The training user-item matrix.
 63 |         item_features : array_like, shape = [num_items, num_features]
 64 |             Features for items in training set, required by some recommenders.
 65 | 
 66 |         Notes
 67 |         =====
 68 |         You are not obliged to call this, alternatively you can pass
 69 |         ready trained models to the RerankingRecommender constructor.
 70 |         """
 71 |         self.item_similarity_recommender.fit(train,item_features)
 72 |         self.mf_recommender.fit(train,item_features)
 73 | 
 74 |     def rerank(self,u,candidates,max_items,return_scores):
 75 |         """
 76 |         Use latent factors to rerank candidate recommended items for a user
 77 |         and return the highest scoring.
 78 | 
 79 |         Parameters
 80 |         ==========
 81 |         u : int
 82 |             Index of user for which to make recommendations.
 83 |         candidates : array like
 84 |             List of candidate item indices.
 85 |         max_items : int
 86 |             Maximum number of recommended items to return.
 87 |         return_scores : bool
 88 |             If true return a score along with each recommended item.
 89 | 
 90 |         Returns
 91 |         =======
 92 |         recs : list
 93 |             List of (idx,score) pairs if return_scores is True, else
 94 |             just a list of idxs.
 95 |         """
 96 |         r = self.mf_recommender.U[u].dot(self.mf_recommender.V[candidates].T)
 97 |         reranked = r.argsort()[:-1-max_items:-1]
 98 |         if return_scores:
 99 |             recs = [(candidates[i],r[i]) for i in reranked]
100 |         else:
101 |             recs = [candidates[i] for i in reranked]
102 |         return recs
103 | 
104 |     def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None):
105 |         """
106 |         Recommend new items for a user.
107 | 
108 |         Parameters
109 |         ==========
110 |         dataset : scipy.sparse.csr_matrix
111 |             User-item matrix containing known items.
112 |         u : int
113 |             Index of user for which to make recommendations.
114 |         max_items : int
115 |             Maximum number of recommended items to return.
116 |         return_scores : bool
117 |             If true return a score along with each recommended item.
118 |         item_features : array_like, shape = [num_items, num_features]
119 |             Features for items in training set, required by some recommenders.
120 | 
121 |         Returns
122 |         =======
123 |         recs : list
124 |             List of (idx,score) pairs if return_scores is True, else
125 |             just a list of idxs.
126 |         """
127 |         candidates = self.item_similarity_recommender.recommend_items(dataset,u,self.num_candidates,return_scores=False)
128 |         return self.rerank(u,candidates,max_items,return_scores=return_scores)
129 | 
130 |     def batch_recommend_items(self,
131 |                               dataset,
132 |                               max_items=10,
133 |                               return_scores=True,
134 |                               item_features=None):
135 |         """
136 |         Recommend new items for all users in the training dataset.  Assumes
137 |         you've already called fit() to learn the similarity matrix.
138 | 
139 |         Parameters
140 |         ==========
141 |         dataset : scipy.sparse.csr_matrix
142 |             User-item matrix containing known items.
143 |         max_items : int
144 |             Maximum number of recommended items to return.
145 |         return_scores : bool
146 |             If true return a score along with each recommended item.
147 |         show_progress: bool
148 |             If true print something to stdout to show progress.
149 |         item_features : array_like, shape = [num_items, num_features]
150 |             Features for items in training set, required by some recommenders.
151 | 
152 |         Returns
153 |         =======
154 |         recs : list of lists
155 |             Each entry is a list of (idx,score) pairs if return_scores is True,
156 |             else just a list of idxs.
157 |         """
158 |         recs = self.item_similarity_recommender.batch_recommend_items(dataset,self.num_candidates,return_scores=False,item_features=item_features)
159 |         for u,candidates in enumerate(recs):
160 |             recs[u] = self.rerank(u,candidates,max_items,return_scores=return_scores)
161 |         return recs
162 | 
163 |     def range_recommend_items(self,
164 |                               dataset,
165 |                               user_start,
166 |                               user_end,
167 |                               max_items=10,
168 |                               return_scores=True,
169 |                               item_features=None):
170 |         """
171 |         Recommend new items for a range of users in the training dataset.
172 |         Assumes you've already called fit() to learn the similarity matrix.
173 | 
174 |         Parameters
175 |         ==========
176 |         dataset : scipy.sparse.csr_matrix
177 |             User-item matrix containing known items.
178 |         user_start : int
179 |             Index of first user in the range to recommend.
180 |         user_end : int
181 |             Index one beyond last user in the range to recommend.
182 |         max_items : int
183 |             Maximum number of recommended items to return.
184 |         return_scores : bool
185 |             If true return a score along with each recommended item.
186 |         item_features : array_like, shape = [num_items, num_features]
187 |             Features for items in training set, required by some recommenders.
188 | 
189 |         Returns
190 |         =======
191 |         recs : list of lists
192 |             Each entry is a list of (idx,score) pairs if return_scores is True,
193 |             else just a list of idxs.
194 |         """
195 |         recs = self.item_similarity_recommender.range_recommend_items(dataset,user_start,user_end,self.num_candidates,return_scores=False,item_features=item_features)
196 |         for u,candidates in enumerate(recs):
197 |             recs[u] = self.rerank(user_start+u,candidates,max_items,return_scores=return_scores)
198 |         return recs
199 | 
200 | def main():
201 |     import sys
202 |     from mrec import load_sparse_matrix, save_recommender
203 |     from mrec.sparse import fast_sparse_matrix
204 |     from mrec.item_similarity.knn import CosineKNNRecommender
205 |     from mrec.mf.warp import WARPMFRecommender
206 |     from mrec.reranking_recommender import RerankingRecommender
207 | 
208 |     file_format = sys.argv[1]
209 |     filepath = sys.argv[2]
210 |     outfile = sys.argv[3]
211 | 
212 |     # load training set as scipy sparse matrix
213 |     train = load_sparse_matrix(file_format,filepath)
214 | 
215 |     item_sim_model = CosineKNNRecommender(k=100)
216 |     mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10)
217 |     recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100)
218 | 
219 |     recommender.fit(train)
220 | 
221 |     save_recommender(recommender,outfile)
222 | 
223 | if __name__ == '__main__':
224 |     main()
225 | 
226 | 


--------------------------------------------------------------------------------
/mrec/testing.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from scipy.sparse import coo_matrix
 4 | from sklearn.utils.testing import assert_array_equal
 5 | 
 6 | def get_random_coo_matrix(rows=3,cols=10,nnz=20):
 7 |     row_col = random.sample(xrange(rows*cols),nnz)  # ensure <row,col> are unique
 8 |     row = [i // cols for i in row_col]
 9 |     col = [i % cols for i in row_col]
10 |     data = np.random.randint(0,nnz*5,nnz)
11 |     return coo_matrix((data,(row,col)),shape=(rows,cols))
12 | 
13 | def assert_sparse_matrix_equal(X,Y):
14 |     expected = X.toarray()
15 |     actual = Y.toarray()
16 |     # it's possible that we had trailing empty columns in X
17 |     # - there's no way we can know about these sometimes e.g.
18 |     # when reading back from file
19 |     expected = expected[:actual.shape[0],:actual.shape[1]]
20 |     assert_array_equal(expected,actual)
21 | 
22 | 


--------------------------------------------------------------------------------
/mrec/tests/test_base_recommender.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import cPickle as pickle
 3 | except ImportError:
 4 |     import pickle
 5 | import tempfile
 6 | import os
 7 | import numpy as np
 8 | from nose.tools import assert_less_equal
 9 | from sklearn.utils.testing import assert_raises
10 | from sklearn.utils.testing import assert_equal
11 | from sklearn.utils.testing import assert_array_equal
12 | 
13 | from mrec.testing import get_random_coo_matrix
14 | 
15 | from mrec.base_recommender import BaseRecommender
16 | 
17 | class MyRecommender(BaseRecommender):
18 |     def __init__(self):
19 |         self.foo = np.ndarray(range(10))
20 |         self.description = 'my recommender'
21 |     def _create_archive(self):
22 |         tmp = self.foo
23 |         self.foo = None
24 |         m = pickle.dumps(self)
25 |         self.foo = tmp
26 |         return {'model':m,'foo':self.foo}
27 |     def _load_archive(self,archive):
28 |         self.foo = archive['foo']
29 | 
30 | def save_load(r):
31 |     f,path = tempfile.mkstemp(suffix='.npz')
32 |     r.save(path)
33 |     return BaseRecommender.load(path)
34 | 
35 | def check_read_description(r):
36 |     f,path = tempfile.mkstemp(suffix='.npz')
37 |     r.save(path)
38 |     d = BaseRecommender.read_recommender_description(path)
39 |     assert_equal(str(r),d)
40 | 
41 | def test_save_filepath_condition():
42 |     r = BaseRecommender()
43 |     invalid_filepath = 'no suffix'
44 |     assert_raises(ValueError,r.save,invalid_filepath)
45 | 
46 | def test_save_load():
47 |     r = save_load(BaseRecommender())
48 |     assert_equal(type(r),BaseRecommender)
49 |     r = MyRecommender()
50 |     r2 = save_load(r)
51 |     assert_equal(type(r2),type(r))
52 |     assert_array_equal(r2.foo,r.foo)
53 |     assert_equal(r2.description,r.description)
54 | 
55 | def test_read_recommender_description():
56 |     check_read_description(BaseRecommender())
57 |     check_read_description(MyRecommender())
58 | 
59 | def test_zero_known_item_scores():
60 |     train = get_random_coo_matrix().tocsr()
61 |     predictions = np.random.random_sample(train.shape)
62 |     r = BaseRecommender()
63 |     safe = r._zero_known_item_scores(predictions,train)
64 |     num_users,num_items = predictions.shape
65 |     for u in xrange(num_users):
66 |         for i in xrange(num_items):
67 |             if i in train[u].indices:
68 |                 assert_less_equal(safe[u,i],0)
69 |             else:
70 |                 assert_equal(safe[u,i],predictions[u,i])
71 | 


--------------------------------------------------------------------------------
/mrec/tests/test_mrec.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import os
 3 | 
 4 | from mrec.testing import get_random_coo_matrix
 5 | from mrec.testing import assert_sparse_matrix_equal
 6 | 
 7 | from mrec import load_sparse_matrix
 8 | from mrec import save_sparse_matrix
 9 | 
10 | def test_save_load_sparse_matrix():
11 |     X = get_random_coo_matrix()
12 |     for fmt in ['tsv','csv','npz','mm','fsm']:
13 |         if fmt == 'mm':
14 |             suffix = '.mtx'
15 |         elif fmt == 'npz' or fmt == 'fsm':
16 |             suffix = '.npz'
17 |         else:
18 |             suffix = ''
19 |         f,path = tempfile.mkstemp(suffix=suffix)
20 |         save_sparse_matrix(X,fmt,path)
21 |         Y = load_sparse_matrix(fmt,path)
22 |         assert_sparse_matrix_equal(X,Y)
23 |         os.remove(path)
24 | 


--------------------------------------------------------------------------------
/mrec/tests/test_sparse.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import os
 3 | from sklearn.utils.testing import assert_equal
 4 | from sklearn.utils.testing import assert_array_equal
 5 | 
 6 | from mrec.testing import get_random_coo_matrix
 7 | from mrec.testing import assert_sparse_matrix_equal
 8 | 
 9 | from mrec.sparse import loadtxt
10 | from mrec.sparse import savez
11 | from mrec.sparse import loadz
12 | from mrec.sparse import fast_sparse_matrix
13 | 
14 | def test_loadtxt():
15 |     X = get_random_coo_matrix()
16 |     f,path = tempfile.mkstemp(suffix='.npz')
17 |     with open(path,'w') as f:
18 |         for i,j,v in zip(X.row,X.col,X.data):
19 |             print >>f,'{0}\t{1}\t{2}'.format(i+1,j+1,v)
20 |     Y = loadtxt(path)
21 |     os.remove(path)
22 |     assert_sparse_matrix_equal(X,Y)
23 | 
24 | def test_savez_loadz():
25 |     m = get_random_coo_matrix()
26 |     f,path = tempfile.mkstemp(suffix='.npz')
27 |     savez(m,path)
28 |     n = loadz(path)
29 |     os.remove(path)
30 |     assert_array_equal(n.toarray(),m.toarray())
31 | 
32 | def test_init_fast_sparse_matrix():
33 |     X = get_random_coo_matrix()
34 |     Y = X.tocsr()
35 |     Z = X.tocsc()
36 |     for M in [X,Y,Z]:
37 |         m = fast_sparse_matrix(M)
38 |         assert_array_equal(m.X.toarray(),M.toarray())
39 |         assert_equal(m.shape,M.shape)
40 | 
41 | def test_fast_get_col():
42 |     X = get_random_coo_matrix().tocsc()
43 |     m = fast_sparse_matrix(X)
44 |     rows,cols = X.shape
45 |     for j in xrange(cols):
46 |         assert_array_equal(m.fast_get_col(j).toarray(),X[:,j].toarray())
47 | 
48 | def test_fast_update_col():
49 |     X = get_random_coo_matrix().tocsc()
50 |     m = fast_sparse_matrix(X)
51 |     cols = X.shape[1]
52 |     for j in xrange(cols):
53 |         vals = m.fast_get_col(j).data
54 |         if (vals==0).all():
55 |             continue
56 |         vals[vals!=0] += 1
57 |         m.fast_update_col(j,vals)
58 |         expected = X[:,j].toarray()
59 |         for i in xrange(expected.shape[0]):
60 |             if expected[i] != 0:
61 |                 expected[i] += 1
62 |         assert_array_equal(m.fast_get_col(j).toarray(),expected)
63 | 
64 | def test_save_load():
65 |     """Save to file as arrays in numpy binary format."""
66 |     X = get_random_coo_matrix()
67 |     m = fast_sparse_matrix(X)
68 |     f,path = tempfile.mkstemp(suffix='.npz')
69 |     m.save(path)
70 |     n = fast_sparse_matrix.load(path)
71 |     os.remove(path)
72 |     assert_equal(m.shape,n.shape)
73 |     assert_array_equal(m.X.toarray(),n.X.toarray())
74 |     assert_array_equal(m.col_view.toarray(),n.col_view.toarray())
75 | 
76 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from setuptools import setup, find_packages, Extension
 4 | 
 5 | from Cython.Distutils import build_ext
 6 | import numpy
 7 | 
 8 | import mrec
 9 | 
10 | with open('README.rst') as f:
11 |     long_description = f.read()
12 | 
13 | setup(packages=find_packages(),
14 |       version=mrec.__version__,
15 |       maintainer='Mark Levy',
16 |       name='mrec',
17 |       package_dir={'':'.'},
18 |       maintainer_email='mark.levy@mendeley.com',
19 |       description='mrec recommender systems library',
20 |       long_description=long_description,
21 |       url='https://github.com/mendeley/mrec',
22 |       download_url='https://github.com/mendeley/mrec/tarball/master#egg=mrec-'+mrec.__version__,
23 |       classifiers=['Development Status :: 4 - Beta',
24 |                    'Environment :: Console',
25 |                    'License :: OSI Approved :: BSD License',
26 |                    'Operating System :: Unix',
27 |                    'Programming Language :: Python',
28 |                    'Topic :: Scientific/Engineering',],
29 |       install_requires=['numpy',
30 |                         'scipy',
31 |                         'scikit-learn',
32 |                         'ipython <= 4.0.0',
33 |                         'cython',
34 |                         'psutil'],
35 |       entry_points={
36 |           'console_scripts':[
37 |               'mrec_prepare = mrec.examples.prepare:main',
38 |               'mrec_train = mrec.examples.train:main',
39 |               'mrec_predict = mrec.examples.predict:main',
40 |               'mrec_evaluate = mrec.examples.evaluate:main',
41 |               'mrec_tune = mrec.examples.tune_slim:main',
42 |               'mrec_convert = mrec.examples.convert:main',
43 |               'mrec_factors = mrec.examples.factors:main',
44 |           ]},
45 |       cmdclass={'build_ext':build_ext},
46 |       ext_modules=[Extension('warp_fast',
47 |                              sources=['mrec/mf/model/warp_fast.pyx'],
48 |                              include_dirs=[numpy.get_include()]),
49 |                   ]
50 | )
51 | 


--------------------------------------------------------------------------------