├── MANIFEST.in
├── setup.cfg
├── .gitignore
├── setup.py
├── LICENSE.txt
├── examples
    ├── simple.py
    └── Dockerfile
├── Dockerfile
├── README.md
└── pywFM
    └── __init__.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE.md
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # libFM folder
 2 | libfm
 3 | 
 4 | # Compiled python modules.
 5 | *.pyc
 6 | 
 7 | # Setuptools distribution folder.
 8 | /dist/
 9 | /build/
10 | 
11 | # Python egg metadata, regenerated from source files by setuptools.
12 | /*.egg-info
13 | .idea/
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # http://python-packaging-user-guide.readthedocs.org/en/latest/distributing/#uploading-your-project-to-pypi
 2 | # to publish package:
 3 | # 1) python setup.py register
 4 | # 2) python setup.py sdist bdist_wheel upload
 5 | # 3) Convert pypi documentation (http://devotter.com/converter)
 6 | 
 7 | from setuptools import setup
 8 | 
 9 | setup(name='pywFM',
10 |       version='0.12.3',
11 |       description='Python wrapper for libFM',
12 |       classifiers=[
13 |         'Development Status :: 3 - Alpha',
14 |         'License :: OSI Approved :: MIT License',
15 |         'Programming Language :: Python :: 2.7',
16 |         'Topic :: Scientific/Engineering :: Information Analysis',
17 |       ],
18 |       keywords='python wrapper libfm factorization machines',
19 |       url='http://github.com/jfloff/pywFM',
20 |       author='Joao Loff',
21 |       author_email='jfloff@gmail.com',
22 |       license='MIT',
23 |       packages=['pywFM'],
24 |       install_requires=[
25 |         'numpy',
26 |         'scipy',
27 |         'scikit-learn',
28 |         'pandas'
29 |       ],
30 |       zip_safe=False)
31 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT COPYRIGHT (c) 2015 João Ferreira Loff
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/examples/simple.py:
--------------------------------------------------------------------------------
 1 | import pywFM
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | features = np.matrix([
 6 | #     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
 7 | #    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
 8 |     [1, 0, 0,  1,  0,  0,  0,   0.3, 0.3, 0.3, 0,     13,   0,  0,  0,  0 ],
 9 |     [1, 0, 0,  0,  1,  0,  0,   0.3, 0.3, 0.3, 0,     14,   1,  0,  0,  0 ],
10 |     [1, 0, 0,  0,  0,  1,  0,   0.3, 0.3, 0.3, 0,     16,   0,  1,  0,  0 ],
11 |     [0, 1, 0,  0,  0,  1,  0,   0,   0,   0.5, 0.5,   5,    0,  0,  0,  0 ],
12 |     [0, 1, 0,  0,  0,  0,  1,   0,   0,   0.5, 0.5,   8,    0,  0,  1,  0 ],
13 |     [0, 0, 1,  1,  0,  0,  0,   0.5, 0,   0.5, 0,     9,    0,  0,  0,  0 ],
14 |     [0, 0, 1,  0,  0,  1,  0,   0.5, 0,   0.5, 0,     12,   1,  0,  0,  0 ]
15 | ])
16 | target = [5, 3, 1, 4, 5, 1, 5]
17 | 
18 | fm = pywFM.FM(task='regression', num_iter=5)
19 | 
20 | # split features and target for train/test
21 | # first 5 are train, last 2 are test
22 | model = fm.run(features[:5], target[:5], features[5:], target[5:])
23 | print(model.predictions)
24 | # you can also get the model weights
25 | print(model.weights)
26 | 


--------------------------------------------------------------------------------
/examples/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Docker image to run pywFM examples
 2 | FROM jfloff/alpine-python:2.7
 3 | 
 4 | # install needed packages
 5 | RUN set -ex ;\
 6 |     echo "@community http://dl-cdn.alpinelinux.org/alpine/v$ALPINE_VERSION/community" >> /etc/apk/repositories ;\
 7 |     apk add --no-cache --update libgfortran \
 8 |                                 openblas-dev@community
 9 | 
10 | # Due to dependencies scipy and scikit-learn have to be in different commands
11 | # Also due to that, we can't have a requirements.txt file
12 | RUN set -ex ;\
13 |     pip install --upgrade --no-cache-dir numpy ;\
14 |     pip install --upgrade --no-cache-dir scipy \
15 |                                          scikit-learn \
16 |                                          pandas \
17 |                                          ;\
18 |     pip install --upgrade --no-cache-dir pywFM ;\
19 |     rm -rf ~/.cache/pip/
20 | 
21 | # clone repo and set envorinment variable to libfm PATH
22 | RUN set -ex ;\
23 |     git clone https://github.com/srendle/libfm /home/libfm ;\
24 |     cd /home/libfm/ ;\
25 |     # taking advantage of a bug to allow us to save model #ShameShame
26 |     git reset --hard 91f8504a15120ef6815d6e10cc7dee42eebaab0f ;\
27 |     make all
28 | ENV LIBFM_PATH /home/libfm/bin/
29 | 
30 | # since we will be "always" mounting the volume, we can set this up
31 | WORKDIR /home/pywfm
32 | 
33 | # start init script and bash right after
34 | CMD /bin/bash
35 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Docker image to pywFM development
 2 | FROM jfloff/alpine-python:2.7
 3 | 
 4 | # install needed packages
 5 | RUN set -ex ;\
 6 |     echo "@community http://dl-cdn.alpinelinux.org/alpine/v$ALPINE_VERSION/community" >> /etc/apk/repositories ;\
 7 |     apk add --no-cache --update libgfortran \
 8 |                                 openblas-dev@community
 9 | 
10 | # Due to dependencies scipy and scikit-learn have to be in different commands
11 | # Also due to that, we can't have a requirements.txt file
12 | RUN set -ex ;\
13 |     pip install --upgrade --no-cache-dir numpy ;\
14 |     pip install --upgrade --no-cache-dir scikit-learn \
15 |                                          pandas \
16 |                                          scipy \
17 |                                          # for PyPi package management
18 |                                          # How to publish to PyPi:
19 |                                          # 1) bump setup.py for the new version <VERSION>
20 |                                          # 2) `python setup.py bdist_wheel`
21 |                                          # 3) `twine upload dist/pywFM-<VERSION>-py2-none-any.whl`
22 |                                          wheel \
23 |                                          twine \
24 |                                          ;\
25 |     # make sure nothing is on pip cache folder
26 |     rm -rf ~/.cache/pip/
27 | 
28 | # clone repo and set envorinment variable to libfm PATH
29 | RUN set -ex ;\
30 |     git clone https://github.com/srendle/libfm /home/libfm ;\
31 |     cd /home/libfm/ ;\
32 |     # taking advantage of a bug to allow us to save model #ShameShame
33 |     git reset --hard 91f8504a15120ef6815d6e10cc7dee42eebaab0f ;\
34 |     make all
35 | ENV LIBFM_PATH /home/libfm/bin/
36 | 
37 | # since we will be "always" mounting the volume, we can set this up
38 | WORKDIR /home/pywFM
39 | 
40 | # install package in development mode at the begining
41 | CMD pip install -e . && /bin/bash
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | pywFM
  2 | ======
  3 | 
  4 | pywFM is a Python wrapper for Steffen Rendle's [libFM](http://libfm.org/). libFM is a **Factorization Machine** library:
  5 | 
  6 | > Factorization machines (FM) are a generic approach that allows to mimic most factorization models by feature engineering. This way, factorization machines combine the generality of feature engineering with the superiority of factorization models in estimating interactions between categorical variables of large domain. libFM is a software implementation for factorization machines that features stochastic gradient descent (SGD) and alternating least squares (ALS) optimization as well as Bayesian inference using Markov Chain Monte Carlo (MCMC).
  7 | 
  8 | For more information regarding Factorization machines and libFM, read Steffen Rendle's paper: [Factorization Machines with libFM, in ACM Trans. Intell. Syst. Technol., 3(3), May. 2012](http://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf)
  9 | 
 10 | **Don't forget to acknowledge `libFM` (i.e. cite the paper [Factorization Machines with libFM](http://libfm.org/#publications)) if you publish results produced with this software.**
 11 | 
 12 | 
 13 | ### Motivation
 14 | While using Python implementations of Factorization Machines, I felt that the current implementations ([pyFM](https://github.com/coreylynch/pyFM) and [fastFM](https://github.com/ibayer/fastFM/)) had many *[f](https://github.com/coreylynch/pyFM/issues/3)[l](https://github.com/ibayer/fastFM/issues/28)[a](https://github.com/ibayer/fastFM/blob/master/examples/warm_start_als.py#L45)w[s](https://github.com/ibayer/fastFM/issues/13)*. Then I though, why re-invent the wheel? Why not use the original libFM?
 15 | 
 16 | Sure, it's not Python native yada yada ... But at least we have a bulletproof, battle-tested implementation that we can guide ourselves with.
 17 | 
 18 | ### Installing
 19 | First you have to clone and compile `libFM` repository and set an environment variable to the `libFM` bin folder:
 20 | ```shell
 21 | git clone https://github.com/srendle/libfm /home/libfm
 22 | cd /home/libfm/
 23 | # taking advantage of a bug to allow us to save model #ShameShame
 24 | git reset --hard 91f8504a15120ef6815d6e10cc7dee42eebaab0f
 25 | make all
 26 | export LIBFM_PATH=/home/libfm/bin/
 27 | ```
 28 | 
 29 | Make sure you are compiling source from `libfm` repository and at [this specific commit](https://github.com/srendle/libfm/commit/91f8504a15120ef6815d6e10cc7dee42eebaab0f), since `pywFM` needs the `save_model`. ***Beware that the installers and source code in [libfm.org](libfm.org) are both dated before this commit.*** I know this is extremely hacky, but since a fix was deployed it only allows the `save_model` option for SGD or ALS. I don't know why exactly, because it was working well before.
 30 | 
 31 | If you use *Jupyter* take a look at the following [issue](https://github.com/jfloff/pywFM/issues/18) for some extra notes on getting `libfm` to work.
 32 | 
 33 | Then, install `pywFM` using `pip`:
 34 | ```shell
 35 | pip install pywFM
 36 | ```
 37 | 
 38 | Binary installers for the latest released version are available at the [Python package index](http://pypi.python.org/pypi/pywFM/).
 39 | 
 40 | ### Dependencies
 41 | * numpy
 42 | * scipy
 43 | * sklearn
 44 | * pandas
 45 | 
 46 | ### Example
 47 | 
 48 | Very simple example taken from Steffen Rendle's paper: Factorization Machines with libFM.
 49 | 
 50 | ```py
 51 | import pywFM
 52 | import numpy as np
 53 | import pandas as pd
 54 | 
 55 | features = np.matrix([
 56 | #     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
 57 | #    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
 58 |     [1, 0, 0,  1,  0,  0,  0,   0.3, 0.3, 0.3, 0,     13,   0,  0,  0,  0 ],
 59 |     [1, 0, 0,  0,  1,  0,  0,   0.3, 0.3, 0.3, 0,     14,   1,  0,  0,  0 ],
 60 |     [1, 0, 0,  0,  0,  1,  0,   0.3, 0.3, 0.3, 0,     16,   0,  1,  0,  0 ],
 61 |     [0, 1, 0,  0,  0,  1,  0,   0,   0,   0.5, 0.5,   5,    0,  0,  0,  0 ],
 62 |     [0, 1, 0,  0,  0,  0,  1,   0,   0,   0.5, 0.5,   8,    0,  0,  1,  0 ],
 63 |     [0, 0, 1,  1,  0,  0,  0,   0.5, 0,   0.5, 0,     9,    0,  0,  0,  0 ],
 64 |     [0, 0, 1,  0,  0,  1,  0,   0.5, 0,   0.5, 0,     12,   1,  0,  0,  0 ]
 65 | ])
 66 | target = [5, 3, 1, 4, 5, 1, 5]
 67 | 
 68 | fm = pywFM.FM(task='regression', num_iter=5)
 69 | 
 70 | # split features and target for train/test
 71 | # first 5 are train, last 2 are test
 72 | model = fm.run(features[:5], target[:5], features[5:], target[5:])
 73 | print(model.predictions)
 74 | # you can also get the model weights
 75 | print(model.weights)
 76 | ```
 77 | 
 78 | You can also use numpy's `array`, sklearn's `sparse_matrix`, and even pandas' `DataFrame` as features input.
 79 | 
 80 | ### Prediction on new data
 81 | 
 82 | Current approach is to send test data as `x_test`, `y_test` in `run` method call. libfm uses the test values to output some results regarding its predictions. They are not used when training the model. `y_test` can be set as dummy value and just collect the predictions with `model.predictions` (also disregard the prediction statistics since those will be wrong). For more info check libfm manual.
 83 | 
 84 | Running against a new dataset using something like a `predict` method is not supported yet. Pending feature request: https://github.com/jfloff/pywFM/issues/7
 85 | 
 86 | Feel free to PR that change ;)
 87 | 
 88 | ### Usage
 89 | 
 90 | *Don't forget to acknowledge `libFM` (i.e. cite the paper [Factorization Machines with libFM](http://libfm.org/#publications)) if you publish results produced with this software.*
 91 | 
 92 | ##### **`FM`**: Class that wraps `libFM` parameters. For more information read [libFM manual](http://www.libfm.org/libfm-1.42.manual.pdf)
 93 | 
 94 | ```
 95 | Parameters
 96 | ----------
 97 | task : string, MANDATORY
 98 |         regression: for regression
 99 |         classification: for binary classification
100 | num_iter: int, optional
101 |     Number of iterations
102 |     Defaults to 100
103 | init_stdev : double, optional
104 |     Standard deviation for initialization of 2-way factors
105 |     Defaults to 0.1
106 | k0 : bool, optional
107 |     Use bias.
108 |     Defaults to True
109 | k1 : bool, optional
110 |     Use 1-way interactions.
111 |     Defaults to True
112 | k2 : int, optional
113 |     Dimensionality of 2-way interactions.
114 |     Defaults to 8
115 | learning_method: string, optional
116 |     sgd: parameter learning with SGD
117 |     sgda: parameter learning with adpative SGD
118 |     als: parameter learning with ALS
119 |     mcmc: parameter learning with MCMC
120 |     Defaults to 'mcmc'
121 | learn_rate: double, optional
122 |     Learning rate for SGD
123 |     Defaults to 0.1
124 | r0_regularization: int, optional
125 |     bias regularization for SGD and ALS
126 |     Defaults to 0
127 | r1_regularization: int, optional
128 |     1-way regularization for SGD and ALS
129 |     Defaults to 0
130 | r2_regularization: int, optional
131 |     2-way regularization for SGD and ALS
132 |     Defaults to 0
133 | rlog: bool, optional
134 |     Enable/disable rlog output
135 |     Defaults to True.
136 | verbose: bool, optional
137 |     How much infos to print
138 |     Defaults to False.
139 | seed: int, optional
140 |     seed used to reproduce the results
141 |     Defaults to None.
142 | silent: bool, optional
143 |     Completly silences all libFM output
144 |     Defaults to False.
145 | temp_path: string, optional
146 |     Sets path for libFM temporary files. Usefull when dealing with large data.
147 |     Defaults to None (default mkstemp behaviour)
148 | ```
149 | 
150 | ##### **`FM.run`**: run factorization machine model against train and test data
151 | ```
152 | 
153 | Parameters
154 | ----------
155 | x_train : {array-like, matrix}, shape = [n_train, n_features]
156 |     Training data
157 | y_train : numpy array of shape [n_train]
158 |     Target values
159 | x_test: {array-like, matrix}, shape = [n_test, n_features]
160 |     Testing data
161 | y_test : numpy array of shape [n_test]
162 |     Testing target values
163 | x_validation_set: optional, {array-like, matrix}, shape = [n_train, n_features]
164 |     Validation data (only for SGDA)
165 | y_validation_set: optional, numpy array of shape [n_train]
166 |     Validation target data (only for SGDA)
167 | 
168 | Return
169 | -------
170 | Returns `namedtuple` with the following properties:
171 | 
172 | predictions: array [n_samples of x_test]
173 |    Predicted target values per element in x_test.
174 | global_bias: float
175 |     If k0 is True, returns the model's global bias w0
176 | weights: array [n_features]
177 |     If k1 is True, returns the model's weights for each features Wj
178 | pairwise_interactions: numpy matrix [n_features x k2]
179 |     Matrix with pairwise interactions Vj,f
180 | rlog: pandas dataframe [nrow = num_iter]
181 |     `pandas` DataFrame with measurements about each iteration
182 | ```
183 | 
184 | ### Docker
185 | This repository includes `Dockerfile` for development and for running `pywFM`.
186 | 
187 | * Run `pywFM` examples ([Dockerfile](examples/Dockerfile)): if you are only interested in running the examples, you can use the pre-build image availabe in [Docker Hub](https://hub.docker.com/r/jfloff/pywfm):
188 | ```shell
189 | # to run examples/simple.py (the one in this README).
190 | docker run --rm -v "$(pwd)":/home/pywfm -w /home/pywfm -ti jfloff/pywfm python examples/simple.py
191 | ```
192 | 
193 | * Development of `pywFM` ([Dockerfile](Dockerfile)): useful if you want to make changes to the repo. `Dockerfile` defaults to bash.
194 | ```shell
195 | # to build image
196 | docker build --rm=true -t jfloff/pywfm-dev .
197 | # to run image
198 | docker run --rm -v "$(pwd)":/home/pywfm-dev -w /home/pywfm-dev -ti jfloff/pywfm-dev
199 | ```
200 | 
201 | ### Future work
202 | * Improve the `save_model` / `load_model` so we can have a more defined init-fit-predict cycle (perhaps we could inherit from [sklearn.BaseEstimator](http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html))
203 | * Can we contribute to libFM repo so save_model is enabled for all learning methods (namely MCMC)?
204 | * Look up into shared library solution to improve I/O overhead
205 | 
206 | *I'm no factorization machine expert, so this library was just an effort to have `libFM` as fast as possible in Python. Feel free to suggest features, enhancements; to point out issues; and of course, to post PRs.*
207 | 
208 | 
209 | ### License
210 | 
211 | MIT (see LICENSE.txt file)
212 | 


--------------------------------------------------------------------------------
/pywFM/__init__.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import os
  3 | import tempfile
  4 | 
  5 | 
  6 | class FM:
  7 |     """ Class that wraps `libFM` parameters. For more information read
  8 |     [libFM manual](http://www.libfm.org/libfm-1.42.manual.pdf)
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     task : string, MANDATORY
 13 |         regression: for regression
 14 |         classification: for binary classification
 15 | 
 16 |     num_iter: int, optional
 17 |         Number of iterations
 18 |         Defaults to 100
 19 |     init_stdev : double, optional
 20 |         Standard deviation for initialization of 2-way factors
 21 |         Defaults to 0.1
 22 |     k0 : bool, optional
 23 |         Use bias.
 24 |         Defaults to True
 25 |     k1 : bool, optional
 26 |         Use 1-way interactions.
 27 |         Defaults to True
 28 |     k2 : int, optional
 29 |         Dimensionality of 2-way interactions.
 30 |         Defaults to 8
 31 |     learning_method: string, optional
 32 |         sgd: parameter learning with SGD
 33 |         sgda: parameter learning with adpative SGD
 34 |         als: parameter learning with ALS
 35 |         mcmc: parameter learning with MCMC
 36 |         Defaults to 'mcmc'
 37 |     learn_rate: double, optional
 38 |         Learning rate for SGD
 39 |         Defaults to 0.1
 40 |     r0_regularization: int, optional
 41 |         bias regularization for SGD and ALS
 42 |         Defaults to 0
 43 |     r1_regularization: int, optional
 44 |         1-way regularization for SGD and ALS
 45 |         Defaults to 0
 46 |     r2_regularization: int, optional
 47 |         2-way regularization for SGD and ALS
 48 |         Defaults to 0
 49 |     rlog: bool, optional
 50 |         Enable/disable rlog output
 51 |         Defaults to True.
 52 |     verbose: bool, optional
 53 |         How much infos to print
 54 |         Defaults to False.
 55 |     seed: int, optional
 56 |         seed used to reproduce the results
 57 |         Defaults to None.
 58 |     silent: bool, optional
 59 |         Completly silences all libFM output
 60 |         Defaults to False.
 61 |     temp_path: string, optional
 62 |         Sets path for libFM temporary files. Usefull when dealing with large data.
 63 |         Defaults to None (default NamedTemporaryFile behaviour)
 64 |     """
 65 | 
 66 |     """
 67 |     ### unsused libFM flags
 68 |     cache_size: cache size for data storage (only applicable if data is in binary format), default=infty
 69 |         datafile is text so we don't need this parameter
 70 |     relation: BS - filenames for the relations, default=''
 71 |         not dealing with BS extensions since they are only used for binary files
 72 |     """
 73 | 
 74 |     def __init__(self,
 75 |                  task,
 76 |                  num_iter=100,
 77 |                  init_stdev=0.1,
 78 |                  k0=True,
 79 |                  k1=True,
 80 |                  k2=8,
 81 |                  learning_method='mcmc',
 82 |                  learn_rate=0.1,
 83 |                  r0_regularization=0,
 84 |                  r1_regularization=0,
 85 |                  r2_regularization=0,
 86 |                  rlog=True,
 87 |                  verbose=False,
 88 |                  seed=None,
 89 |                  silent=False,
 90 |                  temp_path=None):
 91 | 
 92 |         # gets first letter of either regression or classification
 93 |         self.__task = task[0]
 94 |         self.__num_iter = num_iter
 95 |         self.__init_stdev = init_stdev
 96 |         self.__dim = "%d,%d,%d" % (int(k0), int(k1), k2)
 97 |         self.__learning_method = learning_method
 98 |         self.__learn_rate = learn_rate
 99 |         self.__regularization = "%.5f,%.5f,%.5f" % (r0_regularization, r1_regularization, r2_regularization)
100 |         self.__rlog = rlog
101 |         self.__verbose = int(verbose)
102 |         self.__seed = int(seed) if seed else None
103 |         self.__silent = silent
104 |         self.__temp_path = temp_path
105 | 
106 |         # gets libfm path
107 |         self.__libfm_path = os.path.join(os.environ.get('LIBFM_PATH'), "")
108 |         if self.__libfm_path is None:
109 |             raise OSError("`LIBFM_PATH` is not set. Please install libFM and set the path variable "
110 |                           "(https://github.com/jfloff/pywFM#installing).")
111 | 
112 |         # #ShameShame
113 |         # Once upon a time, there was a bug in libFM that allowed any type of
114 |         # learning_method to save the model. I @jfloff built this package at
115 |         # that time, and did not find anything that showed me that MCMC couldn't
116 |         # use save_model flag. Nowadays only SGD and ALS can use this parameter.
117 |         # Hence, we need to reset the repo to this specific commit pre-fix, so
118 |         # we can use MCMC with save_model flag.
119 |         # Can we contribute to main libFM repo so this is possible again??
120 |         GITHASH = '91f8504a15120ef6815d6e10cc7dee42eebaab0f'
121 |         c_githash = subprocess.check_output(['git', '--git-dir', os.path.join(self.__libfm_path, "..", ".git"), 'rev-parse', 'HEAD']).strip()
122 |         if c_githash.decode("utf-8") != GITHASH:
123 |             raise OSError("libFM is not checked out to the correct commit."
124 |                           "(https://github.com/jfloff/pywFM#installing).")
125 | 
126 |     def run(self, x_train, y_train, x_test, y_test, x_validation_set=None, y_validation_set=None, meta=None):
127 |         """Run factorization machine model against train and test data
128 | 
129 |         Parameters
130 |         ----------
131 |         x_train : {array-like, matrix}, shape = [n_train, n_features]
132 |             Training data
133 |         y_train : numpy array of shape [n_train]
134 |             Target values
135 |         x_test: {array-like, matrix}, shape = [n_test, n_features]
136 |             Testing data
137 |         y_test : numpy array of shape [n_test]
138 |             Testing target values
139 |         x_validation_set: optional, {array-like, matrix}, shape = [n_train, n_features]
140 |             Validation data (only for SGDA)
141 |         y_validation_set: optional, numpy array of shape [n_train]
142 |             Validation target data (only for SGDA)
143 |         meta: optional, numpy array of shape [n_features]
144 |             Grouping input variables
145 | 
146 |         Return
147 |         -------
148 |         Returns `namedtuple` with the following properties:
149 | 
150 |         predictions: array [n_samples of x_test]
151 |            Predicted target values per element in x_test.
152 |         global_bias: float
153 |             If k0 is True, returns the model's global bias w0
154 |         weights: array [n_features]
155 |             If k1 is True, returns the model's weights for each features Wj
156 |         pairwise_interactions: numpy matrix [n_features x k2]
157 |             Matrix with pairwise interactions Vj,f
158 |         rlog: pandas dataframe [nrow = num_iter]
159 |             `pandas` DataFrame with measurements about each iteration
160 |         """
161 | 
162 |         from sklearn.datasets import dump_svmlight_file
163 | 
164 |         TMP_SUFFIX = '.pywfm'
165 |         train_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
166 |         test_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
167 |         out_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
168 |         model_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
169 | 
170 |         # converts train and test data to libSVM format
171 |         dump_svmlight_file(x_train, y_train, train_fd)
172 |         train_fd.seek(0)
173 |         dump_svmlight_file(x_test, y_test, test_fd)
174 |         test_fd.seek(0)
175 | 
176 |         # builds arguments array
177 |         args = [os.path.join(self.__libfm_path, "libFM"),
178 |                 '-task', "%s" % self.__task,
179 |                 '-train', "%s" % train_fd.name,
180 |                 '-test', "%s" % test_fd.name,
181 |                 '-dim', "'%s'" % self.__dim,
182 |                 '-init_stdev', "%g" % self.__init_stdev,
183 |                 '-iter', "%d" % self.__num_iter,
184 |                 '-method', "%s" % self.__learning_method,
185 |                 '-out', "%s" % out_fd.name,
186 |                 '-verbosity', "%d" % self.__verbose,
187 |                 '-save_model', "%s" % model_fd.name]
188 | 
189 |         # appends rlog if true
190 |         rlog_fd = None
191 |         if self.__rlog:
192 |             rlog_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
193 |             args.extend(['-rlog', "%s" % rlog_fd.name])
194 | 
195 |         # appends seed if given
196 |         if self.__seed:
197 |             args.extend(['-seed', "%d" % self.__seed])
198 | 
199 |         # appends arguments that only work for certain learning methods
200 |         if self.__learning_method in ['sgd', 'sgda']:
201 |             args.extend(['-learn_rate', "%.5f" % self.__learn_rate])
202 | 
203 |         if self.__learning_method in ['sgd', 'sgda', 'als']:
204 |             args.extend(['-regular', "'%s'" % self.__regularization])
205 | 
206 |         # adds validation if sgda
207 |         # if validation_set is none, libFM will throw error hence, I'm not doing any validation
208 |         validation_fd = None
209 |         if self.__learning_method == 'sgda' and (x_validation_set is not None and y_validation_set is not None):
210 |             validation_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
211 |             dump_svmlight_file(x_validation_set, y_validation_set, validation_fd.name)
212 |             args.extend(['-validation', "%s" % validation_fd.name])
213 | 
214 |         # if meta data is given
215 |         meta_fd = None
216 |         if meta is not None:
217 |             meta_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path, text=True)
218 |             # write group ids
219 |             for group_id in meta:
220 |                 meta_fd.write("%s\n" % group_id)
221 |             args.extend(['-meta', "%s" % meta_fd.name])
222 |             meta_fd.seek(0)
223 | 
224 |         # if silent redirects all output
225 |         stdout = None
226 |         if self.__silent:
227 |             stdout = open(os.devnull, 'wb')
228 | 
229 |         # call libfm with parsed arguments
230 |         # had unkown bug with "-dim" option on array. At the time was forced to
231 |         # concatenate string `args = ' '.join(args)` but looks like its working
232 |         # needs further tests
233 |         subprocess.call(args, shell=False, stdout=stdout)
234 | 
235 |         # reads output file
236 |         preds = [float(p) for p in out_fd.read().decode("utf-8").split('\n') if p]
237 | 
238 |         # "hidden" feature that allows users to save the model
239 |         # We use this to get the feature weights
240 |         # https://github.com/srendle/libfm/commit/19db0d1e36490290dadb530a56a5ae314b68da5d
241 |         import numpy as np
242 |         global_bias = None
243 |         weights = []
244 |         pairwise_interactions = []
245 |         # if 0 its global bias; if 1, weights; if 2, pairwise interactions
246 |         out_iter = 0
247 |         for line in model_fd.read().decode("utf-8").splitlines():
248 |             # checks which line is starting with #
249 |             if line.startswith('#'):
250 |                 if "#global bias W0" in line:
251 |                     out_iter = 0
252 |                 elif "#unary interactions Wj" in line:
253 |                     out_iter = 1
254 |                 elif "#pairwise interactions Vj,f" in line:
255 |                     out_iter = 2
256 |             else:
257 |                 # check context get in previous step and adds accordingly
258 |                 if out_iter == 0:
259 |                     global_bias = float(line)
260 |                 elif out_iter == 1:
261 |                     weights.append(float(line))
262 |                 elif out_iter == 2:
263 |                     try:
264 |                         pairwise_interactions.append([float(x) for x in line.split(' ')])
265 |                     except ValueError as e:
266 |                         pairwise_interactions.append(0.0) #Case: no pairwise interactions used
267 | 
268 |         pairwise_interactions = np.matrix(pairwise_interactions)
269 | 
270 |         # parses rlog into dataframe
271 |         if self.__rlog:
272 |             # parses rlog into
273 |             import pandas as pd
274 |             rlog_fd.seek(0)
275 |             print(os.stat(rlog_fd.name).st_size)
276 |             rlog = pd.read_csv(rlog_fd.name, sep='\t')
277 |             rlog_fd.close()
278 |         else:
279 |             rlog = None
280 | 
281 |         if self.__learning_method == 'sgda' and (x_validation_set is not None and y_validation_set is not None):
282 |             validation_fd.close()
283 |         if meta is not None:
284 |             meta_fd.close()
285 | 
286 |         # removes temporary output file after using
287 |         train_fd.close()
288 |         test_fd.close()
289 |         model_fd.close()
290 |         out_fd.close()
291 | 
292 |         # return as named collection for multiple output
293 |         import collections
294 |         fm = collections.namedtuple('model', ['predictions',
295 |                                               'global_bias',
296 |                                               'weights',
297 |                                               'pairwise_interactions',
298 |                                               'rlog'])
299 |         return fm(preds, global_bias, weights, pairwise_interactions, rlog)
300 | 


--------------------------------------------------------------------------------