├── pyfm
    ├── __init__.py
    └── pylibfm.py
├── setup.py
├── README.md
└── pyfm_fast.pyx


/pyfm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages, Extension
 2 | from Cython.Distutils import build_ext
 3 | import numpy
 4 | 
 5 | setup(
 6 | 	maintainer='Corey Lynch',
 7 |     name='pyfm',
 8 |     packages=find_packages(),
 9 |     url='https://github.com/coreylynch/pyFM',
10 |     cmdclass = {'build_ext': build_ext},
11 |     ext_modules = [Extension("pyfm_fast", ["pyfm_fast.pyx"],
12 |     						 libraries=["m"],
13 |     						 include_dirs=[numpy.get_include()])]
14 | )
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Factorization Machines in Python
  2 | 
  3 | This is a python implementation of Factorization Machines [1]. This uses stochastic gradient descent with adaptive regularization as a learning method, which adapts the regularization automatically while training the model parameters. See [2] for details. From libfm.org: "Factorization machines (FM) are a generic approach that allows to mimic most factorization models by feature engineering. This way, factorization machines combine the generality of feature engineering with the superiority of factorization models in estimating interactions between categorical variables of large domain."
  4 | 
  5 | [1] Steffen Rendle (2012): Factorization Machines with libFM, in ACM Trans. Intell. Syst. Technol., 3(3), May.
  6 | [2] Steffen Rendle: Learning recommender systems with adaptive regularization. WSDM 2012: 133-142
  7 | 
  8 | ## Installation
  9 | ```
 10 | pip install git+https://github.com/coreylynch/pyFM
 11 | ```
 12 | 
 13 | ## Dependencies
 14 | * numpy
 15 | * sklearn
 16 | 
 17 | ## Training Representation
 18 | The easiest way to use this class is to represent your training data as lists of standard Python dict objects, where the dict elements map each instance's categorical and real valued variables to its values. Then use a [sklearn DictVectorizer](http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.DictVectorizer.html#sklearn.feature_extraction.DictVectorizer) to convert them to a design matrix with a one-of-K or “one-hot” coding.
 19 | 
 20 | Here's a toy example
 21 | ```python
 22 | from pyfm import pylibfm
 23 | from sklearn.feature_extraction import DictVectorizer
 24 | import numpy as np
 25 | train = [
 26 | 	{"user": "1", "item": "5", "age": 19},
 27 | 	{"user": "2", "item": "43", "age": 33},
 28 | 	{"user": "3", "item": "20", "age": 55},
 29 | 	{"user": "4", "item": "10", "age": 20},
 30 | ]
 31 | v = DictVectorizer()
 32 | X = v.fit_transform(train)
 33 | print(X.toarray())
 34 | [[ 19.   0.   0.   0.   1.   1.   0.   0.   0.]
 35 |  [ 33.   0.   0.   1.   0.   0.   1.   0.   0.]
 36 |  [ 55.   0.   1.   0.   0.   0.   0.   1.   0.]
 37 |  [ 20.   1.   0.   0.   0.   0.   0.   0.   1.]]
 38 | y = np.repeat(1.0,X.shape[0])
 39 | fm = pylibfm.FM()
 40 | fm.fit(X,y)
 41 | fm.predict(v.transform({"user": "1", "item": "10", "age": 24}))
 42 | ```
 43 | 
 44 | ## Getting Started
 45 | Here's an example on some real  movie ratings data.
 46 | 
 47 | First get the smallest movielens ratings dataset from http://www.grouplens.org/system/files/ml-100k.zip.
 48 | ml-100k contains the files u.item (list of movie ids and titles) and u.data (list of user_id, movie_id, rating, timestamp).
 49 | ```python
 50 | import numpy as np
 51 | from sklearn.feature_extraction import DictVectorizer
 52 | from pyfm import pylibfm
 53 | 
 54 | # Read in data
 55 | def loadData(filename,path="ml-100k/"):
 56 |     data = []
 57 |     y = []
 58 |     users=set()
 59 |     items=set()
 60 |     with open(path+filename) as f:
 61 |         for line in f:
 62 |             (user,movieid,rating,ts)=line.split('\t')
 63 |             data.append({ "user_id": str(user), "movie_id": str(movieid)})
 64 |             y.append(float(rating))
 65 |             users.add(user)
 66 |             items.add(movieid)
 67 | 
 68 |     return (data, np.array(y), users, items)
 69 | 
 70 | (train_data, y_train, train_users, train_items) = loadData("ua.base")
 71 | (test_data, y_test, test_users, test_items) = loadData("ua.test")
 72 | v = DictVectorizer()
 73 | X_train = v.fit_transform(train_data)
 74 | X_test = v.transform(test_data)
 75 | 
 76 | # Build and train a Factorization Machine
 77 | fm = pylibfm.FM(num_factors=10, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")
 78 | 
 79 | fm.fit(X_train,y_train)
 80 | Creating validation dataset of 0.01 of training for adaptive regularization
 81 | -- Epoch 1
 82 | Training MSE: 0.59477
 83 | -- Epoch 2
 84 | Training MSE: 0.51841
 85 | -- Epoch 3
 86 | Training MSE: 0.49125
 87 | -- Epoch 4
 88 | Training MSE: 0.47589
 89 | -- Epoch 5
 90 | Training MSE: 0.46571
 91 | -- Epoch 6
 92 | Training MSE: 0.45852
 93 | -- Epoch 7
 94 | Training MSE: 0.45322
 95 | -- Epoch 8
 96 | Training MSE: 0.44908
 97 | -- Epoch 9
 98 | Training MSE: 0.44557
 99 | -- Epoch 10
100 | Training MSE: 0.44278
101 | ...
102 | -- Epoch 98
103 | Training MSE: 0.41863
104 | -- Epoch 99
105 | Training MSE: 0.41865
106 | -- Epoch 100
107 | Training MSE: 0.41874
108 | 
109 | # Evaluate
110 | preds = fm.predict(X_test)
111 | from sklearn.metrics import mean_squared_error
112 | print("FM MSE: %.4f" % mean_squared_error(y_test,preds))
113 | FM MSE: 0.9227
114 | 
115 | ```
116 | ## Classification example
117 | ```python
118 | import numpy as np
119 | from sklearn.feature_extraction import DictVectorizer
120 | from sklearn.cross_validation import train_test_split
121 | from pyfm import pylibfm
122 | 
123 | from sklearn.datasets import make_classification
124 | 
125 | X, y = make_classification(n_samples=1000,n_features=100, n_clusters_per_class=1)
126 | data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()}  for i in X]
127 | 
128 | X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state=42)
129 | 
130 | v = DictVectorizer()
131 | X_train = v.fit_transform(X_train)
132 | X_test = v.transform(X_test)
133 | 
134 | fm = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal")
135 | 
136 | fm.fit(X_train,y_train)
137 | 
138 | Creating validation dataset of 0.01 of training for adaptive regularization
139 | -- Epoch 1
140 | Training log loss: 1.91885
141 | -- Epoch 2
142 | Training log loss: 1.62022
143 | -- Epoch 3
144 | Training log loss: 1.36736
145 | -- Epoch 4
146 | Training log loss: 1.15562
147 | -- Epoch 5
148 | Training log loss: 0.97961
149 | -- Epoch 6
150 | Training log loss: 0.83356
151 | -- Epoch 7
152 | Training log loss: 0.71208
153 | -- Epoch 8
154 | Training log loss: 0.61108
155 | -- Epoch 9
156 | Training log loss: 0.52705
157 | -- Epoch 10
158 | Training log loss: 0.45685
159 | 
160 | # Evaluate
161 | from sklearn.metrics import log_loss
162 | print "Validation log loss: %.4f" % log_loss(y_test,fm.predict(X_test))
163 | Validation log loss: 1.5025
164 | 
165 | ```
166 | 


--------------------------------------------------------------------------------
/pyfm/pylibfm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.model_selection import train_test_split
  3 | import random
  4 | from pyfm_fast import FM_fast, CSRDataset
  5 | 
  6 | LEARNING_RATE_TYPES = {"optimal": 0, "invscaling": 1, "constant": 2}
  7 | TASKS = {"regression": 0, "classification" : 1}
  8 | 
  9 | class FM:
 10 |     """Factorization machine fitted by minimizing a regularized empirical loss with adaptive SGD.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 | 
 15 |     num_factors : int
 16 |         The dimensionality of the factorized 2-way interactions
 17 |     num_iter : int
 18 |         Number of iterations
 19 |     k0 : bool
 20 |         Use bias. Defaults to true.
 21 |     k1 : bool
 22 |         Use 1-way interactions (learn feature weights).
 23 |         Defaults to true.
 24 |     init_stdev : double, optional
 25 |         Standard deviation for initialization of 2-way factors.
 26 |         Defaults to 0.01.
 27 |     validation_size : double, optional
 28 |         Proportion of the training set to use for validation.
 29 |         Defaults to 0.01.
 30 |     learning_rate_schedule : string, optional
 31 |         The learning rate:
 32 |             constant: eta = eta0
 33 |             optimal: eta = 1.0/(t+t0) [default]
 34 |             invscaling: eta = eta0 / pow(t, power_t)
 35 |     initial_learning_rate : double
 36 |         Defaults to 0.01
 37 |     power_t : double
 38 |         The exponent for inverse scaling learning rate [default 0.5].
 39 |     t0 : double
 40 |         Constant in the denominator for optimal learning rate schedule.
 41 |         Defaults to 0.001.
 42 |     task : string
 43 |         regression: Labels are real values.
 44 |         classification: Labels are either positive or negative.
 45 |     verbose : bool
 46 |         Whether or not to print current iteration, training error
 47 |     shuffle_training: bool
 48 |         Whether or not to shuffle training dataset before learning
 49 |     seed : int
 50 |         The seed of the pseudo random number generator
 51 |     """
 52 |     def __init__(self,
 53 |                  num_factors=10,
 54 |                  num_iter=1,
 55 |                  k0=True,
 56 |                  k1=True,
 57 |                  init_stdev=0.1,
 58 |                  validation_size=0.01,
 59 |                  learning_rate_schedule="optimal",
 60 |                  initial_learning_rate=0.01,
 61 |                  power_t=0.5,
 62 |                  t0=0.001,
 63 |                  task='classification',
 64 |                  verbose=True,
 65 |                  shuffle_training=True,
 66 |                  seed = 28):
 67 | 
 68 |         self.num_factors = num_factors
 69 |         self.num_iter = num_iter
 70 |         self.sum = np.zeros(self.num_factors)
 71 |         self.sum_sqr = np.zeros(self.num_factors)
 72 |         self.k0 = k0
 73 |         self.k1 = k1
 74 |         self.init_stdev = init_stdev
 75 |         self.validation_size = validation_size
 76 |         self.task = task
 77 |         self.shuffle_training = shuffle_training
 78 |         self.seed = seed
 79 | 
 80 |         # Learning rate Parameters
 81 |         self.learning_rate_schedule = learning_rate_schedule
 82 |         self.eta0 = initial_learning_rate
 83 |         self.power_t = power_t
 84 |         self.t = 1.0
 85 |         self.learning_rate = initial_learning_rate
 86 |         self.t0 = t0
 87 | 
 88 |         # Regularization Parameters (start with no regularization)
 89 |         self.reg_0 = 0.0
 90 |         self.reg_w = 0.0
 91 |         self.reg_v = np.repeat(0.0, num_factors)
 92 | 
 93 |         # local parameters in the lambda_update step
 94 |         self.lambda_w_grad = 0.0
 95 |         self.lambda_v_grad = 0.0
 96 |         self.sum_f = 0.0
 97 |         self.sum_f_dash_f = 0.0
 98 |         self.verbose = verbose
 99 | 
100 |     def _validate_params(self):
101 |         """Validate input params. """
102 |         if not isinstance(self.shuffle_training, bool):
103 |             raise ValueError("shuffle must be either True or False")
104 |         if self.num_iter <= 0:
105 |             raise ValueError("n_iter must be > zero")
106 |         if self.learning_rate_schedule in ("constant", "invscaling"):
107 |             if self.eta0 <= 0.0:
108 |                 raise ValueError("eta0 must be > 0")
109 | 
110 |     def _get_learning_rate_type(self, learning_rate):
111 |         """Map learning rate string to int for cython"""
112 |         try:
113 |             return LEARNING_RATE_TYPES[learning_rate]
114 |         except KeyError:
115 |             raise ValueError("learning rate %s "
116 |                              "is not supported. " % learning_rate)
117 | 
118 |     def _get_task(self, task):
119 |         """Map task string to int for cython"""
120 |         try:
121 |             return TASKS[task]
122 |         except KeyError:
123 |             raise ValueError("task %s "
124 |                              "is not supported. " % task)
125 | 
126 |     def _bool_to_int(self, bool_arg):
127 |         """Map bool to int for cython"""
128 |         if bool_arg == True:
129 |             return 1
130 |         else:
131 |             return 0
132 | 
133 |     def _prepare_y(self,y):
134 |         """Maps labels to [-1, 1] space"""
135 |         y_i = np.ones(y.shape, dtype=np.float64, order="C")
136 |         y_i[y != 1] = -1.0
137 |         return y_i
138 | 
139 |     def fit(self, X, y):
140 |         """Fit factorization machine using Stochastic Gradient Descent with Adaptive Regularization.
141 | 
142 |         Parameters
143 |         ----------
144 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
145 |             Training data
146 | 
147 |         y : numpy array of shape [n_samples]
148 |             Target values
149 | 
150 |         Returns
151 |         -------
152 |         self : returns an instance of self.
153 |         """
154 |         if type(y) != np.ndarray:
155 |             y = np.array(y)
156 | 
157 |         self._validate_params()
158 | 
159 |         if self.task == "classification":
160 |             y = self._prepare_y(y)
161 | 
162 |         self.max_target = max(y)
163 |         self.min_target = min(y)
164 | 
165 |         # convert member variables to ints for use in cython
166 |         k0 = self._bool_to_int(self.k0)
167 |         k1 = self._bool_to_int(self.k1)
168 |         shuffle_training = self._bool_to_int(self.shuffle_training)
169 |         verbose = self._bool_to_int(self.verbose)
170 |         learning_rate_schedule = self._get_learning_rate_type(self.learning_rate_schedule)
171 |         task = self._get_task(self.task)
172 | 
173 |         # use sklearn to create a validation dataset for lambda updates
174 |         if self.verbose == True:
175 |             print("Creating validation dataset of %.2f of training for adaptive regularization" % self.validation_size)
176 |         X_train, validation, train_labels, validation_labels = train_test_split(
177 |             X, y, test_size=self.validation_size)
178 |         self.num_attribute = X_train.shape[1]
179 | 
180 |         # Convert datasets to sklearn sequential datasets for fast traversal
181 |         X_train_dataset = _make_dataset(X_train, train_labels)
182 |         validation_dataset = _make_dataset(validation, validation_labels)
183 | 
184 |         # Set up params
185 |         self.w0 = 0.0
186 |         self.w = np.zeros(self.num_attribute)
187 |         np.random.seed(seed=self.seed)
188 |         self.v = np.random.normal(scale=self.init_stdev,size=(self.num_factors, self.num_attribute))
189 | 
190 |         self.fm_fast = FM_fast(self.w,
191 |                                self.v,
192 |                                self.num_factors,
193 |                                self.num_attribute,
194 |                                self.num_iter,
195 |                                k0,
196 |                                k1,
197 |                                self.w0,
198 |                                self.t,
199 |                                self.t0,
200 |                                self.power_t,
201 |                                self.min_target,
202 |                                self.max_target,
203 |                                self.eta0,
204 |                                learning_rate_schedule,
205 |                                shuffle_training,
206 |                                task,
207 |                                self.seed,
208 |                                verbose)
209 | 
210 |         return self.fm_fast.fit(X_train_dataset, validation_dataset)
211 | 
212 |         # report epoch information
213 |         if self.verbose == True:
214 |             print("-- Epoch %d" % (epoch + 1))
215 |             print("Train MSE: %.5f" % (self.sumloss / self.count))
216 | 
217 |     def predict(self, X):
218 |         """Predict using the factorization machine
219 | 
220 |         Parameters
221 |         ----------
222 |         X : sparse matrix, shape = [n_samples, n_features]
223 |         or
224 |         X : single instance [1, n_features]
225 | 
226 |         Returns
227 |         -------
228 |         float if X is one instance
229 |         array, shape = [n_samples] if X is sparse matrix
230 |            Predicted target values per element in X.
231 |         """
232 |         sparse_X = _make_dataset(X, np.ones(X.shape[0]))
233 | 
234 |         return self.fm_fast._predict(sparse_X)
235 | 
236 | def _make_dataset(X, y_i):
237 |     """Create ``Dataset`` abstraction for sparse and dense inputs."""
238 |     sample_weight = np.ones(X.shape[0], dtype=np.float64, order='C') # ignore sample weight for the moment
239 |     dataset = CSRDataset(X.data, X.indptr, X.indices, y_i, sample_weight)
240 |     return dataset
241 | 
242 | 


--------------------------------------------------------------------------------
/pyfm_fast.pyx:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | # cython: cdivision=True
  3 | # cython: boundscheck=False
  4 | # cython: wraparound=False
  5 | #
  6 | # Author: Corey Lynch <coreylynch9@gmail.com>
  7 | #
  8 | # License: BSD Style.
  9 | 
 10 | import numpy as np
 11 | import sys
 12 | from time import time
 13 | 
 14 | from libc.math cimport exp, log, pow
 15 | cimport numpy as np
 16 | cimport cython
 17 | 
 18 | np.import_array()
 19 | 
 20 | ctypedef np.float64_t DOUBLE
 21 | ctypedef np.int32_t INTEGER
 22 | 
 23 | # MODEL CONSTANTS
 24 | DEF REGRESSION = 0
 25 | DEF CLASSIFICATION = 1
 26 | DEF OPTIMAL = 0
 27 | DEF INVERSE_SCALING = 1
 28 | 
 29 | cdef class FM_fast(object):
 30 |     """Factorization Machine fitted by minimizing a regularized empirical loss with adaptive SGD.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     w : np.ndarray[DOUBLE, ndim=1, mode='c']
 35 |     v : ndarray[DOUBLE, ndim=2, mode='c']
 36 |     num_factors : int
 37 |     num_attributes : int
 38 |     n_iter : int
 39 |     k0 : int
 40 |     k1 : int
 41 |     w0 : double
 42 |     t : double
 43 |     t0 : double
 44 |     l : double
 45 |     power_t : double
 46 |     min_target : double
 47 |     max_target : double
 48 |     eta0 : double
 49 |     learning_rate_schedule : int
 50 |     shuffle_training : int
 51 |     task : int
 52 |     seed : int
 53 |     verbose : int
 54 |     """
 55 | 
 56 |     cdef public double w0
 57 |     cdef public np.ndarray w
 58 |     cdef public np.ndarray v
 59 |     cdef public int num_factors
 60 |     cdef public int num_attributes
 61 |     cdef public int n_iter
 62 |     cdef public int k0
 63 |     cdef public int k1
 64 | 
 65 |     cdef public DOUBLE t
 66 |     cdef public DOUBLE t0
 67 |     cdef public DOUBLE l
 68 |     cdef public DOUBLE power_t
 69 |     cdef public DOUBLE min_target
 70 |     cdef public DOUBLE max_target
 71 |     cdef public np.ndarray sum
 72 |     cdef public np.ndarray sum_sqr
 73 |     cdef public int task
 74 |     cdef public int learning_rate_schedule
 75 |     cdef public double learning_rate
 76 |     cdef public int shuffle_training
 77 |     cdef public int seed
 78 |     cdef public int verbose
 79 | 
 80 |     cdef public DOUBLE reg_0
 81 |     cdef public DOUBLE reg_w
 82 |     cdef public np.ndarray reg_v
 83 | 
 84 |     cdef public np.ndarray grad_w
 85 |     cdef public np.ndarray grad_v
 86 | 
 87 |     cdef public DOUBLE sumloss
 88 |     cdef public int count
 89 | 
 90 |     def __init__(self,
 91 |                   np.ndarray[DOUBLE, ndim=1, mode='c'] w,
 92 |                   np.ndarray[DOUBLE, ndim=2, mode='c'] v,
 93 |                   int num_factors,
 94 |                   int num_attributes,
 95 |                   int n_iter,
 96 |                   int k0,
 97 |                   int k1,
 98 |                   double w0,
 99 |                   double t,
100 |                   double t0,
101 |                   double power_t,
102 |                   double min_target,
103 |                   double max_target,
104 |                   double eta0,
105 |                   int learning_rate_schedule,
106 |                   int shuffle_training,
107 |                   int task,
108 |                   int seed,
109 |                   int verbose):
110 | 
111 |         self.w0 = w0
112 |         self.w = w
113 |         self.v = v
114 |         self.num_factors = num_factors
115 |         self.num_attributes = num_attributes
116 |         self.n_iter = n_iter
117 |         self.k0 = k0
118 |         self.k1 = k1
119 |         self.t = 1
120 |         self.t0 = t0
121 |         self.learning_rate = eta0
122 |         self.power_t = power_t
123 |         self.min_target = min_target
124 |         self.max_target = max_target
125 |         self.sum = np.zeros(self.num_factors)
126 |         self.sum_sqr = np.zeros(self.num_factors)
127 |         self.task = task
128 |         self.learning_rate_schedule = learning_rate_schedule
129 |         self.shuffle_training = shuffle_training
130 |         self.seed = seed
131 |         self.verbose = verbose
132 | 
133 |         self.reg_0 = 0.0
134 |         self.reg_w = 0.0
135 |         self.reg_v = np.zeros(self.num_factors)
136 | 
137 |         self.sumloss = 0.0
138 |         self.count = 0
139 | 
140 |         self.grad_w = np.zeros(self.num_attributes)
141 |         self.grad_v = np.zeros((self.num_factors, self.num_attributes))
142 | 
143 |     cdef _predict_instance(self, DOUBLE * x_data_ptr,
144 |                            INTEGER * x_ind_ptr,
145 |                            int xnnz):
146 | 
147 |         # Helper variables
148 |         cdef DOUBLE result = 0.0
149 |         cdef int feature
150 |         cdef unsigned int i = 0
151 |         cdef unsigned int f = 0
152 |         cdef DOUBLE d
153 | 
154 |         # map instance variables to local variables
155 |         cdef DOUBLE w0 = self.w0
156 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w
157 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v
158 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_ = np.zeros(self.num_factors)
159 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_sqr_ = np.zeros(self.num_factors)
160 | 
161 |         if self.k0 > 0:
162 |             result += w0
163 |         if self.k1 > 0:
164 |             for i in range(xnnz):
165 |                 feature = x_ind_ptr[i]
166 |                 result += w[feature] * x_data_ptr[i]
167 |         for f in range(self.num_factors):
168 |             sum_[f] = 0.0
169 |             sum_sqr_[f] = 0.0
170 |             for i in range(xnnz):
171 |                 feature = x_ind_ptr[i]
172 |                 d = v[f, feature] * x_data_ptr[i]
173 |                 sum_[f] += d
174 |                 sum_sqr_[f] += d*d
175 |             result += 0.5 * (sum_[f] * sum_[f] - sum_sqr_[f])
176 | 
177 |         # pass sum to sgd_theta
178 |         self.sum = sum_
179 |         return result
180 | 
181 |     cdef _predict_scaled(self, DOUBLE * x_data_ptr,
182 |                            INTEGER * x_ind_ptr,
183 |                            int xnnz):
184 |         cdef DOUBLE result = 0.0
185 |         cdef unsigned int i = 0
186 |         cdef unsigned int f = 0
187 |         cdef DOUBLE d
188 |         cdef DOUBLE w_dash = 0.0
189 |         cdef DOUBLE v_dash = 0.0
190 | 
191 |         # map instance variables to local variables
192 |         cdef DOUBLE w0 = self.w0
193 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w
194 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v
195 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] grad_w = self.grad_w
196 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] grad_v = self.grad_v
197 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_ = np.zeros(self.num_factors)
198 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_sqr_ = np.zeros(self.num_factors)
199 |         cdef DOUBLE learning_rate = self.learning_rate
200 |         cdef DOUBLE reg_w = self.reg_w
201 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] reg_v = self.reg_v
202 | 
203 |         if self.k0 > 0:
204 |             result += w0
205 |         if self.k1 > 0:
206 |             for i in xrange(xnnz):
207 |                 feature = x_ind_ptr[i]
208 |                 assert(feature < self.num_attributes)
209 |                 w_dash = w[feature] - learning_rate * (grad_w[feature] + 2 * reg_w * w[feature])
210 |                 result += w_dash * x_data_ptr[i]
211 |         for f in xrange(self.num_factors):
212 |             sum_[f] = 0.0
213 |             sum_sqr_[f] = 0.0
214 |             for i in xrange(xnnz):
215 |                 feature = x_ind_ptr[i]
216 |                 v_dash = v[f,feature] - learning_rate * (grad_v[f,feature] + 2 * reg_v[f] * v[f,feature])
217 |                 d = v_dash * x_data_ptr[i]
218 |                 sum_[f] += d
219 |                 sum_sqr_[f] += d*d
220 |             result += 0.5 * (sum_[f]*sum_[f] - sum_sqr_[f])
221 |         return result
222 | 
223 |     def _predict(self, CSRDataset dataset):
224 | 
225 |         # Helper access variables
226 |         cdef unsigned int i = 0
227 |         cdef Py_ssize_t n_samples = dataset.n_samples
228 |         cdef DOUBLE * x_data_ptr = NULL
229 |         cdef INTEGER * x_ind_ptr = NULL
230 |         cdef int xnnz
231 |         cdef DOUBLE sample_weight = 1.0
232 |         cdef DOUBLE y_placeholder
233 |         cdef DOUBLE p = 0.0
234 | 
235 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] return_preds = np.zeros(n_samples)
236 | 
237 |         for i in range(n_samples):
238 |             dataset.next(& x_data_ptr, & x_ind_ptr, & xnnz, & y_placeholder,
239 |                          & sample_weight)
240 |             p = self._predict_instance(x_data_ptr, x_ind_ptr, xnnz)
241 |             if self.task == REGRESSION:
242 |                 p = min(self.max_target, p)
243 |                 p = max(self.min_target, p)
244 |             else:
245 |                 p = (1.0 / (1.0 + exp(-p)))
246 |             return_preds[i] = p
247 |         return return_preds
248 | 
249 |     cdef _sgd_theta_step(self, DOUBLE * x_data_ptr,
250 |                         INTEGER * x_ind_ptr,
251 |                         int xnnz,
252 |                         DOUBLE y):
253 | 
254 |         cdef DOUBLE mult = 0.0
255 |         cdef DOUBLE p
256 |         cdef int feature
257 |         cdef unsigned int i = 0
258 |         cdef unsigned int f = 0
259 |         cdef DOUBLE d
260 |         cdef DOUBLE grad_0
261 | 
262 |         cdef DOUBLE w0 = self.w0
263 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w
264 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v
265 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] grad_w = self.grad_w
266 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] grad_v = self.grad_v
267 |         cdef DOUBLE learning_rate = self.learning_rate
268 |         cdef DOUBLE reg_0 = self.reg_0
269 |         cdef DOUBLE reg_w = self.reg_w
270 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] reg_v = self.reg_v
271 | 
272 |         p = self._predict_instance(x_data_ptr, x_ind_ptr, xnnz)
273 | 
274 |         if self.task == REGRESSION:
275 |             p = min(self.max_target, p)
276 |             p = max(self.min_target, p)
277 |             mult = 2 * (p - y);
278 |         else:
279 |             mult = y * ( (1.0 / (1.0+exp(-y*p))) - 1.0)
280 | 
281 |         # Set learning schedule
282 |         if self.learning_rate_schedule == OPTIMAL:
283 |             self.learning_rate = 1.0 / (self.t + self.t0)
284 | 
285 |         elif self.learning_rate_schedule == INVERSE_SCALING:
286 |             self.learning_rate = self.learning_rate / pow(self.t, self.power_t)
287 | 
288 |         if self.verbose > 0:
289 |             self.sumloss += _squared_loss(p,y) if self.task == REGRESSION else _log_loss(p,y)
290 | 
291 |         # Update global bias
292 |         if self.k0 > 0:
293 |             grad_0 = mult
294 |             w0 -= learning_rate * (grad_0 + 2 * reg_0 * w0)
295 | 
296 |         # Update feature biases
297 |         if self.k1 > 0:
298 |             for i in range(xnnz):
299 |                 feature = x_ind_ptr[i]
300 |                 grad_w[feature] = mult * x_data_ptr[i]
301 |                 w[feature] -= learning_rate * (grad_w[feature]
302 |                                    + 2 * reg_w * w[feature])
303 | 
304 |         # Update feature factor vectors
305 |         for f in range(self.num_factors):
306 |             for i in range(xnnz):
307 |                 feature = x_ind_ptr[i]
308 |                 grad_v[f,feature] = mult * (x_data_ptr[i] * (self.sum[f] - v[f,feature] * x_data_ptr[i]))
309 |                 v[f,feature] -= learning_rate * (grad_v[f,feature] + 2 * reg_v[f] * v[f,feature])
310 | 
311 |         # Pass updated vars to other functions
312 |         self.learning_rate = learning_rate
313 |         self.w0 = w0
314 |         self.w = w
315 |         self.v = v
316 |         self.grad_w = grad_w
317 |         self.grad_v = grad_v
318 | 
319 |         self.t += 1
320 |         self.count += 1
321 | 
322 |     cdef _sgd_lambda_step(self, DOUBLE * validation_x_data_ptr,
323 |                         INTEGER * validation_x_ind_ptr,
324 |                         int validation_xnnz,
325 |                         DOUBLE validation_y):
326 | 
327 |         cdef DOUBLE sum_f
328 |         cdef DOUBLE sum_f_dash
329 |         cdef DOUBLE sum_f_dash_f
330 |         cdef DOUBLE p
331 |         cdef DOUBLE grad_loss
332 |         cdef int feature
333 |         cdef unsigned int i
334 |         cdef unsigned int f
335 |         cdef DOUBLE lambda_w_grad = 0.0
336 |         cdef DOUBLE lambda_v_grad = 0.0
337 |         cdef DOUBLE v_dash = 0.0
338 | 
339 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w
340 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v
341 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] grad_w = self.grad_w
342 |         cdef np.ndarray[DOUBLE, ndim=2, mode='c'] grad_v = self.grad_v
343 |         cdef DOUBLE learning_rate = self.learning_rate
344 |         cdef DOUBLE reg_0 = self.reg_0
345 |         cdef DOUBLE reg_w = self.reg_w
346 |         cdef np.ndarray[DOUBLE, ndim=1, mode='c'] reg_v = self.reg_v
347 | 
348 |         p = self._predict_scaled(validation_x_data_ptr, validation_x_ind_ptr, validation_xnnz)
349 |         if self.task == REGRESSION:
350 |             p = min(self.max_target, p)
351 |             p = max(self.min_target, p)
352 |             grad_loss = 2 * (p - validation_y)
353 |         else:
354 |             grad_loss = validation_y * ( (1.0 / (1.0 + exp(-validation_y*p))) - 1.0)
355 | 
356 |         if self.k1 > 0:
357 |             lambda_w_grad = 0.0
358 |             for i in xrange(validation_xnnz):
359 |                 feature = validation_x_ind_ptr[i]
360 |                 lambda_w_grad += validation_x_data_ptr[i] * w[feature]
361 |             lambda_w_grad = -2 * learning_rate * lambda_w_grad
362 |             reg_w -= learning_rate * grad_loss * lambda_w_grad
363 |             reg_w = max(0.0, reg_w)
364 | 
365 |         for f in xrange(self.num_factors):
366 |             sum_f = 0.0
367 |             sum_f_dash = 0.0
368 |             sum_f_dash_f = 0.0
369 | 
370 |             for i in xrange(validation_xnnz):
371 |                 feature = validation_x_ind_ptr[i]
372 |                 v_dash = v[f,feature] - learning_rate * (grad_v[f,feature] + 2 * reg_v[f] * v[f,feature])
373 |                 sum_f_dash += v_dash * validation_x_data_ptr[i]
374 |                 sum_f += v[f,feature] * validation_x_data_ptr[i]
375 |                 sum_f_dash_f += v_dash * validation_x_data_ptr[i] * v[f,feature] * validation_x_data_ptr[i]
376 |             lambda_v_grad = -2 * learning_rate * (sum_f_dash * sum_f - sum_f_dash_f)
377 |             reg_v[f] -= learning_rate * grad_loss * lambda_v_grad
378 |             reg_v[f] = max(0.0, reg_v[f])
379 | 
380 |         # Pass updated vars to other functions
381 |         self.reg_w = reg_w
382 |         self.reg_v = reg_v
383 | 
384 |     def fit(self, CSRDataset dataset, CSRDataset validation_dataset):
385 | 
386 |         # get the data information into easy vars
387 |         cdef Py_ssize_t n_samples = dataset.n_samples
388 |         cdef Py_ssize_t n_validation_samples = validation_dataset.n_samples
389 | 
390 |         cdef DOUBLE * x_data_ptr = NULL
391 |         cdef INTEGER * x_ind_ptr = NULL
392 | 
393 |         cdef DOUBLE * validation_x_data_ptr = NULL
394 |         cdef INTEGER * validation_x_ind_ptr = NULL
395 | 
396 |         # helper variables
397 |         cdef int xnnz
398 |         cdef DOUBLE y = 0.0
399 |         cdef DOUBLE validation_y = 0.0
400 |         cdef int validation_xnnz
401 |         cdef unsigned int count = 0
402 |         cdef unsigned int epoch = 0
403 |         cdef unsigned int i = 0
404 | 
405 |         cdef DOUBLE sample_weight = 1.0
406 |         cdef DOUBLE validation_sample_weight = 1.0
407 | 
408 |         for epoch in range(self.n_iter):
409 | 
410 |             if self.verbose > 0:
411 |                 print("-- Epoch %d" % (epoch + 1))
412 |             self.count = 0
413 |             self.sumloss = 0
414 |             if self.shuffle_training:
415 |                 dataset.shuffle(self.seed)
416 | 
417 |             for i in range(n_samples):
418 |                 dataset.next( & x_data_ptr, & x_ind_ptr, & xnnz, & y,
419 |                              & sample_weight)
420 | 
421 |                 self._sgd_theta_step(x_data_ptr, x_ind_ptr, xnnz, y)
422 | 
423 |                 if epoch > 0:
424 |                     validation_dataset.next( & validation_x_data_ptr, & validation_x_ind_ptr,
425 |                                              & validation_xnnz, & validation_y,
426 |                                              & validation_sample_weight)
427 |                     self._sgd_lambda_step(validation_x_data_ptr, validation_x_ind_ptr,
428 |                                           validation_xnnz, validation_y)
429 |             if self.verbose > 0:
430 |                 error_type = "MSE" if self.task == REGRESSION else "log loss"
431 |                 print "Training %s: %.5f" % (error_type, (self.sumloss / self.count))
432 | 
433 |     def __getstate__(self):
434 |         # Implements Pickle interface.
435 |         field_names = ["w0", "w", "v", "num_factors", "num_attributes",
436 |                        "n_iter", "k0", "k1", "t", "t0", "l", "power_t",
437 |                        "min_target", "max_target", "sum", "sum_sqr", "task",
438 |                        "learning_rate_schedule", "learning_rate",
439 |                        "shuffle_training", "seed", "verbose", "reg_0",
440 |                        "reg_w", "reg_v", "grad_w", "grad_v", "sumloss",
441 |                        "count"]
442 | 
443 |         state = [field_names]
444 |         for field in field_names:
445 |             val = getattr(self ,field)
446 |             state.append(val)
447 |         return tuple(state)
448 | 
449 |     def __setstate__(self, state):
450 |         # Implements Pickle interface.
451 |         for n, field in enumerate(state[0]):
452 |             setattr(self, field, state[n + 1])
453 | 
454 | cdef inline double max(double a, double b):
455 |     return a if a >= b else b
456 | 
457 | cdef inline double min(double a, double b):
458 |     return a if a <= b else b
459 | 
460 | cdef _log_loss(DOUBLE p, DOUBLE y):
461 |     cdef DOUBLE z
462 | 
463 |     z = p * y
464 |     # approximately equal and saves the computation of the log
465 |     if z > 18:
466 |         return exp(-z)
467 |     if z < -18:
468 |         return -z
469 |     return log(1.0 + exp(-z))
470 | 
471 | cdef _squared_loss(DOUBLE p, DOUBLE y):
472 |     return 0.5 * (p - y) * (p - y)
473 | 
474 | cdef class CSRDataset:
475 |     """An sklearn ``SequentialDataset`` backed by a scipy sparse CSR matrix. This is an ugly hack for the moment until I find the best way to link to sklearn. """
476 | 
477 |     cdef Py_ssize_t n_samples
478 |     cdef int current_index
479 |     cdef int stride
480 |     cdef DOUBLE *X_data_ptr
481 |     cdef INTEGER *X_indptr_ptr
482 |     cdef INTEGER *X_indices_ptr
483 |     cdef DOUBLE *Y_data_ptr
484 |     cdef np.ndarray feature_indices
485 |     cdef INTEGER *feature_indices_ptr
486 |     cdef np.ndarray index
487 |     cdef INTEGER *index_data_ptr
488 |     cdef DOUBLE *sample_weight_data
489 | 
490 |     def __cinit__(self, np.ndarray[DOUBLE, ndim=1, mode='c'] X_data,
491 |                   np.ndarray[INTEGER, ndim=1, mode='c'] X_indptr,
492 |                   np.ndarray[INTEGER, ndim=1, mode='c'] X_indices,
493 |                   np.ndarray[DOUBLE, ndim=1, mode='c'] Y,
494 |                   np.ndarray[DOUBLE, ndim=1, mode='c'] sample_weight):
495 |         """Dataset backed by a scipy sparse CSR matrix.
496 | 
497 |         The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
498 |         The corresponding feature values are given by
499 |         x_data_ptr[0:nnz].
500 | 
501 |         Parameters
502 |         ----------
503 |         X_data : ndarray, dtype=np.float64, ndim=1, mode='c'
504 |             The data array of the CSR matrix; a one-dimensional c-continuous
505 |             numpy array of dtype np.float64.
506 |         X_indptr : ndarray, dtype=np.int32, ndim=1, mode='c'
507 |             The index pointer array of the CSR matrix; a one-dimensional
508 |             c-continuous numpy array of dtype np.int32.
509 |         X_indices : ndarray, dtype=np.int32, ndim=1, mode='c'
510 |             The column indices array of the CSR matrix; a one-dimensional
511 |             c-continuous numpy array of dtype np.int32.
512 |         Y : ndarray, dtype=np.float64, ndim=1, mode='c'
513 |             The target values; a one-dimensional c-continuous numpy array of
514 |             dtype np.float64.
515 |         sample_weights : ndarray, dtype=np.float64, ndim=1, mode='c'
516 |             The weight of each sample; a one-dimensional c-continuous numpy
517 |             array of dtype np.float64.
518 |         """
519 |         self.n_samples = Y.shape[0]
520 |         self.current_index = -1
521 |         self.X_data_ptr = <DOUBLE *>X_data.data
522 |         self.X_indptr_ptr = <INTEGER *>X_indptr.data
523 |         self.X_indices_ptr = <INTEGER *>X_indices.data
524 |         self.Y_data_ptr = <DOUBLE *>Y.data
525 |         self.sample_weight_data = <DOUBLE *> sample_weight.data
526 |         # Use index array for fast shuffling
527 |         cdef np.ndarray[INTEGER, ndim=1,
528 |                         mode='c'] index = np.arange(0, self.n_samples,
529 |                                                     dtype=np.int32)
530 |         self.index = index
531 |         self.index_data_ptr = <INTEGER *> index.data
532 | 
533 |     cdef void next(self, DOUBLE **x_data_ptr, INTEGER **x_ind_ptr,
534 |                    int *nnz, DOUBLE *y, DOUBLE *sample_weight):
535 |         cdef int current_index = self.current_index
536 |         if current_index >= (self.n_samples - 1):
537 |             current_index = -1
538 | 
539 |         current_index += 1
540 |         cdef int sample_idx = self.index_data_ptr[current_index]
541 |         cdef int offset = self.X_indptr_ptr[sample_idx]
542 |         y[0] = self.Y_data_ptr[sample_idx]
543 |         x_data_ptr[0] = self.X_data_ptr + offset
544 |         x_ind_ptr[0] = self.X_indices_ptr + offset
545 |         nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
546 |         sample_weight[0] = self.sample_weight_data[sample_idx]
547 | 
548 |         self.current_index = current_index
549 | 
550 |     cdef void shuffle(self, seed):
551 |         np.random.RandomState(seed).shuffle(self.index)
552 | 


--------------------------------------------------------------------------------