├── pyfm ├── __init__.py └── pylibfm.py ├── setup.py ├── README.md └── pyfm_fast.pyx /pyfm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages, Extension 2 | from Cython.Distutils import build_ext 3 | import numpy 4 | 5 | setup( 6 | maintainer='Corey Lynch', 7 | name='pyfm', 8 | packages=find_packages(), 9 | url='https://github.com/coreylynch/pyFM', 10 | cmdclass = {'build_ext': build_ext}, 11 | ext_modules = [Extension("pyfm_fast", ["pyfm_fast.pyx"], 12 | libraries=["m"], 13 | include_dirs=[numpy.get_include()])] 14 | ) 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Factorization Machines in Python 2 | 3 | This is a python implementation of Factorization Machines [1]. This uses stochastic gradient descent with adaptive regularization as a learning method, which adapts the regularization automatically while training the model parameters. See [2] for details. From libfm.org: "Factorization machines (FM) are a generic approach that allows to mimic most factorization models by feature engineering. This way, factorization machines combine the generality of feature engineering with the superiority of factorization models in estimating interactions between categorical variables of large domain." 4 | 5 | [1] Steffen Rendle (2012): Factorization Machines with libFM, in ACM Trans. Intell. Syst. Technol., 3(3), May. 6 | [2] Steffen Rendle: Learning recommender systems with adaptive regularization. WSDM 2012: 133-142 7 | 8 | ## Installation 9 | ``` 10 | pip install git+https://github.com/coreylynch/pyFM 11 | ``` 12 | 13 | ## Dependencies 14 | * numpy 15 | * sklearn 16 | 17 | ## Training Representation 18 | The easiest way to use this class is to represent your training data as lists of standard Python dict objects, where the dict elements map each instance's categorical and real valued variables to its values. Then use a [sklearn DictVectorizer](http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.DictVectorizer.html#sklearn.feature_extraction.DictVectorizer) to convert them to a design matrix with a one-of-K or “one-hot” coding. 19 | 20 | Here's a toy example 21 | ```python 22 | from pyfm import pylibfm 23 | from sklearn.feature_extraction import DictVectorizer 24 | import numpy as np 25 | train = [ 26 | {"user": "1", "item": "5", "age": 19}, 27 | {"user": "2", "item": "43", "age": 33}, 28 | {"user": "3", "item": "20", "age": 55}, 29 | {"user": "4", "item": "10", "age": 20}, 30 | ] 31 | v = DictVectorizer() 32 | X = v.fit_transform(train) 33 | print(X.toarray()) 34 | [[ 19. 0. 0. 0. 1. 1. 0. 0. 0.] 35 | [ 33. 0. 0. 1. 0. 0. 1. 0. 0.] 36 | [ 55. 0. 1. 0. 0. 0. 0. 1. 0.] 37 | [ 20. 1. 0. 0. 0. 0. 0. 0. 1.]] 38 | y = np.repeat(1.0,X.shape[0]) 39 | fm = pylibfm.FM() 40 | fm.fit(X,y) 41 | fm.predict(v.transform({"user": "1", "item": "10", "age": 24})) 42 | ``` 43 | 44 | ## Getting Started 45 | Here's an example on some real movie ratings data. 46 | 47 | First get the smallest movielens ratings dataset from http://www.grouplens.org/system/files/ml-100k.zip. 48 | ml-100k contains the files u.item (list of movie ids and titles) and u.data (list of user_id, movie_id, rating, timestamp). 49 | ```python 50 | import numpy as np 51 | from sklearn.feature_extraction import DictVectorizer 52 | from pyfm import pylibfm 53 | 54 | # Read in data 55 | def loadData(filename,path="ml-100k/"): 56 | data = [] 57 | y = [] 58 | users=set() 59 | items=set() 60 | with open(path+filename) as f: 61 | for line in f: 62 | (user,movieid,rating,ts)=line.split('\t') 63 | data.append({ "user_id": str(user), "movie_id": str(movieid)}) 64 | y.append(float(rating)) 65 | users.add(user) 66 | items.add(movieid) 67 | 68 | return (data, np.array(y), users, items) 69 | 70 | (train_data, y_train, train_users, train_items) = loadData("ua.base") 71 | (test_data, y_test, test_users, test_items) = loadData("ua.test") 72 | v = DictVectorizer() 73 | X_train = v.fit_transform(train_data) 74 | X_test = v.transform(test_data) 75 | 76 | # Build and train a Factorization Machine 77 | fm = pylibfm.FM(num_factors=10, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal") 78 | 79 | fm.fit(X_train,y_train) 80 | Creating validation dataset of 0.01 of training for adaptive regularization 81 | -- Epoch 1 82 | Training MSE: 0.59477 83 | -- Epoch 2 84 | Training MSE: 0.51841 85 | -- Epoch 3 86 | Training MSE: 0.49125 87 | -- Epoch 4 88 | Training MSE: 0.47589 89 | -- Epoch 5 90 | Training MSE: 0.46571 91 | -- Epoch 6 92 | Training MSE: 0.45852 93 | -- Epoch 7 94 | Training MSE: 0.45322 95 | -- Epoch 8 96 | Training MSE: 0.44908 97 | -- Epoch 9 98 | Training MSE: 0.44557 99 | -- Epoch 10 100 | Training MSE: 0.44278 101 | ... 102 | -- Epoch 98 103 | Training MSE: 0.41863 104 | -- Epoch 99 105 | Training MSE: 0.41865 106 | -- Epoch 100 107 | Training MSE: 0.41874 108 | 109 | # Evaluate 110 | preds = fm.predict(X_test) 111 | from sklearn.metrics import mean_squared_error 112 | print("FM MSE: %.4f" % mean_squared_error(y_test,preds)) 113 | FM MSE: 0.9227 114 | 115 | ``` 116 | ## Classification example 117 | ```python 118 | import numpy as np 119 | from sklearn.feature_extraction import DictVectorizer 120 | from sklearn.cross_validation import train_test_split 121 | from pyfm import pylibfm 122 | 123 | from sklearn.datasets import make_classification 124 | 125 | X, y = make_classification(n_samples=1000,n_features=100, n_clusters_per_class=1) 126 | data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X] 127 | 128 | X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state=42) 129 | 130 | v = DictVectorizer() 131 | X_train = v.fit_transform(X_train) 132 | X_test = v.transform(X_test) 133 | 134 | fm = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal") 135 | 136 | fm.fit(X_train,y_train) 137 | 138 | Creating validation dataset of 0.01 of training for adaptive regularization 139 | -- Epoch 1 140 | Training log loss: 1.91885 141 | -- Epoch 2 142 | Training log loss: 1.62022 143 | -- Epoch 3 144 | Training log loss: 1.36736 145 | -- Epoch 4 146 | Training log loss: 1.15562 147 | -- Epoch 5 148 | Training log loss: 0.97961 149 | -- Epoch 6 150 | Training log loss: 0.83356 151 | -- Epoch 7 152 | Training log loss: 0.71208 153 | -- Epoch 8 154 | Training log loss: 0.61108 155 | -- Epoch 9 156 | Training log loss: 0.52705 157 | -- Epoch 10 158 | Training log loss: 0.45685 159 | 160 | # Evaluate 161 | from sklearn.metrics import log_loss 162 | print "Validation log loss: %.4f" % log_loss(y_test,fm.predict(X_test)) 163 | Validation log loss: 1.5025 164 | 165 | ``` 166 | -------------------------------------------------------------------------------- /pyfm/pylibfm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | import random 4 | from pyfm_fast import FM_fast, CSRDataset 5 | 6 | LEARNING_RATE_TYPES = {"optimal": 0, "invscaling": 1, "constant": 2} 7 | TASKS = {"regression": 0, "classification" : 1} 8 | 9 | class FM: 10 | """Factorization machine fitted by minimizing a regularized empirical loss with adaptive SGD. 11 | 12 | Parameters 13 | ---------- 14 | 15 | num_factors : int 16 | The dimensionality of the factorized 2-way interactions 17 | num_iter : int 18 | Number of iterations 19 | k0 : bool 20 | Use bias. Defaults to true. 21 | k1 : bool 22 | Use 1-way interactions (learn feature weights). 23 | Defaults to true. 24 | init_stdev : double, optional 25 | Standard deviation for initialization of 2-way factors. 26 | Defaults to 0.01. 27 | validation_size : double, optional 28 | Proportion of the training set to use for validation. 29 | Defaults to 0.01. 30 | learning_rate_schedule : string, optional 31 | The learning rate: 32 | constant: eta = eta0 33 | optimal: eta = 1.0/(t+t0) [default] 34 | invscaling: eta = eta0 / pow(t, power_t) 35 | initial_learning_rate : double 36 | Defaults to 0.01 37 | power_t : double 38 | The exponent for inverse scaling learning rate [default 0.5]. 39 | t0 : double 40 | Constant in the denominator for optimal learning rate schedule. 41 | Defaults to 0.001. 42 | task : string 43 | regression: Labels are real values. 44 | classification: Labels are either positive or negative. 45 | verbose : bool 46 | Whether or not to print current iteration, training error 47 | shuffle_training: bool 48 | Whether or not to shuffle training dataset before learning 49 | seed : int 50 | The seed of the pseudo random number generator 51 | """ 52 | def __init__(self, 53 | num_factors=10, 54 | num_iter=1, 55 | k0=True, 56 | k1=True, 57 | init_stdev=0.1, 58 | validation_size=0.01, 59 | learning_rate_schedule="optimal", 60 | initial_learning_rate=0.01, 61 | power_t=0.5, 62 | t0=0.001, 63 | task='classification', 64 | verbose=True, 65 | shuffle_training=True, 66 | seed = 28): 67 | 68 | self.num_factors = num_factors 69 | self.num_iter = num_iter 70 | self.sum = np.zeros(self.num_factors) 71 | self.sum_sqr = np.zeros(self.num_factors) 72 | self.k0 = k0 73 | self.k1 = k1 74 | self.init_stdev = init_stdev 75 | self.validation_size = validation_size 76 | self.task = task 77 | self.shuffle_training = shuffle_training 78 | self.seed = seed 79 | 80 | # Learning rate Parameters 81 | self.learning_rate_schedule = learning_rate_schedule 82 | self.eta0 = initial_learning_rate 83 | self.power_t = power_t 84 | self.t = 1.0 85 | self.learning_rate = initial_learning_rate 86 | self.t0 = t0 87 | 88 | # Regularization Parameters (start with no regularization) 89 | self.reg_0 = 0.0 90 | self.reg_w = 0.0 91 | self.reg_v = np.repeat(0.0, num_factors) 92 | 93 | # local parameters in the lambda_update step 94 | self.lambda_w_grad = 0.0 95 | self.lambda_v_grad = 0.0 96 | self.sum_f = 0.0 97 | self.sum_f_dash_f = 0.0 98 | self.verbose = verbose 99 | 100 | def _validate_params(self): 101 | """Validate input params. """ 102 | if not isinstance(self.shuffle_training, bool): 103 | raise ValueError("shuffle must be either True or False") 104 | if self.num_iter <= 0: 105 | raise ValueError("n_iter must be > zero") 106 | if self.learning_rate_schedule in ("constant", "invscaling"): 107 | if self.eta0 <= 0.0: 108 | raise ValueError("eta0 must be > 0") 109 | 110 | def _get_learning_rate_type(self, learning_rate): 111 | """Map learning rate string to int for cython""" 112 | try: 113 | return LEARNING_RATE_TYPES[learning_rate] 114 | except KeyError: 115 | raise ValueError("learning rate %s " 116 | "is not supported. " % learning_rate) 117 | 118 | def _get_task(self, task): 119 | """Map task string to int for cython""" 120 | try: 121 | return TASKS[task] 122 | except KeyError: 123 | raise ValueError("task %s " 124 | "is not supported. " % task) 125 | 126 | def _bool_to_int(self, bool_arg): 127 | """Map bool to int for cython""" 128 | if bool_arg == True: 129 | return 1 130 | else: 131 | return 0 132 | 133 | def _prepare_y(self,y): 134 | """Maps labels to [-1, 1] space""" 135 | y_i = np.ones(y.shape, dtype=np.float64, order="C") 136 | y_i[y != 1] = -1.0 137 | return y_i 138 | 139 | def fit(self, X, y): 140 | """Fit factorization machine using Stochastic Gradient Descent with Adaptive Regularization. 141 | 142 | Parameters 143 | ---------- 144 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 145 | Training data 146 | 147 | y : numpy array of shape [n_samples] 148 | Target values 149 | 150 | Returns 151 | ------- 152 | self : returns an instance of self. 153 | """ 154 | if type(y) != np.ndarray: 155 | y = np.array(y) 156 | 157 | self._validate_params() 158 | 159 | if self.task == "classification": 160 | y = self._prepare_y(y) 161 | 162 | self.max_target = max(y) 163 | self.min_target = min(y) 164 | 165 | # convert member variables to ints for use in cython 166 | k0 = self._bool_to_int(self.k0) 167 | k1 = self._bool_to_int(self.k1) 168 | shuffle_training = self._bool_to_int(self.shuffle_training) 169 | verbose = self._bool_to_int(self.verbose) 170 | learning_rate_schedule = self._get_learning_rate_type(self.learning_rate_schedule) 171 | task = self._get_task(self.task) 172 | 173 | # use sklearn to create a validation dataset for lambda updates 174 | if self.verbose == True: 175 | print("Creating validation dataset of %.2f of training for adaptive regularization" % self.validation_size) 176 | X_train, validation, train_labels, validation_labels = train_test_split( 177 | X, y, test_size=self.validation_size) 178 | self.num_attribute = X_train.shape[1] 179 | 180 | # Convert datasets to sklearn sequential datasets for fast traversal 181 | X_train_dataset = _make_dataset(X_train, train_labels) 182 | validation_dataset = _make_dataset(validation, validation_labels) 183 | 184 | # Set up params 185 | self.w0 = 0.0 186 | self.w = np.zeros(self.num_attribute) 187 | np.random.seed(seed=self.seed) 188 | self.v = np.random.normal(scale=self.init_stdev,size=(self.num_factors, self.num_attribute)) 189 | 190 | self.fm_fast = FM_fast(self.w, 191 | self.v, 192 | self.num_factors, 193 | self.num_attribute, 194 | self.num_iter, 195 | k0, 196 | k1, 197 | self.w0, 198 | self.t, 199 | self.t0, 200 | self.power_t, 201 | self.min_target, 202 | self.max_target, 203 | self.eta0, 204 | learning_rate_schedule, 205 | shuffle_training, 206 | task, 207 | self.seed, 208 | verbose) 209 | 210 | return self.fm_fast.fit(X_train_dataset, validation_dataset) 211 | 212 | # report epoch information 213 | if self.verbose == True: 214 | print("-- Epoch %d" % (epoch + 1)) 215 | print("Train MSE: %.5f" % (self.sumloss / self.count)) 216 | 217 | def predict(self, X): 218 | """Predict using the factorization machine 219 | 220 | Parameters 221 | ---------- 222 | X : sparse matrix, shape = [n_samples, n_features] 223 | or 224 | X : single instance [1, n_features] 225 | 226 | Returns 227 | ------- 228 | float if X is one instance 229 | array, shape = [n_samples] if X is sparse matrix 230 | Predicted target values per element in X. 231 | """ 232 | sparse_X = _make_dataset(X, np.ones(X.shape[0])) 233 | 234 | return self.fm_fast._predict(sparse_X) 235 | 236 | def _make_dataset(X, y_i): 237 | """Create ``Dataset`` abstraction for sparse and dense inputs.""" 238 | sample_weight = np.ones(X.shape[0], dtype=np.float64, order='C') # ignore sample weight for the moment 239 | dataset = CSRDataset(X.data, X.indptr, X.indices, y_i, sample_weight) 240 | return dataset 241 | 242 | -------------------------------------------------------------------------------- /pyfm_fast.pyx: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # cython: cdivision=True 3 | # cython: boundscheck=False 4 | # cython: wraparound=False 5 | # 6 | # Author: Corey Lynch 7 | # 8 | # License: BSD Style. 9 | 10 | import numpy as np 11 | import sys 12 | from time import time 13 | 14 | from libc.math cimport exp, log, pow 15 | cimport numpy as np 16 | cimport cython 17 | 18 | np.import_array() 19 | 20 | ctypedef np.float64_t DOUBLE 21 | ctypedef np.int32_t INTEGER 22 | 23 | # MODEL CONSTANTS 24 | DEF REGRESSION = 0 25 | DEF CLASSIFICATION = 1 26 | DEF OPTIMAL = 0 27 | DEF INVERSE_SCALING = 1 28 | 29 | cdef class FM_fast(object): 30 | """Factorization Machine fitted by minimizing a regularized empirical loss with adaptive SGD. 31 | 32 | Parameters 33 | ---------- 34 | w : np.ndarray[DOUBLE, ndim=1, mode='c'] 35 | v : ndarray[DOUBLE, ndim=2, mode='c'] 36 | num_factors : int 37 | num_attributes : int 38 | n_iter : int 39 | k0 : int 40 | k1 : int 41 | w0 : double 42 | t : double 43 | t0 : double 44 | l : double 45 | power_t : double 46 | min_target : double 47 | max_target : double 48 | eta0 : double 49 | learning_rate_schedule : int 50 | shuffle_training : int 51 | task : int 52 | seed : int 53 | verbose : int 54 | """ 55 | 56 | cdef public double w0 57 | cdef public np.ndarray w 58 | cdef public np.ndarray v 59 | cdef public int num_factors 60 | cdef public int num_attributes 61 | cdef public int n_iter 62 | cdef public int k0 63 | cdef public int k1 64 | 65 | cdef public DOUBLE t 66 | cdef public DOUBLE t0 67 | cdef public DOUBLE l 68 | cdef public DOUBLE power_t 69 | cdef public DOUBLE min_target 70 | cdef public DOUBLE max_target 71 | cdef public np.ndarray sum 72 | cdef public np.ndarray sum_sqr 73 | cdef public int task 74 | cdef public int learning_rate_schedule 75 | cdef public double learning_rate 76 | cdef public int shuffle_training 77 | cdef public int seed 78 | cdef public int verbose 79 | 80 | cdef public DOUBLE reg_0 81 | cdef public DOUBLE reg_w 82 | cdef public np.ndarray reg_v 83 | 84 | cdef public np.ndarray grad_w 85 | cdef public np.ndarray grad_v 86 | 87 | cdef public DOUBLE sumloss 88 | cdef public int count 89 | 90 | def __init__(self, 91 | np.ndarray[DOUBLE, ndim=1, mode='c'] w, 92 | np.ndarray[DOUBLE, ndim=2, mode='c'] v, 93 | int num_factors, 94 | int num_attributes, 95 | int n_iter, 96 | int k0, 97 | int k1, 98 | double w0, 99 | double t, 100 | double t0, 101 | double power_t, 102 | double min_target, 103 | double max_target, 104 | double eta0, 105 | int learning_rate_schedule, 106 | int shuffle_training, 107 | int task, 108 | int seed, 109 | int verbose): 110 | 111 | self.w0 = w0 112 | self.w = w 113 | self.v = v 114 | self.num_factors = num_factors 115 | self.num_attributes = num_attributes 116 | self.n_iter = n_iter 117 | self.k0 = k0 118 | self.k1 = k1 119 | self.t = 1 120 | self.t0 = t0 121 | self.learning_rate = eta0 122 | self.power_t = power_t 123 | self.min_target = min_target 124 | self.max_target = max_target 125 | self.sum = np.zeros(self.num_factors) 126 | self.sum_sqr = np.zeros(self.num_factors) 127 | self.task = task 128 | self.learning_rate_schedule = learning_rate_schedule 129 | self.shuffle_training = shuffle_training 130 | self.seed = seed 131 | self.verbose = verbose 132 | 133 | self.reg_0 = 0.0 134 | self.reg_w = 0.0 135 | self.reg_v = np.zeros(self.num_factors) 136 | 137 | self.sumloss = 0.0 138 | self.count = 0 139 | 140 | self.grad_w = np.zeros(self.num_attributes) 141 | self.grad_v = np.zeros((self.num_factors, self.num_attributes)) 142 | 143 | cdef _predict_instance(self, DOUBLE * x_data_ptr, 144 | INTEGER * x_ind_ptr, 145 | int xnnz): 146 | 147 | # Helper variables 148 | cdef DOUBLE result = 0.0 149 | cdef int feature 150 | cdef unsigned int i = 0 151 | cdef unsigned int f = 0 152 | cdef DOUBLE d 153 | 154 | # map instance variables to local variables 155 | cdef DOUBLE w0 = self.w0 156 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w 157 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v 158 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_ = np.zeros(self.num_factors) 159 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_sqr_ = np.zeros(self.num_factors) 160 | 161 | if self.k0 > 0: 162 | result += w0 163 | if self.k1 > 0: 164 | for i in range(xnnz): 165 | feature = x_ind_ptr[i] 166 | result += w[feature] * x_data_ptr[i] 167 | for f in range(self.num_factors): 168 | sum_[f] = 0.0 169 | sum_sqr_[f] = 0.0 170 | for i in range(xnnz): 171 | feature = x_ind_ptr[i] 172 | d = v[f, feature] * x_data_ptr[i] 173 | sum_[f] += d 174 | sum_sqr_[f] += d*d 175 | result += 0.5 * (sum_[f] * sum_[f] - sum_sqr_[f]) 176 | 177 | # pass sum to sgd_theta 178 | self.sum = sum_ 179 | return result 180 | 181 | cdef _predict_scaled(self, DOUBLE * x_data_ptr, 182 | INTEGER * x_ind_ptr, 183 | int xnnz): 184 | cdef DOUBLE result = 0.0 185 | cdef unsigned int i = 0 186 | cdef unsigned int f = 0 187 | cdef DOUBLE d 188 | cdef DOUBLE w_dash = 0.0 189 | cdef DOUBLE v_dash = 0.0 190 | 191 | # map instance variables to local variables 192 | cdef DOUBLE w0 = self.w0 193 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w 194 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v 195 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] grad_w = self.grad_w 196 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] grad_v = self.grad_v 197 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_ = np.zeros(self.num_factors) 198 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] sum_sqr_ = np.zeros(self.num_factors) 199 | cdef DOUBLE learning_rate = self.learning_rate 200 | cdef DOUBLE reg_w = self.reg_w 201 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] reg_v = self.reg_v 202 | 203 | if self.k0 > 0: 204 | result += w0 205 | if self.k1 > 0: 206 | for i in xrange(xnnz): 207 | feature = x_ind_ptr[i] 208 | assert(feature < self.num_attributes) 209 | w_dash = w[feature] - learning_rate * (grad_w[feature] + 2 * reg_w * w[feature]) 210 | result += w_dash * x_data_ptr[i] 211 | for f in xrange(self.num_factors): 212 | sum_[f] = 0.0 213 | sum_sqr_[f] = 0.0 214 | for i in xrange(xnnz): 215 | feature = x_ind_ptr[i] 216 | v_dash = v[f,feature] - learning_rate * (grad_v[f,feature] + 2 * reg_v[f] * v[f,feature]) 217 | d = v_dash * x_data_ptr[i] 218 | sum_[f] += d 219 | sum_sqr_[f] += d*d 220 | result += 0.5 * (sum_[f]*sum_[f] - sum_sqr_[f]) 221 | return result 222 | 223 | def _predict(self, CSRDataset dataset): 224 | 225 | # Helper access variables 226 | cdef unsigned int i = 0 227 | cdef Py_ssize_t n_samples = dataset.n_samples 228 | cdef DOUBLE * x_data_ptr = NULL 229 | cdef INTEGER * x_ind_ptr = NULL 230 | cdef int xnnz 231 | cdef DOUBLE sample_weight = 1.0 232 | cdef DOUBLE y_placeholder 233 | cdef DOUBLE p = 0.0 234 | 235 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] return_preds = np.zeros(n_samples) 236 | 237 | for i in range(n_samples): 238 | dataset.next(& x_data_ptr, & x_ind_ptr, & xnnz, & y_placeholder, 239 | & sample_weight) 240 | p = self._predict_instance(x_data_ptr, x_ind_ptr, xnnz) 241 | if self.task == REGRESSION: 242 | p = min(self.max_target, p) 243 | p = max(self.min_target, p) 244 | else: 245 | p = (1.0 / (1.0 + exp(-p))) 246 | return_preds[i] = p 247 | return return_preds 248 | 249 | cdef _sgd_theta_step(self, DOUBLE * x_data_ptr, 250 | INTEGER * x_ind_ptr, 251 | int xnnz, 252 | DOUBLE y): 253 | 254 | cdef DOUBLE mult = 0.0 255 | cdef DOUBLE p 256 | cdef int feature 257 | cdef unsigned int i = 0 258 | cdef unsigned int f = 0 259 | cdef DOUBLE d 260 | cdef DOUBLE grad_0 261 | 262 | cdef DOUBLE w0 = self.w0 263 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w 264 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v 265 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] grad_w = self.grad_w 266 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] grad_v = self.grad_v 267 | cdef DOUBLE learning_rate = self.learning_rate 268 | cdef DOUBLE reg_0 = self.reg_0 269 | cdef DOUBLE reg_w = self.reg_w 270 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] reg_v = self.reg_v 271 | 272 | p = self._predict_instance(x_data_ptr, x_ind_ptr, xnnz) 273 | 274 | if self.task == REGRESSION: 275 | p = min(self.max_target, p) 276 | p = max(self.min_target, p) 277 | mult = 2 * (p - y); 278 | else: 279 | mult = y * ( (1.0 / (1.0+exp(-y*p))) - 1.0) 280 | 281 | # Set learning schedule 282 | if self.learning_rate_schedule == OPTIMAL: 283 | self.learning_rate = 1.0 / (self.t + self.t0) 284 | 285 | elif self.learning_rate_schedule == INVERSE_SCALING: 286 | self.learning_rate = self.learning_rate / pow(self.t, self.power_t) 287 | 288 | if self.verbose > 0: 289 | self.sumloss += _squared_loss(p,y) if self.task == REGRESSION else _log_loss(p,y) 290 | 291 | # Update global bias 292 | if self.k0 > 0: 293 | grad_0 = mult 294 | w0 -= learning_rate * (grad_0 + 2 * reg_0 * w0) 295 | 296 | # Update feature biases 297 | if self.k1 > 0: 298 | for i in range(xnnz): 299 | feature = x_ind_ptr[i] 300 | grad_w[feature] = mult * x_data_ptr[i] 301 | w[feature] -= learning_rate * (grad_w[feature] 302 | + 2 * reg_w * w[feature]) 303 | 304 | # Update feature factor vectors 305 | for f in range(self.num_factors): 306 | for i in range(xnnz): 307 | feature = x_ind_ptr[i] 308 | grad_v[f,feature] = mult * (x_data_ptr[i] * (self.sum[f] - v[f,feature] * x_data_ptr[i])) 309 | v[f,feature] -= learning_rate * (grad_v[f,feature] + 2 * reg_v[f] * v[f,feature]) 310 | 311 | # Pass updated vars to other functions 312 | self.learning_rate = learning_rate 313 | self.w0 = w0 314 | self.w = w 315 | self.v = v 316 | self.grad_w = grad_w 317 | self.grad_v = grad_v 318 | 319 | self.t += 1 320 | self.count += 1 321 | 322 | cdef _sgd_lambda_step(self, DOUBLE * validation_x_data_ptr, 323 | INTEGER * validation_x_ind_ptr, 324 | int validation_xnnz, 325 | DOUBLE validation_y): 326 | 327 | cdef DOUBLE sum_f 328 | cdef DOUBLE sum_f_dash 329 | cdef DOUBLE sum_f_dash_f 330 | cdef DOUBLE p 331 | cdef DOUBLE grad_loss 332 | cdef int feature 333 | cdef unsigned int i 334 | cdef unsigned int f 335 | cdef DOUBLE lambda_w_grad = 0.0 336 | cdef DOUBLE lambda_v_grad = 0.0 337 | cdef DOUBLE v_dash = 0.0 338 | 339 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] w = self.w 340 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] v = self.v 341 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] grad_w = self.grad_w 342 | cdef np.ndarray[DOUBLE, ndim=2, mode='c'] grad_v = self.grad_v 343 | cdef DOUBLE learning_rate = self.learning_rate 344 | cdef DOUBLE reg_0 = self.reg_0 345 | cdef DOUBLE reg_w = self.reg_w 346 | cdef np.ndarray[DOUBLE, ndim=1, mode='c'] reg_v = self.reg_v 347 | 348 | p = self._predict_scaled(validation_x_data_ptr, validation_x_ind_ptr, validation_xnnz) 349 | if self.task == REGRESSION: 350 | p = min(self.max_target, p) 351 | p = max(self.min_target, p) 352 | grad_loss = 2 * (p - validation_y) 353 | else: 354 | grad_loss = validation_y * ( (1.0 / (1.0 + exp(-validation_y*p))) - 1.0) 355 | 356 | if self.k1 > 0: 357 | lambda_w_grad = 0.0 358 | for i in xrange(validation_xnnz): 359 | feature = validation_x_ind_ptr[i] 360 | lambda_w_grad += validation_x_data_ptr[i] * w[feature] 361 | lambda_w_grad = -2 * learning_rate * lambda_w_grad 362 | reg_w -= learning_rate * grad_loss * lambda_w_grad 363 | reg_w = max(0.0, reg_w) 364 | 365 | for f in xrange(self.num_factors): 366 | sum_f = 0.0 367 | sum_f_dash = 0.0 368 | sum_f_dash_f = 0.0 369 | 370 | for i in xrange(validation_xnnz): 371 | feature = validation_x_ind_ptr[i] 372 | v_dash = v[f,feature] - learning_rate * (grad_v[f,feature] + 2 * reg_v[f] * v[f,feature]) 373 | sum_f_dash += v_dash * validation_x_data_ptr[i] 374 | sum_f += v[f,feature] * validation_x_data_ptr[i] 375 | sum_f_dash_f += v_dash * validation_x_data_ptr[i] * v[f,feature] * validation_x_data_ptr[i] 376 | lambda_v_grad = -2 * learning_rate * (sum_f_dash * sum_f - sum_f_dash_f) 377 | reg_v[f] -= learning_rate * grad_loss * lambda_v_grad 378 | reg_v[f] = max(0.0, reg_v[f]) 379 | 380 | # Pass updated vars to other functions 381 | self.reg_w = reg_w 382 | self.reg_v = reg_v 383 | 384 | def fit(self, CSRDataset dataset, CSRDataset validation_dataset): 385 | 386 | # get the data information into easy vars 387 | cdef Py_ssize_t n_samples = dataset.n_samples 388 | cdef Py_ssize_t n_validation_samples = validation_dataset.n_samples 389 | 390 | cdef DOUBLE * x_data_ptr = NULL 391 | cdef INTEGER * x_ind_ptr = NULL 392 | 393 | cdef DOUBLE * validation_x_data_ptr = NULL 394 | cdef INTEGER * validation_x_ind_ptr = NULL 395 | 396 | # helper variables 397 | cdef int xnnz 398 | cdef DOUBLE y = 0.0 399 | cdef DOUBLE validation_y = 0.0 400 | cdef int validation_xnnz 401 | cdef unsigned int count = 0 402 | cdef unsigned int epoch = 0 403 | cdef unsigned int i = 0 404 | 405 | cdef DOUBLE sample_weight = 1.0 406 | cdef DOUBLE validation_sample_weight = 1.0 407 | 408 | for epoch in range(self.n_iter): 409 | 410 | if self.verbose > 0: 411 | print("-- Epoch %d" % (epoch + 1)) 412 | self.count = 0 413 | self.sumloss = 0 414 | if self.shuffle_training: 415 | dataset.shuffle(self.seed) 416 | 417 | for i in range(n_samples): 418 | dataset.next( & x_data_ptr, & x_ind_ptr, & xnnz, & y, 419 | & sample_weight) 420 | 421 | self._sgd_theta_step(x_data_ptr, x_ind_ptr, xnnz, y) 422 | 423 | if epoch > 0: 424 | validation_dataset.next( & validation_x_data_ptr, & validation_x_ind_ptr, 425 | & validation_xnnz, & validation_y, 426 | & validation_sample_weight) 427 | self._sgd_lambda_step(validation_x_data_ptr, validation_x_ind_ptr, 428 | validation_xnnz, validation_y) 429 | if self.verbose > 0: 430 | error_type = "MSE" if self.task == REGRESSION else "log loss" 431 | print "Training %s: %.5f" % (error_type, (self.sumloss / self.count)) 432 | 433 | def __getstate__(self): 434 | # Implements Pickle interface. 435 | field_names = ["w0", "w", "v", "num_factors", "num_attributes", 436 | "n_iter", "k0", "k1", "t", "t0", "l", "power_t", 437 | "min_target", "max_target", "sum", "sum_sqr", "task", 438 | "learning_rate_schedule", "learning_rate", 439 | "shuffle_training", "seed", "verbose", "reg_0", 440 | "reg_w", "reg_v", "grad_w", "grad_v", "sumloss", 441 | "count"] 442 | 443 | state = [field_names] 444 | for field in field_names: 445 | val = getattr(self ,field) 446 | state.append(val) 447 | return tuple(state) 448 | 449 | def __setstate__(self, state): 450 | # Implements Pickle interface. 451 | for n, field in enumerate(state[0]): 452 | setattr(self, field, state[n + 1]) 453 | 454 | cdef inline double max(double a, double b): 455 | return a if a >= b else b 456 | 457 | cdef inline double min(double a, double b): 458 | return a if a <= b else b 459 | 460 | cdef _log_loss(DOUBLE p, DOUBLE y): 461 | cdef DOUBLE z 462 | 463 | z = p * y 464 | # approximately equal and saves the computation of the log 465 | if z > 18: 466 | return exp(-z) 467 | if z < -18: 468 | return -z 469 | return log(1.0 + exp(-z)) 470 | 471 | cdef _squared_loss(DOUBLE p, DOUBLE y): 472 | return 0.5 * (p - y) * (p - y) 473 | 474 | cdef class CSRDataset: 475 | """An sklearn ``SequentialDataset`` backed by a scipy sparse CSR matrix. This is an ugly hack for the moment until I find the best way to link to sklearn. """ 476 | 477 | cdef Py_ssize_t n_samples 478 | cdef int current_index 479 | cdef int stride 480 | cdef DOUBLE *X_data_ptr 481 | cdef INTEGER *X_indptr_ptr 482 | cdef INTEGER *X_indices_ptr 483 | cdef DOUBLE *Y_data_ptr 484 | cdef np.ndarray feature_indices 485 | cdef INTEGER *feature_indices_ptr 486 | cdef np.ndarray index 487 | cdef INTEGER *index_data_ptr 488 | cdef DOUBLE *sample_weight_data 489 | 490 | def __cinit__(self, np.ndarray[DOUBLE, ndim=1, mode='c'] X_data, 491 | np.ndarray[INTEGER, ndim=1, mode='c'] X_indptr, 492 | np.ndarray[INTEGER, ndim=1, mode='c'] X_indices, 493 | np.ndarray[DOUBLE, ndim=1, mode='c'] Y, 494 | np.ndarray[DOUBLE, ndim=1, mode='c'] sample_weight): 495 | """Dataset backed by a scipy sparse CSR matrix. 496 | 497 | The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. 498 | The corresponding feature values are given by 499 | x_data_ptr[0:nnz]. 500 | 501 | Parameters 502 | ---------- 503 | X_data : ndarray, dtype=np.float64, ndim=1, mode='c' 504 | The data array of the CSR matrix; a one-dimensional c-continuous 505 | numpy array of dtype np.float64. 506 | X_indptr : ndarray, dtype=np.int32, ndim=1, mode='c' 507 | The index pointer array of the CSR matrix; a one-dimensional 508 | c-continuous numpy array of dtype np.int32. 509 | X_indices : ndarray, dtype=np.int32, ndim=1, mode='c' 510 | The column indices array of the CSR matrix; a one-dimensional 511 | c-continuous numpy array of dtype np.int32. 512 | Y : ndarray, dtype=np.float64, ndim=1, mode='c' 513 | The target values; a one-dimensional c-continuous numpy array of 514 | dtype np.float64. 515 | sample_weights : ndarray, dtype=np.float64, ndim=1, mode='c' 516 | The weight of each sample; a one-dimensional c-continuous numpy 517 | array of dtype np.float64. 518 | """ 519 | self.n_samples = Y.shape[0] 520 | self.current_index = -1 521 | self.X_data_ptr = X_data.data 522 | self.X_indptr_ptr = X_indptr.data 523 | self.X_indices_ptr = X_indices.data 524 | self.Y_data_ptr = Y.data 525 | self.sample_weight_data = sample_weight.data 526 | # Use index array for fast shuffling 527 | cdef np.ndarray[INTEGER, ndim=1, 528 | mode='c'] index = np.arange(0, self.n_samples, 529 | dtype=np.int32) 530 | self.index = index 531 | self.index_data_ptr = index.data 532 | 533 | cdef void next(self, DOUBLE **x_data_ptr, INTEGER **x_ind_ptr, 534 | int *nnz, DOUBLE *y, DOUBLE *sample_weight): 535 | cdef int current_index = self.current_index 536 | if current_index >= (self.n_samples - 1): 537 | current_index = -1 538 | 539 | current_index += 1 540 | cdef int sample_idx = self.index_data_ptr[current_index] 541 | cdef int offset = self.X_indptr_ptr[sample_idx] 542 | y[0] = self.Y_data_ptr[sample_idx] 543 | x_data_ptr[0] = self.X_data_ptr + offset 544 | x_ind_ptr[0] = self.X_indices_ptr + offset 545 | nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset 546 | sample_weight[0] = self.sample_weight_data[sample_idx] 547 | 548 | self.current_index = current_index 549 | 550 | cdef void shuffle(self, seed): 551 | np.random.RandomState(seed).shuffle(self.index) 552 | --------------------------------------------------------------------------------