├── .gitignore
├── 1.py
├── FM.py
├── LoadData.py
├── NeuralFM.py
├── README.md
├── __init__.py
└── data
    └── frappe
        ├── README.txt
        ├── frappe.test.libfm
        ├── frappe.train.libfm
        └── frappe.validation.libfm


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/


--------------------------------------------------------------------------------
/1.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | # tf.set_random_seed(1234)
 5 | # a = tf.random_uniform([1])
 6 | # b = tf.random_uniform([1])
 7 | # with tf.Session() as sess1:
 8 | #     print(sess1.run(a))  # generates 'A1'
 9 | #     print(sess1.run(a)) # generates 'A2'
10 | #     print(sess1.run(b)) # generates 'B1'
11 | #     print(sess1.run(b))  # generates 'B2'
12 | # with tf.Session() as sess2:
13 | #     print(sess2.run(a))  # generates 'A1'
14 | #     print(sess2.run(a)) # generates 'A2'
15 | #     print(sess2.run(b)) # generates 'B1'
16 | #     print(sess2.run(b))  # generates 'B2'
17 | 
18 | # all = dict()
19 | # all[1] =2
20 | # print(all)
21 | 
22 | saver = tf.train.Saver
23 | 
24 | a = tf.Variable([1,2])
25 | b = tf.Variable([2,4])
26 | c = tf.matmul(a,b,transpose_b=True)
27 | tf.add_to_collection('c',c)
28 | sess = tf.Session()
29 | sess.run(c)
30 | saver.save(sess,'my_model')


--------------------------------------------------------------------------------
/FM.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Tensorflow implementation of Factorization Machines (FM) as described in:
  3 | Xiangnan He, Tat-Seng Chua. Neural Factorization Machines for Sparse Predictive Analytics. In Proc. of SIGIR 2017.
  4 | 
  5 | Note that the original paper of FM is: Steffen Rendle. Factorization Machines. In Proc. of ICDM 2010.
  6 | 
  7 | @author: 
  8 | Xiangnan He (xiangnanhe@gmail.com)
  9 | Lizi Liao (liaolizi.llz@gmail.com)
 10 | 
 11 | @references:
 12 | '''
 13 | import math
 14 | import os
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from sklearn.base import BaseEstimator, TransformerMixin
 18 | from sklearn.metrics import mean_squared_error
 19 | from sklearn.metrics import accuracy_score
 20 | from sklearn.metrics import log_loss
 21 | from time import time
 22 | import argparse
 23 | import LoadData as DATA
 24 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
 25 | 
 26 | #################### Arguments ####################
 27 | def parse_args():
 28 |     parser = argparse.ArgumentParser(description="Run FM.")
 29 |     parser.add_argument('--path', nargs='?', default='./data/',
 30 |                         help='Input data path.')
 31 |     parser.add_argument('--dataset', nargs='?', default='frappe',
 32 |                         help='Choose a dataset.')
 33 |     parser.add_argument('--epoch', type=int, default=100,
 34 |                         help='Number of epochs.')
 35 |     parser.add_argument('--pretrain', type=int, default=-1,
 36 |                         help='flag for pretrain. 1: initialize from pretrain; 0: randomly initialize; -1: save the model to pretrain file')
 37 |     parser.add_argument('--batch_size', type=int, default=128,
 38 |                         help='Batch size.')
 39 |     parser.add_argument('--hidden_factor', type=int, default=64,
 40 |                         help='Number of hidden factors.')
 41 |     parser.add_argument('--lamda', type=float, default=0,
 42 |                         help='Regularizer for bilinear part.')
 43 |     parser.add_argument('--keep_prob', type=float, default=0.5, 
 44 |                     help='Keep probility (1-dropout_ratio) for the Bi-Interaction layer. 1: no dropout')
 45 |     parser.add_argument('--lr', type=float, default=0.05,
 46 |                         help='Learning rate.')
 47 |     parser.add_argument('--loss_type', nargs='?', default='square_loss',
 48 |                         help='Specify a loss type (square_loss or log_loss).')
 49 |     parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer',
 50 |                         help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).')
 51 |     parser.add_argument('--verbose', type=int, default=1,
 52 |                         help='Show the results per X epochs (0, 1 ... any positive integer)')
 53 |     parser.add_argument('--batch_norm', type=int, default=0,
 54 |                     help='Whether to perform batch normaization (0 or 1)')
 55 | 
 56 |     return parser.parse_args()
 57 | 
 58 | class FM(BaseEstimator, TransformerMixin):
 59 |     def __init__(self, features_M, pretrain_flag, save_file, hidden_factor, loss_type, epoch, batch_size, learning_rate, lamda_bilinear, keep,
 60 |                  optimizer_type, batch_norm, verbose, random_seed=2016):
 61 |         # bind params to class
 62 |         self.batch_size = batch_size
 63 |         self.learning_rate = learning_rate
 64 |         self.hidden_factor = hidden_factor  # ?
 65 |         self.save_file = save_file  # ?
 66 |         self.pretrain_flag = pretrain_flag
 67 |         self.loss_type = loss_type
 68 |         self.features_M = features_M  # ?
 69 |         self.lamda_bilinear = lamda_bilinear  # ?
 70 |         self.keep = keep  # ?
 71 |         self.epoch = epoch
 72 |         self.random_seed = random_seed
 73 |         self.optimizer_type = optimizer_type
 74 |         self.batch_norm = batch_norm  # ?
 75 |         self.verbose = verbose  # ?
 76 |         # performance of each epoch
 77 |         self.train_rmse, self.valid_rmse, self.test_rmse = [], [], []
 78 | 
 79 |         # init all variables in a tensorflow graph
 80 |         self._init_graph()
 81 | 
 82 |     def _init_graph(self):
 83 |         '''
 84 |         Init a tensorflow Graph containing: input data, variables, model, loss, optimizer
 85 |         '''
 86 |         self.graph = tf.Graph()
 87 |         with self.graph.as_default():  # , tf.device('/cpu:0'):
 88 |             # Set graph level random seed
 89 |             tf.set_random_seed(self.random_seed)
 90 |             # Input data.
 91 |             self.train_features = tf.placeholder(tf.int32, shape=[None, None])  # None * features_M
 92 |             self.train_labels = tf.placeholder(tf.float32, shape=[None, 1])  # None * 1
 93 |             self.dropout_keep = tf.placeholder(tf.float32)
 94 |             self.train_phase = tf.placeholder(tf.bool)  # ?
 95 | 
 96 |             # Variables.
 97 |             self.weights = self._initialize_weights()
 98 | 
 99 |             # Model.
100 |             # _________ sum_square part _____________
101 |             # get the summed up embeddings of features.
102 |             nonzero_embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'], self.train_features)
103 |             self.summed_features_emb = tf.reduce_sum(nonzero_embeddings, 1) # None * K
104 |             # get the element-multiplication
105 |             self.summed_features_emb_square = tf.square(self.summed_features_emb)  # None * K
106 | 
107 |             # _________ square_sum part _____________
108 |             self.squared_features_emb = tf.square(nonzero_embeddings)
109 |             self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1)  # None * K
110 | 
111 |             # ________ FM __________
112 |             self.FM = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb)  # None * K
113 |             if self.batch_norm:
114 |                 self.FM = self.batch_norm_layer(self.FM, train_phase=self.train_phase, scope_bn='bn_fm')
115 |             self.FM = tf.nn.dropout(self.FM, self.dropout_keep) # dropout at the FM layer
116 | 
117 |             # _________out _________
118 |             Bilinear = tf.reduce_sum(self.FM, 1, keep_dims=True)  # None * 1
119 |             self.Feature_bias = tf.reduce_sum(tf.nn.embedding_lookup(self.weights['feature_bias'], self.train_features) , 1)  # None * 1
120 |             Bias = self.weights['bias'] * tf.ones_like(self.train_labels)  # None * 1
121 |             self.out = tf.add_n([Bilinear, self.Feature_bias, Bias])  # None * 1
122 | 
123 |             # Compute the loss.
124 |             if self.loss_type == 'square_loss':
125 |                 if self.lamda_bilinear > 0:
126 |                     self.loss = tf.nn.l2_loss(tf.subtract(self.train_labels, self.out)) + tf.contrib.layers.l2_regularizer(self.lamda_bilinear)(self.weights['feature_embeddings'])  # regulizer
127 |                 else:
128 |                     self.loss = tf.nn.l2_loss(tf.subtract(self.train_labels, self.out))
129 |             elif self.loss_type == 'log_loss':
130 |                 self.out = tf.sigmoid(self.out)
131 |                 if self.lambda_bilinear > 0:
132 |                     self.loss = tf.contrib.losses.log_loss(self.out, self.train_labels, weight=1.0, epsilon=1e-07, scope=None) + tf.contrib.layers.l2_regularizer(self.lamda_bilinear)(self.weights['feature_embeddings'])  # regulizer
133 |                 else:
134 |                     self.loss = tf.contrib.losses.log_loss(self.out, self.train_labels, weight=1.0, epsilon=1e-07, scope=None)
135 | 
136 |             # Optimizer.
137 |             if self.optimizer_type == 'AdamOptimizer':
138 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
139 |             elif self.optimizer_type == 'AdagradOptimizer':
140 |                 self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
141 |             elif self.optimizer_type == 'GradientDescentOptimizer':
142 |                 self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
143 |             elif self.optimizer_type == 'MomentumOptimizer':
144 |                 self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
145 | 
146 |             # init
147 |             self.saver = tf.train.Saver()
148 |             init = tf.global_variables_initializer()
149 |             self.sess = tf.Session()
150 |             self.sess.run(init)
151 | 
152 |             # number of params
153 |             total_parameters = 0
154 |             for variable in self.weights.values():
155 |                 shape = variable.get_shape() # shape is an array of tf.Dimension
156 |                 variable_parameters = 1
157 |                 for dim in shape:
158 |                     variable_parameters *= dim.value
159 |                 total_parameters += variable_parameters
160 |             if self.verbose > 0:
161 |                 print "#params: %d" %total_parameters 
162 | 
163 |     def _initialize_weights(self):
164 |         all_weights = dict()
165 |         if self.pretrain_flag > 0:
166 |             weight_saver = tf.train.import_meta_graph(self.save_file + '.meta')
167 |             pretrain_graph = tf.get_default_graph()
168 |             feature_embeddings = pretrain_graph.get_tensor_by_name('feature_embeddings:0')
169 |             feature_bias = pretrain_graph.get_tensor_by_name('feature_bias:0')
170 |             bias = pretrain_graph.get_tensor_by_name('bias:0')
171 |             with tf.Session() as sess:
172 |                 weight_saver.restore(sess, self.save_file)
173 |                 fe, fb, b = sess.run([feature_embeddings, feature_bias, bias])
174 |             all_weights['feature_embeddings'] = tf.Variable(fe, dtype=tf.float32)
175 |             all_weights['feature_bias'] = tf.Variable(fb, dtype=tf.float32)
176 |             all_weights['bias'] = tf.Variable(b, dtype=tf.float32)
177 |         else:
178 |             all_weights['feature_embeddings'] = tf.Variable(
179 |                 tf.random_normal([self.features_M, self.hidden_factor], 0.0, 0.01),
180 |                 name='feature_embeddings')  # features_M * K
181 |             all_weights['feature_bias'] = tf.Variable(
182 |                 tf.random_uniform([self.features_M, 1], 0.0, 0.0), name='feature_bias')  # features_M * 1
183 |             all_weights['bias'] = tf.Variable(tf.constant(0.0), name='bias')  # 1 * 1
184 |         return all_weights
185 | 
186 |     def batch_norm_layer(self, x, train_phase, scope_bn):
187 |         # Note: the decay parameter is tunable
188 |         bn_train = batch_norm(x, decay=0.9, center=True, scale=True, updates_collections=None,
189 |             is_training=True, reuse=None, trainable=True, scope=scope_bn)
190 |         bn_inference = batch_norm(x, decay=0.9, center=True, scale=True, updates_collections=None,
191 |             is_training=False, reuse=True, trainable=True, scope=scope_bn)
192 |         z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
193 |         return z
194 | 
195 |     def partial_fit(self, data):  # fit a batch
196 |         feed_dict = {self.train_features: data['X'], self.train_labels: data['Y'], self.dropout_keep: self.keep, self.train_phase: True}
197 |         loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
198 |         return loss
199 | 
200 |     def get_random_block_from_data(self, data, batch_size):  # generate a random block of training data
201 |         start_index = np.random.randint(0, len(data['Y']) - batch_size)
202 |         X , Y = [], []
203 |         # forward get sample
204 |         i = start_index
205 |         while len(X) < batch_size and i < len(data['X']):
206 |             if len(data['X'][i]) == len(data['X'][start_index]):
207 |                 Y.append([data['Y'][i]])
208 |                 X.append(data['X'][i])
209 |                 i = i + 1
210 |             else:
211 |                 break
212 |         # backward get sample
213 |         i = start_index
214 |         while len(X) < batch_size and i >= 0:
215 |             if len(data['X'][i]) == len(data['X'][start_index]):
216 |                 Y.append([data['Y'][i]])
217 |                 X.append(data['X'][i])
218 |                 i = i - 1
219 |             else:
220 |                 break
221 |         return {'X': X, 'Y': Y}
222 | 
223 |     def shuffle_in_unison_scary(self, a, b): # shuffle two lists simutaneously
224 |         rng_state = np.random.get_state()
225 |         np.random.shuffle(a)
226 |         np.random.set_state(rng_state)
227 |         np.random.shuffle(b)
228 | 
229 |     def train(self, Train_data, Validation_data, Test_data):  # fit a dataset
230 |         # Check Init performance
231 |         if self.verbose > 0:
232 |             t2 = time()
233 |             init_train = self.evaluate(Train_data)
234 |             init_valid = self.evaluate(Validation_data)
235 |             init_test = self.evaluate(Test_data)
236 |             print("Init: \t train=%.4f, validation=%.4f, test=%.4f [%.1f s]" %(init_train, init_valid, init_test, time()-t2))
237 | 
238 |         for epoch in xrange(self.epoch):
239 |             t1 = time()
240 |             self.shuffle_in_unison_scary(Train_data['X'], Train_data['Y'])
241 |             total_batch = int(len(Train_data['Y']) / self.batch_size)
242 |             for i in xrange(total_batch):
243 |                 # generate a batch
244 |                 batch_xs = self.get_random_block_from_data(Train_data, self.batch_size)
245 |                 # Fit training
246 |                 self.partial_fit(batch_xs)
247 |             t2 = time()
248 | 
249 |             # output validation
250 |             train_result = self.evaluate(Train_data)
251 |             valid_result = self.evaluate(Validation_data)
252 |             test_result = self.evaluate(Test_data)
253 | 
254 |             self.train_rmse.append(train_result)
255 |             self.valid_rmse.append(valid_result)
256 |             self.test_rmse.append(test_result)
257 |             if self.verbose > 0 and epoch%self.verbose == 0:
258 |                 print("Epoch %d [%.1f s]\ttrain=%.4f, validation=%.4f, test=%.4f [%.1f s]"
259 |                       %(epoch+1, t2-t1, train_result, valid_result, test_result, time()-t2))
260 |             if self.eva_termination(self.valid_rmse):
261 |                 break
262 | 
263 |         if self.pretrain_flag < 0:
264 |             print "Save model to file as pretrain."
265 |             self.saver.save(self.sess, self.save_file)
266 | 
267 |     def eva_termination(self, valid):
268 |         if self.loss_type == 'square_loss':
269 |             if len(valid) > 5:
270 |                 if valid[-1] > valid[-2] and valid[-2] > valid[-3] and valid[-3] > valid[-4] and valid[-4] > valid[-5]:
271 |                     return True
272 |         else:
273 |             if len(valid) > 5:
274 |                 if valid[-1] < valid[-2] and valid[-2] < valid[-3] and valid[-3] < valid[-4] and valid[-4] < valid[-5]:
275 |                     return True
276 |         return False
277 | 
278 |     def evaluate(self, data):  # evaluate the results for an input set
279 |         num_example = len(data['Y'])
280 |         feed_dict = {self.train_features: data['X'], self.train_labels: [[y] for y in data['Y']], self.dropout_keep: 1.0, self.train_phase: False}
281 |         predictions = self.sess.run((self.out), feed_dict=feed_dict)
282 |         y_pred = np.reshape(predictions, (num_example,))
283 |         y_true = np.reshape(data['Y'], (num_example,))
284 |         if self.loss_type == 'square_loss':    
285 |             predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true))  # bound the lower values
286 |             predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true))  # bound the higher values
287 |             RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded))
288 |             return RMSE
289 |         elif self.loss_type == 'log_loss':
290 |             logloss = log_loss(y_true, y_pred) # I haven't checked the log_loss
291 |             return logloss
292 | '''         # for testing the classification accuracy  
293 |             predictions_binary = [] 
294 |             for item in y_pred:
295 |                 if item > 0.5:
296 |                     predictions_binary.append(1.0)
297 |                 else:
298 |                     predictions_binary.append(0.0)
299 |             Accuracy = accuracy_score(y_true, predictions_binary)
300 |             return Accuracy '''
301 | 
302 | if __name__ == '__main__':
303 |     # Data loading
304 |     args = parse_args()
305 |     data = DATA.LoadData(args.path, args.dataset, args.loss_type)
306 |     if args.verbose > 0:
307 |         print("FM: dataset=%s, factors=%d, loss_type=%s, #epoch=%d, batch=%d, lr=%.4f, lambda=%.1e, keep=%.2f, optimizer=%s, batch_norm=%d"
308 |               %(args.dataset, args.hidden_factor, args.loss_type, args.epoch, args.batch_size, args.lr, args.lamda, args.keep_prob, args.optimizer, args.batch_norm))
309 | 
310 |     save_file = './pretrain/%s_%d/%s_%d' %(args.dataset, args.hidden_factor, args.dataset, args.hidden_factor)
311 |     # Training
312 |     t1 = time()
313 |     model = FM(data.features_M, args.pretrain, save_file, args.hidden_factor, args.loss_type, args.epoch, args.batch_size, args.lr, args.lamda, args.keep_prob, args.optimizer, args.batch_norm, args.verbose)
314 |     model.train(data.Train_data, data.Validation_data, data.Test_data)
315 |     
316 |     # Find the best validation result across iterations
317 |     best_valid_score = 0
318 |     if args.loss_type == 'square_loss':
319 |         best_valid_score = min(model.valid_rmse)
320 |     elif args.loss_type == 'log_loss':
321 |         best_valid_score = max(model.valid_rmse)
322 |     best_epoch = model.valid_rmse.index(best_valid_score)
323 |     print ("Best Iter(validation)= %d\t train = %.4f, valid = %.4f, test = %.4f [%.1f s]" 
324 |            %(best_epoch+1, model.train_rmse[best_epoch], model.valid_rmse[best_epoch], model.test_rmse[best_epoch], time()-t1))


--------------------------------------------------------------------------------
/LoadData.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Utilities for Loading data.
  3 | The input data file follows the same input for LibFM: http://www.libfm.org/libfm-1.42.manual.pdf
  4 | 
  5 | @author: 
  6 | Xiangnan He (xiangnanhe@gmail.com)
  7 | Lizi Liao (liaolizi.llz@gmail.com)
  8 | 
  9 | @references:
 10 | '''
 11 | import numpy as np
 12 | import os
 13 | 
 14 | class LoadData(object):
 15 |     '''given the path of data, return the data format for DeepFM
 16 |     :param path
 17 |     return:
 18 |     Train_data: a dictionary, 'Y' refers to a list of y values; 'X' refers to a list of features_M dimension vectors with 0 or 1 entries
 19 |     Test_data: same as Train_data
 20 |     Validation_data: same as Train_data
 21 |     '''
 22 | 
 23 |     # Three files are needed in the path
 24 |     def __init__(self, path, dataset, loss_type):
 25 |         self.path = path + dataset + "/"
 26 |         self.trainfile = self.path + dataset +".train.libfm"
 27 |         self.testfile = self.path + dataset + ".test.libfm"
 28 |         self.validationfile = self.path + dataset + ".validation.libfm"
 29 |         self.features_M = self.map_features( )
 30 |         self.Train_data, self.Validation_data, self.Test_data = self.construct_data( loss_type )
 31 | 
 32 |     def map_features(self): # map the feature entries in all files, kept in self.features dictionary
 33 |         self.features = {}
 34 |         self.read_features(self.trainfile)
 35 |         self.read_features(self.testfile)
 36 |         self.read_features(self.validationfile)
 37 |         #print("features_M:", len(self.features))
 38 |         return  len(self.features)
 39 | 
 40 |     def read_features(self, file): # read a feature file
 41 |         f = open( file )
 42 |         line = f.readline()
 43 |         i = len(self.features)
 44 |         while line:
 45 |             items = line.strip().split(' ')
 46 |             for item in items[1:]:
 47 |                 if item not in self.features:
 48 |                     self.features[ item ] = i
 49 |                     i = i + 1
 50 |             line = f.readline()
 51 |         f.close()
 52 | 
 53 |     def construct_data(self, loss_type):
 54 |         X_, Y_ , Y_for_logloss= self.read_data(self.trainfile)
 55 |         if loss_type == 'log_loss':
 56 |             Train_data = self.construct_dataset(X_, Y_for_logloss)
 57 |         else:
 58 |             Train_data = self.construct_dataset(X_, Y_)
 59 |         print("# of training:" , len(Y_))
 60 | 
 61 |         X_, Y_ , Y_for_logloss= self.read_data(self.validationfile)
 62 |         if loss_type == 'log_loss':
 63 |             Validation_data = self.construct_dataset(X_, Y_for_logloss)
 64 |         else:
 65 |             Validation_data = self.construct_dataset(X_, Y_)
 66 |         print("# of validation:", len(Y_))
 67 | 
 68 |         X_, Y_ , Y_for_logloss = self.read_data(self.testfile)
 69 |         if loss_type == 'log_loss':
 70 |             Test_data = self.construct_dataset(X_, Y_for_logloss)
 71 |         else:
 72 |             Test_data = self.construct_dataset(X_, Y_)
 73 |         print("# of test:", len(Y_))
 74 | 
 75 |         return Train_data,  Validation_data,  Test_data
 76 | 
 77 |     def read_data(self, file):
 78 |         # read a data file. For a row, the first column goes into Y_;
 79 |         # the other columns become a row in X_ and entries are maped to indexs in self.features
 80 |         f = open( file )
 81 |         X_ = []
 82 |         Y_ = []
 83 |         Y_for_logloss = []
 84 |         line = f.readline()
 85 |         while line:
 86 |             items = line.strip().split(' ')
 87 |             Y_.append( 1.0*float(items[0]) )
 88 | 
 89 |             if float(items[0]) > 0:# > 0 as 1; others as 0
 90 |                 v = 1.0
 91 |             else:
 92 |                 v = 0.0
 93 |             Y_for_logloss.append( v )
 94 | 
 95 |             X_.append( [ self.features[item] for item in items[1:]] )
 96 |             line = f.readline()
 97 |         f.close()
 98 |         return X_, Y_, Y_for_logloss
 99 | 
100 |     def construct_dataset(self, X_, Y_):
101 |         Data_Dic = {}
102 |         X_lens = [ len(line) for line in X_]
103 |         indexs = np.argsort(X_lens)
104 |         Data_Dic['Y'] = [ Y_[i] for i in indexs]
105 |         Data_Dic['X'] = [ X_[i] for i in indexs]
106 |         return Data_Dic
107 |     
108 |     def truncate_features(self):
109 |         """
110 |         Make sure each feature vector is of the same length
111 |         """
112 |         num_variable = len(self.Train_data['X'][0])
113 |         for i in xrange(len(self.Train_data['X'])):
114 |             num_variable = min([num_variable, len(self.Train_data['X'][i])])
115 |         # truncate train, validation and test
116 |         for i in xrange(len(self.Train_data['X'])):
117 |             self.Train_data['X'][i] = self.Train_data['X'][i][0:num_variable]
118 |         for i in xrange(len(self.Validation_data['X'])):
119 |             self.Validation_data['X'][i] = self.Validation_data['X'][i][0:num_variable]
120 |         for i in xrange(len(self.Test_data['X'])):
121 |             self.Test_data['X'][i] = self.Test_data['X'][i][0:num_variable]
122 |         return num_variable
123 | 


--------------------------------------------------------------------------------
/NeuralFM.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Tensorflow implementation of Neural Factorization Machines as described in:
  3 | Xiangnan He, Tat-Seng Chua. Neural Factorization Machines for Sparse Predictive Analytics. In Proc. of SIGIR 2017.
  4 | 
  5 | This is a deep version of factorization machine and is more expressive than FM.
  6 | 
  7 | @author: 
  8 | Xiangnan He (xiangnanhe@gmail.com)
  9 | Lizi Liao (liaolizi.llz@gmail.com)
 10 | 
 11 | @references:
 12 | '''
 13 | import os
 14 | import sys
 15 | import math
 16 | import numpy as np
 17 | import tensorflow as tf
 18 | from sklearn.base import BaseEstimator, TransformerMixin
 19 | from sklearn.metrics import mean_squared_error
 20 | from sklearn.metrics import accuracy_score
 21 | from sklearn.metrics import log_loss
 22 | from time import time
 23 | import argparse
 24 | import LoadData as DATA
 25 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
 26 | 
 27 | #################### Arguments ####################
 28 | def parse_args():
 29 |     parser = argparse.ArgumentParser(description="Run Neural FM.")
 30 |     parser.add_argument('--path', nargs='?', default='../data/',
 31 |                         help='Input data path.')
 32 |     parser.add_argument('--dataset', nargs='?', default='frappe',
 33 |                         help='Choose a dataset.')
 34 |     parser.add_argument('--epoch', type=int, default=200,
 35 |                         help='Number of epochs.')
 36 |     parser.add_argument('--pretrain', type=int, default=0,
 37 |                         help='Pre-train flag. 0: train from scratch; 1: load from pretrain file')
 38 |     parser.add_argument('--batch_size', type=int, default=128,
 39 |                         help='Batch size.')
 40 |     parser.add_argument('--hidden_factor', type=int, default=64,
 41 |                         help='Number of hidden factors.')
 42 |     parser.add_argument('--layers', nargs='?', default='[64]',
 43 |                         help="Size of each layer.")
 44 |     parser.add_argument('--keep_prob', nargs='?', default='[0.8,0.5]', 
 45 |                         help='Keep probability (i.e., 1-dropout_ratio) for each deep layer and the Bi-Interaction layer. 1: no dropout. Note that the last index is for the Bi-Interaction layer.')
 46 |     parser.add_argument('--lamda', type=float, default=0,
 47 |                         help='Regularizer for bilinear part.')
 48 |     parser.add_argument('--lr', type=float, default=0.05,
 49 |                         help='Learning rate.')
 50 |     parser.add_argument('--loss_type', nargs='?', default='square_loss',
 51 |                         help='Specify a loss type (square_loss or log_loss).')
 52 |     parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer',
 53 |                         help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).')
 54 |     parser.add_argument('--verbose', type=int, default=1,
 55 |                         help='Show the results per X epochs (0, 1 ... any positive integer)')
 56 |     parser.add_argument('--batch_norm', type=int, default=1,
 57 |                     help='Whether to perform batch normaization (0 or 1)')
 58 |     parser.add_argument('--activation', nargs='?', default='relu',
 59 |                     help='Which activation function to use for deep layers: relu, sigmoid, tanh, identity')
 60 |     parser.add_argument('--early_stop', type=int, default=1,
 61 |                     help='Whether to perform early stop (0 or 1)')
 62 |     return parser.parse_args()
 63 | 
 64 | class NeuralFM(BaseEstimator, TransformerMixin):
 65 |     def __init__(self, features_M, hidden_factor, layers, loss_type, pretrain_flag, epoch, batch_size, learning_rate, lamda_bilinear,
 66 |                  keep_prob, optimizer_type, batch_norm, activation_function, verbose, early_stop, random_seed=2016):
 67 |         # bind params to class
 68 |         self.batch_size = batch_size
 69 |         self.hidden_factor = hidden_factor
 70 |         self.layers = layers
 71 |         self.loss_type = loss_type
 72 |         self.pretrain_flag = pretrain_flag
 73 |         self.features_M = features_M
 74 |         self.lamda_bilinear = lamda_bilinear
 75 |         self.epoch = epoch
 76 |         self.random_seed = random_seed
 77 |         self.keep_prob = np.array(keep_prob)
 78 |         self.no_dropout = np.array([1 for i in xrange(len(keep_prob))])
 79 |         self.optimizer_type = optimizer_type
 80 |         self.learning_rate = learning_rate
 81 |         self.batch_norm = batch_norm
 82 |         self.verbose = verbose
 83 |         self.activation_function = activation_function
 84 |         self.early_stop = early_stop
 85 |         # performance of each epoch
 86 |         self.train_rmse, self.valid_rmse, self.test_rmse = [], [], [] 
 87 |         
 88 |         # init all variables in a tensorflow graph
 89 |         self._init_graph()
 90 | 
 91 |     def _init_graph(self):
 92 |         '''
 93 |         Init a tensorflow Graph containing: input data, variables, model, loss, optimizer
 94 |         '''
 95 |         self.graph = tf.Graph()
 96 |         with self.graph.as_default():  # , tf.device('/cpu:0'):
 97 |             # Set graph level random seed
 98 |             tf.set_random_seed(self.random_seed)
 99 |             # Input data.
100 |             self.train_features = tf.placeholder(tf.int32, shape=[None, None])  # None * features_M
101 |             self.train_labels = tf.placeholder(tf.float32, shape=[None, 1])  # None * 1
102 |             self.dropout_keep = tf.placeholder(tf.float32, shape=[None])
103 |             self.train_phase = tf.placeholder(tf.bool)
104 | 
105 |             # Variables.
106 |             self.weights = self._initialize_weights()
107 | 
108 |             # Model.
109 |             # _________ sum_square part _____________
110 |             # get the summed up embeddings of features.
111 |             nonzero_embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'], self.train_features)
112 |             self.summed_features_emb = tf.reduce_sum(nonzero_embeddings, 1) # None * K
113 |             # get the element-multiplication
114 |             self.summed_features_emb_square = tf.square(self.summed_features_emb)  # None * K
115 | 
116 |             # _________ square_sum part _____________
117 |             self.squared_features_emb = tf.square(nonzero_embeddings)
118 |             self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1)  # None * K
119 | 
120 |             # ________ FM __________
121 |             self.FM = 0.5 * tf.sub(self.summed_features_emb_square, self.squared_sum_features_emb)  # None * K
122 |             if self.batch_norm:
123 |                 self.FM = self.batch_norm_layer(self.FM, train_phase=self.train_phase, scope_bn='bn_fm')
124 |             self.FM = tf.nn.dropout(self.FM, self.dropout_keep[-1]) # dropout at the bilinear interactin layer
125 | 
126 |             # ________ Deep Layers __________
127 |             for i in range(0, len(self.layers)):
128 |                 self.FM = tf.add(tf.matmul(self.FM, self.weights['layer_%d' %i]), self.weights['bias_%d'%i]) # None * layer[i] * 1
129 |                 if self.batch_norm:
130 |                     self.FM = self.batch_norm_layer(self.FM, train_phase=self.train_phase, scope_bn='bn_%d' %i) # None * layer[i] * 1
131 |                 self.FM = self.activation_function(self.FM)
132 |                 self.FM = tf.nn.dropout(self.FM, self.dropout_keep[i]) # dropout at each Deep layer
133 |             self.FM = tf.matmul(self.FM, self.weights['prediction'])     # None * 1
134 | 
135 |             # _________out _________
136 |             Bilinear = tf.reduce_sum(self.FM, 1, keep_dims=True)  # None * 1
137 |             self.Feature_bias = tf.reduce_sum(tf.nn.embedding_lookup(self.weights['feature_bias'], self.train_features) , 1)  # None * 1
138 |             Bias = self.weights['bias'] * tf.ones_like(self.train_labels)  # None * 1
139 |             self.out = tf.add_n([Bilinear, self.Feature_bias, Bias])  # None * 1
140 | 
141 |             # Compute the loss.
142 |             if self.loss_type == 'square_loss':
143 |                 if self.lamda_bilinear > 0:
144 |                     self.loss = tf.nn.l2_loss(tf.sub(self.train_labels, self.out)) + tf.contrib.layers.l2_regularizer(self.lamda_bilinear)(self.weights['feature_embeddings'])  # regulizer
145 |                 else:
146 |                     self.loss = tf.nn.l2_loss(tf.sub(self.train_labels, self.out))
147 |             elif self.loss_type == 'log_loss':
148 |                 self.out = tf.sigmoid(self.out)
149 |                 if self.lambda_bilinear > 0:
150 |                     self.loss = tf.contrib.losses.log_loss(self.out, self.train_labels, weight=1.0, epsilon=1e-07, scope=None) + tf.contrib.layers.l2_regularizer(self.lamda_bilinear)(self.weights['feature_embeddings'])  # regulizer
151 |                 else:
152 |                     self.loss = tf.contrib.losses.log_loss(self.out, self.train_labels, weight=1.0, epsilon=1e-07, scope=None)
153 | 
154 |             # Optimizer.
155 |             if self.optimizer_type == 'AdamOptimizer':
156 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
157 |             elif self.optimizer_type == 'AdagradOptimizer':
158 |                 self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
159 |             elif self.optimizer_type == 'GradientDescentOptimizer':
160 |                 self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
161 |             elif self.optimizer_type == 'MomentumOptimizer':
162 |                 self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
163 | 
164 |             # init
165 |             self.saver = tf.train.Saver()
166 |             init = tf.global_variables_initializer()
167 |             self.sess = tf.Session()
168 |             self.sess.run(init)
169 | 
170 |             # number of params
171 |             total_parameters = 0
172 |             for variable in self.weights.values():
173 |                 shape = variable.get_shape() # shape is an array of tf.Dimension
174 |                 variable_parameters = 1
175 |                 for dim in shape:
176 |                     variable_parameters *= dim.value
177 |                 total_parameters += variable_parameters
178 |             if self.verbose > 0:
179 |                 print "#params: %d" %total_parameters 
180 | 
181 |     def _initialize_weights(self):
182 |         all_weights = dict()
183 |         if self.pretrain_flag > 0: # with pretrain
184 |             pretrain_file = '../pretrain/%s_%d/%s_%d' %(args.dataset, args.hidden_factor, args.dataset, args.hidden_factor)  
185 |             weight_saver = tf.train.import_meta_graph(pretrain_file + '.meta')
186 |             pretrain_graph = tf.get_default_graph()
187 |             feature_embeddings = pretrain_graph.get_tensor_by_name('feature_embeddings:0')
188 |             feature_bias = pretrain_graph.get_tensor_by_name('feature_bias:0')
189 |             bias = pretrain_graph.get_tensor_by_name('bias:0')
190 |             with tf.Session() as sess:
191 |                 weight_saver.restore(sess, pretrain_file)
192 |                 fe, fb, b = sess.run([feature_embeddings, feature_bias, bias])
193 |             all_weights['feature_embeddings'] = tf.Variable(fe, dtype=tf.float32)
194 |             all_weights['feature_bias'] = tf.Variable(fb, dtype=tf.float32)
195 |             all_weights['bias'] = tf.Variable(b, dtype=tf.float32)
196 |         else: # without pretrain
197 |             all_weights['feature_embeddings'] = tf.Variable(
198 |                 tf.random_normal([self.features_M, self.hidden_factor], 0.0, 0.01), name='feature_embeddings')  # features_M * K
199 |             all_weights['feature_bias'] = tf.Variable(tf.random_uniform([self.features_M, 1], 0.0, 0.0), name='feature_bias')  # features_M * 1
200 |             all_weights['bias'] = tf.Variable(tf.constant(0.0), name='bias')  # 1 * 1
201 |         # deep layers
202 |         num_layer = len(self.layers)
203 |         if num_layer > 0:
204 |             glorot = np.sqrt(2.0 / (self.hidden_factor + self.layers[0]))
205 |             all_weights['layer_0'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.hidden_factor, self.layers[0])), dtype=np.float32)
206 |             all_weights['bias_0'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.layers[0])), dtype=np.float32)  # 1 * layers[0]
207 |             for i in range(1, num_layer):
208 |                 glorot = np.sqrt(2.0 / (self.layers[i-1] + self.layers[i]))
209 |                 all_weights['layer_%d' %i] = tf.Variable(
210 |                     np.random.normal(loc=0, scale=glorot, size=(self.layers[i-1], self.layers[i])), dtype=np.float32)  # layers[i-1]*layers[i]
211 |                 all_weights['bias_%d' %i] = tf.Variable(
212 |                     np.random.normal(loc=0, scale=glorot, size=(1, self.layers[i])), dtype=np.float32)  # 1 * layer[i]
213 | 	        # prediction layer
214 |             glorot = np.sqrt(2.0 / (self.layers[-1] + 1))
215 |             all_weights['prediction'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.layers[-1], 1)), dtype=np.float32)  # layers[-1] * 1
216 |         else:
217 |             all_weights['prediction'] = tf.Variable(np.ones((self.hidden_factor, 1), dtype=np.float32))  # hidden_factor * 1
218 |         return all_weights
219 | 
220 |     def batch_norm_layer(self, x, train_phase, scope_bn):
221 |         bn_train = batch_norm(x, decay=0.9, center=True, scale=True, updates_collections=None,
222 |             is_training=True, reuse=None, trainable=True, scope=scope_bn)
223 |         bn_inference = batch_norm(x, decay=0.9, center=True, scale=True, updates_collections=None,
224 |             is_training=False, reuse=True, trainable=True, scope=scope_bn)
225 |         z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
226 |         return z
227 | 
228 |     def partial_fit(self, data):  # fit a batch
229 |         feed_dict = {self.train_features: data['X'], self.train_labels: data['Y'], self.dropout_keep: self.keep_prob, self.train_phase: True}
230 |         loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
231 |         return loss
232 | 
233 |     def get_random_block_from_data(self, data, batch_size):  # generate a random block of training data
234 |         start_index = np.random.randint(0, len(data['Y']) - batch_size)
235 |         X , Y = [], []
236 |         # forward get sample
237 |         i = start_index
238 |         while len(X) < batch_size and i < len(data['X']):
239 |             if len(data['X'][i]) == len(data['X'][start_index]):
240 |                 Y.append([data['Y'][i]])
241 |                 X.append(data['X'][i])
242 |                 i = i + 1
243 |             else:
244 |                 break
245 |         # backward get sample
246 |         i = start_index
247 |         while len(X) < batch_size and i >= 0:
248 |             if len(data['X'][i]) == len(data['X'][start_index]):
249 |                 Y.append([data['Y'][i]])
250 |                 X.append(data['X'][i])
251 |                 i = i - 1
252 |             else:
253 |                 break
254 |         return {'X': X, 'Y': Y}
255 | 
256 |     def shuffle_in_unison_scary(self, a, b):
257 |         rng_state = np.random.get_state()
258 |         np.random.shuffle(a)
259 |         np.random.set_state(rng_state)
260 |         np.random.shuffle(b)
261 | 
262 |     def train(self, Train_data, Validation_data, Test_data):  # fit a dataset
263 |         # Check Init performance
264 |         if self.verbose > 0:
265 |             t2 = time()
266 |             init_train = self.evaluate(Train_data)
267 |             init_valid = self.evaluate(Validation_data)
268 |             init_test = self.evaluate(Test_data)
269 |             print("Init: \t train=%.4f, validation=%.4f, test=%.4f [%.1f s]" %(init_train, init_valid, init_test, time()-t2))
270 |         
271 |         for epoch in xrange(self.epoch):
272 |             t1 = time()
273 |             self.shuffle_in_unison_scary(Train_data['X'], Train_data['Y'])
274 |             total_batch = int(len(Train_data['Y']) / self.batch_size)
275 |             for i in xrange(total_batch):
276 |                 # generate a batch
277 |                 batch_xs = self.get_random_block_from_data(Train_data, self.batch_size)
278 |                 # Fit training
279 |                 self.partial_fit(batch_xs)
280 |             t2 = time()
281 |             
282 |             # output validation
283 |             train_result = self.evaluate(Train_data)
284 |             valid_result = self.evaluate(Validation_data)
285 |             test_result = self.evaluate(Test_data)
286 |             
287 |             self.train_rmse.append(train_result)
288 |             self.valid_rmse.append(valid_result)
289 |             self.test_rmse.append(test_result)
290 |             if self.verbose > 0 and epoch%self.verbose == 0:
291 |                 print("Epoch %d [%.1f s]\ttrain=%.4f, validation=%.4f, test=%.4f [%.1f s]" 
292 |                       %(epoch+1, t2-t1, train_result, valid_result, test_result, time()-t2))
293 |             if self.early_stop > 0 and self.eva_termination(self.valid_rmse):
294 |                 #print "Early stop at %d based on validation result." %(epoch+1)
295 |                 break
296 | 
297 |     def eva_termination(self, valid):
298 |         if self.loss_type == 'square_loss':
299 |             if len(valid) > 5:
300 |                 if valid[-1] > valid[-2] and valid[-2] > valid[-3] and valid[-3] > valid[-4] and valid[-4] > valid[-5]:
301 |                     return True
302 |         else:
303 |             if len(valid) > 5:
304 |                 if valid[-1] < valid[-2] and valid[-2] < valid[-3] and valid[-3] < valid[-4] and valid[-4] < valid[-5]:
305 |                     return True
306 |         return False
307 | 
308 |     def evaluate(self, data):  # evaluate the results for an input set
309 |         num_example = len(data['Y'])
310 |         feed_dict = {self.train_features: data['X'], self.train_labels: [[y] for y in data['Y']], self.dropout_keep: self.no_dropout, self.train_phase: False}
311 |         predictions = self.sess.run((self.out), feed_dict=feed_dict)
312 |         y_pred = np.reshape(predictions, (num_example,))
313 |         y_true = np.reshape(data['Y'], (num_example,))
314 |         if self.loss_type == 'square_loss':    
315 |             predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true))  # bound the lower values
316 |             predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true))  # bound the higher values
317 |             RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded))
318 |             return RMSE
319 |         elif self.loss_type == 'log_loss':
320 |             logloss = log_loss(y_true, y_pred) # I haven't checked the log_loss
321 |             return logloss
322 | '''         # for testing the classification accuracy  
323 |             predictions_binary = [] 
324 |             for item in y_pred:
325 |                 if item > 0.5:
326 |                     predictions_binary.append(1.0)
327 |                 else:
328 |                     predictions_binary.append(0.0)
329 |             Accuracy = accuracy_score(y_true, predictions_binary)
330 |             return Accuracy '''
331 | 
332 | if __name__ == '__main__':
333 |     # Data loading
334 |     args = parse_args()
335 |     data = DATA.LoadData(args.path, args.dataset, args.loss_type)
336 |     if args.verbose > 0:
337 |         print("Neural FM: dataset=%s, hidden_factor=%d, dropout_keep=%s, layers=%s, loss_type=%s, pretrain=%d, #epoch=%d, batch=%d, lr=%.4f, lambda=%.4f, optimizer=%s, batch_norm=%d, activation=%s, early_stop=%d" 
338 |               %(args.dataset, args.hidden_factor, args.keep_prob, args.layers, args.loss_type, args.pretrain, args.epoch, args.batch_size, args.lr, args.lamda, args.optimizer, args.batch_norm, args.activation, args.early_stop))
339 |     activation_function = tf.nn.relu
340 |     if args.activation == 'sigmoid':
341 |         activation_function = tf.sigmoid
342 |     elif args.activation == 'tanh':
343 |         activation_function == tf.tanh
344 |     elif args.activation == 'identity':
345 |         activation_function = tf.identity
346 | 
347 |     # Training
348 |     t1 = time()
349 |     model = NeuralFM(data.features_M, args.hidden_factor, eval(args.layers), args.loss_type, args.pretrain, args.epoch, args.batch_size, args.lr, args.lamda, eval(args.keep_prob), args.optimizer, args.batch_norm, activation_function, args.verbose, args.early_stop)
350 |     model.train(data.Train_data, data.Validation_data, data.Test_data)
351 |     
352 |     # Find the best validation result across iterations
353 |     best_valid_score = 0
354 |     if args.loss_type == 'square_loss':
355 |         best_valid_score = min(model.valid_rmse)
356 |     elif args.loss_type == 'log_loss':
357 |         best_valid_score = max(model.valid_rmse)
358 |     best_epoch = model.valid_rmse.index(best_valid_score)
359 |     print ("Best Iter(validation)= %d\t train = %.4f, valid = %.4f, test = %.4f [%.1f s]" 
360 |            %(best_epoch+1, model.train_rmse[best_epoch], model.valid_rmse[best_epoch], model.test_rmse[best_epoch], time()-t1))
361 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Factorization Machines
 2 | 
 3 | This is our implementation for the paper:
 4 | 
 5 | Xiangnan He and Tat-Seng Chua (2017). [Neural Factorization Machines for Sparse Predictive Analytics.] (http://www.comp.nus.edu.sg/~xiangnan/papers/sigir17-nfm.pdf) In Proceedings of SIGIR '17, Shinjuku, Tokyo,
 6 | Japan, August 07-11, 2017.
 7 | 
 8 | We have additionally released our TensorFlow implementation of Factorization Machines under our proposed neural network framework. 
 9 | 
10 | **Please cite our SIGIR'17 paper if you use our codes. Thanks!** 
11 | 
12 | Author: Dr. Xiangnan He (http://www.comp.nus.edu.sg/~xiangnan/)
13 | 
14 | ## Example to run the codes.
15 | 
16 | ```
17 | python NeuralFM.py --dataset frappe --hidden_factor 64 --layers [64] --keep_prob [0.8,0.5] --loss_type square_loss --activation relu --pretrain 0 --optimizer AdagradOptimizer --lr 0.05 --batch_norm 1 --verbose 1 --early_stop 1 --epoch 200
18 | ```
19 | The instruction of commands has been clearly stated in the codes (see the  parse_args function). 
20 | 
21 | The current implementation supports two tasks: regression and binary classification. The regression task optimizes RMSE, and the binary classification task optimizes Log Loss. 
22 | 
23 | ### Dataset
24 | We use the same input format as the LibFM toolkit (http://www.libfm.org/). 
25 | 
26 | Split the data to train/test/validation files to run the codes directly (examples see data/frappe/). 
27 | 
28 | 
29 | 
30 | Last Update Date: May 11, 2017


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/faychu/nfm/69fc7af5ef5396896d0eedd9114ae049a5a91c2a/__init__.py


--------------------------------------------------------------------------------
/data/frappe/README.txt:
--------------------------------------------------------------------------------
 1 | Frappe dataset v1.0 http://baltrunas.info/research-menu/frappe
 2 | 
 3 | The frappe dataset contains a context-aware app usage log.
 4 | It consist of 96203 entries by 957 users for 4082 apps used in various contexts.
 5 | (sample 2 negative samples for 1 positive => # of total instances: 288609)
 6 | 
 7 | Nonzero u-i pairs: 18842
 8 | Context fields:
 9 | 	#user:  957
10 | 	#item:  4082
11 | 	#cnt:  1981 (means how many times the app has been used by the user; convert it to 0/1)
12 | 	#daytime:  7
13 | 	#weekday:  7
14 | 	#isweekend:  2
15 | 	#homework:  3
16 | 	#cost:  2
17 | 	#weather:  9
18 | 	#country:  80
19 | 	#city:  233
20 | Total features: 7363 - 1981 = 5382
21 | 
22 | 
23 | Any scientific publications that use this data set should cite the following paper as the reference:
24 | @Article{frappe15,
25 |     title={Frappe: Understanding the Usage and Perception of Mobile App Recommendations In-The-Wild},
26 |     author = {Linas Baltrunas, Karen Church, Alexandros Karatzoglou, Nuria Oliver},
27 |     date={2015},
28 |     urldate={2015-05-12},
29 |     eprinttype={arxiv},
30 |     eprint={arXiv:1505.03014}
31 | }
32 | 
33 | Nobody guarantees the correctness of the data, its suitability for any particular purpose, 
34 | or the validity of results based on the use of the data set. The data set may be used for any 
35 | research purposes under the following conditions:
36 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set.
37 | * The user may not redistribute the data without separate permission.
38 | * The user may not try to deanonymise the data.
39 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from us.
40 | 
41 | In no event anyone involved in frappe project be liable to you for any damages arising out of the use or inability to use the 
42 | associated scripts (including but not limited to loss of data or data being rendered inaccurate).
43 | 
44 | The three data files are encoded as UTF-8. You can use the following pandas script in python to load the data set:
45 | 
46 | import pandas
47 | df = pandas.read_csv('frappe.csv', sep="\t")
48 | meta_app = pandas.read_csv('apps.csv', sep="\t")
49 | df = df.merge(meta_app, on='item')
50 | 
51 | Note, that we don't provide city names for privacy reasons. Also apps, that were downloaded not so many times are missing meta information.
52 | However, the item id is a valid identification.
53 | 
54 | If you have any further questions or comments, please email linas.baltrunas@gmail.com


--------------------------------------------------------------------------------