├── .idea └── vcs.xml ├── FFM.py ├── README.md └── read_data.py /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /FFM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import random 3 | import numpy as np 4 | 5 | ''' 6 | configure 7 | ''' 8 | batch_size = 128 9 | learning_rate = 0.001 10 | data_path = './dep_norm_test_data.txt' 11 | 12 | # no need to define,will be assigned by prepare_data function 13 | field_num = 0 14 | feature_num = 0 15 | 16 | 17 | def prepare_data(file_path=data_path): 18 | """ 19 | :param file_path: 20 | :return: a tuple (data_set,feature2field) 21 | data_set is a list,each element is a list,the last is label 22 | """ 23 | feature2field = {} 24 | data_set = [] 25 | global field_num 26 | global feature_num 27 | for sample in open(file_path, 'r'): 28 | sample_data = [] 29 | field_features = sample.split()[1:] 30 | for field_feature_pair in field_features: 31 | feature = int(field_feature_pair.split(':')[1]) 32 | field = int(field_feature_pair.split(':')[0]) 33 | value = float(field_feature_pair.split(':')[0]) 34 | if (field + 1 > field_num): 35 | field_num = field + 1 36 | if (feature + 1 > feature_num): 37 | feature_num = feature + 1 38 | feature2field[feature] = field 39 | sample_data.append('{}:{}'.format(feature, value)) 40 | sample_data.append(int(sample[0])) 41 | data_set.append(sample_data) 42 | return data_set, feature2field 43 | 44 | 45 | class FFM: 46 | def __init__(self, batch_size, learning_rate, 47 | data_path, field_num, 48 | feature_num, feature2field, data_set): 49 | self.batch_size = batch_size 50 | self.lr = learning_rate 51 | self.data_path = data_path 52 | self.field_num = field_num 53 | self.feature_num = feature_num 54 | self.feature2field = feature2field 55 | self.data_set = data_set 56 | 57 | with tf.name_scope('embedding_matrix'): 58 | # a tensor of shape [feature_num] to hold each Wi 59 | self.liner_weight = tf.get_variable(name='line_weight', 60 | shape=[feature_num], 61 | dtype=tf.float32, 62 | initializer=tf.truncated_normal_initializer(stddev=0.01)) 63 | tf.summary.histogram('liner_weight', self.liner_weight) 64 | self.field_embedding = [] 65 | for idx in xrange(0, self.feature_num): 66 | # a list or tensor which stores each feature's vector to each identity field, 67 | # shape = [feature_num * field_num] 68 | 69 | self.field_embedding.append(tf.get_variable(name='field_embedding{}'.format(idx), 70 | shape=[field_num], 71 | dtype=tf.float32, 72 | initializer=tf.truncated_normal_initializer(stddev=0.01))) 73 | tf.summary.histogram('field_vector{}'.format(idx), self.field_embedding[idx]) 74 | with tf.name_scope('input'): 75 | self.label = tf.placeholder(tf.float32, shape=(self.batch_size)) 76 | self.feature_value = [] 77 | for idx in xrange(0, feature_num): 78 | self.feature_value.append( 79 | tf.placeholder(tf.float32, 80 | shape=(self.batch_size), 81 | name='feature_{}'.format(idx))) 82 | with tf.name_scope('network'): 83 | 84 | # b0:constant bias 85 | # predict = b0 + sum(Vi * feature_i) + sum(Vij * Vji * feature_i * feature_j) 86 | self.b0 = tf.get_variable(name='bias_0', shape=[1], dtype=tf.float32) 87 | tf.summary.histogram('b0', self.b0) 88 | # calculate liner term 89 | self.liner_term = tf.reduce_sum(tf.multiply(tf.transpose( 90 | tf.convert_to_tensor(self.feature_value),perm=[1, 0]) 91 | , self.liner_weight)) 92 | # calculate quadratic term 93 | self.qua_term = tf.get_variable(name='quad_term', shape=[1], dtype=tf.float32) 94 | for f1 in xrange(0, feature_num - 1): 95 | for f2 in xrange(f1 + 1, feature_num): 96 | W1 = tf.nn.embedding_lookup(self.field_embedding[f1], self.feature2field[f2]) 97 | W2 = tf.nn.embedding_lookup(self.field_embedding[f2], self.feature2field[f1]) 98 | self.qua_term += W1 * W2 * self.feature_value[f1] * self.feature_value[f2] 99 | self.predict = self.b0 + self.liner_term + self.qua_term 100 | self.losses = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.label, logits=self.predict)) 101 | tf.summary.scalar('losses', self.losses) 102 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, name='Adam') 103 | self.grad = self.optimizer.compute_gradients(self.losses) 104 | self.opt = self.optimizer.apply_gradients(self.grad) 105 | 106 | self.sess = tf.InteractiveSession() 107 | 108 | with tf.name_scope('plot'): 109 | self.merged = tf.summary.merge_all() 110 | self.writer = tf.summary.FileWriter('./train_plot', self.sess.graph) 111 | 112 | self.sess.run(tf.global_variables_initializer()) 113 | self.loop_step = 0 114 | 115 | 116 | def step(self): 117 | ''' 118 | :return: log_loss 119 | ''' 120 | self.loop_step += 1 121 | feature, label = self.get_data() 122 | # feed value to placeholder 123 | feed_dict = {} 124 | feed_dict[self.label] = label 125 | arr_feature = np.transpose(np.array(feature)) 126 | for idx in xrange(0, self.feature_num): 127 | feed_dict[self.feature_value[idx]] = arr_feature[idx] 128 | _,summary, loss_value = self.sess.run([self.opt,self.merged, self.losses], feed_dict=feed_dict) 129 | #self.train_writer.add_summary(summary, self.step) 130 | self.writer.add_summary(summary, self.loop_step) 131 | return loss_value 132 | 133 | def get_data(self): 134 | """ 135 | :return: a tuple of feature and label 136 | feature: shape[batch_size ,feature_num] each element is a sclar 137 | label:[batch_size] each element is 0 or 1 138 | """ 139 | feature = [] 140 | label = [] 141 | for _ in xrange(0, self.batch_size): 142 | t_feature = [0.0] * feature_num 143 | sample = self.data_set[random.randint(0, len(self.data_set) - 1)] 144 | label.append(sample[-1]) 145 | sample = sample[:-1] 146 | for f in sample: 147 | t_feature[int(f.split(':')[0])] = float(f.split(':')[1]) 148 | feature.append(t_feature) 149 | return feature, label 150 | 151 | 152 | if __name__ == "__main__": 153 | data_set, feature_map = prepare_data(file_path=data_path) 154 | print("feature num {} field num {}".format(feature_num, field_num)) 155 | ffm = FFM(batch_size, learning_rate, data_path, field_num, feature_num, feature_map, data_set) 156 | feature, label = ffm.get_data() 157 | for loop in xrange(0, 1000): 158 | losses = ffm.step() 159 | if (loop % 50): 160 | print("loop:{} losses:{}".format(loop, losses)) 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FFM-tensorflow 2 | *** 3 | ## What the project do? 4 | This project is a FFM implement with tensorflow by Python create by Robin Han.To know more about 5 | FFM,please see [FFM introduce](https://tech.meituan.com/deep-understanding-of-ffm-principles-and-practices.html) 6 | 7 | 8 | *** 9 | # How to use 10 | First clone the project from github 11 | ``` 12 | https://github.com/drcut/FFM-tensorflow 13 | ``` 14 | Then you can just run the program. 15 | ``` 16 | python FFM.py 17 | ``` 18 | Make sure before you run the program, you have replaced the test_data.txt by 19 | yours special data with same format. 20 | 21 | *** 22 | ## Note: 23 | For dataset,each feature must have unique name,which means each pair of features 24 | should have different name,no matter whether they belongs to the same field or not. 25 | 26 | 27 | ##Author 28 | * [Han Ruobing](https://github.com/drcut) 29 | -------------------------------------------------------------------------------- /read_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ''' 4 | notice:feature id should be identity among all field, 5 | which means there should not be different feature in different fiedls 6 | that have the same feature id 7 | ''' 8 | def get_field_feature(file_name = './test_data.txt'): 9 | file = open(file_name) 10 | meta_data = file.readlines() 11 | file.close() 12 | ''' 13 | get fields and features 14 | ''' 15 | field_set = set() 16 | feature_set = set() 17 | for sample in meta_data: 18 | attribute = sample.split() 19 | for value in attribute[1:]: 20 | print(value) 21 | attrs = value.split(':') 22 | field_set.add(attrs[0]) 23 | feature_set.add(attrs[1]) 24 | return field_set,feature_set 25 | 26 | ''' 27 | change feature and field id to range[0-N) 28 | ''' 29 | def normalize_data_file(field_dic,feature_dic,meta_file_name = './test_data.txt'): 30 | if os.path.exists('./norm_test_data.txt'): 31 | print("norm dataset already exists") 32 | return 33 | file = open('./norm_test_data.txt','w') 34 | for sample in open(meta_file_name): 35 | attribute = sample.split() 36 | str = attribute[0] 37 | for value in attribute[1:]: 38 | attrs = value.split(':') 39 | if(float(attrs[2]) == 0): 40 | #if feature's value is 0, then ignore that feature 41 | #may be need normalize in future version 42 | continue 43 | norm_field = field_dic[attrs[0]] 44 | norm_feature = feature_dic[attrs[1]] 45 | str += ' {}:{}:{}'.format(norm_field,norm_feature,attrs[2]) 46 | file.write(str+'\n') 47 | file.close() 48 | 49 | if __name__ == "__main__": 50 | print("get field&&feature id") 51 | field_set,feature_set = get_field_feature() 52 | field_dic = {} 53 | for idx,item in enumerate(list(field_set)): 54 | field_dic[item] = idx 55 | feature_dic = {} 56 | for idx,item in enumerate(list(feature_set)): 57 | feature_dic[item] = idx 58 | print("get norm dataset") 59 | normalize_data_file(field_dic,feature_dic) --------------------------------------------------------------------------------