├── __init__.py ├── common ├── __init__.py └── model_args.py ├── models ├── __init__.py └── model.py ├── utils ├── __init__.py └── utils.py ├── data_io ├── __init__.py └── data_parser.py ├── model_zoo ├── __init__.py └── fm.py ├── images └── fm.jpg ├── train.sh ├── train.py └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/fm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/challenge-ICME2019-Bytedance/Bytedance_ICME_challenge/HEAD/images/fm.jpg -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | def get_dataset_path_list(dataset_path, sub_str=None): 2 | dataset_path_list = [] 3 | for root_path, dir_names, file_names in os.walk(dataset_path): 4 | for file_name in file_names: 5 | file_path = os.path.join(root_path, file_name) 6 | if sub_str: 7 | if file_path.find(sub_str) == -1: 8 | continue 9 | dataset_path_list.append(file_path) 10 | 11 | return dataset_path_list 12 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd `dirname $0` 3 | echo `pwd` 4 | training_path=$1 5 | validation_path=$2 6 | echo "training path: " $training_path 7 | echo "validation path: " $validation_path 8 | 9 | save_model_dir=$3 10 | echo "save model on: " $save_model_dir 11 | 12 | batch_size=$4 13 | embedding_size=$5 14 | echo "batch size: " $batch_size 15 | echo "embedding size: " $embedding_size 16 | 17 | optimizer=$6 18 | lr=$7 19 | 20 | task=$8 21 | track=$9 22 | echo "task: " $task 23 | echo "track: " $track 24 | 25 | mkdir ${save_model_dir}; 26 | 27 | python train.py \ 28 | --training_path $training_path \ 29 | --validation_path $validation_path \ 30 | --save_model_dir $save_model_dir \ 31 | --batch_size $batch_size \ 32 | --embedding_size $embedding_size \ 33 | --lr $lr \ 34 | --task $task \ 35 | --track $track \ 36 | --optimizer $optimizer 37 | -------------------------------------------------------------------------------- /common/model_args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def init_model_args(): 4 | """ 5 | Basic but important params for traning 6 | """ 7 | parser = argparse.ArgumentParser() 8 | # training size 9 | parser.add_argument('--batch_size', type=int, default=40) 10 | parser.add_argument('--embedding_size', type=int, default=40) 11 | parser.add_argument('--num_epochs', type=int, default=200) 12 | 13 | # optimizer 14 | parser.add_argument('--optimizer', default='adam', choices=['adam', 'adagrad']) 15 | parser.add_argument('--lr', type=float, default=0.001) 16 | 17 | #necessary dir 18 | parser.add_argument('--save_model_dir', default='save_model') 19 | parser.add_argument('--training_path', required=True) 20 | parser.add_argument('--validation_path', required=True) 21 | 22 | #task 23 | parser.add_argument('--task', default="finish") 24 | 25 | #track 26 | parser.add_argument('--track', type=int, default=2) 27 | 28 | args = parser.parse_args() 29 | return args 30 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | from utils.utils import get_dataset_path_list 4 | from common.model_args import init_model_args 5 | from models.model import RecommendModelHandler 6 | 7 | 8 | def main(): 9 | 10 | # basic logging setup for tf 11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 12 | tf.logging.set_verbosity(tf.logging.INFO) 13 | 14 | # init necessary args 15 | args = init_model_args() 16 | 17 | #train_dataset_path_list = get_dataset_path_list(train_dataset_path, sub_str="track2_train_time.txt") 18 | train_dataset_path_list = [args.training_path] 19 | 20 | val_dataset_path_list = [args.validation_path] 21 | print "training path list: {}".format(train_dataset_path_list) 22 | print "training path list: {}".format(val_dataset_path_list) 23 | 24 | save_model_dir = args.save_model_dir 25 | print "saving model in ... {}".format(save_model_dir) 26 | 27 | optimizer = args.optimizer 28 | learning_rate = args.lr 29 | print "we use {} as optimizer".format(optimizer) 30 | print "learning rate is set as {}".format(learning_rate) 31 | 32 | batch_size = args.batch_size 33 | embedding_size = args.embedding_size 34 | num_epochs = args.num_epochs 35 | print "batch size: {}".format(batch_size) 36 | print "embedding size: {}".format(embedding_size) 37 | 38 | task = args.task 39 | track = args.track 40 | print "track: {}, task: {}".format(track, task) 41 | 42 | 43 | model = RecommendModelHandler( 44 | train_dataset_path=train_dataset_path_list, 45 | val_dataset_path=val_dataset_path_list, 46 | save_model_dir=save_model_dir, 47 | num_epochs=num_epochs, 48 | optimizer=optimizer, 49 | batch_size= batch_size, 50 | embedding_size=embedding_size, 51 | task=task, 52 | track=track, 53 | learning_rate=args.lr) 54 | 55 | model.train() 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | REPO DESCRIPTION 2 | ------- 3 | Our FM implementation is based on tensorflow 1.12.0. 4 | You can run our reference training code on-the-fly using the following command: 5 | 6 | ``` 7 | #--------------------------how to train----------------------------# 8 | ./train.sh \ 9 | 10 | ``` 11 | 12 | CODE STRUCTURE 13 | -------------- 14 | 15 |
 
16 | #--------------------------run script------------------------------#
17 | train.sh  
18 | 
19 | #----------------------------train---------------------------------#
20 | train.py  
21 | 
22 | #------------------------common operation--------------------------#
23 | common/  
24 |         model_args.py  
25 |   
26 | #--------convert input text data into tensorflow batch need--------#
27 | data_io/  
28 |        data_parser.py  
29 | 
30 | #-------------prepare model and build up main framework------------#
31 | models/  
32 |        model.py  
33 | 
34 | #---------------common algorithm and models for recom--------------#
35 | model_zoo/  
36 |        fm.py  
37 | 
38 | #-----------------utils for str or data processing-----------------#
39 | utils/  
40 |        utils.py
41 |  
42 | 43 | 44 | ALGORITHM: FACTORIZATION MACHINE 45 | -------------------------------- 46 | 47 | ![image](https://github.com/challenge-ICME2019-Bytedance/Bytedance_ICME_challenge/raw/master/images/fm.jpg) 48 | 49 | BASELINE 50 | ------- 51 | Our baseline results with 5 features (user_id, user_city, item_id,author_id,item_city): 52 | 53 | * TRACK2 LIKE TASK: 54 | ``` 55 | auc: 86.5% 56 | #------------------------params-------------------------# 57 | embedding_size = 40 58 | optimizer = adam 59 | lr = 0.0005 60 | ``` 61 | * TRACK FINISH TASK: 62 | ``` 63 | auc: 69.8% 64 | #------------------------params-------------------------# 65 | embedding_size = 40 66 | optimizer = adam 67 | lr = 0.0001 68 | ``` 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /data_io/data_parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | class PosShifts(object): 4 | """ 5 | Position shifts will be caused by different fields, 6 | thus, we need to take it into our consideration. 7 | This class is used for removing position shifts 8 | """ 9 | 10 | _shifts = [] 11 | def __init__(self, track): 12 | PosShifts._track = track 13 | if track == 1: 14 | PosShifts._shifts = [663011, 0, 31180492, 15595718, 410, 6] 15 | elif track == 2: 16 | PosShifts._shifts = [73974, 396, 4122689, 850308, 461, 5] 17 | else: 18 | raise Exception("unknown track", track) 19 | 20 | @staticmethod 21 | def get_features_num(): 22 | index_shift = PosShifts._shifts 23 | all_shift = reduce(lambda x, y: x+y, index_shift) 24 | return all_shift 25 | 26 | 27 | @staticmethod 28 | def shift(): 29 | """ position shifts for different field features """ 30 | shifts = PosShifts._shifts 31 | shifts = [0] + shifts 32 | 33 | sum = 0 34 | for index, shift in enumerate(shifts): 35 | sum += shift 36 | shifts[index] = sum 37 | return shifts 38 | 39 | class LineParser(object): 40 | """ 41 | class for parsing tf input line 42 | """ 43 | @staticmethod 44 | def parse_finish_line(line): 45 | """ 46 | tf operator not good for parse text info 47 | thus py_func is applied 48 | finish line parse 49 | """ 50 | return tf.py_func(DataParser.data_parser, [line, 6], [tf.int32, tf.float32, tf.float32]) 51 | 52 | @staticmethod 53 | def parse_like_line(line): 54 | """ 55 | tf operator not good for parse text info 56 | thus py_func is applied 57 | like line parser 58 | """ 59 | return tf.py_func(DataParser.data_parser, [line, 7], [tf.int32, tf.float32, tf.float32]) 60 | 61 | class DataParser(object): 62 | """ 63 | Detailed operator foe line input 64 | """ 65 | @staticmethod 66 | def data_parser(line, label_index): 67 | """ parser line content and generate idx, features, and gts """ 68 | content = line.split('\t') 69 | label = np.float32(content[label_index].strip()) 70 | feature_num = 5 71 | features = content[:feature_num+1] 72 | features = map(lambda feature: np.float32(feature), features) 73 | idx = [0 if feature < 0 else feature for feature in features] 74 | features = [np.float32(0) if feature < 0 else np.float32(1) for feature in features] 75 | features = features[:feature_num] 76 | 77 | idx = idx[:feature_num] 78 | 79 | shifts = PosShifts.shift() 80 | idx = [idx[i] + shifts[i] for i in xrange(len(idx))] 81 | 82 | idx= map(lambda one_id: np.int32(one_id), idx) 83 | return idx, features, label 84 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | """ import necessary packages""" 2 | import tensorflow as tf 3 | from data_io.data_parser import PosShifts, LineParser 4 | from model_zoo.fm import FMModel 5 | 6 | 7 | class RecommendModelHandler(object): 8 | """ class for setup recommend model """ 9 | def __init__(self, train_dataset_path, val_dataset_path, save_model_dir, \ 10 | learning_rate=0.1, num_threads=1, num_epochs=100, batch_size=40, \ 11 | embedding_size=100, optimizer='adam', task="finish", track=2): 12 | """ init basic params""" 13 | self._learning_rate = learning_rate 14 | self._num_threads = num_threads 15 | self._num_epochs = num_epochs 16 | self._batch_size = batch_size 17 | self._train_dataset_path = train_dataset_path 18 | self._val_dataset_path = val_dataset_path 19 | self._save_model_dir = save_model_dir 20 | self._embedding_size = embedding_size 21 | self._optimizer = optimizer 22 | self._task = task 23 | self._track = track 24 | 25 | 26 | def build_model(self): 27 | """ build recommend model framework""" 28 | config = tf.estimator.RunConfig().replace( 29 | session_config=tf.ConfigProto(device_count={'CPU':self._num_threads}), 30 | log_step_count_steps=20) 31 | PosShifts(self._track) 32 | feature_size = PosShifts.get_features_num() 33 | params={ 34 | 'feature_size': feature_size, 35 | 'embedding_size': self._embedding_size, 36 | 'learning_rate': self._learning_rate, 37 | 'field_size': 5, 38 | 'batch_size': self._batch_size, 39 | 'optimizer': self._optimizer} 40 | 41 | model = tf.estimator.Estimator( 42 | model_fn=FMModel.fm_model_fn, 43 | model_dir=self._save_model_dir, 44 | params=params, 45 | config=config) 46 | return model 47 | 48 | def prepare_data_fn(self, data_mode='train'): 49 | """ prepare train, val fn""" 50 | if data_mode == 'train': 51 | dataset = tf.data.TextLineDataset(self._train_dataset_path) 52 | elif data_mode == 'val': 53 | dataset = tf.data.TextLineDataset(self._val_dataset_path) 54 | else: 55 | raise Exception("unknown data_mode", data_mode) 56 | 57 | if self._task == "finish": 58 | dataset = dataset.map(LineParser.parse_finish_line) 59 | elif self._task == "like": 60 | dataset = dataset.map(LineParser.parse_like_line) 61 | else: 62 | raise Exception("unknown task", task) 63 | 64 | dataset = dataset.shuffle(buffer_size=300) 65 | dataset = dataset.repeat(self._num_epochs) 66 | dataset = dataset.batch(self._batch_size) 67 | data_iterator = dataset.make_one_shot_iterator() 68 | idx, features, labels = data_iterator.get_next() 69 | feature_infos = {} 70 | feature_infos['feature_idx'] = idx 71 | feature_infos['feature_values'] = features 72 | tf.logging.info(labels) 73 | return feature_infos, labels 74 | 75 | 76 | def train(self): 77 | """ 78 | Train model 79 | """ 80 | model = self.build_model() 81 | train_spec = tf.estimator.TrainSpec(input_fn=lambda: self.prepare_data_fn(data_mode='train')) 82 | val_spec = tf.estimator.EvalSpec(input_fn=lambda: self.prepare_data_fn(data_mode='val')) 83 | tf.estimator.train_and_evaluate(model, train_spec, val_spec) 84 | -------------------------------------------------------------------------------- /model_zoo/fm.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | class FMModelParams(object): 5 | """ class for initializing weights """ 6 | def __init__(self, feature_size, embedding_size): 7 | self._feature_size = feature_size 8 | self._embedding_size = embedding_size 9 | 10 | def initialize_weights(self): 11 | """ init fm weights 12 | Returns 13 | weights: 14 | feature_embeddings: vi, vj second order params 15 | weights_first_order: wi first order params 16 | bias: b bias 17 | """ 18 | 19 | weights = dict() 20 | weights_initializer=tf.glorot_normal_initializer() 21 | bias_initializer=tf.constant_initializer(0.0) 22 | weights["feature_embeddings"] = tf.get_variable( 23 | name='weights', 24 | dtype=tf.float32, 25 | initializer=weights_initializer, 26 | shape=[self._feature_size, self._embedding_size]) 27 | weights["weights_first_order"] = tf.get_variable( 28 | name='vectors', 29 | dtype=tf.float32, 30 | initializer=weights_initializer, 31 | shape=[self._feature_size, 1]) 32 | weights["fm_bias"] = tf.get_variable( 33 | name='bias', 34 | dtype=tf.float32, 35 | initializer=bias_initializer, 36 | shape=[1]) 37 | return weights 38 | 39 | class FMModel(object): 40 | """ FM implementation """ 41 | 42 | @staticmethod 43 | def fm_model_fn(features, labels, mode, params): 44 | """ build tf model""" 45 | 46 | #parse params 47 | embedding_size = params['embedding_size'] 48 | feature_size = params['feature_size'] 49 | batch_size = params['batch_size'] 50 | learning_rate = params['learning_rate'] 51 | field_size = params['field_size'] 52 | optimizer_used = params['optimizer'] 53 | 54 | #parse features 55 | feature_idx = features["feature_idx"] 56 | feature_idx = tf.reshape(feature_idx, shape=[batch_size, field_size]) 57 | labels = tf.reshape(labels, shape=[batch_size, 1]) 58 | feature_values = features["feature_values"] 59 | feature_values = tf.reshape(feature_values, shape=[batch_size, field_size, 1]) 60 | 61 | # tf fm weights 62 | tf_model_params = FMModelParams(feature_size, embedding_size) 63 | weights = tf_model_params.initialize_weights() 64 | embeddings = tf.nn.embedding_lookup( 65 | weights["feature_embeddings"], 66 | feature_idx 67 | ) 68 | weights_first_order = tf.nn.embedding_lookup( 69 | weights["weights_first_order"], 70 | feature_idx 71 | ) 72 | bias = weights['fm_bias'] 73 | 74 | #build function 75 | ##first order 76 | first_order = tf.multiply(feature_values, weights_first_order) 77 | first_order = tf.reduce_sum(first_order, 2) 78 | first_order = tf.reduce_sum(first_order, 1, keepdims=True) 79 | 80 | ##second order 81 | ### feature * embeddings 82 | f_e_m = tf.multiply(feature_values, embeddings) 83 | ### square(sum(feature * embedding)) 84 | f_e_m_sum = tf.reduce_sum(f_e_m, 1) 85 | f_e_m_sum_square = tf.square(f_e_m_sum) 86 | ### sum(square(feature * embedding)) 87 | f_e_m_square = tf.square(f_e_m) 88 | f_e_m_square_sum = tf.reduce_sum(f_e_m_square, 1) 89 | 90 | second_order = f_e_m_sum_square - f_e_m_square_sum 91 | 92 | second_order = tf.reduce_sum(second_order, 1, keepdims=True) 93 | 94 | ##final objective function 95 | logits = second_order + first_order + bias 96 | predicts = tf.sigmoid(logits) 97 | 98 | ##loss function 99 | sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels) 100 | sigmoid_loss = tf.reduce_mean(sigmoid_loss) 101 | loss = sigmoid_loss 102 | 103 | #train op 104 | if optimizer_used == 'adagrad': 105 | optimizer = tf.train.AdagradOptimizer( 106 | learning_rate=learning_rate, 107 | initial_accumulator_value=1e-8) 108 | elif optimizer_used == 'adam': 109 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 110 | else: 111 | raise Exception("unknown optimizer", optimizer_used) 112 | 113 | train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) 114 | 115 | 116 | #metric 117 | eval_metric_ops = { 118 | "auc": tf.metrics.auc(labels, predicts) 119 | } 120 | predictions = {"prob": predicts} 121 | 122 | return tf.estimator.EstimatorSpec( 123 | mode=tf.estimator.ModeKeys.TRAIN, 124 | predictions=predicts, 125 | loss=loss, 126 | eval_metric_ops=eval_metric_ops, 127 | train_op=train_op) 128 | --------------------------------------------------------------------------------