├── README.md └── tencent_FFM_baseline ├── FFM_baseline.py └── convert_to_ffm.py /README.md: -------------------------------------------------------------------------------- 1 | 具体可见 https://zhuanlan.zhihu.com/p/36302396 2 | -------------------------------------------------------------------------------- /tencent_FFM_baseline/FFM_baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by Bo Song on 2018/4/26 4 | import xlearn as xl 5 | import pandas as pd 6 | import numpy as np 7 | path='data/' 8 | ffm_model = xl.create_ffm() 9 | ffm_model.setTrain(path+'train_ffm.csv') 10 | ffm_model.setTest(path+'test_ffm.csv') 11 | ffm_model.setSigmoid() 12 | param = {'task':'binary', 'lr':0.01, 'lambda':0.001,'metric': 'auc','opt':'ftrl','epoch':5,'k':4, 13 | 'alpha': 1.5, 'beta': 0.01, 'lambda_1': 0.0, 'lambda_2': 0.0} 14 | ffm_model.fit(param,"./model.out") 15 | ffm_model.predict("./model.out","./output.txt") 16 | sub = pd.DataFrame() 17 | sub['aid']=test_df['aid'] 18 | sub['uid']=test_df['uid'] 19 | sub['score'] = np.loadtxt("./output.txt") 20 | sub.to_csv('submission.csv',index=False) 21 | os.system('zip baseline_ffm.zip submission.csv') 22 | 23 | -------------------------------------------------------------------------------- /tencent_FFM_baseline/convert_to_ffm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by Bo Song on 2018/4/26 4 | 5 | import pandas as pd 6 | from pandas import get_dummies 7 | import lightgbm as lgb 8 | from sklearn.feature_extraction.text import CountVectorizer 9 | from sklearn.preprocessing import OneHotEncoder,LabelEncoder 10 | from scipy import sparse 11 | import numpy as np 12 | import os 13 | import gc 14 | 15 | 16 | path='data/' 17 | 18 | 19 | 20 | one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','advertiserId','campaignId', 'creativeId', 21 | 'adCategoryId', 'productId', 'productType'] 22 | 23 | vector_feature=['interest1','interest2','interest5','kw1','kw2','topic1','topic2','os','ct','marriageStatus'] 24 | continus_feature=['creativeSize'] 25 | 26 | ad_feature=pd.read_csv(path+'adFeature.csv') 27 | user_feature=pd.read_csv(path+'userFeature.csv') 28 | 29 | train=pd.read_csv(path+'train.csv') 30 | test=pd.read_csv(path+'test1.csv') 31 | 32 | data=pd.concat([train,test]) 33 | data=pd.merge(data,ad_feature,on='aid',how='left') 34 | data=pd.merge(data,user_feature,on='uid',how='left') 35 | 36 | 37 | data=data.fillna(-1) 38 | data=data[one_hot_feature+vector_feature+continus_feature] 39 | 40 | class FFMFormat: 41 | def __init__(self,vector_feat,one_hot_feat,continus_feat): 42 | self.field_index_ = None 43 | self.feature_index_ = None 44 | self.vector_feat=vector_feat 45 | self.one_hot_feat=one_hot_feat 46 | self.continus_feat=continus_feat 47 | 48 | 49 | def get_params(self): 50 | pass 51 | 52 | def set_params(self, **parameters): 53 | pass 54 | 55 | def fit(self, df, y=None): 56 | self.field_index_ = {col: i for i, col in enumerate(df.columns)} 57 | self.feature_index_ = dict() 58 | last_idx = 0 59 | for col in df.columns: 60 | if col in self.one_hot_feat: 61 | print(col) 62 | df[col]=df[col].astype('int') 63 | vals = np.unique(df[col]) 64 | for val in vals: 65 | if val==-1: continue 66 | name = '{}_{}'.format(col, val) 67 | if name not in self.feature_index_: 68 | self.feature_index_[name] = last_idx 69 | last_idx += 1 70 | elif col in self.vector_feat: 71 | print(col) 72 | vals=[] 73 | for data in df[col].apply(str): 74 | if data!="-1": 75 | for word in data.strip().split(' '): 76 | vals.append(word) 77 | vals = np.unique(vals) 78 | for val in vals: 79 | if val=="-1": continue 80 | name = '{}_{}'.format(col, val) 81 | if name not in self.feature_index_: 82 | self.feature_index_[name] = last_idx 83 | last_idx += 1 84 | self.feature_index_[col] = last_idx 85 | last_idx += 1 86 | return self 87 | 88 | def fit_transform(self, df, y=None): 89 | self.fit(df, y) 90 | return self.transform(df) 91 | 92 | def transform_row_(self, row): 93 | ffm = [] 94 | 95 | for col, val in row.loc[row != 0].to_dict().items(): 96 | if col in self.one_hot_feat: 97 | name = '{}_{}'.format(col, val) 98 | if name in self.feature_index_: 99 | ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name])) 100 | # ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], 1)) 101 | elif col in self.vector_feat: 102 | for word in str(val).split(' '): 103 | name = '{}_{}'.format(col, word) 104 | if name in self.feature_index_: 105 | ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name])) 106 | elif col in self.continus_feat: 107 | if val!=-1: 108 | ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val)) 109 | return ' '.join(ffm) 110 | 111 | def transform(self, df): 112 | # val=[] 113 | # for k,v in self.feature_index_.items(): 114 | # val.append(v) 115 | # val.sort() 116 | # print(val) 117 | # print(self.field_index_) 118 | # print(self.feature_index_) 119 | return pd.Series({idx: self.transform_row_(row) for idx, row in df.iterrows()}) 120 | 121 | tr = FFMFormat(vector_feature,one_hot_feature,continus_feature) 122 | user_ffm=tr.fit_transform(data) 123 | user_ffm.to_csv('ffm.csv',index=False) 124 | 125 | train = pd.read_csv(path + 'train.csv') 126 | test = pd.read_csv(path+'test1.csv') 127 | 128 | Y = np.array(train.pop('label')) 129 | len_train=len(train) 130 | 131 | with open('ffm.csv') as fin: 132 | f_train_out=open('train_ffm.csv','w') 133 | f_test_out = open('test_ffm.csv', 'w') 134 | for (i,line) in enumerate(fin): 135 | if i