├── README.md
└── tencent_FFM_baseline
    ├── FFM_baseline.py
    └── convert_to_ffm.py


/README.md:
--------------------------------------------------------------------------------
1 | 具体可见 https://zhuanlan.zhihu.com/p/36302396
2 | 


--------------------------------------------------------------------------------
/tencent_FFM_baseline/FFM_baseline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by Bo Song on 2018/4/26
 4 | import xlearn as xl
 5 | import pandas as pd
 6 | import numpy as np
 7 | path='data/'
 8 | ffm_model = xl.create_ffm()
 9 | ffm_model.setTrain(path+'train_ffm.csv')
10 | ffm_model.setTest(path+'test_ffm.csv')
11 | ffm_model.setSigmoid()
12 | param = {'task':'binary', 'lr':0.01, 'lambda':0.001,'metric': 'auc','opt':'ftrl','epoch':5,'k':4,
13 |          'alpha': 1.5, 'beta': 0.01, 'lambda_1': 0.0, 'lambda_2': 0.0}
14 | ffm_model.fit(param,"./model.out")
15 | ffm_model.predict("./model.out","./output.txt")
16 | sub = pd.DataFrame()
17 | sub['aid']=test_df['aid']
18 | sub['uid']=test_df['uid']
19 | sub['score'] = np.loadtxt("./output.txt")
20 | sub.to_csv('submission.csv',index=False)
21 | os.system('zip baseline_ffm.zip submission.csv')
22 | 
23 | 


--------------------------------------------------------------------------------
/tencent_FFM_baseline/convert_to_ffm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Created by Bo Song on 2018/4/26
  4 | 
  5 | import pandas as pd
  6 | from pandas import get_dummies
  7 | import lightgbm as lgb
  8 | from sklearn.feature_extraction.text import CountVectorizer
  9 | from sklearn.preprocessing import OneHotEncoder,LabelEncoder
 10 | from scipy import sparse
 11 | import numpy as np
 12 | import os
 13 | import gc
 14 | 
 15 | 
 16 | path='data/'
 17 | 
 18 | 
 19 | 
 20 | one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','advertiserId','campaignId', 'creativeId',
 21 |        'adCategoryId', 'productId', 'productType']
 22 | 
 23 | vector_feature=['interest1','interest2','interest5','kw1','kw2','topic1','topic2','os','ct','marriageStatus']
 24 | continus_feature=['creativeSize']
 25 | 
 26 | ad_feature=pd.read_csv(path+'adFeature.csv')
 27 | user_feature=pd.read_csv(path+'userFeature.csv')
 28 | 
 29 | train=pd.read_csv(path+'train.csv')
 30 | test=pd.read_csv(path+'test1.csv')
 31 | 
 32 | data=pd.concat([train,test])
 33 | data=pd.merge(data,ad_feature,on='aid',how='left')
 34 | data=pd.merge(data,user_feature,on='uid',how='left')
 35 | 
 36 | 
 37 | data=data.fillna(-1)
 38 | data=data[one_hot_feature+vector_feature+continus_feature]
 39 | 
 40 | class FFMFormat:
 41 |     def __init__(self,vector_feat,one_hot_feat,continus_feat):
 42 |         self.field_index_ = None
 43 |         self.feature_index_ = None
 44 |         self.vector_feat=vector_feat
 45 |         self.one_hot_feat=one_hot_feat
 46 |         self.continus_feat=continus_feat
 47 | 
 48 | 
 49 |     def get_params(self):
 50 |         pass
 51 | 
 52 |     def set_params(self, **parameters):
 53 |         pass
 54 | 
 55 |     def fit(self, df, y=None):
 56 |         self.field_index_ = {col: i for i, col in enumerate(df.columns)}
 57 |         self.feature_index_ = dict()
 58 |         last_idx = 0
 59 |         for col in df.columns:
 60 |             if col in self.one_hot_feat:
 61 |                 print(col)
 62 |                 df[col]=df[col].astype('int')
 63 |                 vals = np.unique(df[col])
 64 |                 for val in vals:
 65 |                     if val==-1: continue
 66 |                     name = '{}_{}'.format(col, val)
 67 |                     if name not in self.feature_index_:
 68 |                         self.feature_index_[name] = last_idx
 69 |                         last_idx += 1
 70 |             elif col in self.vector_feat:
 71 |                 print(col)
 72 |                 vals=[]
 73 |                 for data in df[col].apply(str):
 74 |                     if data!="-1":
 75 |                         for word in data.strip().split(' '):
 76 |                             vals.append(word)
 77 |                 vals = np.unique(vals)
 78 |                 for val in vals:
 79 |                     if val=="-1": continue
 80 |                     name = '{}_{}'.format(col, val)
 81 |                     if name not in self.feature_index_:
 82 |                         self.feature_index_[name] = last_idx
 83 |                         last_idx += 1
 84 |             self.feature_index_[col] = last_idx
 85 |             last_idx += 1
 86 |         return self
 87 | 
 88 |     def fit_transform(self, df, y=None):
 89 |         self.fit(df, y)
 90 |         return self.transform(df)
 91 | 
 92 |     def transform_row_(self, row):
 93 |         ffm = []
 94 | 
 95 |         for col, val in row.loc[row != 0].to_dict().items():
 96 |             if col in self.one_hot_feat:
 97 |                 name = '{}_{}'.format(col, val)
 98 |                 if name in self.feature_index_:
 99 |                     ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
100 |                 # ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], 1))
101 |             elif col in self.vector_feat:
102 |                 for word in str(val).split(' '):
103 |                     name = '{}_{}'.format(col, word)
104 |                     if name in self.feature_index_:
105 |                         ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
106 |             elif col in self.continus_feat:
107 |                 if val!=-1:
108 |                     ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
109 |         return ' '.join(ffm)
110 | 
111 |     def transform(self, df):
112 |         # val=[]
113 |         # for k,v in self.feature_index_.items():
114 |         #     val.append(v)
115 |         # val.sort()
116 |         # print(val)
117 |         # print(self.field_index_)
118 |         # print(self.feature_index_)
119 |         return pd.Series({idx: self.transform_row_(row) for idx, row in df.iterrows()})
120 | 
121 | tr = FFMFormat(vector_feature,one_hot_feature,continus_feature)
122 | user_ffm=tr.fit_transform(data)
123 | user_ffm.to_csv('ffm.csv',index=False)
124 | 
125 | train = pd.read_csv(path + 'train.csv')
126 | test = pd.read_csv(path+'test1.csv')
127 | 
128 | Y = np.array(train.pop('label'))
129 | len_train=len(train)
130 | 
131 | with open('ffm.csv') as fin:
132 |     f_train_out=open('train_ffm.csv','w')
133 |     f_test_out = open('test_ffm.csv', 'w')
134 |     for (i,line) in enumerate(fin):
135 |         if i<len_train:
136 |             f_train_out.write(str(Y[i])+' '+line)
137 |         else:
138 |             f_test_out.write(line)
139 |     f_train_out.close()
140 |     f_test_out.close()
141 | 


--------------------------------------------------------------------------------