├── README.md
├── run_nn.sh
└── src
    ├── __pycache__
        └── nffm.cpython-35.pyc
    ├── input
        └── README.md
    ├── load_vowpal.py
    ├── make_dataset.py
    ├── make_feature.py
    ├── nffm.py
    ├── train.py
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | ## 代码
 2 | 2018年腾讯广告算法大赛Rank10代码：深度部分。
 3 | 
 4 | ## 环境说明
 5 | 
 6 | - 系统环境：　
 7 |     - 操作系统：Ubuntu16.04 LTS
 8 |     - 硬件：CPU　40核，128GB内存
 9 |     - 显卡: TITAN Xp, 显存：12G
10 | 
11 | - 软件环境：
12 |     - Python: 3.6.4(conda 4.5.4)
13 |     - Tensorflow: GPU版本１.7(源码编译),  NVIDIA-SMI 390.48, cuda_9.1.85_387.26
14 |     - 其他包: pandas(0.22.0), numpy(1.14.0), scipy(1.0.0), scikit-leartn(0.19.1), tqdm(4.23.3)
15 | 
16 | ## 运行步骤
17 | - 预处理
18 |     - 载入userFeature数据：`python3 load_vowpal.py `
19 | - 生成特征
20 |     - 生成uid/aid全局统计特征： `python3 make_feature.py`
21 | - 运行模型
22 |     - 运行NFFM模型: `python3 train.py`
23 | 
24 | ## 特征工程
25 | 
26 | ### 特征使用
27 | - 基础特征：user、ad的所有基本特征
28 | - 统计特征：uid_aid_nunique(每个uid下的aid数目，等频离散化), aid_uid_nunique(每个aid下的uid数目，等频离散化), campaignId_aid_nunique(每个campaignId下的aid数目， 等频离散化)，pos_aid(每个用户的训练集正aid)， neg_aid(每个用户的训练集负aid)，user_convert(用户转化率)
29 | 
30 | ### 特征生成
31 | - 全局统计特征：
32 |     - 使用groupby提取，如`ad_Feature.groupby(['campaignId']).aid.nunique()`
33 |     - 离散化：按百分比：[0, 20, 35, 50, 65, 85, 100]进行离散化
34 |     - 转化率特征： 用户在训练集上的转化率，为防止过拟合，在移除当前行的label基础上进行统计
35 | - 正负aid特征：
36 |     - 在训练集中构建uid:aid-label字典
37 |     - 整理上述字典uid:[aid-pos, aid-neg], 其中aid-pos为label为1的aid列表， aid-neg为label为-1的aid列表
38 |     - 构建测试集中的特征：直接将上述uid拼接到测试集的uid中
39 |     - 构建训练集中的特征：将上述uid拼接到训练集的uid中，并移除aid-pos、aid-neg中每行出现的aid
40 |     - 最终生成多值正负aid特征 
41 | 
42 | ## 模型结构
43 | 模型使用郭达雅同学在群里的开源代码：`nffm-v3`, 即深度FFM模型。
44 | ### 结构定义
45 | - FFM部分：线性部分+二阶隐式交叉
46 | - 深度部分：两个隐藏层
47 | 
48 | ### 参数
49 | - 批大小：4096
50 | - 迭代次数: 1
51 | - 隐藏层： 256, 128
52 | - 优化器： adam
53 | - 学习率： 0.0002
54 | - L2正则： 0.000002
55 | - 嵌入大小： 8
56 | - 随机种子：2018
57 | 
58 | ## 感谢
59 | 感谢郭达雅大佬的[开源模型](https://github.com/guoday/Tencent2018_Lookalike_Rank7th)
60 | 


--------------------------------------------------------------------------------
/run_nn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash  
 2 | printf '========== Generate NN Result ==========\n'
 3 | CURDIR="`pwd`"/"`dirname $0`"
 4 | printf "\nCurrent Path：$CURDIR \n"
 5 | 
 6 | train_file=${CURDIR}"/src/input/train.csv"
 7 | test1_file=${CURDIR}"/src/input/test1.csv"
 8 | test2_file=${CURDIR}"/src/input/test2.csv"
 9 | ad_feature=${CURDIR}"/src/input/adFeature.csv"
10 | user_file=${CURDIR}"/src/input/userFeature.data"
11 | user_feature=${CURDIR}"/src/input/userFeature.csv"
12 | dataset_dir=${CURDIR}"/src/dataset/"
13 | train_dir=${CURDIR}"/src/dataset/train/"
14 | valid_dir=${CURDIR}"/src/dataset/dev/"
15 | test2_dir=${CURDIR}"/src/dataset/test/"
16 | train_aid_fea=${dataset_dir}"/train_uid_aid_bin.csv"
17 | fea_dict=${dataset_dir}"/dic.pkl"
18 | sub_file=${CURDIR}"/submission_nffm.csv"
19 | 
20 | printf '\nStep1: PreProcess UserFeature...\n'
21 | if [ ! -f $user_feature ]; then 
22 |     cd src
23 |     python3 load_vowpal.py 
24 |     printf 'Save to input/userFeature.csv\n'
25 |     cd ..
26 | else
27 |     printf 'UserFeature exists, Skip this step!\n'
28 | fi
29 | 
30 | printf '\nStep2: Make Features...\n'
31 | if [ ! -f $train_aid_fea ]; then
32 |     cd src
33 |     python3 make_feature.py
34 |     cd ..
35 | else
36 |     printf 'Feature exists, Skip this step!\n'
37 | fi
38 | 
39 | printf '\nStep3: Make Dataset...\n'
40 | if [ ! -f $fea_dict ]; then
41 |     cd src
42 |     python3 make_dataset.py
43 |     cd ..
44 | else
45 |     printf 'Feature exists, Skip this step!\n'
46 | fi
47 | 
48 | printf '\nStep4: Train Model and Predic Result...\n'
49 | if [ ! -f $sub_file ]; then
50 |     cd src
51 |     python3 train.py
52 |     cd ..
53 | else
54 |     printf 'Train Done, Skip this step!\n'
55 | fi


--------------------------------------------------------------------------------
/src/__pycache__/nffm.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keyunluo/Tencent2018_Lookalike_Rank10th/ae9227541c472cd0dc91a494369e7c795a31dcae/src/__pycache__/nffm.cpython-35.pyc


--------------------------------------------------------------------------------
/src/input/README.md:
--------------------------------------------------------------------------------
1 | ## 原始数据存放在此
2 | 
3 | - adFeature.csv
4 | - test1.csv
5 | - test2.csv
6 | - train.csv
7 | - userFeature.data(预处理自动生成：userFeature.csv)


--------------------------------------------------------------------------------
/src/load_vowpal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from csv import DictWriter
 4 | from tqdm import tqdm
 5 | import mmap
 6 | 
 7 | 
 8 | def get_file_lines(file_path):
 9 |     '''
10 |     获取文件行数
11 |     '''
12 |     fp = open(file_path, "r+")
13 |     buf = mmap.mmap(fp.fileno(), 0)
14 |     lines = 0
15 |     while buf.readline():
16 |         lines += 1
17 |     return lines
18 | 
19 | 
20 | def process_vowpal(file_path, out_path):
21 |     '''
22 |     处理用户特征文件
23 |     '''
24 |     headers = ['uid', 'age', 'gender', 'marriageStatus', 'education', 'consumptionAbility', 'LBS', 'interest1', 'interest2', 'interest3',
25 |                'interest4', 'interest5', 'kw1', 'kw2', 'kw3',  'topic1', 'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os', 'carrier', 'house']
26 |     fo = open(out_path, 'wt')
27 |     writer = DictWriter(fo, fieldnames=headers, lineterminator='\n')
28 |     writer.writeheader()
29 | 
30 |     with open(file_path, 'rt') as f:
31 |         for line in tqdm(f, total=get_file_lines(file_path)):
32 |             feature_groups = line.strip().split('|')
33 |             fea_dict = {}
34 |             for feas in feature_groups:
35 |                 feas_split = feas.split(' ')
36 |                 fea_dict[feas_split[0]] = ' '.join(feas_split[1:])
37 |             writer.writerow(fea_dict)
38 |     fo.close()
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     file_path = 'input/userFeature.data'
43 |     out_path = 'input/userFeature.csv'
44 |     process_vowpal(file_path, out_path)
45 | 


--------------------------------------------------------------------------------
/src/make_dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import random
  6 | import pickle
  7 | import gc
  8 | import os
  9 | from collections import Counter, OrderedDict
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | 
 13 | threshold = 1000
 14 | random.seed(2018)
 15 | 
 16 | 
 17 | def pre_data():
 18 |     user_feature = pd.read_csv('input/userFeature.csv')
 19 |     ad_Feature = pd.read_csv('input/adFeature.csv')
 20 | 
 21 |     train_df = pd.read_csv('input/train.csv')
 22 |     test_df = pd.read_csv('input/test2.csv')
 23 | 
 24 |     train_df = pd.merge(train_df, ad_Feature, on='aid', how='left')
 25 |     test_df = pd.merge(test_df, ad_Feature, on='aid', how='left')
 26 |     train_df = pd.merge(train_df, user_feature, on='uid', how='left')
 27 |     test_df = pd.merge(test_df, user_feature, on='uid', how='left')
 28 | 
 29 |     del user_feature
 30 |     gc.collect()
 31 | 
 32 |     # ['pos_aid', 'neg_aid']
 33 |     train_aid_fea = pd.read_csv(
 34 |         'dataset/train_neg_pos_aid.csv', usecols=['pos_aid', 'neg_aid'])
 35 |     test2_aid_fea = pd.read_csv(
 36 |         'dataset/test2_neg_pos_aid.csv', usecols=['pos_aid', 'neg_aid'])
 37 | 
 38 |     # ['campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', 'uid_convert']
 39 |     train_statistic_fea = pd.read_csv('dataset/train_uid_aid_bin.csv')
 40 |     test2_statistic_fea = pd.read_csv('dataset/test2_uid_aid_bin.csv')
 41 | 
 42 |     train_df = pd.concat(
 43 |         [train_df, train_aid_fea, train_statistic_fea], axis=1)
 44 |     test_df = pd.concat([test_df, test2_aid_fea, test2_statistic_fea], axis=1)
 45 | 
 46 |     train_df = train_df.fillna('0')
 47 |     test_df = test_df.fillna('0')
 48 | 
 49 |     gc.collect()
 50 | 
 51 |     train_df.loc[train_df['label'] == -1, 'label'] = 0
 52 |     test_df['label'] = -1
 53 | 
 54 |     train_df, dev_df = train_test_split(
 55 |         train_df, test_size=0.1, random_state=2018)
 56 | 
 57 |     np.save('dataset/train_df.index', np.array(train_df.index))
 58 |     np.save('dataset/dev_df.index', np.array(dev_df.index))
 59 | 
 60 |     return train_df, dev_df, test_df
 61 | 
 62 | 
 63 | def output_label(train_df, dev_df, test_df):
 64 |     with open('dataset/dev/label', 'w') as f:
 65 |         for i in list(dev_df['label']):
 66 |             f.write(str(i)+'\n')
 67 |     with open('dataset/test/label', 'w') as f:
 68 |         for i in list(test_df['label']):
 69 |             f.write(str(i)+'\n')
 70 |     with open('dataset/train/label', 'w') as f:
 71 |         for i in list(train_df['label']):
 72 |             f.write(str(i)+'\n')
 73 | 
 74 | 
 75 | def single_features(train_df, dev_df, test_df, word2index):
 76 |     single_ids_features = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType', 'age',
 77 |                            'gender', 'education', 'consumptionAbility', 'LBS', 'carrier', 'house',
 78 |                            'campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', 'uid_convert']
 79 | 
 80 |     for s in single_ids_features:
 81 |         print(s)
 82 |         cont = {}
 83 | 
 84 |         with open('dataset/train/'+str(s), 'w') as f:
 85 |             for line in list(train_df[s].values):
 86 |                 f.write(str(line)+'\n')
 87 |                 if str(line) not in cont:
 88 |                     cont[str(line)] = 0
 89 |                 cont[str(line)] += 1
 90 | 
 91 |         with open('dataset/dev/'+str(s), 'w') as f:
 92 |             for line in list(dev_df[s].values):
 93 |                 f.write(str(line)+'\n')
 94 | 
 95 |         with open('dataset/test/'+str(s), 'w') as f:
 96 |             for line in list(test_df[s].values):
 97 |                 f.write(str(line)+'\n')
 98 |         index = []
 99 |         for k in cont:
100 |             if s not in ['campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', 'uid_convert']:
101 |                 if cont[k] >= threshold:
102 |                     index.append(k)
103 |             else:
104 |                 index.append(k)
105 |         word2index[s] = {}
106 |         for idx, val in enumerate(index):
107 |             word2index[s][val] = idx+2
108 |         print(s+' done!')
109 | 
110 | 
111 | def mutil_ids(train_df, dev_df, test_df, word2index):
112 |     features_mutil = ['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
113 |                       'topic2', 'topic3', 'appIdAction', 'appIdInstall', 'marriageStatus', 'ct', 'os', 'pos_aid', 'neg_aid']
114 |     for s in features_mutil:
115 |         print(s)
116 |         cont = {}
117 |         with open('dataset/train/'+str(s), 'w') as f:
118 |             for lines in list(train_df[s].values):
119 |                 lines = str(lines)
120 |                 f.write(lines+'\n')
121 |                 for line in lines.split():
122 |                     if line not in cont:
123 |                         cont[line] = 0
124 |                     cont[line] += 1
125 | 
126 |         with open('dataset/dev/'+str(s), 'w') as f:
127 |             for line in list(dev_df[s].values):
128 |                 f.write(str(line)+'\n')
129 | 
130 |         with open('dataset/test/'+str(s), 'w') as f:
131 |             for line in list(test_df[s].values):
132 |                 f.write(str(line)+'\n')
133 |         index = []
134 |         for k in cont:
135 |             if s not in ['pos_aid', 'neg_aid']:
136 |                 if cont[k] >= threshold:
137 |                     index.append(k)
138 |             else:
139 |                 index.append(k)
140 |         word2index[s] = {}
141 |         for idx, val in enumerate(index):
142 |             word2index[s][val] = idx+2
143 |         print(s+' done!')
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     if os.path.exists('dataset/dic.pkl'):
148 |         word2index = pickle.load(open('dataset/dic.pkl', 'rb'))
149 |     else:
150 |         word2index = {}
151 |     print("Loading Data...")
152 |     train_df, dev_df, test_df = pre_data()
153 | 
154 |     print("Output Label...")
155 |     output_label(train_df, dev_df, test_df)
156 | 
157 |     print("Processing Single Feature...")
158 |     single_features(train_df, dev_df, test_df, word2index)
159 |     pickle.dump(word2index, open('dataset/dic.pkl', 'wb'))
160 | 
161 |     print("Processing Multiple Feature...")
162 |     mutil_ids(train_df, dev_df, test_df, word2index)
163 |     pickle.dump(word2index, open('dataset/dic.pkl', 'wb'))
164 | 


--------------------------------------------------------------------------------
/src/make_feature.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | from collections import defaultdict
  7 | from sklearn.preprocessing import MinMaxScaler
  8 | 
  9 | def gen_pos_neg_aid_fea():
 10 |     train_data = pd.read_csv('input/train.csv')
 11 |     test2_data = pd.read_csv('input/test2.csv')
 12 | 
 13 |     train_user = train_data.uid.unique()
 14 |     
 15 |     # user-aid dict
 16 |     uid_dict = defaultdict(list)
 17 |     for row in tqdm(train_data.itertuples(), total=len(train_data)):
 18 |         uid_dict[row[2]].append([row[1], row[3]])
 19 | 
 20 |     # user convert
 21 |     uid_convert = {}
 22 |     for uid in tqdm(train_user):
 23 |         pos_aid, neg_aid = [], []
 24 |         for data in uid_dict[uid]:
 25 |             if data[1] > 0:
 26 |                 pos_aid.append(data[0])
 27 |             else:
 28 |                 neg_aid.append(data[0])
 29 |         uid_convert[uid] = [pos_aid, neg_aid]  
 30 | 
 31 |     test2_neg_pos_aid = {}
 32 |     for row in tqdm(test2_data.itertuples(), total=len(test2_data)):
 33 |         aid = row[1]
 34 |         uid = row[2]
 35 |         if uid_convert.get(uid, []) == []:
 36 |             test2_neg_pos_aid[row[0]] =  ['', '', -1] 
 37 |         else:
 38 |             pos_aid, neg_aid = uid_convert[uid][0].copy(), uid_convert[uid][1].copy()
 39 |             convert = len(pos_aid)  / (len(pos_aid) + len(neg_aid)) if (len(pos_aid) + len(neg_aid)) > 0 else -1
 40 |             test2_neg_pos_aid[row[0]] =  [' '.join(map(str, pos_aid)), ' '.join(map(str, neg_aid)), convert]
 41 |     df_test2 =  pd.DataFrame.from_dict(data=test2_neg_pos_aid, orient='index')
 42 |     df_test2.columns = ['pos_aid', 'neg_aid', 'uid_convert']
 43 | 
 44 |     train_neg_pos_aid = {}
 45 |     for row in tqdm(train_data.itertuples(), total=len(train_data)):
 46 |         aid = row[1]
 47 |         uid = row[2]
 48 |         pos_aid, neg_aid = uid_convert[uid][0].copy(), uid_convert[uid][1].copy()
 49 |         if aid in pos_aid:
 50 |             pos_aid.remove(aid)
 51 |         if aid in neg_aid:
 52 |             neg_aid.remove(aid)
 53 |         convert = len(pos_aid)  / (len(pos_aid) + len(neg_aid)) if (len(pos_aid) + len(neg_aid)) > 0 else -1
 54 |         train_neg_pos_aid[row[0]] = [' '.join(map(str, pos_aid)), ' '.join(map(str, neg_aid)), convert]
 55 | 
 56 |     df_train = pd.DataFrame.from_dict(data=train_neg_pos_aid, orient='index')
 57 |     df_train.columns = ['pos_aid', 'neg_aid', 'uid_convert']
 58 |      
 59 |     df_train.to_csv("dataset/train_neg_pos_aid.csv", index=False)
 60 |     df_test2.to_csv("dataset/test2_neg_pos_aid.csv", index=False)
 61 | 
 62 | def gen_uid_aid_fea():
 63 |     '''
 64 |     载入数据，　提取aid, uid的全局统计特征
 65 |     '''
 66 |     train_data = pd.read_csv('input/train.csv')
 67 |     test1_data = pd.read_csv('input/test1.csv')
 68 |     test2_data = pd.read_csv('input/test2.csv')
 69 | 
 70 |     ad_Feature = pd.read_csv('input/adFeature.csv')
 71 | 
 72 |     train_len = len(train_data)  # 45539700
 73 |     test1_len = len(test1_data)
 74 |     test2_len = len(test2_data)  # 11727304
 75 | 
 76 |     ad_Feature = pd.merge(ad_Feature, ad_Feature.groupby(['campaignId']).aid.nunique().reset_index(
 77 |     ).rename(columns={'aid': 'campaignId_aid_nunique'}), how='left', on='campaignId')
 78 | 
 79 |     df = pd.concat([train_data, test1_data, test2_data], axis=0)
 80 |     df = pd.merge(df, df.groupby(['uid'])['aid'].nunique().reset_index().rename(
 81 |         columns={'aid': 'uid_aid_nunique'}), how='left', on='uid')
 82 | 
 83 |     df = pd.merge(df, df.groupby(['aid'])['uid'].nunique().reset_index().rename(
 84 |         columns={'uid': 'aid_uid_nunique'}), how='left', on='aid')
 85 | 
 86 |     df['uid_count'] = df.groupby('uid')['aid'].transform('count')
 87 |     df = pd.merge(df, ad_Feature[['aid', 'campaignId_aid_nunique']], how='left', on='aid')
 88 | 
 89 |     fea_columns = ['campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', ]
 90 | 
 91 |     df[fea_columns].iloc[:train_len].to_csv('dataset/train_uid_aid.csv', index=False)
 92 |     df[fea_columns].iloc[train_len: train_len+test1_len].to_csv('dataset/test1_uid_aid.csv', index=False)
 93 |     df[fea_columns].iloc[-test2_len:].to_csv('dataset/test2_uid_aid.csv', index=False)
 94 | 
 95 | def digitize():
 96 |     uid_aid_train = pd.read_csv('dataset/train_uid_aid.csv')
 97 |     uid_aid_test1 = pd.read_csv('dataset/test1_uid_aid.csv')
 98 |     uid_aid_test2 = pd.read_csv('dataset/test2_uid_aid.csv')
 99 |     uid_aid_df = pd.concat([uid_aid_train, uid_aid_test1, uid_aid_test2], axis=0)
100 |     for col in range(3):
101 |         bins = []
102 |         for percent in [0, 20, 35, 50, 65, 85, 100]:
103 |             bins.append(np.percentile(uid_aid_df.iloc[:, col], percent))
104 |         uid_aid_train.iloc[:, col] =  np.digitize(uid_aid_train.iloc[:, col], bins, right=True)
105 |         uid_aid_test1.iloc[:, col] =  np.digitize(uid_aid_test1.iloc[:, col], bins, right=True)
106 |         uid_aid_test2.iloc[:, col] =  np.digitize(uid_aid_test2.iloc[:, col], bins, right=True)
107 |     
108 |     count_bins = [1, 2, 4, 6, 8, 10, 16, 27, 50]
109 |     uid_aid_train.iloc[:, 3] =  np.digitize(uid_aid_train.iloc[:, 3], count_bins, right=True)
110 |     uid_aid_test1.iloc[:, 3] =  np.digitize(uid_aid_test1.iloc[:, 3], count_bins, right=True)
111 |     uid_aid_test2.iloc[:, 3] =  np.digitize(uid_aid_test2.iloc[:, 3], count_bins, right=True)
112 | 
113 |     uid_convert_train = pd.read_csv("dataset/train_neg_pos_aid.csv", usecols=['uid_convert'])
114 |     uid_convert_test2 = pd.read_csv("dataset/test2_neg_pos_aid.csv", usecols=['uid_convert'])
115 | 
116 |     convert_bins = [-1, 0, 0.1, 0.3, 0.5, 0.7, 1]
117 |     uid_convert_train.iloc[:, 0] =  np.digitize(uid_convert_train.iloc[:, 0], convert_bins, right=True)
118 |     uid_convert_test2.iloc[:, 0] =  np.digitize(uid_convert_test2.iloc[:, 0], convert_bins, right=True)
119 | 
120 |     
121 |     uid_aid_train = pd.concat([uid_aid_train, uid_convert_train], axis=1)
122 |     uid_aid_test2 = pd.concat([uid_aid_test2, uid_convert_test2], axis=1)
123 | 
124 |     uid_aid_train.to_csv('dataset/train_uid_aid_bin.csv', index=False)
125 |     uid_aid_test2.to_csv('dataset/test2_uid_aid_bin.csv', index=False)
126 | 
127 | if __name__ == '__main__':
128 |     print("Make Feature...")
129 |     print("1. Generate pos_neg_aid_fea")
130 |     gen_pos_neg_aid_fea()
131 |     print("2. Generate uid_aid_fea")
132 |     gen_uid_aid_fea()
133 |     print("3. Digitize numerical feature")
134 |     digitize()
135 |     print("Make Feature Done")


--------------------------------------------------------------------------------
/src/nffm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | import utils
  5 | from tensorflow.python.ops import lookup_ops
  6 | from tensorflow.python.layers import core as layers_core
  7 | from sklearn.metrics import log_loss
  8 | import time
  9 | import random
 10 | import os
 11 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
 12 | import numpy as np
 13 | from sklearn import metrics
 14 | import pickle
 15 | gloab_auc = {}
 16 | import pandas as pd
 17 | 
 18 | 
 19 | def print_step_info(prefix, global_step, info):
 20 |     utils.print_out(
 21 |         "%sstep %d lr %g logloss %.6f gN %.2f, %s" %
 22 |         (prefix, global_step, info["learning_rate"],
 23 |          info["train_ppl"], info["avg_grad_norm"], time.ctime()))
 24 | 
 25 | 
 26 | class TextIterator:
 27 |     def __init__(self, hparams, mode, batch_size=None):
 28 |         self.single_file = {}
 29 |         self.mulit_file = {}
 30 |         self.num_file = {}
 31 | 
 32 |         for s in hparams.single_features:
 33 |             self.single_file[s] = open(hparams.data_path+'/'+mode+'/'+s, 'r')
 34 |         for s in hparams.mutil_features:
 35 |             self.mulit_file[s] = open(hparams.data_path+'/'+mode+'/'+s, 'r')
 36 |         for s in hparams.num_features:
 37 |             self.num_file[s] = open(hparams.data_path+'/'+mode+'/'+s, 'r')
 38 | 
 39 |         self.label_file = open(hparams.data_path+'/'+mode+'/label', 'r')
 40 |         self.word2index = pickle.load(open(hparams.data_path+'/dic.pkl', 'rb'))
 41 |         hparams.dict = self.word2index
 42 |         if batch_size:
 43 |             self.batch_size = batch_size
 44 |         else:
 45 |             self.batch_size = hparams.batch_size
 46 | 
 47 |         if 'train' in mode:
 48 |             self.idx = hparams.idx
 49 |             self.all_process = int(hparams.all_process)
 50 |         else:
 51 |             self.idx = 0
 52 |             self.all_process = 1
 53 |         self.hparams = hparams
 54 |         random.seed(2018)
 55 | 
 56 |     def reset(self):
 57 |         self.idx = (self.idx+1) % self.all_process
 58 |         for s in self.single_file:
 59 |             self.single_file[s].seek(0)
 60 |         for s in self.mulit_file:
 61 |             self.mulit_file[s].seek(0)
 62 |         for s in self.num_file:
 63 |             self.num_file[s].seek(0)
 64 |         self.label_file.seek(0)
 65 | 
 66 |     def next(self):
 67 |         single_ids = {}
 68 |         mulit_ids = {}
 69 |         labels = []
 70 |         mulit_length = {}
 71 |         num_ids = {}
 72 |         for s in self.hparams.single_features:
 73 |             temp = []
 74 |             for i in range(self.batch_size*self.all_process):
 75 |                 ss = self.single_file[s].readline().strip()
 76 |                 if ss == "":
 77 |                     break
 78 |                 if i % self.all_process != self.idx:
 79 |                     continue
 80 |                 try:
 81 |                     if int(ss) in self.hparams.dict[s]:
 82 |                         ss = int(ss)
 83 |                 except:
 84 |                     pass
 85 | 
 86 |                 try:
 87 |                     temp.append(self.hparams.dict[s][ss])
 88 |                 except:
 89 |                     temp.append(0)
 90 |             single_ids[s] = temp
 91 | 
 92 |         for s in self.hparams.num_features:
 93 |             temp = []
 94 |             for i in range(self.batch_size*self.all_process):
 95 |                 ss = self.num_file[s].readline().strip()
 96 |                 if ss == "":
 97 |                     break
 98 |                 if i % self.all_process != self.idx:
 99 |                     continue
100 |                 temp.append(float(ss))
101 |             num_ids[s] = temp
102 | 
103 |         max_len = 0
104 |         for s in self.hparams.mutil_features:
105 |             temp = []
106 |             temp_len = []
107 |             for i in range(self.batch_size*self.all_process):
108 |                 mm = self.mulit_file[s].readline().strip()
109 |                 if mm == "":
110 |                     break
111 |                 if i % self.all_process != self.idx:
112 |                     continue
113 |                 t = []
114 |                 t_len = []
115 |                 for m in mm.split():
116 |                     try:
117 |                         t.append(self.hparams.dict[s][m])
118 |                     except:
119 |                         t.append(0)
120 |                 temp.append(t)
121 |                 temp_len.append(len(t))
122 |                 max_len = max(len(t), max_len)
123 |             mulit_length[s] = temp_len
124 |             mulit_ids[s] = temp
125 |         max_len = 100
126 |         for t in mulit_ids:
127 |             for i in range(len(mulit_ids[t])):
128 |                 if len(mulit_ids[t][i]) <= max_len:
129 |                     mulit_ids[t][i] += (max_len-len(mulit_ids[t][i]))*[1]
130 |                 else:
131 |                     mulit_ids[t][i] = random.sample(mulit_ids[t][i], max_len)
132 | 
133 |         for i in range(self.batch_size*self.all_process):
134 |             ss = self.label_file.readline().strip()
135 |             if ss == "":
136 |                 break
137 |             if i % self.all_process != self.idx:
138 |                 continue
139 |             labels.append(int(ss))
140 | 
141 |         if len(single_ids['aid']) == 0:
142 |             self.reset()
143 |             raise StopIteration
144 |         return (labels, single_ids, mulit_ids, num_ids, mulit_length)
145 | 
146 | 
147 | class Model(object):
148 |     def __init__(self, hparams):
149 |         self.f1 = hparams.aid.copy()
150 |         self.f2 = hparams.user.copy()
151 |         self.batch_norm_decay = 0.9
152 |         self.single_ids = {}
153 |         self.num_ids = {}
154 |         self.mulit_ids = {}
155 |         self.mulit_mask = {}
156 |         self.emb_v1 = {}
157 |         self.emb_v2 = {}
158 |         self.emb_combine_aid_v2 = {}
159 |         self.norm_num = {}
160 |         self.cross_params = []
161 |         self.layer_params = []
162 |         self.length = {}
163 |         self.bias = tf.Variable(tf.truncated_normal(
164 |             shape=[1], mean=0.0, stddev=0.0001), name='bias')
165 |         self.use_dropout = tf.placeholder(tf.bool)
166 |         initializer = tf.random_uniform_initializer(-0.1, 0.1)
167 |         self.feature_all_length = len(
168 |             hparams.single_features)+len(hparams.mutil_features)
169 |         feature_all_length = self.feature_all_length
170 |         self.label = tf.placeholder(shape=(None), dtype=tf.float32)
171 |         norm = ['uid_count']
172 |         for s in hparams.num_features:
173 |             self.num_ids[s] = tf.placeholder(shape=(None,), dtype=tf.float32)
174 |             if s in norm:
175 |                 self.norm_num[s] = self.batch_norm_layer(
176 |                     tf.reshape(self.num_ids[s], [-1, 1]), self.use_dropout, s)
177 |             else:
178 |                 self.norm_num[s] = self.num_ids[s][:, None]
179 | 
180 |         for s in hparams.single_features:
181 |             self.single_ids[s] = tf.placeholder(shape=(None,), dtype=tf.int32)
182 |             self.emb_v1[s] = tf.Variable(tf.truncated_normal(
183 |                 shape=[len(hparams.dict[s])+2, 1], mean=0.0, stddev=0.0001), name='emb_v1_'+s)
184 |             if s in self.f1:
185 |                 self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len(
186 |                     hparams.dict[s])+2, len(self.f2), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s)
187 |             elif s in self.f2:
188 |                 self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len(
189 |                     hparams.dict[s])+2, len(self.f1), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s)
190 | 
191 |         for s in hparams.mutil_features:
192 |             self.mulit_ids[s] = tf.placeholder(
193 |                 shape=(None, None), dtype=tf.int32)
194 |             self.length[s] = tf.placeholder(shape=(None,), dtype=tf.int32)
195 |             self.mulit_mask[s] = tf.sequence_mask(
196 |                 self.length[s], 100, dtype=tf.float32)
197 |             self.emb_v1[s] = tf.get_variable(
198 |                 shape=[len(hparams.dict[s])+2, 1], initializer=initializer, name='emb_v1_'+s)
199 |             if s in self.f1:
200 |                 self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len(
201 |                     hparams.dict[s])+2, len(self.f2), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s)
202 |             elif s in self.f2:
203 |                 self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len(
204 |                     hparams.dict[s])+2, len(self.f1), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s)
205 | 
206 |         self.build_graph(hparams)
207 |         self.optimizer(hparams)
208 |         params = tf.trainable_variables()
209 | 
210 |     def build_graph(self, hparams):
211 |         initializer = tf.random_uniform_initializer(-0.1, 0.1)
212 |         # lr
213 |         emb_inp_v1 = {}
214 |         for s in hparams.single_features:
215 |             emb_inp_v1[s] = tf.gather(self.emb_v1[s], self.single_ids[s])
216 |         for s in hparams.mutil_features:
217 |             emb_inp_v1[s] = tf.reduce_sum(tf.gather(self.emb_v1[s], self.mulit_ids[s]) *
218 |                                           self.mulit_mask[s][:, :, None], axis=1)/tf.cast(self.length[s], tf.float32)[:, None]
219 | 
220 |         emb_inp_v1 = tf.concat([emb_inp_v1[s] for s in emb_inp_v1], -1)
221 |         w1 = tf.reduce_sum(emb_inp_v1, [-1])
222 | 
223 |         # poly
224 |         emb_inp_v2 = {}
225 |         for s in hparams.single_features:
226 |             emb_inp_v2[s] = tf.gather(self.emb_v2[s], self.single_ids[s])
227 | 
228 |         for s in hparams.mutil_features:
229 |             emb_inp_v2[s] = tf.reduce_sum(tf.gather(self.emb_v2[s], self.mulit_ids[s])*self.mulit_mask[s]
230 |                                           [:, :, None, None], axis=1) / tf.cast(self.length[s], tf.float32)[:, None, None]
231 | 
232 |         x = [[], []]
233 | 
234 |         for s in self.f1:
235 |             x[0].append(emb_inp_v2[s][:, None, :, :])
236 |         for s in self.f2:
237 |             x[1].append(emb_inp_v2[s][:, :, None, :])
238 |         x[0] = tf.concat(x[0], 1)
239 |         x[1] = tf.concat(x[1], 2)
240 | 
241 |         emb_rep_v = x[0]*x[1]
242 |         emb_rep_v = tf.reshape(
243 |             emb_rep_v, [-1, len(self.f1)*len(self.f2)*hparams.k])
244 |         emb_rep = emb_rep_v
245 |         temp = []
246 |         temp.append(emb_rep)
247 |         for s in hparams.num_features:
248 |             temp.append(self.norm_num[s])
249 |         emb_rep = tf.concat(temp, -1)
250 | 
251 |         input_size = len(self.f1)*len(self.f2)*hparams.k + \
252 |             len(hparams.num_features)
253 |         glorot = np.sqrt(2.0 / (input_size + hparams.hidden_size[0]))
254 | 
255 |         W_1 = tf.get_variable("W_1", [emb_rep.get_shape(
256 |         )[-1], hparams.hidden_size[0]], initializer=initializer)
257 |         b_1 = tf.get_variable(
258 |             "b_1", [1, hparams.hidden_size[0]], initializer=initializer)
259 | 
260 |         hidden_outputs_1 = tf.tensordot(emb_rep, W_1, [[-1], [0]])
261 |         hidden_outputs_1 = self.batch_norm_layer(
262 |             hidden_outputs_1, self.use_dropout, 'train_1')
263 |         hidden_outputs_1 = tf.nn.relu(hidden_outputs_1)
264 | 
265 |         W_2 = tf.get_variable(
266 |             "W_2", [hparams.hidden_size[0], hparams.hidden_size[1]], initializer=initializer)
267 |         b_2 = tf.get_variable(
268 |             "b_2", [1, hparams.hidden_size[1]], initializer=initializer)
269 | 
270 |         hidden_outputs_2 = tf.tensordot(hidden_outputs_1, W_2, [[-1], [0]])
271 |         hidden_outputs_2 = self.batch_norm_layer(
272 |             hidden_outputs_2, self.use_dropout, 'train_2')
273 |         hidden_outputs_2 = tf.nn.relu(hidden_outputs_2)
274 | 
275 |         glorot = np.sqrt(2.0 / (hparams.hidden_size[1] + 1))
276 |         W_3 = tf.get_variable(
277 |             "W_3", [hparams.hidden_size[1], 1], initializer=initializer)
278 |         b_3 = tf.Variable(tf.constant(-3.5), dtype=np.float32)
279 | 
280 |         w_3 = tf.tensordot(hidden_outputs_2, W_3, [[-1], [0]])+b_3
281 | 
282 |         score = w1+w_3[:, 0]
283 |         self.prob = tf.sigmoid(score)
284 |         logit_1 = tf.log(self.prob)
285 |         logit_0 = tf.log(1-self.prob)
286 |         self.loss = -tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0)
287 |         self.cost = -tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0)
288 |         self.saver_ffm = tf.train.Saver()
289 | 
290 |     def optimizer(self, hparams):
291 |         self.lrate = tf.Variable(hparams.learning_rate, trainable=False)
292 |         if hparams.optimizer == "sgd":
293 |             opt = tf.train.GradientDescentOptimizer(self.lrate)
294 |         elif hparams.optimizer == "adam":
295 |             opt = tf.train.AdamOptimizer(
296 |                 self.lrate, beta1=0.9, beta2=0.999, epsilon=1e-8)
297 |         elif hparams.optimizer == "ada":
298 |             opt = tf.train.AdagradOptimizer(
299 |                 learning_rate=self.lrate, initial_accumulator_value=1e-8)
300 |         params = tf.trainable_variables()
301 | 
302 |         gradients = tf.gradients(
303 |             self.cost, params, colocate_gradients_with_ops=True)
304 |         clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0)
305 |         self.grad_norm = gradient_norm
306 |         self.update = opt.apply_gradients(zip(clipped_grads, params))
307 | 
308 |     def dey_lrate(self, sess, lrate):
309 |         sess.run(tf.assign(self.lrate, lrate))
310 | 
311 |     def train(self, sess, iterator):
312 |         data = iterator.next()
313 |         self.maxlen = len(data[2]['interest2'][0])
314 |         dic = {}
315 |         for s in self.single_ids:
316 |             dic[self.single_ids[s]] = data[1][s]
317 | 
318 |         for s in self.mulit_ids:
319 |             dic[self.mulit_ids[s]] = data[2][s]
320 |             dic[self.length[s]] = data[4][s]
321 | 
322 |         for s in self.num_ids:
323 |             dic[self.num_ids[s]] = data[3][s]
324 | 
325 |         dic[self.use_dropout] = True
326 |         dic[self.label] = data[0]
327 | 
328 |         return sess.run([self.cost, self.update, self.grad_norm], feed_dict=dic)
329 | 
330 |     def infer(self, sess, iterator, label, aid):
331 |         data = iterator.next()
332 |         self.maxlen = len(data[2]['interest1'][0])
333 |         label.extend(data[0])
334 |         aid.extend(data[1]['aid'])
335 |         dic = {}
336 |         for s in self.single_ids:
337 |             dic[self.single_ids[s]] = data[1][s]
338 | 
339 |         for s in self.mulit_ids:
340 |             dic[self.mulit_ids[s]] = data[2][s]
341 |             dic[self.length[s]] = data[4][s]
342 |         for s in self.num_ids:
343 |             dic[self.num_ids[s]] = data[3][s]
344 | 
345 |         dic[self.use_dropout] = False
346 |         return sess.run(self.prob, feed_dict=dic)
347 | 
348 |     def batch_norm_layer(self, x, train_phase, scope_bn):
349 |         bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
350 |                               is_training=True, reuse=None, trainable=True, scope=scope_bn)
351 |         bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
352 |                                   is_training=False, reuse=True, trainable=True, scope=scope_bn)
353 |         z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
354 |         return z
355 | 
356 | 
357 | def train(hparams):
358 |     tf.set_random_seed(2018)
359 |     random.seed(2018)
360 |     train_iterator = TextIterator(hparams, mode="train")
361 |     dev_iterator = TextIterator(
362 |         hparams, mode="dev", batch_size=hparams.evl_batch_size)
363 |     test_iterator = TextIterator(
364 |         hparams, mode="test", batch_size=hparams.evl_batch_size)
365 |     model = Model(hparams)
366 |     config_proto = tf.ConfigProto(
367 |         log_device_placement=0, allow_soft_placement=0)
368 |     config_proto.gpu_options.allow_growth = True
369 |     sess = tf.Session(config=config_proto)
370 |     sess.run(tf.global_variables_initializer())
371 | 
372 |     global_step = 0
373 |     train_loss = 0
374 |     train_norm = 0
375 |     best_loss = 0
376 |     dey_cont = 0
377 |     pay_cont = 0
378 |     epoch = False
379 |     epoch_cont = 0
380 |     start_time = time.time()
381 |     if hparams.mode != 'train':
382 |         model.saver_ffm.restore(sess, os.path.join(
383 |             hparams.path, 'model_'+str(hparams.sub_name)))
384 |     else:
385 |         # if os.path.exists(os.path.join(hparams.path, 'model_'+str(hparams.idx))):
386 |         #    model.saver_ffm.restore(sess,os.path.join(hparams.path, 'model_'+str(hparams.idx)))
387 |         while True:
388 |             try:
389 |                 cost, _, norm = model.train(sess, train_iterator)
390 |             except StopIteration:
391 |                 continue
392 |             global_step += 1
393 |             train_loss += cost
394 |             train_norm += norm
395 |             if global_step % hparams.num_display_steps == 0 or global_step >= hparams.epoch:
396 |                 info = {}
397 |                 info['learning_rate'] = hparams.learning_rate
398 |                 info["train_ppl"] = train_loss / \
399 |                     hparams.num_display_steps
400 |                 info["avg_grad_norm"] = train_norm / \
401 |                     hparams.num_display_steps
402 |                 train_loss = 0
403 |                 train_norm = 0
404 |                 print_step_info("  ", global_step, info)
405 |                 if global_step % hparams.num_eval_steps == 0 or global_step >= hparams.epoch:
406 |                     epoch = False
407 |                     preds = []
408 |                     label = []
409 |                     aid = []
410 |                     while True:
411 |                         try:
412 |                             pred = model.infer(
413 |                                 sess, dev_iterator, label, aid)
414 |                             preds += list(pred)
415 |                         except StopIteration:
416 |                             break
417 |                     res = {}
418 |                     for i in range(len(aid)):
419 |                         if aid[i] not in res:
420 |                             res[aid[i]] = {}
421 |                             res[aid[i]]['label'] = []
422 |                             res[aid[i]]['pred'] = []
423 |                         res[aid[i]]['label'].append(label[i]+1)
424 |                         res[aid[i]]['pred'].append(preds[i])
425 |                     auc = []
426 |                     for u in res:
427 |                         fpr, tpr, thresholds = metrics.roc_curve(
428 |                             res[u]['label'], res[u]['pred'], pos_label=2)
429 |                         loss_ = metrics.auc(fpr, tpr)
430 |                         if np.isnan(loss_):
431 |                             continue
432 |                         gloab_auc[u] = loss_
433 |                         auc.append(loss_)
434 |                     loss_ = np.mean(auc)
435 |                     if best_loss <= loss_:
436 |                         model.saver_ffm.save(sess, os.path.join(
437 |                             hparams.path, 'model_'+str(hparams.sub_name)))
438 |                         best_loss = loss_
439 |                         T = (time.time()-start_time)
440 |                         start_time = time.time()
441 |                         utils.print_out(
442 |                             "# Epcho-time %.2fs Eval AUC %.6f. Best AUC %.6f." % (T, loss_, best_loss))
443 |                     else:
444 |                         utils.print_out(
445 |                             "# Epcho-time %.2fs Eval AUC %.6f. Best AUC %.6f." % (T, loss_, best_loss))
446 |                         model.saver_ffm.restore(sess, os.path.join(
447 |                             hparams.path, 'model_'+str(hparams.sub_name)))
448 | 
449 |                     if global_step >= hparams.epoch:
450 |                         model.saver_ffm.restore(sess, os.path.join(
451 |                             hparams.path, 'model_'+str(hparams.sub_name)))
452 |                         break
453 |     print("Dev inference ...")
454 |     preds = []
455 |     label = []
456 |     aid = []
457 |     while True:
458 |         try:
459 |             pred = model.infer(sess, dev_iterator, label, aid)
460 |             preds += list(pred)
461 |         except StopIteration:
462 |             break
463 |     data = []
464 |     for i in range(len(preds)):
465 |         data.append([aid[i], label[i], preds[i]])
466 |     df = pd.DataFrame(data)
467 |     df.columns = ['aid', 'label', 'score']
468 |     df.to_csv('submission_dev_' +
469 |               str(hparams.sub_name)+'.csv', index=False)
470 |     print('Dev inference done!')
471 |     res = {}
472 |     for i in range(len(aid)):
473 |         if aid[i] not in res:
474 |             res[aid[i]] = {}
475 |             res[aid[i]]['label'] = []
476 |             res[aid[i]]['pred'] = []
477 |         res[aid[i]]['label'].append(label[i]+1)
478 |         res[aid[i]]['pred'].append(preds[i])
479 |     auc = []
480 |     for u in res:
481 |         fpr, tpr, thresholds = metrics.roc_curve(
482 |             res[u]['label'], res[u]['pred'], pos_label=2)
483 |         loss_ = metrics.auc(fpr, tpr)
484 |         if np.isnan(loss_):
485 |             continue
486 |         auc.append(loss_)
487 |     loss_ = np.mean(auc)
488 |     print("Dev auc:", loss_)
489 |     print("Test inference ...")
490 |     preds = []
491 |     label = []
492 |     aid = []
493 |     while True:
494 |         try:
495 |             pred = model.infer(sess, test_iterator, label, aid)
496 |             preds += list(pred)
497 |         except StopIteration:
498 |             break
499 |     print('Test inference done!')
500 |     return preds
501 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import tensorflow as tf
 6 | import utils
 7 | import nffm
 8 | import os
 9 | os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
10 | os.environ["CUDA_VISIBLE_DEVICES"] = '0'
11 | 
12 | 
13 | def create_hparams():
14 |     return tf.contrib.training.HParams(
15 |         k=8,
16 |         batch_size=4096,
17 |         optimizer="adam",
18 |         learning_rate=0.0002,
19 |         num_display_steps=100,
20 |         num_eval_steps=1000,
21 |         l2=0.000002,
22 |         hidden_size=[256, 128],
23 |         evl_batch_size=4096,
24 |         all_process=1,
25 |         idx=0,
26 |         epoch=int(44628906//4096),
27 |         mode='train',
28 |         data_path='dataset/',
29 |         sub_name='nffm',
30 |         single_features=['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType', 'age', 'gender',
31 |                          'education', 'consumptionAbility', 'LBS', 'carrier', 'house', 'uid_aid_nunique', 'aid_uid_nunique', 'campaignId_aid_nunique', 'uid_convert'],
32 |         mutil_features=['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdAction',
33 |                         'appIdInstall', 'marriageStatus', 'ct', 'os', 'pos_aid', 'neg_aid']
34 |     )
35 | 
36 | 
37 | hparams = create_hparams()
38 | hparams.path = '../model/'
39 | 
40 | hparams.aid = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType', 'pos_aid',
41 |                'neg_aid', 'aid_uid_nunique', 'campaignId_aid_nunique']
42 | hparams.user = ['age', 'gender', 'education', 'consumptionAbility', 'LBS', 'carrier', 'house', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5',
43 |                 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdAction', 'appIdInstall', 'marriageStatus', 'ct', 'os', 'uid_aid_nunique', 'uid_convert']
44 | hparams.num_features = []
45 | preds = nffm.train(hparams)
46 | 
47 | 
48 | test2_df = pd.read_csv('input/test2.csv')
49 | test2_df['score'] = preds
50 | test2_df['score'] = test2_df['score'].apply(lambda x: round(x, 4))
51 | test2_df[['aid', 'uid', 'score']].to_csv(
52 |     '../submission_'+str(hparams.sub_name)+'.csv', index=False)
53 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Generally useful utility functions."""
 17 | from __future__ import print_function
 18 | 
 19 | import codecs
 20 | import collections
 21 | import json
 22 | import math
 23 | import os
 24 | import sys
 25 | import time
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | 
 30 | 
 31 | def check_tensorflow_version():
 32 |   min_tf_version = "1.4.0-dev20171024"
 33 |   if tf.__version__ < min_tf_version:
 34 |     raise EnvironmentError("Tensorflow version must >= %s" % min_tf_version)
 35 | 
 36 | 
 37 | def safe_exp(value):
 38 |   """Exponentiation with catching of overflow error."""
 39 |   try:
 40 |     ans = math.exp(value)
 41 |   except OverflowError:
 42 |     ans = float("inf")
 43 |   return ans
 44 | 
 45 | 
 46 | def print_time(s, start_time):
 47 |   """Take a start time, print elapsed duration, and return a new time."""
 48 |   print("%s, time %ds, %s." % (s, (time.time() - start_time), time.ctime()))
 49 |   sys.stdout.flush()
 50 |   return time.time()
 51 | 
 52 | 
 53 | def print_out(s, f=None, new_line=True):
 54 |   """Similar to print but with support to flush and output to a file."""
 55 |   if isinstance(s, bytes):
 56 |     s = s.decode("utf-8")
 57 | 
 58 |   if f:
 59 |     f.write(s.encode("utf-8"))
 60 |     if new_line:
 61 |       f.write(b"\n")
 62 | 
 63 |   # stdout
 64 |   out_s = s.encode("utf-8")
 65 |   if not isinstance(out_s, str):
 66 |     out_s = out_s.decode("utf-8")
 67 |   print(out_s, end="", file=sys.stdout)
 68 | 
 69 |   if new_line:
 70 |     sys.stdout.write("\n")
 71 |   sys.stdout.flush()
 72 | 
 73 | 
 74 | def print_hparams(hparams, skip_patterns=None, header=None):
 75 |   """Print hparams, can skip keys based on pattern."""
 76 |   if header:
 77 |     print_out("%s" % header)
 78 |   values = hparams.values()
 79 |   for key in sorted(values.keys()):
 80 |     if not skip_patterns or all(
 81 |             [skip_pattern not in key for skip_pattern in skip_patterns]):
 82 |       print_out("  %s=%s" % (key, str(values[key])))
 83 | 
 84 | 
 85 | def load_hparams(model_dir):
 86 |   """Load hparams from an existing model directory."""
 87 |   hparams_file = os.path.join(model_dir, "hparams")
 88 |   if tf.gfile.Exists(hparams_file):
 89 |     print_out("# Loading hparams from %s" % hparams_file)
 90 |     with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_file, "rb")) as f:
 91 |       try:
 92 |         hparams_values = json.load(f)
 93 |         hparams = tf.contrib.training.HParams(**hparams_values)
 94 |       except ValueError:
 95 |         print_out("  can't load hparams file")
 96 |         return None
 97 |     return hparams
 98 |   else:
 99 |     return None
100 | 
101 | 
102 | def maybe_parse_standard_hparams(hparams, hparams_path):
103 |   """Override hparams values with existing standard hparams config."""
104 |   if not hparams_path:
105 |     return hparams
106 | 
107 |   if tf.gfile.Exists(hparams_path):
108 |     print_out("# Loading standard hparams from %s" % hparams_path)
109 |     with tf.gfile.GFile(hparams_path, "r") as f:
110 |       hparams.parse_json(f.read())
111 | 
112 |   return hparams
113 | 
114 | 
115 | def save_hparams(out_dir, hparams):
116 |   """Save hparams."""
117 |   hparams_file = os.path.join(out_dir, "hparams")
118 |   print_out("  saving hparams to %s" % hparams_file)
119 |   with codecs.getwriter("utf-8")(tf.gfile.GFile(hparams_file, "wb")) as f:
120 |     f.write(hparams.to_json())
121 | 
122 | 
123 | def debug_tensor(s, msg=None, summarize=10):
124 |   """Print the shape and value of a tensor at test time. Return a new tensor."""
125 |   if not msg:
126 |     msg = s.name
127 |   return tf.Print(s, [tf.shape(s), s], msg + " ", summarize=summarize)
128 | 
129 | 
130 | def add_summary(summary_writer, global_step, tag, value):
131 |   """Add a new summary to the current summary_writer.
132 |   Useful to log things that are not part of the training graph, e.g., tag=BLEU.
133 |   """
134 |   summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
135 |   summary_writer.add_summary(summary, global_step)
136 | 
137 | 
138 | def get_config_proto(log_device_placement=False, allow_soft_placement=True,
139 |                      num_intra_threads=0, num_inter_threads=0):
140 |   # GPU options:
141 |   # https://www.tensorflow.org/versions/r0.10/how_tos/using_gpu/index.html
142 |   config_proto = tf.ConfigProto(
143 |       log_device_placement=log_device_placement,
144 |       allow_soft_placement=allow_soft_placement)
145 |   config_proto.gpu_options.allow_growth = True
146 | 
147 |   # CPU threads options
148 |   if num_intra_threads:
149 |     config_proto.intra_op_parallelism_threads = num_intra_threads
150 |   if num_inter_threads:
151 |     config_proto.inter_op_parallelism_threads = num_inter_threads
152 | 
153 |   return config_proto
154 | 
155 | 
156 | def format_text(words):
157 |   """Convert a sequence words into sentence."""
158 |   if (not hasattr(words, "__len__") and  # for numpy array
159 |           not isinstance(words, collections.Iterable)):
160 |     words = [words]
161 |   return b" ".join(words)
162 | 
163 | 
164 | def format_bpe_text(symbols, delimiter=b"@@"):
165 |   """Convert a sequence of bpe words into sentence."""
166 |   words = []
167 |   word = b""
168 |   if isinstance(symbols, str):
169 |     symbols = symbols.encode()
170 |   delimiter_len = len(delimiter)
171 |   for symbol in symbols:
172 |     if len(symbol) >= delimiter_len and symbol[-delimiter_len:] == delimiter:
173 |       word += symbol[:-delimiter_len]
174 |     else:  # end of a word
175 |       word += symbol
176 |       words.append(word)
177 |       word = b""
178 |   return b" ".join(words)
179 | 
180 | 
181 | def format_spm_text(symbols):
182 |   """Decode a text in SPM (https://github.com/google/sentencepiece) format."""
183 |   return u"".join(format_text(symbols).decode("utf-8").split()).replace(
184 |       u"\u2581", u" ").strip().encode("utf-8")
185 | 


--------------------------------------------------------------------------------