├── README.md ├── run_nn.sh └── src ├── __pycache__ └── nffm.cpython-35.pyc ├── input └── README.md ├── load_vowpal.py ├── make_dataset.py ├── make_feature.py ├── nffm.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | ## 代码 2 | 2018年腾讯广告算法大赛Rank10代码:深度部分。 3 | 4 | ## 环境说明 5 | 6 | - 系统环境:  7 | - 操作系统:Ubuntu16.04 LTS 8 | - 硬件:CPU 40核,128GB内存 9 | - 显卡: TITAN Xp, 显存:12G 10 | 11 | - 软件环境: 12 | - Python: 3.6.4(conda 4.5.4) 13 | - Tensorflow: GPU版本1.7(源码编译), NVIDIA-SMI 390.48, cuda_9.1.85_387.26 14 | - 其他包: pandas(0.22.0), numpy(1.14.0), scipy(1.0.0), scikit-leartn(0.19.1), tqdm(4.23.3) 15 | 16 | ## 运行步骤 17 | - 预处理 18 | - 载入userFeature数据:`python3 load_vowpal.py ` 19 | - 生成特征 20 | - 生成uid/aid全局统计特征: `python3 make_feature.py` 21 | - 运行模型 22 | - 运行NFFM模型: `python3 train.py` 23 | 24 | ## 特征工程 25 | 26 | ### 特征使用 27 | - 基础特征:user、ad的所有基本特征 28 | - 统计特征:uid_aid_nunique(每个uid下的aid数目,等频离散化), aid_uid_nunique(每个aid下的uid数目,等频离散化), campaignId_aid_nunique(每个campaignId下的aid数目, 等频离散化),pos_aid(每个用户的训练集正aid), neg_aid(每个用户的训练集负aid),user_convert(用户转化率) 29 | 30 | ### 特征生成 31 | - 全局统计特征: 32 | - 使用groupby提取,如`ad_Feature.groupby(['campaignId']).aid.nunique()` 33 | - 离散化:按百分比:[0, 20, 35, 50, 65, 85, 100]进行离散化 34 | - 转化率特征: 用户在训练集上的转化率,为防止过拟合,在移除当前行的label基础上进行统计 35 | - 正负aid特征: 36 | - 在训练集中构建uid:aid-label字典 37 | - 整理上述字典uid:[aid-pos, aid-neg], 其中aid-pos为label为1的aid列表, aid-neg为label为-1的aid列表 38 | - 构建测试集中的特征:直接将上述uid拼接到测试集的uid中 39 | - 构建训练集中的特征:将上述uid拼接到训练集的uid中,并移除aid-pos、aid-neg中每行出现的aid 40 | - 最终生成多值正负aid特征 41 | 42 | ## 模型结构 43 | 模型使用郭达雅同学在群里的开源代码:`nffm-v3`, 即深度FFM模型。 44 | ### 结构定义 45 | - FFM部分:线性部分+二阶隐式交叉 46 | - 深度部分:两个隐藏层 47 | 48 | ### 参数 49 | - 批大小:4096 50 | - 迭代次数: 1 51 | - 隐藏层: 256, 128 52 | - 优化器: adam 53 | - 学习率: 0.0002 54 | - L2正则: 0.000002 55 | - 嵌入大小: 8 56 | - 随机种子:2018 57 | 58 | ## 感谢 59 | 感谢郭达雅大佬的[开源模型](https://github.com/guoday/Tencent2018_Lookalike_Rank7th) 60 | -------------------------------------------------------------------------------- /run_nn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | printf '========== Generate NN Result ==========\n' 3 | CURDIR="`pwd`"/"`dirname $0`" 4 | printf "\nCurrent Path:$CURDIR \n" 5 | 6 | train_file=${CURDIR}"/src/input/train.csv" 7 | test1_file=${CURDIR}"/src/input/test1.csv" 8 | test2_file=${CURDIR}"/src/input/test2.csv" 9 | ad_feature=${CURDIR}"/src/input/adFeature.csv" 10 | user_file=${CURDIR}"/src/input/userFeature.data" 11 | user_feature=${CURDIR}"/src/input/userFeature.csv" 12 | dataset_dir=${CURDIR}"/src/dataset/" 13 | train_dir=${CURDIR}"/src/dataset/train/" 14 | valid_dir=${CURDIR}"/src/dataset/dev/" 15 | test2_dir=${CURDIR}"/src/dataset/test/" 16 | train_aid_fea=${dataset_dir}"/train_uid_aid_bin.csv" 17 | fea_dict=${dataset_dir}"/dic.pkl" 18 | sub_file=${CURDIR}"/submission_nffm.csv" 19 | 20 | printf '\nStep1: PreProcess UserFeature...\n' 21 | if [ ! -f $user_feature ]; then 22 | cd src 23 | python3 load_vowpal.py 24 | printf 'Save to input/userFeature.csv\n' 25 | cd .. 26 | else 27 | printf 'UserFeature exists, Skip this step!\n' 28 | fi 29 | 30 | printf '\nStep2: Make Features...\n' 31 | if [ ! -f $train_aid_fea ]; then 32 | cd src 33 | python3 make_feature.py 34 | cd .. 35 | else 36 | printf 'Feature exists, Skip this step!\n' 37 | fi 38 | 39 | printf '\nStep3: Make Dataset...\n' 40 | if [ ! -f $fea_dict ]; then 41 | cd src 42 | python3 make_dataset.py 43 | cd .. 44 | else 45 | printf 'Feature exists, Skip this step!\n' 46 | fi 47 | 48 | printf '\nStep4: Train Model and Predic Result...\n' 49 | if [ ! -f $sub_file ]; then 50 | cd src 51 | python3 train.py 52 | cd .. 53 | else 54 | printf 'Train Done, Skip this step!\n' 55 | fi -------------------------------------------------------------------------------- /src/__pycache__/nffm.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keyunluo/Tencent2018_Lookalike_Rank10th/ae9227541c472cd0dc91a494369e7c795a31dcae/src/__pycache__/nffm.cpython-35.pyc -------------------------------------------------------------------------------- /src/input/README.md: -------------------------------------------------------------------------------- 1 | ## 原始数据存放在此 2 | 3 | - adFeature.csv 4 | - test1.csv 5 | - test2.csv 6 | - train.csv 7 | - userFeature.data(预处理自动生成:userFeature.csv) -------------------------------------------------------------------------------- /src/load_vowpal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from csv import DictWriter 4 | from tqdm import tqdm 5 | import mmap 6 | 7 | 8 | def get_file_lines(file_path): 9 | ''' 10 | 获取文件行数 11 | ''' 12 | fp = open(file_path, "r+") 13 | buf = mmap.mmap(fp.fileno(), 0) 14 | lines = 0 15 | while buf.readline(): 16 | lines += 1 17 | return lines 18 | 19 | 20 | def process_vowpal(file_path, out_path): 21 | ''' 22 | 处理用户特征文件 23 | ''' 24 | headers = ['uid', 'age', 'gender', 'marriageStatus', 'education', 'consumptionAbility', 'LBS', 'interest1', 'interest2', 'interest3', 25 | 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os', 'carrier', 'house'] 26 | fo = open(out_path, 'wt') 27 | writer = DictWriter(fo, fieldnames=headers, lineterminator='\n') 28 | writer.writeheader() 29 | 30 | with open(file_path, 'rt') as f: 31 | for line in tqdm(f, total=get_file_lines(file_path)): 32 | feature_groups = line.strip().split('|') 33 | fea_dict = {} 34 | for feas in feature_groups: 35 | feas_split = feas.split(' ') 36 | fea_dict[feas_split[0]] = ' '.join(feas_split[1:]) 37 | writer.writerow(fea_dict) 38 | fo.close() 39 | 40 | 41 | if __name__ == '__main__': 42 | file_path = 'input/userFeature.data' 43 | out_path = 'input/userFeature.csv' 44 | process_vowpal(file_path, out_path) 45 | -------------------------------------------------------------------------------- /src/make_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import random 6 | import pickle 7 | import gc 8 | import os 9 | from collections import Counter, OrderedDict 10 | from sklearn.model_selection import train_test_split 11 | 12 | 13 | threshold = 1000 14 | random.seed(2018) 15 | 16 | 17 | def pre_data(): 18 | user_feature = pd.read_csv('input/userFeature.csv') 19 | ad_Feature = pd.read_csv('input/adFeature.csv') 20 | 21 | train_df = pd.read_csv('input/train.csv') 22 | test_df = pd.read_csv('input/test2.csv') 23 | 24 | train_df = pd.merge(train_df, ad_Feature, on='aid', how='left') 25 | test_df = pd.merge(test_df, ad_Feature, on='aid', how='left') 26 | train_df = pd.merge(train_df, user_feature, on='uid', how='left') 27 | test_df = pd.merge(test_df, user_feature, on='uid', how='left') 28 | 29 | del user_feature 30 | gc.collect() 31 | 32 | # ['pos_aid', 'neg_aid'] 33 | train_aid_fea = pd.read_csv( 34 | 'dataset/train_neg_pos_aid.csv', usecols=['pos_aid', 'neg_aid']) 35 | test2_aid_fea = pd.read_csv( 36 | 'dataset/test2_neg_pos_aid.csv', usecols=['pos_aid', 'neg_aid']) 37 | 38 | # ['campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', 'uid_convert'] 39 | train_statistic_fea = pd.read_csv('dataset/train_uid_aid_bin.csv') 40 | test2_statistic_fea = pd.read_csv('dataset/test2_uid_aid_bin.csv') 41 | 42 | train_df = pd.concat( 43 | [train_df, train_aid_fea, train_statistic_fea], axis=1) 44 | test_df = pd.concat([test_df, test2_aid_fea, test2_statistic_fea], axis=1) 45 | 46 | train_df = train_df.fillna('0') 47 | test_df = test_df.fillna('0') 48 | 49 | gc.collect() 50 | 51 | train_df.loc[train_df['label'] == -1, 'label'] = 0 52 | test_df['label'] = -1 53 | 54 | train_df, dev_df = train_test_split( 55 | train_df, test_size=0.1, random_state=2018) 56 | 57 | np.save('dataset/train_df.index', np.array(train_df.index)) 58 | np.save('dataset/dev_df.index', np.array(dev_df.index)) 59 | 60 | return train_df, dev_df, test_df 61 | 62 | 63 | def output_label(train_df, dev_df, test_df): 64 | with open('dataset/dev/label', 'w') as f: 65 | for i in list(dev_df['label']): 66 | f.write(str(i)+'\n') 67 | with open('dataset/test/label', 'w') as f: 68 | for i in list(test_df['label']): 69 | f.write(str(i)+'\n') 70 | with open('dataset/train/label', 'w') as f: 71 | for i in list(train_df['label']): 72 | f.write(str(i)+'\n') 73 | 74 | 75 | def single_features(train_df, dev_df, test_df, word2index): 76 | single_ids_features = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType', 'age', 77 | 'gender', 'education', 'consumptionAbility', 'LBS', 'carrier', 'house', 78 | 'campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', 'uid_convert'] 79 | 80 | for s in single_ids_features: 81 | print(s) 82 | cont = {} 83 | 84 | with open('dataset/train/'+str(s), 'w') as f: 85 | for line in list(train_df[s].values): 86 | f.write(str(line)+'\n') 87 | if str(line) not in cont: 88 | cont[str(line)] = 0 89 | cont[str(line)] += 1 90 | 91 | with open('dataset/dev/'+str(s), 'w') as f: 92 | for line in list(dev_df[s].values): 93 | f.write(str(line)+'\n') 94 | 95 | with open('dataset/test/'+str(s), 'w') as f: 96 | for line in list(test_df[s].values): 97 | f.write(str(line)+'\n') 98 | index = [] 99 | for k in cont: 100 | if s not in ['campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', 'uid_convert']: 101 | if cont[k] >= threshold: 102 | index.append(k) 103 | else: 104 | index.append(k) 105 | word2index[s] = {} 106 | for idx, val in enumerate(index): 107 | word2index[s][val] = idx+2 108 | print(s+' done!') 109 | 110 | 111 | def mutil_ids(train_df, dev_df, test_df, word2index): 112 | features_mutil = ['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 113 | 'topic2', 'topic3', 'appIdAction', 'appIdInstall', 'marriageStatus', 'ct', 'os', 'pos_aid', 'neg_aid'] 114 | for s in features_mutil: 115 | print(s) 116 | cont = {} 117 | with open('dataset/train/'+str(s), 'w') as f: 118 | for lines in list(train_df[s].values): 119 | lines = str(lines) 120 | f.write(lines+'\n') 121 | for line in lines.split(): 122 | if line not in cont: 123 | cont[line] = 0 124 | cont[line] += 1 125 | 126 | with open('dataset/dev/'+str(s), 'w') as f: 127 | for line in list(dev_df[s].values): 128 | f.write(str(line)+'\n') 129 | 130 | with open('dataset/test/'+str(s), 'w') as f: 131 | for line in list(test_df[s].values): 132 | f.write(str(line)+'\n') 133 | index = [] 134 | for k in cont: 135 | if s not in ['pos_aid', 'neg_aid']: 136 | if cont[k] >= threshold: 137 | index.append(k) 138 | else: 139 | index.append(k) 140 | word2index[s] = {} 141 | for idx, val in enumerate(index): 142 | word2index[s][val] = idx+2 143 | print(s+' done!') 144 | 145 | 146 | if __name__ == '__main__': 147 | if os.path.exists('dataset/dic.pkl'): 148 | word2index = pickle.load(open('dataset/dic.pkl', 'rb')) 149 | else: 150 | word2index = {} 151 | print("Loading Data...") 152 | train_df, dev_df, test_df = pre_data() 153 | 154 | print("Output Label...") 155 | output_label(train_df, dev_df, test_df) 156 | 157 | print("Processing Single Feature...") 158 | single_features(train_df, dev_df, test_df, word2index) 159 | pickle.dump(word2index, open('dataset/dic.pkl', 'wb')) 160 | 161 | print("Processing Multiple Feature...") 162 | mutil_ids(train_df, dev_df, test_df, word2index) 163 | pickle.dump(word2index, open('dataset/dic.pkl', 'wb')) 164 | -------------------------------------------------------------------------------- /src/make_feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | from collections import defaultdict 7 | from sklearn.preprocessing import MinMaxScaler 8 | 9 | def gen_pos_neg_aid_fea(): 10 | train_data = pd.read_csv('input/train.csv') 11 | test2_data = pd.read_csv('input/test2.csv') 12 | 13 | train_user = train_data.uid.unique() 14 | 15 | # user-aid dict 16 | uid_dict = defaultdict(list) 17 | for row in tqdm(train_data.itertuples(), total=len(train_data)): 18 | uid_dict[row[2]].append([row[1], row[3]]) 19 | 20 | # user convert 21 | uid_convert = {} 22 | for uid in tqdm(train_user): 23 | pos_aid, neg_aid = [], [] 24 | for data in uid_dict[uid]: 25 | if data[1] > 0: 26 | pos_aid.append(data[0]) 27 | else: 28 | neg_aid.append(data[0]) 29 | uid_convert[uid] = [pos_aid, neg_aid] 30 | 31 | test2_neg_pos_aid = {} 32 | for row in tqdm(test2_data.itertuples(), total=len(test2_data)): 33 | aid = row[1] 34 | uid = row[2] 35 | if uid_convert.get(uid, []) == []: 36 | test2_neg_pos_aid[row[0]] = ['', '', -1] 37 | else: 38 | pos_aid, neg_aid = uid_convert[uid][0].copy(), uid_convert[uid][1].copy() 39 | convert = len(pos_aid) / (len(pos_aid) + len(neg_aid)) if (len(pos_aid) + len(neg_aid)) > 0 else -1 40 | test2_neg_pos_aid[row[0]] = [' '.join(map(str, pos_aid)), ' '.join(map(str, neg_aid)), convert] 41 | df_test2 = pd.DataFrame.from_dict(data=test2_neg_pos_aid, orient='index') 42 | df_test2.columns = ['pos_aid', 'neg_aid', 'uid_convert'] 43 | 44 | train_neg_pos_aid = {} 45 | for row in tqdm(train_data.itertuples(), total=len(train_data)): 46 | aid = row[1] 47 | uid = row[2] 48 | pos_aid, neg_aid = uid_convert[uid][0].copy(), uid_convert[uid][1].copy() 49 | if aid in pos_aid: 50 | pos_aid.remove(aid) 51 | if aid in neg_aid: 52 | neg_aid.remove(aid) 53 | convert = len(pos_aid) / (len(pos_aid) + len(neg_aid)) if (len(pos_aid) + len(neg_aid)) > 0 else -1 54 | train_neg_pos_aid[row[0]] = [' '.join(map(str, pos_aid)), ' '.join(map(str, neg_aid)), convert] 55 | 56 | df_train = pd.DataFrame.from_dict(data=train_neg_pos_aid, orient='index') 57 | df_train.columns = ['pos_aid', 'neg_aid', 'uid_convert'] 58 | 59 | df_train.to_csv("dataset/train_neg_pos_aid.csv", index=False) 60 | df_test2.to_csv("dataset/test2_neg_pos_aid.csv", index=False) 61 | 62 | def gen_uid_aid_fea(): 63 | ''' 64 | 载入数据, 提取aid, uid的全局统计特征 65 | ''' 66 | train_data = pd.read_csv('input/train.csv') 67 | test1_data = pd.read_csv('input/test1.csv') 68 | test2_data = pd.read_csv('input/test2.csv') 69 | 70 | ad_Feature = pd.read_csv('input/adFeature.csv') 71 | 72 | train_len = len(train_data) # 45539700 73 | test1_len = len(test1_data) 74 | test2_len = len(test2_data) # 11727304 75 | 76 | ad_Feature = pd.merge(ad_Feature, ad_Feature.groupby(['campaignId']).aid.nunique().reset_index( 77 | ).rename(columns={'aid': 'campaignId_aid_nunique'}), how='left', on='campaignId') 78 | 79 | df = pd.concat([train_data, test1_data, test2_data], axis=0) 80 | df = pd.merge(df, df.groupby(['uid'])['aid'].nunique().reset_index().rename( 81 | columns={'aid': 'uid_aid_nunique'}), how='left', on='uid') 82 | 83 | df = pd.merge(df, df.groupby(['aid'])['uid'].nunique().reset_index().rename( 84 | columns={'uid': 'aid_uid_nunique'}), how='left', on='aid') 85 | 86 | df['uid_count'] = df.groupby('uid')['aid'].transform('count') 87 | df = pd.merge(df, ad_Feature[['aid', 'campaignId_aid_nunique']], how='left', on='aid') 88 | 89 | fea_columns = ['campaignId_aid_nunique', 'uid_aid_nunique', 'aid_uid_nunique', 'uid_count', ] 90 | 91 | df[fea_columns].iloc[:train_len].to_csv('dataset/train_uid_aid.csv', index=False) 92 | df[fea_columns].iloc[train_len: train_len+test1_len].to_csv('dataset/test1_uid_aid.csv', index=False) 93 | df[fea_columns].iloc[-test2_len:].to_csv('dataset/test2_uid_aid.csv', index=False) 94 | 95 | def digitize(): 96 | uid_aid_train = pd.read_csv('dataset/train_uid_aid.csv') 97 | uid_aid_test1 = pd.read_csv('dataset/test1_uid_aid.csv') 98 | uid_aid_test2 = pd.read_csv('dataset/test2_uid_aid.csv') 99 | uid_aid_df = pd.concat([uid_aid_train, uid_aid_test1, uid_aid_test2], axis=0) 100 | for col in range(3): 101 | bins = [] 102 | for percent in [0, 20, 35, 50, 65, 85, 100]: 103 | bins.append(np.percentile(uid_aid_df.iloc[:, col], percent)) 104 | uid_aid_train.iloc[:, col] = np.digitize(uid_aid_train.iloc[:, col], bins, right=True) 105 | uid_aid_test1.iloc[:, col] = np.digitize(uid_aid_test1.iloc[:, col], bins, right=True) 106 | uid_aid_test2.iloc[:, col] = np.digitize(uid_aid_test2.iloc[:, col], bins, right=True) 107 | 108 | count_bins = [1, 2, 4, 6, 8, 10, 16, 27, 50] 109 | uid_aid_train.iloc[:, 3] = np.digitize(uid_aid_train.iloc[:, 3], count_bins, right=True) 110 | uid_aid_test1.iloc[:, 3] = np.digitize(uid_aid_test1.iloc[:, 3], count_bins, right=True) 111 | uid_aid_test2.iloc[:, 3] = np.digitize(uid_aid_test2.iloc[:, 3], count_bins, right=True) 112 | 113 | uid_convert_train = pd.read_csv("dataset/train_neg_pos_aid.csv", usecols=['uid_convert']) 114 | uid_convert_test2 = pd.read_csv("dataset/test2_neg_pos_aid.csv", usecols=['uid_convert']) 115 | 116 | convert_bins = [-1, 0, 0.1, 0.3, 0.5, 0.7, 1] 117 | uid_convert_train.iloc[:, 0] = np.digitize(uid_convert_train.iloc[:, 0], convert_bins, right=True) 118 | uid_convert_test2.iloc[:, 0] = np.digitize(uid_convert_test2.iloc[:, 0], convert_bins, right=True) 119 | 120 | 121 | uid_aid_train = pd.concat([uid_aid_train, uid_convert_train], axis=1) 122 | uid_aid_test2 = pd.concat([uid_aid_test2, uid_convert_test2], axis=1) 123 | 124 | uid_aid_train.to_csv('dataset/train_uid_aid_bin.csv', index=False) 125 | uid_aid_test2.to_csv('dataset/test2_uid_aid_bin.csv', index=False) 126 | 127 | if __name__ == '__main__': 128 | print("Make Feature...") 129 | print("1. Generate pos_neg_aid_fea") 130 | gen_pos_neg_aid_fea() 131 | print("2. Generate uid_aid_fea") 132 | gen_uid_aid_fea() 133 | print("3. Digitize numerical feature") 134 | digitize() 135 | print("Make Feature Done") -------------------------------------------------------------------------------- /src/nffm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import utils 5 | from tensorflow.python.ops import lookup_ops 6 | from tensorflow.python.layers import core as layers_core 7 | from sklearn.metrics import log_loss 8 | import time 9 | import random 10 | import os 11 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm 12 | import numpy as np 13 | from sklearn import metrics 14 | import pickle 15 | gloab_auc = {} 16 | import pandas as pd 17 | 18 | 19 | def print_step_info(prefix, global_step, info): 20 | utils.print_out( 21 | "%sstep %d lr %g logloss %.6f gN %.2f, %s" % 22 | (prefix, global_step, info["learning_rate"], 23 | info["train_ppl"], info["avg_grad_norm"], time.ctime())) 24 | 25 | 26 | class TextIterator: 27 | def __init__(self, hparams, mode, batch_size=None): 28 | self.single_file = {} 29 | self.mulit_file = {} 30 | self.num_file = {} 31 | 32 | for s in hparams.single_features: 33 | self.single_file[s] = open(hparams.data_path+'/'+mode+'/'+s, 'r') 34 | for s in hparams.mutil_features: 35 | self.mulit_file[s] = open(hparams.data_path+'/'+mode+'/'+s, 'r') 36 | for s in hparams.num_features: 37 | self.num_file[s] = open(hparams.data_path+'/'+mode+'/'+s, 'r') 38 | 39 | self.label_file = open(hparams.data_path+'/'+mode+'/label', 'r') 40 | self.word2index = pickle.load(open(hparams.data_path+'/dic.pkl', 'rb')) 41 | hparams.dict = self.word2index 42 | if batch_size: 43 | self.batch_size = batch_size 44 | else: 45 | self.batch_size = hparams.batch_size 46 | 47 | if 'train' in mode: 48 | self.idx = hparams.idx 49 | self.all_process = int(hparams.all_process) 50 | else: 51 | self.idx = 0 52 | self.all_process = 1 53 | self.hparams = hparams 54 | random.seed(2018) 55 | 56 | def reset(self): 57 | self.idx = (self.idx+1) % self.all_process 58 | for s in self.single_file: 59 | self.single_file[s].seek(0) 60 | for s in self.mulit_file: 61 | self.mulit_file[s].seek(0) 62 | for s in self.num_file: 63 | self.num_file[s].seek(0) 64 | self.label_file.seek(0) 65 | 66 | def next(self): 67 | single_ids = {} 68 | mulit_ids = {} 69 | labels = [] 70 | mulit_length = {} 71 | num_ids = {} 72 | for s in self.hparams.single_features: 73 | temp = [] 74 | for i in range(self.batch_size*self.all_process): 75 | ss = self.single_file[s].readline().strip() 76 | if ss == "": 77 | break 78 | if i % self.all_process != self.idx: 79 | continue 80 | try: 81 | if int(ss) in self.hparams.dict[s]: 82 | ss = int(ss) 83 | except: 84 | pass 85 | 86 | try: 87 | temp.append(self.hparams.dict[s][ss]) 88 | except: 89 | temp.append(0) 90 | single_ids[s] = temp 91 | 92 | for s in self.hparams.num_features: 93 | temp = [] 94 | for i in range(self.batch_size*self.all_process): 95 | ss = self.num_file[s].readline().strip() 96 | if ss == "": 97 | break 98 | if i % self.all_process != self.idx: 99 | continue 100 | temp.append(float(ss)) 101 | num_ids[s] = temp 102 | 103 | max_len = 0 104 | for s in self.hparams.mutil_features: 105 | temp = [] 106 | temp_len = [] 107 | for i in range(self.batch_size*self.all_process): 108 | mm = self.mulit_file[s].readline().strip() 109 | if mm == "": 110 | break 111 | if i % self.all_process != self.idx: 112 | continue 113 | t = [] 114 | t_len = [] 115 | for m in mm.split(): 116 | try: 117 | t.append(self.hparams.dict[s][m]) 118 | except: 119 | t.append(0) 120 | temp.append(t) 121 | temp_len.append(len(t)) 122 | max_len = max(len(t), max_len) 123 | mulit_length[s] = temp_len 124 | mulit_ids[s] = temp 125 | max_len = 100 126 | for t in mulit_ids: 127 | for i in range(len(mulit_ids[t])): 128 | if len(mulit_ids[t][i]) <= max_len: 129 | mulit_ids[t][i] += (max_len-len(mulit_ids[t][i]))*[1] 130 | else: 131 | mulit_ids[t][i] = random.sample(mulit_ids[t][i], max_len) 132 | 133 | for i in range(self.batch_size*self.all_process): 134 | ss = self.label_file.readline().strip() 135 | if ss == "": 136 | break 137 | if i % self.all_process != self.idx: 138 | continue 139 | labels.append(int(ss)) 140 | 141 | if len(single_ids['aid']) == 0: 142 | self.reset() 143 | raise StopIteration 144 | return (labels, single_ids, mulit_ids, num_ids, mulit_length) 145 | 146 | 147 | class Model(object): 148 | def __init__(self, hparams): 149 | self.f1 = hparams.aid.copy() 150 | self.f2 = hparams.user.copy() 151 | self.batch_norm_decay = 0.9 152 | self.single_ids = {} 153 | self.num_ids = {} 154 | self.mulit_ids = {} 155 | self.mulit_mask = {} 156 | self.emb_v1 = {} 157 | self.emb_v2 = {} 158 | self.emb_combine_aid_v2 = {} 159 | self.norm_num = {} 160 | self.cross_params = [] 161 | self.layer_params = [] 162 | self.length = {} 163 | self.bias = tf.Variable(tf.truncated_normal( 164 | shape=[1], mean=0.0, stddev=0.0001), name='bias') 165 | self.use_dropout = tf.placeholder(tf.bool) 166 | initializer = tf.random_uniform_initializer(-0.1, 0.1) 167 | self.feature_all_length = len( 168 | hparams.single_features)+len(hparams.mutil_features) 169 | feature_all_length = self.feature_all_length 170 | self.label = tf.placeholder(shape=(None), dtype=tf.float32) 171 | norm = ['uid_count'] 172 | for s in hparams.num_features: 173 | self.num_ids[s] = tf.placeholder(shape=(None,), dtype=tf.float32) 174 | if s in norm: 175 | self.norm_num[s] = self.batch_norm_layer( 176 | tf.reshape(self.num_ids[s], [-1, 1]), self.use_dropout, s) 177 | else: 178 | self.norm_num[s] = self.num_ids[s][:, None] 179 | 180 | for s in hparams.single_features: 181 | self.single_ids[s] = tf.placeholder(shape=(None,), dtype=tf.int32) 182 | self.emb_v1[s] = tf.Variable(tf.truncated_normal( 183 | shape=[len(hparams.dict[s])+2, 1], mean=0.0, stddev=0.0001), name='emb_v1_'+s) 184 | if s in self.f1: 185 | self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len( 186 | hparams.dict[s])+2, len(self.f2), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s) 187 | elif s in self.f2: 188 | self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len( 189 | hparams.dict[s])+2, len(self.f1), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s) 190 | 191 | for s in hparams.mutil_features: 192 | self.mulit_ids[s] = tf.placeholder( 193 | shape=(None, None), dtype=tf.int32) 194 | self.length[s] = tf.placeholder(shape=(None,), dtype=tf.int32) 195 | self.mulit_mask[s] = tf.sequence_mask( 196 | self.length[s], 100, dtype=tf.float32) 197 | self.emb_v1[s] = tf.get_variable( 198 | shape=[len(hparams.dict[s])+2, 1], initializer=initializer, name='emb_v1_'+s) 199 | if s in self.f1: 200 | self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len( 201 | hparams.dict[s])+2, len(self.f2), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s) 202 | elif s in self.f2: 203 | self.emb_v2[s] = tf.Variable(tf.truncated_normal(shape=[len( 204 | hparams.dict[s])+2, len(self.f1), hparams.k], mean=0.0, stddev=0.0001), name='emb_v2_'+s) 205 | 206 | self.build_graph(hparams) 207 | self.optimizer(hparams) 208 | params = tf.trainable_variables() 209 | 210 | def build_graph(self, hparams): 211 | initializer = tf.random_uniform_initializer(-0.1, 0.1) 212 | # lr 213 | emb_inp_v1 = {} 214 | for s in hparams.single_features: 215 | emb_inp_v1[s] = tf.gather(self.emb_v1[s], self.single_ids[s]) 216 | for s in hparams.mutil_features: 217 | emb_inp_v1[s] = tf.reduce_sum(tf.gather(self.emb_v1[s], self.mulit_ids[s]) * 218 | self.mulit_mask[s][:, :, None], axis=1)/tf.cast(self.length[s], tf.float32)[:, None] 219 | 220 | emb_inp_v1 = tf.concat([emb_inp_v1[s] for s in emb_inp_v1], -1) 221 | w1 = tf.reduce_sum(emb_inp_v1, [-1]) 222 | 223 | # poly 224 | emb_inp_v2 = {} 225 | for s in hparams.single_features: 226 | emb_inp_v2[s] = tf.gather(self.emb_v2[s], self.single_ids[s]) 227 | 228 | for s in hparams.mutil_features: 229 | emb_inp_v2[s] = tf.reduce_sum(tf.gather(self.emb_v2[s], self.mulit_ids[s])*self.mulit_mask[s] 230 | [:, :, None, None], axis=1) / tf.cast(self.length[s], tf.float32)[:, None, None] 231 | 232 | x = [[], []] 233 | 234 | for s in self.f1: 235 | x[0].append(emb_inp_v2[s][:, None, :, :]) 236 | for s in self.f2: 237 | x[1].append(emb_inp_v2[s][:, :, None, :]) 238 | x[0] = tf.concat(x[0], 1) 239 | x[1] = tf.concat(x[1], 2) 240 | 241 | emb_rep_v = x[0]*x[1] 242 | emb_rep_v = tf.reshape( 243 | emb_rep_v, [-1, len(self.f1)*len(self.f2)*hparams.k]) 244 | emb_rep = emb_rep_v 245 | temp = [] 246 | temp.append(emb_rep) 247 | for s in hparams.num_features: 248 | temp.append(self.norm_num[s]) 249 | emb_rep = tf.concat(temp, -1) 250 | 251 | input_size = len(self.f1)*len(self.f2)*hparams.k + \ 252 | len(hparams.num_features) 253 | glorot = np.sqrt(2.0 / (input_size + hparams.hidden_size[0])) 254 | 255 | W_1 = tf.get_variable("W_1", [emb_rep.get_shape( 256 | )[-1], hparams.hidden_size[0]], initializer=initializer) 257 | b_1 = tf.get_variable( 258 | "b_1", [1, hparams.hidden_size[0]], initializer=initializer) 259 | 260 | hidden_outputs_1 = tf.tensordot(emb_rep, W_1, [[-1], [0]]) 261 | hidden_outputs_1 = self.batch_norm_layer( 262 | hidden_outputs_1, self.use_dropout, 'train_1') 263 | hidden_outputs_1 = tf.nn.relu(hidden_outputs_1) 264 | 265 | W_2 = tf.get_variable( 266 | "W_2", [hparams.hidden_size[0], hparams.hidden_size[1]], initializer=initializer) 267 | b_2 = tf.get_variable( 268 | "b_2", [1, hparams.hidden_size[1]], initializer=initializer) 269 | 270 | hidden_outputs_2 = tf.tensordot(hidden_outputs_1, W_2, [[-1], [0]]) 271 | hidden_outputs_2 = self.batch_norm_layer( 272 | hidden_outputs_2, self.use_dropout, 'train_2') 273 | hidden_outputs_2 = tf.nn.relu(hidden_outputs_2) 274 | 275 | glorot = np.sqrt(2.0 / (hparams.hidden_size[1] + 1)) 276 | W_3 = tf.get_variable( 277 | "W_3", [hparams.hidden_size[1], 1], initializer=initializer) 278 | b_3 = tf.Variable(tf.constant(-3.5), dtype=np.float32) 279 | 280 | w_3 = tf.tensordot(hidden_outputs_2, W_3, [[-1], [0]])+b_3 281 | 282 | score = w1+w_3[:, 0] 283 | self.prob = tf.sigmoid(score) 284 | logit_1 = tf.log(self.prob) 285 | logit_0 = tf.log(1-self.prob) 286 | self.loss = -tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0) 287 | self.cost = -tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0) 288 | self.saver_ffm = tf.train.Saver() 289 | 290 | def optimizer(self, hparams): 291 | self.lrate = tf.Variable(hparams.learning_rate, trainable=False) 292 | if hparams.optimizer == "sgd": 293 | opt = tf.train.GradientDescentOptimizer(self.lrate) 294 | elif hparams.optimizer == "adam": 295 | opt = tf.train.AdamOptimizer( 296 | self.lrate, beta1=0.9, beta2=0.999, epsilon=1e-8) 297 | elif hparams.optimizer == "ada": 298 | opt = tf.train.AdagradOptimizer( 299 | learning_rate=self.lrate, initial_accumulator_value=1e-8) 300 | params = tf.trainable_variables() 301 | 302 | gradients = tf.gradients( 303 | self.cost, params, colocate_gradients_with_ops=True) 304 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 305 | self.grad_norm = gradient_norm 306 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 307 | 308 | def dey_lrate(self, sess, lrate): 309 | sess.run(tf.assign(self.lrate, lrate)) 310 | 311 | def train(self, sess, iterator): 312 | data = iterator.next() 313 | self.maxlen = len(data[2]['interest2'][0]) 314 | dic = {} 315 | for s in self.single_ids: 316 | dic[self.single_ids[s]] = data[1][s] 317 | 318 | for s in self.mulit_ids: 319 | dic[self.mulit_ids[s]] = data[2][s] 320 | dic[self.length[s]] = data[4][s] 321 | 322 | for s in self.num_ids: 323 | dic[self.num_ids[s]] = data[3][s] 324 | 325 | dic[self.use_dropout] = True 326 | dic[self.label] = data[0] 327 | 328 | return sess.run([self.cost, self.update, self.grad_norm], feed_dict=dic) 329 | 330 | def infer(self, sess, iterator, label, aid): 331 | data = iterator.next() 332 | self.maxlen = len(data[2]['interest1'][0]) 333 | label.extend(data[0]) 334 | aid.extend(data[1]['aid']) 335 | dic = {} 336 | for s in self.single_ids: 337 | dic[self.single_ids[s]] = data[1][s] 338 | 339 | for s in self.mulit_ids: 340 | dic[self.mulit_ids[s]] = data[2][s] 341 | dic[self.length[s]] = data[4][s] 342 | for s in self.num_ids: 343 | dic[self.num_ids[s]] = data[3][s] 344 | 345 | dic[self.use_dropout] = False 346 | return sess.run(self.prob, feed_dict=dic) 347 | 348 | def batch_norm_layer(self, x, train_phase, scope_bn): 349 | bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None, 350 | is_training=True, reuse=None, trainable=True, scope=scope_bn) 351 | bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None, 352 | is_training=False, reuse=True, trainable=True, scope=scope_bn) 353 | z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference) 354 | return z 355 | 356 | 357 | def train(hparams): 358 | tf.set_random_seed(2018) 359 | random.seed(2018) 360 | train_iterator = TextIterator(hparams, mode="train") 361 | dev_iterator = TextIterator( 362 | hparams, mode="dev", batch_size=hparams.evl_batch_size) 363 | test_iterator = TextIterator( 364 | hparams, mode="test", batch_size=hparams.evl_batch_size) 365 | model = Model(hparams) 366 | config_proto = tf.ConfigProto( 367 | log_device_placement=0, allow_soft_placement=0) 368 | config_proto.gpu_options.allow_growth = True 369 | sess = tf.Session(config=config_proto) 370 | sess.run(tf.global_variables_initializer()) 371 | 372 | global_step = 0 373 | train_loss = 0 374 | train_norm = 0 375 | best_loss = 0 376 | dey_cont = 0 377 | pay_cont = 0 378 | epoch = False 379 | epoch_cont = 0 380 | start_time = time.time() 381 | if hparams.mode != 'train': 382 | model.saver_ffm.restore(sess, os.path.join( 383 | hparams.path, 'model_'+str(hparams.sub_name))) 384 | else: 385 | # if os.path.exists(os.path.join(hparams.path, 'model_'+str(hparams.idx))): 386 | # model.saver_ffm.restore(sess,os.path.join(hparams.path, 'model_'+str(hparams.idx))) 387 | while True: 388 | try: 389 | cost, _, norm = model.train(sess, train_iterator) 390 | except StopIteration: 391 | continue 392 | global_step += 1 393 | train_loss += cost 394 | train_norm += norm 395 | if global_step % hparams.num_display_steps == 0 or global_step >= hparams.epoch: 396 | info = {} 397 | info['learning_rate'] = hparams.learning_rate 398 | info["train_ppl"] = train_loss / \ 399 | hparams.num_display_steps 400 | info["avg_grad_norm"] = train_norm / \ 401 | hparams.num_display_steps 402 | train_loss = 0 403 | train_norm = 0 404 | print_step_info(" ", global_step, info) 405 | if global_step % hparams.num_eval_steps == 0 or global_step >= hparams.epoch: 406 | epoch = False 407 | preds = [] 408 | label = [] 409 | aid = [] 410 | while True: 411 | try: 412 | pred = model.infer( 413 | sess, dev_iterator, label, aid) 414 | preds += list(pred) 415 | except StopIteration: 416 | break 417 | res = {} 418 | for i in range(len(aid)): 419 | if aid[i] not in res: 420 | res[aid[i]] = {} 421 | res[aid[i]]['label'] = [] 422 | res[aid[i]]['pred'] = [] 423 | res[aid[i]]['label'].append(label[i]+1) 424 | res[aid[i]]['pred'].append(preds[i]) 425 | auc = [] 426 | for u in res: 427 | fpr, tpr, thresholds = metrics.roc_curve( 428 | res[u]['label'], res[u]['pred'], pos_label=2) 429 | loss_ = metrics.auc(fpr, tpr) 430 | if np.isnan(loss_): 431 | continue 432 | gloab_auc[u] = loss_ 433 | auc.append(loss_) 434 | loss_ = np.mean(auc) 435 | if best_loss <= loss_: 436 | model.saver_ffm.save(sess, os.path.join( 437 | hparams.path, 'model_'+str(hparams.sub_name))) 438 | best_loss = loss_ 439 | T = (time.time()-start_time) 440 | start_time = time.time() 441 | utils.print_out( 442 | "# Epcho-time %.2fs Eval AUC %.6f. Best AUC %.6f." % (T, loss_, best_loss)) 443 | else: 444 | utils.print_out( 445 | "# Epcho-time %.2fs Eval AUC %.6f. Best AUC %.6f." % (T, loss_, best_loss)) 446 | model.saver_ffm.restore(sess, os.path.join( 447 | hparams.path, 'model_'+str(hparams.sub_name))) 448 | 449 | if global_step >= hparams.epoch: 450 | model.saver_ffm.restore(sess, os.path.join( 451 | hparams.path, 'model_'+str(hparams.sub_name))) 452 | break 453 | print("Dev inference ...") 454 | preds = [] 455 | label = [] 456 | aid = [] 457 | while True: 458 | try: 459 | pred = model.infer(sess, dev_iterator, label, aid) 460 | preds += list(pred) 461 | except StopIteration: 462 | break 463 | data = [] 464 | for i in range(len(preds)): 465 | data.append([aid[i], label[i], preds[i]]) 466 | df = pd.DataFrame(data) 467 | df.columns = ['aid', 'label', 'score'] 468 | df.to_csv('submission_dev_' + 469 | str(hparams.sub_name)+'.csv', index=False) 470 | print('Dev inference done!') 471 | res = {} 472 | for i in range(len(aid)): 473 | if aid[i] not in res: 474 | res[aid[i]] = {} 475 | res[aid[i]]['label'] = [] 476 | res[aid[i]]['pred'] = [] 477 | res[aid[i]]['label'].append(label[i]+1) 478 | res[aid[i]]['pred'].append(preds[i]) 479 | auc = [] 480 | for u in res: 481 | fpr, tpr, thresholds = metrics.roc_curve( 482 | res[u]['label'], res[u]['pred'], pos_label=2) 483 | loss_ = metrics.auc(fpr, tpr) 484 | if np.isnan(loss_): 485 | continue 486 | auc.append(loss_) 487 | loss_ = np.mean(auc) 488 | print("Dev auc:", loss_) 489 | print("Test inference ...") 490 | preds = [] 491 | label = [] 492 | aid = [] 493 | while True: 494 | try: 495 | pred = model.infer(sess, test_iterator, label, aid) 496 | preds += list(pred) 497 | except StopIteration: 498 | break 499 | print('Test inference done!') 500 | return preds 501 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import tensorflow as tf 6 | import utils 7 | import nffm 8 | import os 9 | os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID' 10 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 11 | 12 | 13 | def create_hparams(): 14 | return tf.contrib.training.HParams( 15 | k=8, 16 | batch_size=4096, 17 | optimizer="adam", 18 | learning_rate=0.0002, 19 | num_display_steps=100, 20 | num_eval_steps=1000, 21 | l2=0.000002, 22 | hidden_size=[256, 128], 23 | evl_batch_size=4096, 24 | all_process=1, 25 | idx=0, 26 | epoch=int(44628906//4096), 27 | mode='train', 28 | data_path='dataset/', 29 | sub_name='nffm', 30 | single_features=['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType', 'age', 'gender', 31 | 'education', 'consumptionAbility', 'LBS', 'carrier', 'house', 'uid_aid_nunique', 'aid_uid_nunique', 'campaignId_aid_nunique', 'uid_convert'], 32 | mutil_features=['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdAction', 33 | 'appIdInstall', 'marriageStatus', 'ct', 'os', 'pos_aid', 'neg_aid'] 34 | ) 35 | 36 | 37 | hparams = create_hparams() 38 | hparams.path = '../model/' 39 | 40 | hparams.aid = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId', 'productId', 'productType', 'pos_aid', 41 | 'neg_aid', 'aid_uid_nunique', 'campaignId_aid_nunique'] 42 | hparams.user = ['age', 'gender', 'education', 'consumptionAbility', 'LBS', 'carrier', 'house', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 43 | 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdAction', 'appIdInstall', 'marriageStatus', 'ct', 'os', 'uid_aid_nunique', 'uid_convert'] 44 | hparams.num_features = [] 45 | preds = nffm.train(hparams) 46 | 47 | 48 | test2_df = pd.read_csv('input/test2.csv') 49 | test2_df['score'] = preds 50 | test2_df['score'] = test2_df['score'].apply(lambda x: round(x, 4)) 51 | test2_df[['aid', 'uid', 'score']].to_csv( 52 | '../submission_'+str(hparams.sub_name)+'.csv', index=False) 53 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Generally useful utility functions.""" 17 | from __future__ import print_function 18 | 19 | import codecs 20 | import collections 21 | import json 22 | import math 23 | import os 24 | import sys 25 | import time 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | 30 | 31 | def check_tensorflow_version(): 32 | min_tf_version = "1.4.0-dev20171024" 33 | if tf.__version__ < min_tf_version: 34 | raise EnvironmentError("Tensorflow version must >= %s" % min_tf_version) 35 | 36 | 37 | def safe_exp(value): 38 | """Exponentiation with catching of overflow error.""" 39 | try: 40 | ans = math.exp(value) 41 | except OverflowError: 42 | ans = float("inf") 43 | return ans 44 | 45 | 46 | def print_time(s, start_time): 47 | """Take a start time, print elapsed duration, and return a new time.""" 48 | print("%s, time %ds, %s." % (s, (time.time() - start_time), time.ctime())) 49 | sys.stdout.flush() 50 | return time.time() 51 | 52 | 53 | def print_out(s, f=None, new_line=True): 54 | """Similar to print but with support to flush and output to a file.""" 55 | if isinstance(s, bytes): 56 | s = s.decode("utf-8") 57 | 58 | if f: 59 | f.write(s.encode("utf-8")) 60 | if new_line: 61 | f.write(b"\n") 62 | 63 | # stdout 64 | out_s = s.encode("utf-8") 65 | if not isinstance(out_s, str): 66 | out_s = out_s.decode("utf-8") 67 | print(out_s, end="", file=sys.stdout) 68 | 69 | if new_line: 70 | sys.stdout.write("\n") 71 | sys.stdout.flush() 72 | 73 | 74 | def print_hparams(hparams, skip_patterns=None, header=None): 75 | """Print hparams, can skip keys based on pattern.""" 76 | if header: 77 | print_out("%s" % header) 78 | values = hparams.values() 79 | for key in sorted(values.keys()): 80 | if not skip_patterns or all( 81 | [skip_pattern not in key for skip_pattern in skip_patterns]): 82 | print_out(" %s=%s" % (key, str(values[key]))) 83 | 84 | 85 | def load_hparams(model_dir): 86 | """Load hparams from an existing model directory.""" 87 | hparams_file = os.path.join(model_dir, "hparams") 88 | if tf.gfile.Exists(hparams_file): 89 | print_out("# Loading hparams from %s" % hparams_file) 90 | with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_file, "rb")) as f: 91 | try: 92 | hparams_values = json.load(f) 93 | hparams = tf.contrib.training.HParams(**hparams_values) 94 | except ValueError: 95 | print_out(" can't load hparams file") 96 | return None 97 | return hparams 98 | else: 99 | return None 100 | 101 | 102 | def maybe_parse_standard_hparams(hparams, hparams_path): 103 | """Override hparams values with existing standard hparams config.""" 104 | if not hparams_path: 105 | return hparams 106 | 107 | if tf.gfile.Exists(hparams_path): 108 | print_out("# Loading standard hparams from %s" % hparams_path) 109 | with tf.gfile.GFile(hparams_path, "r") as f: 110 | hparams.parse_json(f.read()) 111 | 112 | return hparams 113 | 114 | 115 | def save_hparams(out_dir, hparams): 116 | """Save hparams.""" 117 | hparams_file = os.path.join(out_dir, "hparams") 118 | print_out(" saving hparams to %s" % hparams_file) 119 | with codecs.getwriter("utf-8")(tf.gfile.GFile(hparams_file, "wb")) as f: 120 | f.write(hparams.to_json()) 121 | 122 | 123 | def debug_tensor(s, msg=None, summarize=10): 124 | """Print the shape and value of a tensor at test time. Return a new tensor.""" 125 | if not msg: 126 | msg = s.name 127 | return tf.Print(s, [tf.shape(s), s], msg + " ", summarize=summarize) 128 | 129 | 130 | def add_summary(summary_writer, global_step, tag, value): 131 | """Add a new summary to the current summary_writer. 132 | Useful to log things that are not part of the training graph, e.g., tag=BLEU. 133 | """ 134 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 135 | summary_writer.add_summary(summary, global_step) 136 | 137 | 138 | def get_config_proto(log_device_placement=False, allow_soft_placement=True, 139 | num_intra_threads=0, num_inter_threads=0): 140 | # GPU options: 141 | # https://www.tensorflow.org/versions/r0.10/how_tos/using_gpu/index.html 142 | config_proto = tf.ConfigProto( 143 | log_device_placement=log_device_placement, 144 | allow_soft_placement=allow_soft_placement) 145 | config_proto.gpu_options.allow_growth = True 146 | 147 | # CPU threads options 148 | if num_intra_threads: 149 | config_proto.intra_op_parallelism_threads = num_intra_threads 150 | if num_inter_threads: 151 | config_proto.inter_op_parallelism_threads = num_inter_threads 152 | 153 | return config_proto 154 | 155 | 156 | def format_text(words): 157 | """Convert a sequence words into sentence.""" 158 | if (not hasattr(words, "__len__") and # for numpy array 159 | not isinstance(words, collections.Iterable)): 160 | words = [words] 161 | return b" ".join(words) 162 | 163 | 164 | def format_bpe_text(symbols, delimiter=b"@@"): 165 | """Convert a sequence of bpe words into sentence.""" 166 | words = [] 167 | word = b"" 168 | if isinstance(symbols, str): 169 | symbols = symbols.encode() 170 | delimiter_len = len(delimiter) 171 | for symbol in symbols: 172 | if len(symbol) >= delimiter_len and symbol[-delimiter_len:] == delimiter: 173 | word += symbol[:-delimiter_len] 174 | else: # end of a word 175 | word += symbol 176 | words.append(word) 177 | word = b"" 178 | return b" ".join(words) 179 | 180 | 181 | def format_spm_text(symbols): 182 | """Decode a text in SPM (https://github.com/google/sentencepiece) format.""" 183 | return u"".join(format_text(symbols).decode("utf-8").split()).replace( 184 | u"\u2581", u" ").strip().encode("utf-8") 185 | --------------------------------------------------------------------------------