├── .gitignore
├── README.md
├── data
    └── chusai
    │   └── userFeature.data
├── run.sh
└── src
    ├── DataLoader.py
    ├── NFFM.py
    ├── NFFM_concat.py
    ├── NFFM_concat_dot.py
    ├── NFFM_concat_filter.py
    ├── NFFM_concat_triple.py
    ├── NFM.py
    ├── args.py
    ├── avg_submission.py
    ├── build_conversion_rate.py
    ├── build_len_max_idx.py
    ├── build_pos_feature.py
    ├── build_uid2idx.py
    ├── change_name_idx.py
    ├── check_labels.py
    ├── combine_data.py
    ├── dynamicEmbedding.py
    ├── focal_loss.py
    ├── main.py
    ├── make_submission.py
    ├── merge_and_split_csv.py
    ├── pipeline.sh
    ├── random_shuf.py
    ├── staticEmbedding.py
    └── support_model.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | .DS_Store
 3 | .cache
 4 | *.csv
 5 | *.xlsx
 6 | *.jpg
 7 | *.png
 8 | *.model
 9 | *.pyc
10 | *.pkl
11 | *.log
12 | *.zip
13 | *.bin
14 | 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | ## o98k啊黎 ##
  4 | ### 欧文杰 ###
  5 | ### https://zhuanlan.zhihu.com/p/38499275  ###
  6 | ---
  7 | ## 运行介绍 ##
  8 | 
  9 | 环境要求
 10 |  - 180G以上内存
 11 |  - 500G以上磁盘存储空间
 12 |  - 10G以上的cuda显卡
 13 |  - python 2.7
 14 |  - pytorch 0.2.0
 15 |  
 16 | 运行说明
 17 | ``` 
 18 | ./run.sh
 19 | ```
 20 | 
 21 | 参数修改
 22 | ```
 23 | vim src/args.py
 24 | ```
 25 |  
 26 | ---
 27 | 
 28 | ## 文件夹说明 ##
 29 | |-- data (存放数据文件)
 30 | 
 31 | 	|-- chusai (保存初赛的原始数据)
 32 | 
 33 | 	|-- fusai  (保存复赛的原始数据)
 34 | 
 35 | 	(data中下列的文件夹都是脚本自动生成)
 36 | 
 37 | 	|-- bin\_files (numpy保存格式)
 38 | 
 39 | 	|-- feature2idx (存放字典数据)
 40 | 
 41 | 	|-- infos (存放辅助用数据)
 42 | 
 43 | |-- src  (所有代码文件,具体代码文件在下一部分介绍)
 44 | 
 45 | 	|-- logs (保存log输出文件)
 46 | 
 47 | |-- models (保存pytorch模型)
 48 | 
 49 | |-- result (保存结果文件)
 50 | 
 51 | 
 52 | ---
 53 | 
 54 | ## 代码文件说明 ##
 55 | 	(1) args.py  参数以及一些常量的文件
 56 | 	(2) pipeline.sh  src中的程序入口
 57 | 	(3) combine_data.py  原始特征拼表
 58 | 	(4) merge_and_split_data.py  拼接复赛和初赛数据，并且分割出验证集
 59 | 	(5) build_uid2idx.py  特殊构造uid的idx，用频次来代替传统的LabelEncoding
 60 | 	(6) DataLoader.py  数据读取类，作为入口的时候是用来构造特征的id
 61 | 	(7) build_len_max_idx.py  统计长度特征的最大idx
 62 | 	(8) build_pos_feature.py  构造与正样本相关的一些特征
 63 | 	(9) build_conversion_rate.py  统计每种特征各个id的出现次数和转化率
 64 | 	(10) main.py  训练和测试的入口
 65 | 	(11) avg_submission.py  结果融合脚本
 66 | 	(12) make_submission.py  构造提交格式的结果
 67 | 
 68 | ---
 69 | 
 70 | ## 特征介绍 ##
 71 | 	我们团队使用了基础特征、长度特征、统计值特征、正样本统计值特征，下面做简单的介绍，具体的查看args.py
 72 | 	(1) 基础特征
 73 | 		- ad_static_features : 广告类的定长的基础特征
 74 | 		- user_static_features : 用户类的定长的基础特征
 75 | 		- user_dynamic_features : 用户类的不定长的基础特征
 76 | 		- ignore_features : 忽略不适用的特征
 77 | 
 78 | 	(2) 长度特性
 79 | 		- user_dynamic_len_features : 对于不定长的特征，额外增加一个表示长度的特征
 80 | 	
 81 | 	(3) 统计值特征
 82 | 		- uid : 因为uid比较稀疏，所以用统计的方法来建模uid
 83 | 	
 84 | 	(4) 正样本统计值特征
 85 | 		- uid|ad_static_features : 因为单纯用统计的方法建模uid会有大量的信息损失，所以加入uid与ad拼接后的特征，并且统计对应的正样本出现次数，进一步建模uid
 86 | 	
 87 | 	在实际实验中，我们都构造了多版的特征子集，并不是用全部特征来训练和预测。
 88 | ---
 89 | 
 90 | ## 模型介绍 ##
 91 | 	我们团队主要基于NFFM模型，针对不定长特征对基础模块修改了一下模型结构，而且也在模型结构上进行了一些创新。
 92 | 	模型结构主要分三个模型：NFFM, NFFM_concat, NFFM_concat_triple
 93 | 	(1) 基础模块
 94 | 		- StaticEmbedding : 对定长的特征进行embedding
 95 | 		- DynamicEmbedding : 对不定长的特征进行embedding，之后进行avg_pooling
 96 | 	(2) 模型结构
 97 | 		- NFFM : 传统NFFM模型结构
 98 | 		- NFFM_concat : 在NFFM模型的基础上，把lr部分去掉，在bi-interaction层的输出层中，拼接上StaticEmbedding和DynamicEmbedding输出的embedding矩阵，作为NN层的输入。
 99 | 		- NFFM_concat_triple : 在NFFM_concat的基础上，引入三阶性，对bi-interaction的输出再点乘上一个StaticEmbedding和DynamicEmbedding输出的embedding矩阵，作为NN层的输入。
100 | 
101 | ---
102 | 
103 | ## 结果融合 ##
104 | 	我们构造了多版的特征子集，在三个模型进行训练和验证，挑选验证集最好的前N个模型，进行按比例进行平均融合，得到最终结果。
105 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # run.sh
 4 | # Copyright (C) 2018 niezhaochang <niezhaochang@amax3>
 5 | #
 6 | # Distributed under terms of the MIT license.
 7 | #
 8 | 
 9 | 
10 | cd src
11 | ./pipeline.sh
12 | cd ..
13 | 


--------------------------------------------------------------------------------
/src/DataLoader.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | data loader
 11 | """
 12 | 
 13 | import pickle
 14 | import sys, os
 15 | import numpy as np
 16 | import time
 17 | import pickle
 18 | 
 19 | from args import *
 20 | 
 21 | 
 22 | class DataLoader:
 23 |     def __init__(self, type_name, is_train=True, has_cr=False, parts=1024*5000):
 24 |         self.type_name = type_name
 25 |         self.is_train=is_train
 26 |         self.has_cr = has_cr
 27 |         self.parts=parts
 28 | 
 29 |         self.max_lens_file_path = args.root_data_path + "/infos/{}/max_lens/{}.pkl"
 30 |         self.max_idxs_file_path = args.root_data_path + "infos/max_idxs/{}.pkl"
 31 | 
 32 |         # for preload
 33 |         self.id_data = {}
 34 |         self.len_data = {}
 35 |         self.feature2idx = {}
 36 |         self.max_idxs = {}
 37 | 
 38 |         self.label_counts = [0, 0]
 39 |         self.fname_counts = [{}, {}]
 40 | 
 41 |         self.feature_counts = {}
 42 | 
 43 |         # info
 44 |         self.n_all_data = -1
 45 |         self.next_idx = 0
 46 | 
 47 |         # for batch load
 48 |         self.ids = {}
 49 |         self.id_lens = {}
 50 |         self.max_lens = {}
 51 |         self.labels = []
 52 |         self.conversion_rates = {}
 53 | 
 54 |     # for preload
 55 |     def clear(self):
 56 |         del self.id_data, self.len_data, self.feature2idx, self.max_idxs, self.ids, self.id_lens, self.max_lens, self.labels
 57 |         self.id_data = {}
 58 |         self.len_data = {}
 59 |         self.feature2idx = {}
 60 |         self.max_idxs = {}
 61 | 
 62 |         self.n_all_data = -1
 63 |         self.next_idx = 0
 64 | 
 65 |         self.ids = {}
 66 |         self.id_lens = {}
 67 |         self.max_lens = {}
 68 |         self.labels = []
 69 | 
 70 | 
 71 |     def load_data(self, file_name, fnames):
 72 |         fnames.append("label")
 73 |         with open(file_name, 'r') as fin:
 74 |             features = fin.readline().strip().split(',')
 75 |             num = 0
 76 |             for line in fin:
 77 |                 datas = line.strip().split(',')
 78 |                 for i, d in enumerate(datas):
 79 |                     if features[i] in args.ignore_features or features[i] not in fnames:
 80 |                         continue
 81 |                     if features[i] not in self.id_data:
 82 |                         self.id_data[features[i]] = []
 83 |                     self.id_data[features[i]].append(d)
 84 |                 num += 1
 85 |                 if num % 1000000 == 0:
 86 |                     sys.stderr.write("loading {} data...\n".format(num))
 87 |                 #  if num >= self.part_num:
 88 |                 #      break
 89 |             self.n_all_data = num
 90 |             sys.stderr.write("total {} data...\n".format(num))
 91 | 
 92 |         # for save
 93 |         self.n_parts = self.n_all_data // self.parts
 94 |         if self.n_all_data % self.parts != 0:
 95 |             self.n_parts += 1
 96 | 
 97 |     def count_features(self, fname):
 98 |         if fname in args.dynamic_features:
 99 |             sys.stderr.write("counting {}...\n".format(fname))
100 |             for data in self.id_data[fname]:
101 |                 if fname not in self.len_data:
102 |                     self.len_data[fname] = []
103 |                 self.len_data[fname].append(len(data.split(' ')))
104 |             self.len_data[fname] = np.asarray(self.len_data[fname])
105 |         else:
106 |             sys.stderr.write("Warning: {} is static feature!\n".format(fname))
107 | 
108 |     def build_feature2idx(self, fname):
109 |         sys.stderr.write("building {}2idx dict...\n".format(fname))
110 |         fname2idx_path = os.path.join(args.root_data_path, "feature2idx/{}2idx.pkl".format(fname))
111 |         if os.path.exists(fname2idx_path):
112 |             fname2idx = pickle.load(open(fname2idx_path, 'rb'))
113 |         else:
114 |             fname2idx = {"<pad>":0, '-1':1, '<unk>':2}
115 |             now_idx = len(fname2idx)
116 |             for data in self.id_data[fname]:
117 |                 for d in data.split(' '):
118 |                     if d not in fname2idx:
119 |                         fname2idx[d] = now_idx
120 |                         now_idx += 1
121 |             fname2idx["max_idx"] = now_idx
122 |             pickle.dump(fname2idx, open(fname2idx_path, 'wb'))
123 |         self.feature2idx[fname] = fname2idx
124 | 
125 |         # save max_idxs
126 |         self.max_idxs[fname] = self.feature2idx[fname]["max_idx"]
127 | 
128 |     def combine_features(self, fname):
129 |         cfs = fname.split('|')
130 |         if len(cfs) != 2:
131 |             return
132 |         fname1 = cfs[0]
133 |         fname2 = cfs[1]
134 |         self.id_data[fname] = []
135 |         sys.stderr.write("combining {} | {}...\n".format(fname1, fname2))
136 |         for data1, data2 in zip(self.id_data[fname1], self.id_data[fname2]):
137 |             combine_data = []
138 |             for d1 in data1.split(' '):
139 |                 for d2 in data2.split(' '):
140 |                     combine_data.append(d1+'|'+d2)
141 |             self.id_data[fname].append(' '.join(combine_data))
142 | 
143 |     def get_max_idxs(self, fnames):
144 |         # load max_idxs
145 |         max_idxs = []
146 |         for fname in fnames:
147 |             max_idx = pickle.load(open(self.max_idxs_file_path.format(fname), 'r'))
148 |             self.max_idxs[fname] = max_idx
149 |             max_idxs.append(max_idx)
150 |         return max_idxs
151 |     
152 |     def load_max_idxs(self, fnames):
153 |         for fname in fnames:
154 |             max_idx = pickle.load(open(self.max_idxs_file_path.format(fname), 'r'))
155 |             self.max_idxs[fname] = max_idx
156 | 
157 |     def load_max_lens(self, fnames, part):
158 |         for fname in fnames:
159 |             if fname in args.static_features:
160 |                 continue
161 |             max_len = pickle.load(open(self.max_lens_file_path.format(self.type_name, fname), 'r'))
162 |             self.max_lens[fname] = max_len
163 | 
164 |     def save_max_lens(self, fname):
165 |         if fname in args.static_features:
166 |             return
167 |         pickle.dump(self.max_lens[fname], open(self.max_lens_file_path.format(self.type_name, fname), 'w'))
168 | 
169 |     def save_max_idxs(self, fnames):
170 |         for fname in fnames:
171 |             pickle.dump(self.max_idxs[fname], open(self.max_idxs_file_path.format(fname), 'w'))
172 | 
173 | 
174 |     # preload api
175 |     def prepare_for_final_data(self, fnames):
176 |         """
177 |         preload data pipeline api
178 |         """
179 |         tmp_fnames = []
180 |         for fname in fnames:
181 |             if fname not in tmp_fnames:
182 |                 tmp_fnames.append(fname)
183 |             cf = fname.split('|')
184 |             if len(cf) == 2:
185 |                 if cf[0] not in tmp_fnames:
186 |                     tmp_fnames.append(cf[0])
187 |                 if cf[1] not in tmp_fnames:
188 |                     tmp_fnames.append(cf[1])
189 | 
190 |         # load csv
191 |         stime = time.time()
192 |         self.load_data(os.path.join(args.root_data_path, "combine_{}.csv".format(self.type_name)), tmp_fnames)
193 |         etime = time.time()
194 |         sys.stderr.write("load_data cost {} s...\n".format(etime-stime))
195 | 
196 |         # combine
197 |         stime = time.time()
198 |         for cf in fnames:
199 |             self.combine_features(cf)
200 |         etime = time.time()
201 |         sys.stderr.write("combine_features cost {} s...\n".format(etime-stime))
202 | 
203 |         # count dynamic features
204 |         stime = time.time()
205 |         for udf in fnames:
206 |             if udf in args.dynamic_features:
207 |                 self.count_features(udf)
208 |         etime = time.time()
209 |         sys.stderr.write("count_feature cost {} s...\n".format(etime-stime))
210 | 
211 |         # build feature2idx dict
212 |         stime = time.time()
213 |         for f in fnames:
214 |             self.build_feature2idx(f)
215 |         etime = time.time()
216 |         sys.stderr.write("build_feature2idx cost {} s...\n".format(etime-stime))
217 | 
218 |         # save infos
219 |         if self.is_train:
220 |             self.save_max_idxs(fnames)
221 | 
222 |     
223 |     # for batch load
224 | 
225 |     def build_nn_data(self, fnames, to_bin=False):
226 |         stime = time.time()
227 | 
228 |         for key in fnames:
229 |             if key == 'label':
230 |                 continue
231 |             sys.stderr.write("building idx of {} for nn...\n".format(key))
232 |             if key not in self.ids:
233 |                 self.ids[key] = []
234 |                 self.id_lens[key] = []
235 |             self.max_lens[key] = 1
236 |             for data in self.id_data[key]:
237 |                 tmp_ids = []
238 |                 for d in data.split(' '):
239 |                     if d in self.feature2idx[key]:
240 |                         tmp_ids.append(self.feature2idx[key][d])
241 |                     else:
242 |                         tmp_ids.append(1)
243 |                 dyl = len(tmp_ids)
244 |                 self.id_lens[key].append(dyl)
245 |                 self.ids[key].append(tmp_ids)
246 |                 if dyl > self.max_lens[key]:
247 |                     self.max_lens[key] = dyl
248 |             # padding
249 |             for i in range(len(self.ids[key])):
250 |                 self.ids[key][i] = self.ids[key][i] + ([0]*(self.max_lens[key]-self.id_lens[key][i]))
251 |             self.ids[key] = np.asarray(self.ids[key])
252 |             self.id_lens[key] = np.asarray(self.id_lens[key])
253 | 
254 |             if to_bin:
255 |                 self.to_bin(key, self.n_parts)
256 |             if to_bin:
257 |                 self.save_max_lens(key)
258 | 
259 |         if self.is_train:
260 |             sys.stderr.write("building label for nn...\n")
261 |             self.labels = [int(x) for x in self.id_data["label"]]
262 |             self.labels = np.asarray(self.labels)
263 |             if to_bin:
264 |                 self.to_bin("label", self.n_parts)
265 | 
266 |         etime = time.time()
267 |         sys.stderr.write("build_nn_feature cost {} s...\n".format(etime-stime))
268 | 
269 | 
270 |     def to_bin(self, fname, n_parts):
271 |         for p in range(n_parts):
272 |             sp = p * self.parts
273 |             ep = (p+1) * self.parts
274 |             if fname != "label":
275 |                 bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_{}.bin'.format(self.type_name, fname, p))
276 |                 self.ids[fname][sp:ep].tofile(bin_file_path, format="%d")
277 |                 if fname in args.dynamic_features:
278 |                     bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_feature_cnt_{}.bin'.format(self.type_name, fname, p))
279 |                     self.len_data[fname][sp:ep].tofile(bin_file_path, format="%d")
280 |                     bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_len_{}.bin'.format(self.type_name, fname, p))
281 |                     self.id_lens[fname][sp:ep].tofile(bin_file_path, format="%d")
282 |             else:
283 |                 bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_{}.bin'.format(self.type_name, fname, p))
284 |                 self.labels[sp:ep].tofile(bin_file_path, format="%d")
285 | 
286 |     def load_bin(self, fnames, part):
287 |         stime = time.time()
288 |         self.load_max_lens(fnames, part)
289 |         for key in fnames:
290 |             self.from_bin(key, part)
291 |         if self.is_train:
292 |             self.from_bin("label", part)
293 |         self.n_all_data = self.ids[fnames[0]].shape[0]
294 |         etime = time.time()
295 |         sys.stderr.write("load {} bin cost {} s\n".format(part, etime-stime))
296 | 
297 |     def from_bin(self, fname, part):
298 |         if fname != "label":
299 |             bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_{}.bin'.format(self.type_name, fname, part))
300 |             self.ids[fname] = np.fromfile(bin_file_path, dtype=int)
301 |             if fname in args.dynamic_features:
302 |                 bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_len_{}.bin'.format(self.type_name, fname, part))
303 |                 self.id_lens[fname] = np.fromfile(bin_file_path, dtype=int)
304 |                 self.ids[fname] = self.ids[fname].reshape(-1, self.max_lens[fname])
305 |             else:
306 |                 self.ids[fname] = self.ids[fname].reshape(-1, 1)
307 |             '''
308 |             if fname in args.dynamic_features:
309 |                bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_feature_cnt_{}.bin'.format(self.type_name, fname, part))
310 |                self.len_data[fname] = np.fromfile(bin_file_path, dtype=int)
311 |             '''
312 |         else:
313 |             bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_{}.bin'.format(self.type_name, fname, part))
314 |             self.labels = np.fromfile(bin_file_path, dtype=int)
315 | 
316 | 
317 |     def build_conversion_rate(self, fname):
318 |         label_counts_file_path = os.path.join(args.root_data_path, 'infos/conversion_infos/label_counts.pkl')
319 |         fname_counts_file_path = os.path.join(args.root_data_path, 'infos/conversion_infos/{}_counts.pkl'.format(fname))
320 |         if os.path.exists(label_counts_file_path) and os.path.exists(fname_counts_file_path):
321 |             self.label_counts = pickle.load(open(label_counts_file_path, 'r'))
322 |             self.fname_counts = pickle.load(open(fname_counts_file_path, 'r'))
323 |         else:
324 |             self.label_counts = [0, 0]
325 |             self.fname_counts = [{}, {}]
326 |             for p in range(args.n_train_parts):
327 |                 self.load_bin([fname], p)
328 |                 data = self.ids[fname]
329 |                 label = self.labels
330 |                 # count fname in label==1 and label==0
331 |                 # count labels: label_counts[0] for label==0 and label_counts[1] for label==1
332 |                 for ds, l in zip(data,label):
333 |                     idx = int(l)
334 |                     self.label_counts[idx] += 1.0
335 |                     for d in ds:
336 |                         if d not in self.fname_counts[idx]:
337 |                             self.fname_counts[idx][d] = 0
338 |                         self.fname_counts[idx][d] += 1.0
339 |             # save conversion_rate dict
340 |             if self.is_train:
341 |                 pickle.dump(self.label_counts, open(label_counts_file_path, 'w'))
342 |                 pickle.dump(self.fname_counts, open(fname_counts_file_path, 'w'))
343 | 
344 |     def save_conversion_rate(self, fname, part):
345 |         sys.stderr.write("saving {} conversion_rates part {}\n".format(fname, part))
346 |         self.load_bin([fname], part)
347 |         data = self.ids[fname]
348 |         self.conversion_rates[fname] = []
349 |         for ds in data:
350 |             cr = [0.0, 0.0]
351 |             cn = [0.00001, 0.00001]
352 |             for d in ds:
353 |                 # sum(tf/tl * tf) / sum(tf)
354 |                 if d not in self.fname_counts[0]:
355 |                     cr[0] += 0
356 |                     cn[0] += 0
357 |                 else:
358 |                     cr[0] += self.fname_counts[0][d] / self.label_counts[0] * self.fname_counts[0][d]
359 |                     cn[0] += self.fname_counts[0][d]
360 |                 if d not in self.fname_counts[1]:
361 |                     cr[1] += 0
362 |                     cn[1] += 0
363 |                 else:
364 |                     cr[1] += self.fname_counts[1][d] / self.label_counts[1] * self.fname_counts[1][d]
365 |                     cn[1] += self.fname_counts[1][d]
366 |             self.conversion_rates[fname].append([cr[0]/cn[0], cr[1]/cn[1]])
367 | 
368 |         # save to bin
369 |         self.conversion_rates[fname] = np.asarray(self.conversion_rates[fname])
370 | 
371 |         bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_conversion_rate_{}.bin'.format(self.type_name, fname, part))
372 |         self.conversion_rates[fname].tofile(bin_file_path, format='%f')
373 | 
374 |     def load_counts(self, fnames):
375 |         for fname in fnames:
376 |             if fname in args.combine_features + args.len_static_features:
377 |                 continue
378 |             fname_counts_file_path = os.path.join(args.root_data_path, 'infos/conversion_infos/{}_counts.pkl'.format(fname))
379 |             self.feature_counts[fname] = pickle.load(open(fname_counts_file_path, 'r'))
380 |             
381 | 
382 |     def load_conversion_rate_from_bin(self, fnames, part):
383 |         for fname in fnames:
384 |             bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_conversion_rate_{}.bin'.format(self.type_name, fname, part))
385 |             self.conversion_rates[fname] = np.fromfile(bin_file_path, dtype=float)
386 |             self.conversion_rates[fname] = self.conversion_rates[fname].reshape(-1, 2)
387 | 
388 |     def random_shuffle(self, fnames):
389 |         stime = time.time()
390 |         rng_state = np.random.get_state()
391 |         for key in fnames:
392 |             np.random.set_state(rng_state)
393 |             np.random.shuffle(self.ids[key])
394 |             if key in args.dynamic_features:
395 |                 np.random.set_state(rng_state)
396 |                 np.random.shuffle(self.id_lens[key])
397 |             if self.has_cr:
398 |                 np.random.set_state(rng_state)
399 |                 np.random.shuffle(self.conversion_rates[key])
400 |         np.random.set_state(rng_state)
401 |         np.random.shuffle(self.labels)
402 |         etime = time.time()
403 |         sys.stderr.write("random_shuffle cost {} s...\n".format(etime-stime))
404 | 
405 |     def reset(self):
406 |         self.next_idx = 0
407 | 
408 |     def cut_threshold(self, fnames, ids, id_lens, threshold=500):
409 |         for key in fnames:
410 |             if key in args.combine_features+ args.len_static_features:
411 |                 continue
412 |             new_ids = []
413 |             for xs, ls in zip(ids[key], id_lens[key]):
414 |                 for i in range(ls):
415 |                     tmp_cnt = 0
416 |                     if xs[i] in self.feature_counts[key][0]:
417 |                         tmp_cnt += self.feature_counts[key][0][xs[i]]
418 |                     if xs[i] in self.feature_counts[key][1]:
419 |                         tmp_cnt += self.feature_counts[key][1][xs[i]]
420 |                     if tmp_cnt < threshold:
421 |                         xs[i] = 1
422 |                 new_ids.append(xs)
423 |             ids[key] = np.asarray(new_ids)
424 |         return ids
425 | 
426 |     def fix_0to1(self, fnames, ids, id_lens):
427 |         for key in fnames:
428 |             new_ids = []
429 |             for xs, ls in zip(ids[key], id_lens[key]):
430 |                 for i in range(ls):
431 |                     #  if xs[i] == 0 or xs[i] >= self.max_idxs[key]:
432 |                     if xs[i] == 0:
433 |                         xs[i] = 1
434 |                     elif xs[i] >= self.max_idxs[key]:
435 |                         if 'len' in key:
436 |                             xs[i] = self.max_idxs[key] - 1
437 |                         else:
438 |                             xs[i] = 1
439 |                 new_ids.append(xs)
440 |             ids[key] = np.asarray(new_ids)
441 |         return ids
442 | 
443 | 
444 |     def next_batch(self, fnames_arr, batch_size):
445 |         if self.next_idx >= self.n_all_data:
446 |             return None
447 |         end_idx = self.next_idx + batch_size
448 |         st_ids = {}
449 |         st_lens = {}
450 |         dy_ids = {}
451 |         dy_lens = {}
452 |         conversion_rates = {}
453 |         labels = self.labels[self.next_idx:end_idx]
454 |         for key in fnames_arr[0]:
455 |             st_ids[key] = self.ids[key][self.next_idx:end_idx]
456 |             st_lens[key] = np.asarray([1]*len(st_ids[key]))
457 |         for key in fnames_arr[1]:
458 |             dy_ids[key] = self.ids[key][self.next_idx:end_idx]
459 |             dy_lens[key] = self.id_lens[key][self.next_idx:end_idx]
460 |         if self.has_cr:
461 |             for key in fnames_arr[0]+fnames_arr[1]:
462 |                 conversion_rates[key] = self.conversion_rates[key][self.next_idx:end_idx]
463 |         self.next_idx = end_idx
464 |         # cut
465 |         threshold=args.cut_threshold
466 |         st_ids = self.cut_threshold(fnames_arr[0], st_ids, st_lens, threshold)
467 |         dy_ids = self.cut_threshold(fnames_arr[1], dy_ids, dy_lens, threshold)
468 |         # fix 0 to 1
469 |         st_ids = self.fix_0to1(fnames_arr[0], st_ids, st_lens)
470 |         dy_ids = self.fix_0to1(fnames_arr[1], dy_ids, dy_lens)
471 |         return [st_ids, dy_ids, dy_lens, labels, conversion_rates]
472 | 
473 | if __name__ == "__main__":
474 |     import argparse
475 |     parser = argparse.ArgumentParser()
476 |     parser.add_argument("--type_name", type=str, default="train_shuf", help="train_set name")
477 |     parser_args = parser.parse_args()
478 |     assert parser_args.type_name in ["train_shuf", "test1", "test2", 'chusai_train_shuf', 'valid', 'train_all']
479 | 
480 |     is_train = 'train' in parser_args.type_name or 'valid' in parser_args.type_name
481 |     if is_train:
482 |         args.root_data_path = args.root_train_data_path
483 |     dataLoader = DataLoader(type_name=parser_args.type_name, is_train=is_train)
484 | 
485 |     #  now_features = args.user_static_features
486 |     #  now_features = args.ad_static_features
487 |     #  now_features = args.user_dynamic_features 
488 |     now_features = args.ad_static_features + args.user_static_features + args.user_dynamic_features + ['uid']
489 |     #  now_features = args.combine_features
490 |     #  now_features = ['uid']
491 |     #  now_features = ['label']
492 | 
493 |     # build all data from the beginning
494 |     dataLoader.prepare_for_final_data(now_features)
495 |     dataLoader.build_nn_data(now_features, to_bin=True)
496 | 
497 | 


--------------------------------------------------------------------------------
/src/NFFM.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | model:
 11 |     input: [single feature id] + [multi feature ids]
 12 |     embedding_layer: single : embedding    multi: avg_embedding
 13 |     lr_layer: one-hot lr + embeddings_concat lr
 14 |     activate_layer: sigmoid
 15 | """
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import time, sys
 22 | 
 23 | from staticEmbedding import *
 24 | from dynamicEmbedding import *
 25 | from focal_loss import *
 26 | from support_model import *
 27 | 
 28 | 
 29 | class DyNFFM(nn.Module):
 30 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, batch_norm=True, use_cuda=True):
 31 |         """
 32 |         fnames: feature names: [static feature names, dynamic feature names]
 33 |         max_idxs: max_idxs: [static max_idxs, dynamic max_idxs]
 34 |         embedding_sizes: size of embedding, [n_single_embedding, n_multi_embedding]
 35 |         dropout_rate: prob for dropout, set None if no dropout,
 36 |         use_cuda: bool, True for gpu or False for cpu
 37 |         """
 38 | 
 39 |         super(DyNFFM, self).__init__()
 40 |         self.fnames = fnames
 41 |         self.max_idxs = max_idxs
 42 |         self.n_fnames = len(fnames[0]) + len(fnames[1])
 43 |         self.embedding_size = embedding_size
 44 |         self.field_embedding_size = embedding_size * self.n_fnames
 45 |         self.dropout_rate = dropout_rate
 46 |         self.batch_norm = batch_norm
 47 |         self.use_cuda = use_cuda
 48 | 
 49 |         self.stEmb = StEmb(
 50 |                 self.fnames[0],
 51 |                 self.max_idxs[0],
 52 |                 embedding_size=self.field_embedding_size,
 53 |                 dropout_rate=self.dropout_rate,
 54 |                 use_cuda=self.use_cuda
 55 |                 )
 56 |         self.dyEmb = DyEmb(
 57 |                 self.fnames[1],
 58 |                 self.max_idxs[1],
 59 |                 embedding_size=self.field_embedding_size,
 60 |                 dropout_rate=self.dropout_rate,
 61 |                 method='avg',
 62 |                 use_cuda=self.use_cuda
 63 |                 )
 64 | 
 65 |         self.stLr = StEmb(
 66 |                 self.fnames[0],
 67 |                 self.max_idxs[0],
 68 |                 embedding_size=1,
 69 |                 dropout_rate=self.dropout_rate,
 70 |                 use_cuda=self.use_cuda
 71 |                 )
 72 | 
 73 |         self.dyLr = DyEmb(
 74 |                 self.fnames[1],
 75 |                 self.max_idxs[1],
 76 |                 embedding_size=1,
 77 |                 dropout_rate=self.dropout_rate,
 78 |                 method='avg',
 79 |                 use_cuda=self.use_cuda
 80 |                 )
 81 | 
 82 |         self.bias = torch.nn.Parameter(torch.zeros(1))
 83 | 
 84 |         # focal loss layer
 85 |         self.fc_loss = FocalLoss(gamma=2)
 86 | 
 87 |         # mask for combination
 88 |         self.mask = []
 89 |         all_fnames = fnames[0] + fnames[1]
 90 |         self.n_bi = 0
 91 |         for i in range(self.n_fnames):
 92 |             tmp = []
 93 |             for j in range(self.n_fnames):
 94 |                 if i >= j:
 95 |                 #  if i >= j or filter_same_features(all_fnames[i], all_fnames[j]):
 96 |                     tmp.append(0)
 97 |                 else:
 98 |                     tmp.append(1)
 99 |                     self.n_bi += 1
100 |             self.mask.append(tmp)
101 |         self.mask = torch.autograd.Variable(torch.ByteTensor(self.mask))
102 | 
103 |         #  self.mask_len = [x + 1 for x in range(self.n_fnames)]
104 |         #  self.mask = make_mask(self.mask_len, self.n_fnames, fill_val=False)
105 |         if self.use_cuda:
106 |             self.mask = self.mask.cuda()
107 |             self.fc_loss = self.fc_loss.cuda()
108 | 
109 |         # embedding lr layer, 3 layers
110 |         #  self.embLr_input_dim = self.n_fnames * (self.n_fnames - 1) // 2 * self.embedding_size
111 |         self.embLr_input_dim = self.n_bi * self.embedding_size
112 |         self.hidden_size_1 = 512
113 |         self.batch_norm_1 =  nn.BatchNorm1d(self.embLr_input_dim)
114 |         self.embLr1 = nn.Linear(self.embLr_input_dim, self.hidden_size_1)
115 |         self.batch_norm_2 = nn.BatchNorm1d(self.hidden_size_1)
116 |         self.hidden_size_2 = 256
117 |         self.embLr2 = nn.Linear(self.hidden_size_1, self.hidden_size_2)
118 |         self.batch_norm_3 = nn.BatchNorm1d(self.hidden_size_2)
119 |         self.hidden_size_3 = 128
120 |         self.embLr3 = nn.Linear(self.hidden_size_2, self.hidden_size_3)
121 | 
122 |         self.batch_norm_out = nn.BatchNorm1d(self.hidden_size_3)
123 |         self.embLr_out = nn.Linear(self.hidden_size_3, 1)
124 | 
125 |         self.embLr_is_dropout = False
126 |         if self.dropout_rate is not None:
127 |             self.embLr_is_dropout = True
128 |             self.emb_dropout = nn.Dropout(self.dropout_rate)
129 | 
130 |     
131 |     def load(self, model_path):
132 |         self.load_state_dict(torch.load(model_path))
133 | 
134 |     def save(self, model_path):
135 |         torch.save(self.state_dict(), model_path)
136 | 
137 | 
138 |     def forward(self, static_ids, dynamic_ids, dynamic_lengths, conversion_rates):
139 |         """
140 |         input: relative id
141 |         static_ids: Batch_size * Field_size
142 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
143 |         dynamic_lengths: Batch_size * Field_size
144 |         return: Batch_size * 1,  probs
145 |         """
146 | 
147 |         # embedding layers
148 |         dynamic_embeddings = self.dyEmb(dynamic_ids, dynamic_lengths)
149 | 
150 |         static_embeddings = self.stEmb(static_ids)
151 | 
152 |         batch_size = static_embeddings.size()[0]
153 | 
154 |         # B*F*E
155 |         all_embeddings = torch.cat([static_embeddings, dynamic_embeddings], 1)
156 |         all_embeddings = all_embeddings.view(batch_size, self.n_fnames, self.n_fnames, self.embedding_size)
157 | 
158 |         # combine feature by multi
159 |         all_mask = self.mask.view(1, self.n_fnames, self.n_fnames, 1).expand_as(all_embeddings)
160 |         all_embeddings_ur = torch.masked_select(all_embeddings, all_mask)
161 |         all_embeddings = all_embeddings.transpose(2, 1)
162 |         all_embeddings_ll = torch.masked_select(all_embeddings, all_mask)
163 | 
164 |         bi_embeddings = all_embeddings_ur * all_embeddings_ll
165 | 
166 |         # lr layer
167 |         static_lr_out = self.stLr(static_ids).view(batch_size, -1)
168 |         dynamic_lr_out = self.dyLr(dynamic_ids, dynamic_lengths).view(batch_size, -1)
169 | 
170 |         # embedding lr layer
171 |         # B*F1*E + B*F2*E -> B*[F1+F2]*E -> B*[F*E]
172 |         embedding_lr_in_1 = bi_embeddings.view(batch_size, -1)
173 |         if self.embLr_is_dropout:
174 |             embedding_lr_in_1 = self.emb_dropout(embedding_lr_in_1)
175 |         if self.batch_norm:
176 |             embedding_lr_in_1 = self.batch_norm_1(embedding_lr_in_1)
177 |         embedding_lr_out_1 = self.embLr1(embedding_lr_in_1)
178 | 
179 |         embedding_lr_in_2 = F.relu(embedding_lr_out_1)
180 |         if self.embLr_is_dropout:
181 |             embedding_lr_in_2 = self.emb_dropout(embedding_lr_in_2)
182 |         if self.batch_norm:
183 |             embedding_lr_in_2 = self.batch_norm_2(embedding_lr_in_2)
184 |         embedding_lr_out_2 = self.embLr2(embedding_lr_in_2)
185 | 
186 |         embedding_lr_in_3 = F.relu(embedding_lr_out_2)
187 |         if self.embLr_is_dropout:
188 |             embedding_lr_in_3 = self.emb_dropout(embedding_lr_in_3)
189 |         if self.batch_norm:
190 |             embedding_lr_in_3 = self.batch_norm_3(embedding_lr_in_3)
191 |         embedding_lr_out_3 = self.embLr3(embedding_lr_in_3)
192 | 
193 |         embedding_lr_in = F.relu(embedding_lr_out_3)
194 |         if self.embLr_is_dropout:
195 |             embedding_lr_in = self.emb_dropout(embedding_lr_in)
196 |         if self.batch_norm:
197 |             embedding_lr_in = self.batch_norm_out(embedding_lr_in)
198 |         embedding_lr_out = self.embLr_out(embedding_lr_in)
199 | 
200 |         # output
201 |         #  print self.static_lr_out
202 |         #  print self.dynamic_lr_out
203 |         #  print self.embedding_lr_out
204 |         scores = self.bias + torch.sum(static_lr_out, -1) + torch.sum(dynamic_lr_out, -1) + torch.sum(embedding_lr_out, -1)
205 | 
206 |         # activate layer
207 |         # self.probs = F.sigmoid(self.scores)
208 | 
209 |         return scores
210 | 
211 |     def get_loss(self, scores, labels):
212 |         """
213 |         binary cross entropy loss
214 |         """
215 |         labels = torch.autograd.Variable(torch.FloatTensor(labels), requires_grad=False)
216 |         if self.use_cuda:
217 |             labels = labels.cuda()
218 | 
219 |         #  BCE loss
220 |         loss = F.binary_cross_entropy_with_logits(scores, labels)
221 | 
222 |         #  weighted BCE loss
223 |         #  weights = labels * 10.0
224 |         #  weights = weights.masked_fill_(labels.le(0.5), 1.0)
225 |         #  loss = F.binary_cross_entropy_with_logits(scores, labels, weights)
226 | 
227 |         #  margin loss
228 |         #  labels = labels.masked_fill_(labels.le(0.5), -1)
229 |         #  loss = F.soft_margin_loss(scores, labels)
230 | 
231 |         #  focal loss
232 |         #  scores = torch.sigmoid(scores).view(-1, 1)
233 |         #  scores = torch.cat([1.0-scores, scores], -1)
234 |         #  loss = self.fc_loss(scores, labels.long())
235 | 
236 |         return loss
237 | 
238 | 
239 | if __name__ == '__main__':
240 |     st_max_idxs = [4, 6]
241 |     st_fnames = ["1", "2"]
242 |     st_ids = {"1":[[2], [3]], "2":[[5],[1]]}
243 | 
244 |     dy_max_idxs = [4, 6]
245 |     dy_fnames = ["1", "2"]
246 |     dy_ids = {"1":[[2,1,3,0,0], [2,2,0,0,0]], "2":[[5,0,0,0,0],[5,5,5,5,5]]}
247 |     dy_lengths = {"1":[3,2], "2":[1,5]}
248 | 
249 |     reals = [1, 0]
250 | 
251 |     dyNffm = DyNFFM([st_fnames, dy_fnames], [st_max_idxs, dy_max_idxs], use_cuda=True)
252 |     dyNffm.cuda()
253 | 
254 |     probs = dyNffm(st_ids, dy_ids, dy_lengths)
255 |     print(probs)
256 | 
257 |     loss = dyNffm.get_loss(probs, reals)
258 |     print(loss)
259 | 
260 | 


--------------------------------------------------------------------------------
/src/NFFM_concat.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | model:
 11 |     input: [single feature id] + [multi feature ids]
 12 |     embedding_layer: single : embedding    multi: avg_embedding
 13 |     lr_layer: one-hot lr + embeddings_concat lr
 14 |     activate_layer: sigmoid
 15 | """
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import time, sys
 22 | 
 23 | from staticEmbedding import *
 24 | from dynamicEmbedding import *
 25 | from focal_loss import *
 26 | from support_model import *
 27 | 
 28 | 
 29 | class DyNFFM_concat(nn.Module):
 30 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, batch_norm=True, use_cuda=True):
 31 |         """
 32 |         fnames: feature names: [static feature names, dynamic feature names]
 33 |         max_idxs: max_idxs: [static max_idxs, dynamic max_idxs]
 34 |         embedding_sizes: size of embedding, [n_single_embedding, n_multi_embedding]
 35 |         dropout_rate: prob for dropout, set None if no dropout,
 36 |         use_cuda: bool, True for gpu or False for cpu
 37 |         """
 38 | 
 39 |         super(DyNFFM_concat, self).__init__()
 40 |         self.fnames = fnames
 41 |         self.max_idxs = max_idxs
 42 |         self.n_fnames = len(fnames[0]) + len(fnames[1])
 43 |         self.embedding_size = embedding_size
 44 |         self.field_embedding_size = embedding_size * self.n_fnames
 45 |         self.dropout_rate = dropout_rate
 46 |         self.batch_norm = batch_norm
 47 |         self.use_cuda = use_cuda
 48 | 
 49 |         self.stEmb = StEmb(
 50 |                 self.fnames[0],
 51 |                 self.max_idxs[0],
 52 |                 embedding_size=self.field_embedding_size,
 53 |                 dropout_rate=self.dropout_rate,
 54 |                 use_cuda=self.use_cuda
 55 |                 )
 56 |         self.dyEmb = DyEmb(
 57 |                 self.fnames[1],
 58 |                 self.max_idxs[1],
 59 |                 embedding_size=self.field_embedding_size,
 60 |                 dropout_rate=self.dropout_rate,
 61 |                 method='avg',
 62 |                 use_cuda=self.use_cuda
 63 |                 )
 64 | 
 65 |         #  self.stLr = StEmb(
 66 |         #          self.fnames[0],
 67 |         #          self.max_idxs[0],
 68 |         #          embedding_size=1,
 69 |         #          dropout_rate=self.dropout_rate,
 70 |         #          use_cuda=self.use_cuda
 71 |         #          )
 72 | 
 73 |         #  self.dyLr = DyEmb(
 74 |         #          self.fnames[1],
 75 |         #          self.max_idxs[1],
 76 |         #          embedding_size=1,
 77 |         #          dropout_rate=self.dropout_rate,
 78 |         #          method='avg',
 79 |         #          use_cuda=self.use_cuda
 80 |         #          )
 81 | 
 82 |         self.bias = torch.nn.Parameter(torch.zeros(1))
 83 | 
 84 |         # focal loss layer
 85 |         self.fc_loss = FocalLoss(gamma=2)
 86 | 
 87 |         # mask for combination
 88 |         self.mask = []
 89 |         all_fnames = fnames[0] + fnames[1]
 90 |         self.n_bi = 0
 91 |         for i in range(self.n_fnames):
 92 |             tmp = []
 93 |             for j in range(self.n_fnames):
 94 |                 if i >= j:
 95 |                 #  if i >= j or filter_same_features(all_fnames[i], all_fnames[j]):
 96 |                     tmp.append(0)
 97 |                 else:
 98 |                     tmp.append(1)
 99 |                     self.n_bi += 1
100 |             self.mask.append(tmp)
101 |         self.mask = torch.autograd.Variable(torch.ByteTensor(self.mask))
102 | 
103 |         # embedding lr layer
104 |         # bi-interation + embedding concat
105 |         self.embLr_input_dim = self.n_bi * self.embedding_size + self.n_fnames * self.n_fnames * self.embedding_size 
106 |         self.hidden_sizes = [self.embLr_input_dim, 512, 256, 256, 128]
107 |         self.n_linear_layers = len(self.hidden_sizes) - 1
108 |         self.batch_norms = nn.ModuleList([nn.BatchNorm1d(self.hidden_sizes[i]) for i in range(self.n_linear_layers)])
109 |         self.embLrs = nn.ModuleList([nn.Linear(self.hidden_sizes[i], self.hidden_sizes[i+1]) for i in range(self.n_linear_layers)])
110 | 
111 |         self.batch_norm_out = nn.BatchNorm1d(self.hidden_sizes[-1])
112 |         self.embLr_out = nn.Linear(self.hidden_sizes[-1], 1)
113 | 
114 |         self.embLr_is_dropout = False
115 |         if self.dropout_rate is not None:
116 |             self.embLr_is_dropout = True
117 |             self.emb_dropout = nn.Dropout(self.dropout_rate)
118 | 
119 | 
120 |         #  self.mask_len = [x + 1 for x in range(self.n_fnames)]
121 |         if self.use_cuda:
122 |             self.mask = self.mask.cuda()
123 |             self.fc_loss = self.fc_loss.cuda()
124 |     
125 |     def load(self, model_path):
126 |         self.load_state_dict(torch.load(model_path))
127 | 
128 |     def save(self, model_path):
129 |         torch.save(self.state_dict(), model_path)
130 | 
131 | 
132 |     def forward(self, static_ids, dynamic_ids, dynamic_lengths, conversion_rates):
133 |         """
134 |         input: relative id
135 |         static_ids: Batch_size * Field_size
136 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
137 |         dynamic_lengths: Batch_size * Field_size
138 |         return: Batch_size * 1,  probs
139 |         """
140 | 
141 |         # embedding layers
142 |         dynamic_embeddings = self.dyEmb(dynamic_ids, dynamic_lengths)
143 | 
144 |         static_embeddings = self.stEmb(static_ids)
145 | 
146 |         batch_size = static_embeddings.size()[0]
147 | 
148 |         # B*F*E
149 |         all_embeddings = torch.cat([static_embeddings, dynamic_embeddings], 1)
150 |         all_embeddings = all_embeddings.view(batch_size, self.n_fnames, self.n_fnames, self.embedding_size)
151 | 
152 |         # combine feature by multi
153 |         all_mask = self.mask.view(1, self.n_fnames, self.n_fnames, 1).expand_as(all_embeddings)
154 |         all_embeddings_ur = torch.masked_select(all_embeddings, all_mask)
155 |         all_embeddings_1 = all_embeddings.transpose(2, 1)
156 |         all_embeddings_ll = torch.masked_select(all_embeddings_1, all_mask)
157 | 
158 |         bi_embeddings = all_embeddings_ur * all_embeddings_ll
159 | 
160 |         # lr layer
161 |         # static_lr_out = self.stLr(static_ids).view(batch_size, -1)
162 |         # dynamic_lr_out = self.dyLr(dynamic_ids, dynamic_lengths).view(batch_size, -1)
163 | 
164 |         # embedding lr layer
165 |         # B*F1*E + B*F2*E -> B*[F1+F2]*E -> B*[F*E]
166 | 
167 |         lr_out = torch.cat([bi_embeddings.view(batch_size, -1), all_embeddings.view(batch_size, -1)], -1)
168 |         for i in range(self.n_linear_layers):
169 |             lr_in = lr_out
170 |             if self.batch_norm:
171 |                 lr_in = self.batch_norms[i](lr_in)
172 |             if self.embLr_is_dropout:
173 |                 self.emb_dropout(lr_in)
174 |             lr_out = self.embLrs[i](lr_in)
175 |             lr_out = F.relu(lr_out)
176 |         embedding_lr_in = lr_out
177 | 
178 |         if self.embLr_is_dropout:
179 |             embedding_lr_in = self.emb_dropout(embedding_lr_in)
180 |         if self.batch_norm:
181 |             embedding_lr_in = self.batch_norm_out(embedding_lr_in)
182 |         embedding_lr_out = self.embLr_out(embedding_lr_in)
183 | 
184 |         # output
185 |         #  print self.static_lr_out
186 |         #  print self.dynamic_lr_out
187 |         #  print self.embedding_lr_out
188 |         #  scores = self.bias + torch.sum(static_lr_out, -1) + torch.sum(dynamic_lr_out, -1) + torch.sum(embedding_lr_out, -1)
189 |         scores = self.bias + torch.sum(embedding_lr_out, -1)
190 | 
191 |         # activate layer
192 |         # self.probs = F.sigmoid(self.scores)
193 | 
194 |         return scores
195 | 
196 |     def get_loss(self, scores, labels):
197 |         """
198 |         binary cross entropy loss
199 |         """
200 |         labels = torch.autograd.Variable(torch.FloatTensor(labels), requires_grad=False)
201 |         if self.use_cuda:
202 |             labels = labels.cuda()
203 | 
204 |         #  BCE loss
205 |         loss = F.binary_cross_entropy_with_logits(scores, labels)
206 | 
207 |         #  weighted BCE loss
208 |         #  weights = labels * 10.0
209 |         #  weights = weights.masked_fill_(labels.le(0.5), 1.0)
210 |         #  loss = F.binary_cross_entropy_with_logits(scores, labels, weights)
211 | 
212 |         #  margin loss
213 |         #  labels = labels.masked_fill_(labels.le(0.5), -1)
214 |         #  loss = F.soft_margin_loss(scores, labels)
215 | 
216 |         #  focal loss
217 |         #  scores = torch.sigmoid(scores).view(-1, 1)
218 |         #  scores = torch.cat([1.0-scores, scores], -1)
219 |         #  loss = self.fc_loss(scores, labels.long())
220 | 
221 |         return loss
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     st_max_idxs = [4, 6]
226 |     st_fnames = ["1", "2"]
227 |     st_ids = {"1":[[2], [3]], "2":[[5],[1]]}
228 | 
229 |     dy_max_idxs = [4, 6]
230 |     dy_fnames = ["1", "2"]
231 |     dy_ids = {"1":[[2,1,3,0,0], [2,2,0,0,0]], "2":[[5,0,0,0,0],[5,5,5,5,5]]}
232 |     dy_lengths = {"1":[3,1], "2":[2,5]}
233 | 
234 |     reals = [1, 0]
235 | 
236 |     dyNffm = DyNFFM_concat([st_fnames, dy_fnames], [st_max_idxs, dy_max_idxs], use_cuda=True)
237 |     dyNffm.cuda()
238 | 
239 |     probs = dyNffm(st_ids, dy_ids, dy_lengths)
240 |     print(probs)
241 | 
242 |     loss = dyNffm.get_loss(probs, reals)
243 |     print(loss)
244 | 
245 | 


--------------------------------------------------------------------------------
/src/NFFM_concat_dot.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | model:
 11 |     input: [single feature id] + [multi feature ids]
 12 |     embedding_layer: single : embedding    multi: avg_embedding
 13 |     lr_layer: one-hot lr + embeddings_concat lr
 14 |     activate_layer: sigmoid
 15 | """
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import time, sys
 22 | 
 23 | from staticEmbedding import *
 24 | from dynamicEmbedding import *
 25 | from focal_loss import *
 26 | from support_model import *
 27 | 
 28 | 
 29 | class DyNFFM_concat_dot(nn.Module):
 30 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, batch_norm=True, use_cuda=True):
 31 |         """
 32 |         fnames: feature names: [static feature names, dynamic feature names]
 33 |         max_idxs: max_idxs: [static max_idxs, dynamic max_idxs]
 34 |         embedding_sizes: size of embedding, [n_single_embedding, n_multi_embedding]
 35 |         dropout_rate: prob for dropout, set None if no dropout,
 36 |         use_cuda: bool, True for gpu or False for cpu
 37 |         """
 38 | 
 39 |         super(DyNFFM_concat_dot, self).__init__()
 40 |         self.fnames = fnames
 41 |         self.max_idxs = max_idxs
 42 |         self.n_fnames = len(fnames[0]) + len(fnames[1])
 43 |         self.embedding_size = embedding_size
 44 |         self.field_embedding_size = embedding_size * self.n_fnames
 45 |         self.dropout_rate = dropout_rate
 46 |         self.batch_norm = batch_norm
 47 |         self.use_cuda = use_cuda
 48 | 
 49 |         self.stEmb = StEmb(
 50 |                 self.fnames[0],
 51 |                 self.max_idxs[0],
 52 |                 embedding_size=self.field_embedding_size,
 53 |                 dropout_rate=self.dropout_rate,
 54 |                 use_cuda=self.use_cuda
 55 |                 )
 56 |         self.dyEmb = DyEmb(
 57 |                 self.fnames[1],
 58 |                 self.max_idxs[1],
 59 |                 embedding_size=self.field_embedding_size,
 60 |                 dropout_rate=self.dropout_rate,
 61 |                 method='avg',
 62 |                 use_cuda=self.use_cuda
 63 |                 )
 64 | 
 65 |         self.bias = torch.nn.Parameter(torch.zeros(1))
 66 | 
 67 |         # focal loss layer
 68 |         self.fc_loss = FocalLoss(gamma=2)
 69 | 
 70 |         # mask for combination
 71 |         self.mask = []
 72 |         all_fnames = fnames[0] + fnames[1]
 73 |         self.n_bi = 0
 74 |         for i in range(self.n_fnames):
 75 |             tmp = []
 76 |             for j in range(self.n_fnames):
 77 |                 if i >= j:
 78 |                 #  if i >= j or filter_same_features(all_fnames[i], all_fnames[j]):
 79 |                     tmp.append(0)
 80 |                 else:
 81 |                     tmp.append(1)
 82 |                     self.n_bi += 1
 83 |             self.mask.append(tmp)
 84 |         self.mask = torch.autograd.Variable(torch.ByteTensor(self.mask))
 85 | 
 86 |         # embedding lr layer
 87 |         # bi-interation + embedding concat + bi_dot
 88 |         self.embLr_input_dim = self.n_bi * self.embedding_size + self.n_fnames * self.n_fnames * self.embedding_size + self.n_bi 
 89 |         self.hidden_sizes = [self.embLr_input_dim, 512, 256, 256, 128]
 90 |         self.n_linear_layers = len(self.hidden_sizes) - 1
 91 |         self.batch_norms = nn.ModuleList([nn.BatchNorm1d(self.hidden_sizes[i]) for i in range(self.n_linear_layers)])
 92 |         self.embLrs = nn.ModuleList([nn.Linear(self.hidden_sizes[i], self.hidden_sizes[i+1]) for i in range(self.n_linear_layers)])
 93 | 
 94 |         self.batch_norm_out = nn.BatchNorm1d(self.hidden_sizes[-1])
 95 |         self.embLr_out = nn.Linear(self.hidden_sizes[-1], 1)
 96 | 
 97 |         self.embLr_is_dropout = False
 98 |         if self.dropout_rate is not None:
 99 |             self.embLr_is_dropout = True
100 |             self.emb_dropout = nn.Dropout(self.dropout_rate)
101 | 
102 | 
103 |         #  self.mask_len = [x + 1 for x in range(self.n_fnames)]
104 |         if self.use_cuda:
105 |             self.mask = self.mask.cuda()
106 |             self.fc_loss = self.fc_loss.cuda()
107 |     
108 |     def load(self, model_path):
109 |         self.load_state_dict(torch.load(model_path))
110 | 
111 |     def save(self, model_path):
112 |         torch.save(self.state_dict(), model_path)
113 | 
114 | 
115 |     def forward(self, static_ids, dynamic_ids, dynamic_lengths, conversion_rates):
116 |         """
117 |         input: relative id
118 |         static_ids: Batch_size * Field_size
119 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
120 |         dynamic_lengths: Batch_size * Field_size
121 |         return: Batch_size * 1,  probs
122 |         """
123 | 
124 |         # embedding layers
125 |         dynamic_embeddings = self.dyEmb(dynamic_ids, dynamic_lengths)
126 | 
127 |         static_embeddings = self.stEmb(static_ids)
128 | 
129 |         batch_size = static_embeddings.size()[0]
130 | 
131 |         # B*F*E
132 |         all_embeddings = torch.cat([static_embeddings, dynamic_embeddings], 1)
133 |         all_embeddings = all_embeddings.view(batch_size, self.n_fnames, self.n_fnames, self.embedding_size)
134 | 
135 |         # combine feature by multi
136 |         all_mask = self.mask.view(1, self.n_fnames, self.n_fnames, 1).expand_as(all_embeddings)
137 |         all_embeddings_ur = torch.masked_select(all_embeddings, all_mask)
138 |         all_embeddings_1 = all_embeddings.transpose(2, 1)
139 |         all_embeddings_ll = torch.masked_select(all_embeddings_1, all_mask)
140 | 
141 |         bi_embeddings = all_embeddings_ur * all_embeddings_ll
142 | 
143 |         # batch dot layer
144 |         dot_embeddings = torch.sum(bi_embeddings.view(batch_size, self.n_bi, self.embedding_size), -1)
145 | 
146 |         # embedding lr layer
147 |         # B*F1*E + B*F2*E -> B*[F1+F2]*E -> B*[F*E]
148 |         lr_out = torch.cat([bi_embeddings.view(batch_size, -1), all_embeddings.view(batch_size, -1), dot_embeddings.view(batch_size, -1)], -1)
149 |         for i in range(self.n_linear_layers):
150 |             lr_in = lr_out
151 |             if self.batch_norm:
152 |                 lr_in = self.batch_norms[i](lr_out)
153 |             if self.embLr_is_dropout:
154 |                 self.emb_dropout(lr_in)
155 |             lr_out = self.embLrs[i](lr_in)
156 |             lr_out = F.relu(lr_out)
157 | 
158 |         embedding_lr_in = lr_out
159 |         if self.embLr_is_dropout:
160 |             embedding_lr_in = self.emb_dropout(embedding_lr_in)
161 |         if self.batch_norm:
162 |             embedding_lr_in = self.batch_norm_out(embedding_lr_in)
163 |         embedding_lr_out = self.embLr_out(embedding_lr_in)
164 | 
165 |         # output
166 |         #  print self.static_lr_out
167 |         #  print self.dynamic_lr_out
168 |         #  print self.embedding_lr_out
169 |         #  scores = self.bias + torch.sum(static_lr_out, -1) + torch.sum(dynamic_lr_out, -1) + torch.sum(embedding_lr_out, -1)
170 |         scores = self.bias + torch.sum(embedding_lr_out, -1)
171 | 
172 |         # activate layer
173 |         # self.probs = F.sigmoid(self.scores)
174 | 
175 |         return scores
176 | 
177 |     def get_loss(self, scores, labels):
178 |         """
179 |         binary cross entropy loss
180 |         """
181 |         labels = torch.autograd.Variable(torch.FloatTensor(labels), requires_grad=False)
182 |         if self.use_cuda:
183 |             labels = labels.cuda()
184 | 
185 |         #  BCE loss
186 |         loss = F.binary_cross_entropy_with_logits(scores, labels)
187 | 
188 |         #  weighted BCE loss
189 |         #  weights = labels * 10.0
190 |         #  weights = weights.masked_fill_(labels.le(0.5), 1.0)
191 |         #  loss = F.binary_cross_entropy_with_logits(scores, labels, weights)
192 | 
193 |         #  margin loss
194 |         #  labels = labels.masked_fill_(labels.le(0.5), -1)
195 |         #  loss = F.soft_margin_loss(scores, labels)
196 | 
197 |         #  focal loss
198 |         #  scores = torch.sigmoid(scores).view(-1, 1)
199 |         #  scores = torch.cat([1.0-scores, scores], -1)
200 |         #  loss = self.fc_loss(scores, labels.long())
201 | 
202 |         return loss
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     st_max_idxs = [4, 6]
207 |     st_fnames = ["1", "2"]
208 |     st_ids = {"1":[[2], [3]], "2":[[5],[1]]}
209 | 
210 |     dy_max_idxs = [4, 6]
211 |     dy_fnames = ["1", "2"]
212 |     dy_ids = {"1":[[2,1,3,0,0], [2,2,0,0,0]], "2":[[5,0,0,0,0],[5,5,5,5,5]]}
213 |     dy_lengths = {"1":[3,1], "2":[2,5]}
214 | 
215 |     reals = [1, 0]
216 | 
217 |     dyNffm = DyNFFM_concat([st_fnames, dy_fnames], [st_max_idxs, dy_max_idxs], use_cuda=True)
218 |     dyNffm.cuda()
219 | 
220 |     probs = dyNffm(st_ids, dy_ids, dy_lengths)
221 |     print(probs)
222 | 
223 |     loss = dyNffm.get_loss(probs, reals)
224 |     print(loss)
225 | 
226 | 


--------------------------------------------------------------------------------
/src/NFFM_concat_filter.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | model:
 11 |     input: [single feature id] + [multi feature ids]
 12 |     embedding_layer: single : embedding    multi: avg_embedding
 13 |     lr_layer: one-hot lr + embeddings_concat lr
 14 |     activate_layer: sigmoid
 15 | """
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import time, sys
 22 | 
 23 | from staticEmbedding import *
 24 | from dynamicEmbedding import *
 25 | from focal_loss import *
 26 | from support_model import *
 27 | 
 28 | 
 29 | class DyNFFM_concat_filter(nn.Module):
 30 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, batch_norm=True, use_cuda=True):
 31 |         """
 32 |         fnames: feature names: [static feature names, dynamic feature names]
 33 |         max_idxs: max_idxs: [static max_idxs, dynamic max_idxs]
 34 |         embedding_sizes: size of embedding, [n_single_embedding, n_multi_embedding]
 35 |         dropout_rate: prob for dropout, set None if no dropout,
 36 |         use_cuda: bool, True for gpu or False for cpu
 37 |         """
 38 | 
 39 |         super(DyNFFM_concat_filter, self).__init__()
 40 |         self.fnames = fnames
 41 |         self.max_idxs = max_idxs
 42 |         self.n_fnames = len(fnames[0]) + len(fnames[1])
 43 |         self.embedding_size = embedding_size
 44 |         self.field_embedding_size = embedding_size * self.n_fnames
 45 |         self.dropout_rate = dropout_rate
 46 |         self.batch_norm = batch_norm
 47 |         self.use_cuda = use_cuda
 48 | 
 49 |         self.stEmb = StEmb(
 50 |                 self.fnames[0],
 51 |                 self.max_idxs[0],
 52 |                 embedding_size=self.field_embedding_size,
 53 |                 dropout_rate=self.dropout_rate,
 54 |                 use_cuda=self.use_cuda
 55 |                 )
 56 |         self.dyEmb = DyEmb(
 57 |                 self.fnames[1],
 58 |                 self.max_idxs[1],
 59 |                 embedding_size=self.field_embedding_size,
 60 |                 dropout_rate=self.dropout_rate,
 61 |                 method='avg',
 62 |                 use_cuda=self.use_cuda
 63 |                 )
 64 | 
 65 |         #  self.stLr = StEmb(
 66 |         #          self.fnames[0],
 67 |         #          self.max_idxs[0],
 68 |         #          embedding_size=1,
 69 |         #          dropout_rate=self.dropout_rate,
 70 |         #          use_cuda=self.use_cuda
 71 |         #          )
 72 | 
 73 |         #  self.dyLr = DyEmb(
 74 |         #          self.fnames[1],
 75 |         #          self.max_idxs[1],
 76 |         #          embedding_size=1,
 77 |         #          dropout_rate=self.dropout_rate,
 78 |         #          method='avg',
 79 |         #          use_cuda=self.use_cuda
 80 |         #          )
 81 | 
 82 |         self.bias = torch.nn.Parameter(torch.zeros(1))
 83 | 
 84 |         # focal loss layer
 85 |         self.fc_loss = FocalLoss(gamma=2)
 86 | 
 87 |         # mask for combination
 88 |         self.mask = []
 89 |         all_fnames = fnames[0] + fnames[1]
 90 |         self.n_bi = 0
 91 |         for i in range(self.n_fnames):
 92 |             tmp = []
 93 |             for j in range(self.n_fnames):
 94 |                 if i >= j or filter_same_features(all_fnames[i], all_fnames[j]):
 95 |                     tmp.append(0)
 96 |                 else:
 97 |                     tmp.append(1)
 98 |                     self.n_bi += 1
 99 |             self.mask.append(tmp)
100 |         self.mask = torch.autograd.Variable(torch.ByteTensor(self.mask))
101 | 
102 |         self.concat_mask = []
103 |         for i in range(self.n_fnames):
104 |             tmp = [0] * self.n_fnames
105 |             tmp[i] = 1
106 |             self.concat_mask.append(tmp)
107 |         self.concat_mask = torch.autograd.Variable(torch.ByteTensor(self.concat_mask))
108 | 
109 |         # embedding lr layer, 3 layers
110 |         # bi-interation + embedding concat
111 |         self.embLr_input_dim = (self.n_bi + self.n_fnames) * self.embedding_size 
112 |         self.hidden_size_1 = 512
113 |         self.batch_norm_1 =  nn.BatchNorm1d(self.embLr_input_dim)
114 |         self.embLr1 = nn.Linear(self.embLr_input_dim, self.hidden_size_1)
115 |         self.batch_norm_2 = nn.BatchNorm1d(self.hidden_size_1)
116 |         self.hidden_size_2 = 256
117 |         self.embLr2 = nn.Linear(self.hidden_size_1, self.hidden_size_2)
118 |         self.batch_norm_3 = nn.BatchNorm1d(self.hidden_size_2)
119 |         self.hidden_size_3 = 128
120 |         self.embLr3 = nn.Linear(self.hidden_size_2, self.hidden_size_3)
121 | 
122 |         # concat input and output
123 |         self.concat_input_dim = self.hidden_size_3 + self.embLr_input_dim
124 | 
125 |         self.batch_norm_out = nn.BatchNorm1d(self.concat_input_dim)
126 |         self.embLr_out = nn.Linear(self.concat_input_dim, 1)
127 | 
128 |         self.embLr_is_dropout = False
129 |         if self.dropout_rate is not None:
130 |             self.embLr_is_dropout = True
131 |             self.emb_dropout = nn.Dropout(self.dropout_rate)
132 | 
133 | 
134 |         #  self.mask_len = [x + 1 for x in range(self.n_fnames)]
135 |         if self.use_cuda:
136 |             self.mask = self.mask.cuda()
137 |             self.concat_mask = self.concat_mask.cuda()
138 |             self.fc_loss = self.fc_loss.cuda()
139 |     
140 |     def load(self, model_path):
141 |         self.load_state_dict(torch.load(model_path))
142 | 
143 |     def save(self, model_path):
144 |         torch.save(self.state_dict(), model_path)
145 | 
146 | 
147 |     def forward(self, static_ids, dynamic_ids, dynamic_lengths, conversion_rates):
148 |         """
149 |         input: relative id
150 |         static_ids: Batch_size * Field_size
151 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
152 |         dynamic_lengths: Batch_size * Field_size
153 |         return: Batch_size * 1,  probs
154 |         """
155 | 
156 |         # embedding layers
157 |         dynamic_embeddings = self.dyEmb(dynamic_ids, dynamic_lengths)
158 | 
159 |         static_embeddings = self.stEmb(static_ids)
160 | 
161 |         batch_size = static_embeddings.size()[0]
162 | 
163 |         # B*F*E
164 |         all_embeddings = torch.cat([static_embeddings, dynamic_embeddings], 1)
165 |         all_embeddings = all_embeddings.view(batch_size, self.n_fnames, self.n_fnames, self.embedding_size)
166 | 
167 |         # combine feature by multi
168 |         all_mask = self.mask.view(1, self.n_fnames, self.n_fnames, 1).expand_as(all_embeddings)
169 |         all_embeddings_ur = torch.masked_select(all_embeddings, all_mask)
170 |         all_embeddings_1 = all_embeddings.transpose(2, 1)
171 |         all_embeddings_ll = torch.masked_select(all_embeddings_1, all_mask)
172 | 
173 |         bi_embeddings = all_embeddings_ur * all_embeddings_ll
174 | 
175 |         # lr layer
176 |         # static_lr_out = self.stLr(static_ids).view(batch_size, -1)
177 |         # dynamic_lr_out = self.dyLr(dynamic_ids, dynamic_lengths).view(batch_size, -1)
178 | 
179 |         # one order
180 |         all_concat_mask = self.concat_mask.view(1, self.n_fnames, self.n_fnames, 1).expand_as(all_embeddings)
181 |         one_embeddings = torch.masked_select(all_embeddings, all_concat_mask)
182 | 
183 |         # embedding lr layer
184 |         # B*F1*E + B*F2*E -> B*[F1+F2]*E -> B*[F*E]
185 |         embedding_lr_in_1 = torch.cat([bi_embeddings.view(batch_size, -1), one_embeddings.view(batch_size, -1)], -1)
186 |         if self.embLr_is_dropout:
187 |             embedding_lr_in_1 = self.emb_dropout(embedding_lr_in_1)
188 |         if self.batch_norm:
189 |             embedding_lr_in_1 = self.batch_norm_1(embedding_lr_in_1)
190 |         embedding_lr_out_1 = self.embLr1(embedding_lr_in_1)
191 | 
192 |         embedding_lr_in_2 = F.relu(embedding_lr_out_1)
193 |         if self.embLr_is_dropout:
194 |             embedding_lr_in_2 = self.emb_dropout(embedding_lr_in_2)
195 |         if self.batch_norm:
196 |             embedding_lr_in_2 = self.batch_norm_2(embedding_lr_in_2)
197 |         embedding_lr_out_2 = self.embLr2(embedding_lr_in_2)
198 | 
199 |         embedding_lr_in_3 = F.relu(embedding_lr_out_2)
200 |         if self.embLr_is_dropout:
201 |             embedding_lr_in_3 = self.emb_dropout(embedding_lr_in_3)
202 |         if self.batch_norm:
203 |             embedding_lr_in_3 = self.batch_norm_3(embedding_lr_in_3)
204 |         embedding_lr_out_3 = self.embLr3(embedding_lr_in_3)
205 | 
206 |         embedding_lr_in = F.relu(embedding_lr_out_3)
207 |         # concat input and output
208 |         embedding_lr_in = torch.cat([embedding_lr_in, embedding_lr_in_1], -1)
209 |         if self.embLr_is_dropout:
210 |             embedding_lr_in = self.emb_dropout(embedding_lr_in)
211 |         if self.batch_norm:
212 |             embedding_lr_in = self.batch_norm_out(embedding_lr_in)
213 |         embedding_lr_out = self.embLr_out(embedding_lr_in)
214 | 
215 |         # output
216 |         #  print self.static_lr_out
217 |         #  print self.dynamic_lr_out
218 |         #  print self.embedding_lr_out
219 |         #  scores = self.bias + torch.sum(static_lr_out, -1) + torch.sum(dynamic_lr_out, -1) + torch.sum(embedding_lr_out, -1)
220 |         scores = self.bias + torch.sum(embedding_lr_out, -1)
221 | 
222 |         # activate layer
223 |         # self.probs = F.sigmoid(self.scores)
224 | 
225 |         return scores
226 | 
227 |     def get_loss(self, scores, labels):
228 |         """
229 |         binary cross entropy loss
230 |         """
231 |         labels = torch.autograd.Variable(torch.FloatTensor(labels), requires_grad=False)
232 |         if self.use_cuda:
233 |             labels = labels.cuda()
234 | 
235 |         #  BCE loss
236 |         loss = F.binary_cross_entropy_with_logits(scores, labels)
237 | 
238 |         #  weighted BCE loss
239 |         #  weights = labels * 10.0
240 |         #  weights = weights.masked_fill_(labels.le(0.5), 1.0)
241 |         #  loss = F.binary_cross_entropy_with_logits(scores, labels, weights)
242 | 
243 |         #  margin loss
244 |         #  labels = labels.masked_fill_(labels.le(0.5), -1)
245 |         #  loss = F.soft_margin_loss(scores, labels)
246 | 
247 |         #  focal loss
248 |         #  scores = torch.sigmoid(scores).view(-1, 1)
249 |         #  scores = torch.cat([1.0-scores, scores], -1)
250 |         #  loss = self.fc_loss(scores, labels.long())
251 | 
252 |         return loss
253 | 
254 | 
255 | if __name__ == '__main__':
256 |     st_max_idxs = [4, 6]
257 |     st_fnames = ["1", "2"]
258 |     st_ids = {"1":[[2], [3]], "2":[[5],[1]]}
259 | 
260 |     dy_max_idxs = [4, 6]
261 |     dy_fnames = ["1", "2"]
262 |     dy_ids = {"1":[[2,1,3,0,0], [2,2,0,0,0]], "2":[[5,0,0,0,0],[5,5,5,5,5]]}
263 |     dy_lengths = {"1":[3,1], "2":[2,5]}
264 | 
265 |     reals = [1, 0]
266 | 
267 |     dyNffm = DyNFFM_concat([st_fnames, dy_fnames], [st_max_idxs, dy_max_idxs], use_cuda=True)
268 |     dyNffm.cuda()
269 | 
270 |     probs = dyNffm(st_ids, dy_ids, dy_lengths)
271 |     print(probs)
272 | 
273 |     loss = dyNffm.get_loss(probs, reals)
274 |     print(loss)
275 | 
276 | 


--------------------------------------------------------------------------------
/src/NFFM_concat_triple.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | model:
 11 |     input: [single feature id] + [multi feature ids]
 12 |     embedding_layer: single : embedding    multi: avg_embedding
 13 |     lr_layer: one-hot lr + embeddings_concat lr
 14 |     activate_layer: sigmoid
 15 | """
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import time, sys
 22 | 
 23 | from staticEmbedding import *
 24 | from dynamicEmbedding import *
 25 | from focal_loss import *
 26 | from support_model import *
 27 | 
 28 | 
 29 | class DyNFFM_concat_triple(nn.Module):
 30 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, batch_norm=True, use_cuda=True):
 31 |         """
 32 |         fnames: feature names: [static feature names, dynamic feature names]
 33 |         max_idxs: max_idxs: [static max_idxs, dynamic max_idxs]
 34 |         embedding_sizes: size of embedding, [n_single_embedding, n_multi_embedding]
 35 |         dropout_rate: prob for dropout, set None if no dropout,
 36 |         use_cuda: bool, True for gpu or False for cpu
 37 |         """
 38 | 
 39 |         super(DyNFFM_concat_triple, self).__init__()
 40 |         self.fnames = fnames
 41 |         self.max_idxs = max_idxs
 42 |         self.n_fnames = len(fnames[0]) + len(fnames[1])
 43 |         self.embedding_size = embedding_size
 44 |         self.field_embedding_size = embedding_size * self.n_fnames
 45 |         self.dropout_rate = dropout_rate
 46 |         self.batch_norm = batch_norm
 47 |         self.use_cuda = use_cuda
 48 | 
 49 |         self.stEmb = StEmb(
 50 |                 self.fnames[0],
 51 |                 self.max_idxs[0],
 52 |                 embedding_size=self.field_embedding_size,
 53 |                 dropout_rate=self.dropout_rate,
 54 |                 use_cuda=self.use_cuda
 55 |                 )
 56 |         self.dyEmb = DyEmb(
 57 |                 self.fnames[1],
 58 |                 self.max_idxs[1],
 59 |                 embedding_size=self.field_embedding_size,
 60 |                 dropout_rate=self.dropout_rate,
 61 |                 method='avg',
 62 |                 use_cuda=self.use_cuda
 63 |                 )
 64 | 
 65 |         # for one order
 66 |         self.one_stEmb = StEmb(
 67 |                 self.fnames[0],
 68 |                 self.max_idxs[0],
 69 |                 embedding_size=self.embedding_size,
 70 |                 dropout_rate=self.dropout_rate,
 71 |                 use_cuda=self.use_cuda
 72 |                 )
 73 |         self.one_dyEmb = DyEmb(
 74 |                 self.fnames[1],
 75 |                 self.max_idxs[1],
 76 |                 embedding_size=self.embedding_size,
 77 |                 dropout_rate=self.dropout_rate,
 78 |                 method='avg',
 79 |                 use_cuda=self.use_cuda
 80 |                 )
 81 | 
 82 |         self.bias = torch.nn.Parameter(torch.zeros(1))
 83 | 
 84 |         # focal loss layer
 85 |         self.fc_loss = FocalLoss(gamma=2)
 86 | 
 87 |         # mask for combination
 88 |         self.mask = []
 89 |         self.triple_masks = []
 90 |         all_fnames = fnames[0] + fnames[1]
 91 |         self.n_bi = 0
 92 |         for i in range(self.n_fnames):
 93 |             tmp = []
 94 |             for j in range(self.n_fnames):
 95 |                 if i >= j:
 96 |                 #  if i >= j or filter_same_features(all_fnames[i], all_fnames[j]):
 97 |                     tmp.append(0)
 98 |                 else:
 99 |                     tmp.append(1)
100 |                     self.n_bi += 1
101 | 
102 |                     tmp_triple_mask = [1] * self.n_fnames
103 |                     tmp_triple_mask[i] = 0
104 |                     tmp_triple_mask[j] = 0
105 |                     self.triple_masks.append(tmp_triple_mask)
106 |             self.mask.append(tmp)
107 |         self.mask = torch.autograd.Variable(torch.ByteTensor(self.mask))
108 |         # N_BI*N
109 |         self.triple_masks = torch.autograd.Variable(torch.ByteTensor(self.triple_masks))
110 | 
111 |         # embedding lr layer, 3 layers
112 |         # bi-interation + one-order-embedding + triple-interation
113 |         #  self.embLr_input_dim = self.n_bi * self.embedding_size + self.n_fnames * (self.n_fnames + 1) * self.embedding_size + self.n_bi * (self.n_fnames - 2) 
114 |         self.embLr_input_dim = self.n_bi * self.embedding_size + self.n_fnames * (self.n_fnames + 1) * self.embedding_size + self.n_bi * self.n_fnames 
115 |         self.hidden_sizes = [self.embLr_input_dim, 512, 256, 128]
116 |         self.n_linear_layers = len(self.hidden_sizes) - 1
117 |         self.batch_norms = nn.ModuleList([nn.BatchNorm1d(self.hidden_sizes[i]) for i in range(self.n_linear_layers)])
118 |         self.embLrs = nn.ModuleList([nn.Linear(self.hidden_sizes[i], self.hidden_sizes[i+1]) for i in range(self.n_linear_layers)])
119 | 
120 |         self.batch_norm_out = nn.BatchNorm1d(self.hidden_sizes[-1])
121 |         self.embLr_out = nn.Linear(self.hidden_sizes[-1], 1)
122 | 
123 |         self.embLr_is_dropout = False
124 |         if self.dropout_rate is not None:
125 |             self.embLr_is_dropout = True
126 |             self.emb_dropout = nn.Dropout(self.dropout_rate)
127 | 
128 | 
129 |         #  self.mask_len = [x + 1 for x in range(self.n_fnames)]
130 |         if self.use_cuda:
131 |             self.mask = self.mask.cuda()
132 |             self.triple_masks = self.triple_masks.cuda()
133 |             self.fc_loss = self.fc_loss.cuda()
134 |     
135 |     def load(self, model_path):
136 |         self.load_state_dict(torch.load(model_path))
137 | 
138 |     def save(self, model_path):
139 |         torch.save(self.state_dict(), model_path)
140 | 
141 | 
142 |     def forward(self, static_ids, dynamic_ids, dynamic_lengths, conversion_rates):
143 |         """
144 |         input: relative id
145 |         static_ids: Batch_size * Field_size
146 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
147 |         dynamic_lengths: Batch_size * Field_size
148 |         return: Batch_size * 1,  probs
149 |         """
150 | 
151 |         # embedding layers
152 |         dynamic_embeddings = self.dyEmb(dynamic_ids, dynamic_lengths)
153 | 
154 |         static_embeddings = self.stEmb(static_ids)
155 | 
156 |         batch_size = static_embeddings.size()[0]
157 | 
158 |         # B*F*E
159 |         all_embeddings = torch.cat([static_embeddings, dynamic_embeddings], 1)
160 |         all_embeddings = all_embeddings.view(batch_size, self.n_fnames, self.n_fnames, self.embedding_size)
161 | 
162 |         # combine feature by multi
163 |         all_mask = self.mask.view(1, self.n_fnames, self.n_fnames, 1).expand_as(all_embeddings)
164 |         all_embeddings_ur = torch.masked_select(all_embeddings, all_mask)
165 |         all_embeddings_1 = all_embeddings.transpose(2, 1)
166 |         all_embeddings_ll = torch.masked_select(all_embeddings_1, all_mask)
167 | 
168 |         bi_embeddings = all_embeddings_ur * all_embeddings_ll
169 |         bi_embeddings = bi_embeddings.view(batch_size, self.n_bi, -1)
170 | 
171 |         # one-order embedding
172 |         one_dynamic_embeddings = self.one_dyEmb(dynamic_ids, dynamic_lengths)
173 |         one_static_embeddings = self.one_stEmb(static_ids)
174 |         one_all_embeddings = torch.cat([one_static_embeddings, one_dynamic_embeddings], 1)
175 | 
176 |         # triple combine embeddings
177 |         '''
178 |         # too slow
179 |         tri_embeddings = []
180 |         for i in range(self.n_bi):
181 |             tmp_triple_mask = self.triple_masks[i].view(1, self.n_fnames, 1).expand_as(one_all_embeddings)
182 |             tmp_triple_embeddings = torch.masked_select(one_all_embeddings, tmp_triple_mask).view(batch_size, (self.n_fnames-2), -1)
183 |             # B*(N-2)*E
184 |             tmp_bi_embeddings = bi_embeddings[:, i, :].contiguous().view(batch_size, 1, -1)
185 |             tmp_bi_embeddings = tmp_bi_embeddings.expand_as(tmp_triple_embeddings)
186 |             # triple dot    B*1*(N-2)
187 |             tri_embeddings.append(torch.sum(tmp_bi_embeddings * tmp_triple_embeddings, -1).view(batch_size, 1, (self.n_fnames-2)))
188 |         # B * N_BI * (N-2)
189 |         tri_embeddings = torch.cat(tri_embeddings, 1)
190 |         '''
191 | 
192 |         '''
193 |         # out of memory because of [expand and masked_select]
194 |         tmp_all_embeddings = one_all_embeddings.view(batch_size, 1, self.n_fnames, self.embedding_size).expand(batch_size, self.n_bi, self.n_fnames, self.embedding_size)
195 |         tmp_triple_mask = self.triple_masks.view(1, self.n_bi, self.n_fnames, 1).expand_as(tmp_all_embeddings)
196 |         tmp_triple_embeddings = torch.masked_select(tmp_all_embeddings, tmp_triple_mask).view(batch_size, self.n_bi, (self.n_fnames-2), -1)
197 |         tmp_bi_embeddings = bi_embeddings.view(batch_size, self.n_bi, 1, -1).expand_as(tmp_triple_embeddings)
198 |         # B * N_BI * (N-2)
199 |         tri_embeddings = torch.sum(tmp_bi_embeddings * tmp_triple_embeddings, -1)
200 |         '''
201 | 
202 |         # multi all without masked_select
203 |         tmp_triple_embeddings = one_all_embeddings.view(batch_size, 1, self.n_fnames, self.embedding_size).expand(batch_size, self.n_bi, self.n_fnames, self.embedding_size)
204 |         tmp_bi_embeddings = bi_embeddings.view(batch_size, self.n_bi, 1, -1).expand_as(tmp_triple_embeddings)
205 |         # B * N_BI * (N-2)
206 |         tri_embeddings = torch.sum(tmp_bi_embeddings * tmp_triple_embeddings, -1)
207 |         
208 | 
209 |         # embedding lr layer
210 |         # B*F1*E + B*F2*E -> B*[F1+F2]*E -> B*[F*E]
211 |         lr_out = torch.cat([bi_embeddings.view(batch_size, -1), all_embeddings.view(batch_size, -1), one_all_embeddings.view(batch_size, -1), tri_embeddings.view(batch_size, -1)], -1)
212 |         for i in range(self.n_linear_layers):
213 |             lr_in = lr_out
214 |             if self.batch_norm:
215 |                 lr_in = self.batch_norms[i](lr_out)
216 |             if self.embLr_is_dropout:
217 |                 self.emb_dropout(lr_in)
218 |             lr_out = self.embLrs[i](lr_in)
219 |             lr_out = F.relu(lr_out)
220 | 
221 |         embedding_lr_in = lr_out
222 |         if self.embLr_is_dropout:
223 |             embedding_lr_in = self.emb_dropout(embedding_lr_in)
224 |         if self.batch_norm:
225 |             embedding_lr_in = self.batch_norm_out(embedding_lr_in)
226 |         embedding_lr_out = self.embLr_out(embedding_lr_in)
227 | 
228 |         # output
229 |         #  print self.static_lr_out
230 |         #  print self.dynamic_lr_out
231 |         #  print self.embedding_lr_out
232 |         #  scores = self.bias + torch.sum(static_lr_out, -1) + torch.sum(dynamic_lr_out, -1) + torch.sum(embedding_lr_out, -1)
233 |         scores = self.bias + torch.sum(embedding_lr_out, -1)
234 | 
235 |         # activate layer
236 |         # self.probs = F.sigmoid(self.scores)
237 | 
238 |         return scores
239 | 
240 |     def get_loss(self, scores, labels):
241 |         """
242 |         binary cross entropy loss
243 |         """
244 |         labels = torch.autograd.Variable(torch.FloatTensor(labels), requires_grad=False)
245 |         if self.use_cuda:
246 |             labels = labels.cuda()
247 | 
248 |         #  BCE loss
249 |         loss = F.binary_cross_entropy_with_logits(scores, labels)
250 | 
251 |         #  weighted BCE loss
252 |         #  weights = labels * 10.0
253 |         #  weights = weights.masked_fill_(labels.le(0.5), 1.0)
254 |         #  loss = F.binary_cross_entropy_with_logits(scores, labels, weights)
255 | 
256 |         #  margin loss
257 |         #  labels = labels.masked_fill_(labels.le(0.5), -1)
258 |         #  loss = F.soft_margin_loss(scores, labels)
259 | 
260 |         #  focal loss
261 |         #  scores = torch.sigmoid(scores).view(-1, 1)
262 |         #  scores = torch.cat([1.0-scores, scores], -1)
263 |         #  loss = self.fc_loss(scores, labels.long())
264 | 
265 |         return loss
266 | 
267 | 
268 | if __name__ == '__main__':
269 |     st_max_idxs = [4, 6]
270 |     st_fnames = ["1", "2"]
271 |     st_ids = {"1":[[2], [3]], "2":[[5],[1]]}
272 | 
273 |     dy_max_idxs = [4, 6]
274 |     dy_fnames = ["1", "2"]
275 |     dy_ids = {"1":[[2,1,3,0,0], [2,2,0,0,0]], "2":[[5,0,0,0,0],[5,5,5,5,5]]}
276 |     dy_lengths = {"1":[3,1], "2":[2,5]}
277 | 
278 |     reals = [1, 0]
279 | 
280 |     dyNffm = DyNFFM_concat([st_fnames, dy_fnames], [st_max_idxs, dy_max_idxs], use_cuda=True)
281 |     dyNffm.cuda()
282 | 
283 |     probs = dyNffm(st_ids, dy_ids, dy_lengths)
284 |     print(probs)
285 | 
286 |     loss = dyNffm.get_loss(probs, reals)
287 |     print(loss)
288 | 
289 | 


--------------------------------------------------------------------------------
/src/NFM.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | model:
 11 |     input: [single feature id] + [multi feature ids]
 12 |     embedding_layer: single : embedding    multi: avg_embedding
 13 |     lr_layer: one-hot lr + embeddings_concat lr
 14 |     activate_layer: sigmoid
 15 | """
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import time, sys
 22 | 
 23 | from staticEmbedding import *
 24 | from dynamicEmbedding import *
 25 | from support_model import *
 26 | 
 27 | 
 28 | class DyNFM(nn.Module):
 29 |     def __init__(self, batch_size, field_sizes, total_feature_sizes, embedding_size=4,
 30 |                  dropout_rate=None, batch_norm=True, use_cuda=True):
 31 |         """
 32 |         batch_size: batch_size
 33 |         field_sizes: length of feature_sizes, [n_single_field, n_multi_field]
 34 |         total_feature_sizes: total feature size, [n_single_feature, n_multi_feature]
 35 |         embedding_sizes: size of embedding, [n_single_embedding, n_multi_embedding]
 36 |         dropout_rate: prob for dropout, set None if no dropout,
 37 |         use_cuda: bool, True for gpu or False for cpu
 38 |         """
 39 |         if batch_norm:
 40 |             dropout_rate = None
 41 | 
 42 |         super(DyNFM, self).__init__()
 43 | 
 44 |         self.batch_size = batch_size
 45 |         self.field_sizes = field_sizes
 46 |         self.total_feature_sizes = total_feature_sizes
 47 |         self.embedding_size = embedding_size
 48 |         self.dropout_rate = dropout_rate
 49 |         self.batch_norm = batch_norm
 50 |         self.use_cuda = use_cuda
 51 | 
 52 |         self.stEmb = StEmb(self.batch_size, self.field_sizes[0], self.total_feature_sizes[0], embedding_size=self.embedding_size, dropout_rate=self.dropout_rate, use_cuda=self.use_cuda)
 53 |         self.dyEmb = DyEmb(self.batch_size, self.field_sizes[1], self.total_feature_sizes[1], embedding_size=self.embedding_size, dropout_rate=self.dropout_rate, method='avg', use_cuda=self.use_cuda)
 54 | 
 55 |         self.stLr = StEmb(self.batch_size, self.field_sizes[0], self.total_feature_sizes[0], embedding_size=1, dropout_rate=self.dropout_rate, use_cuda=self.use_cuda)
 56 |         self.dyLr = DyEmb(self.batch_size, self.field_sizes[1], self.total_feature_sizes[1], embedding_size=1, dropout_rate=self.dropout_rate, method='sum', use_cuda=self.use_cuda)
 57 | 
 58 |         self.bias = torch.nn.Parameter(torch.randn(1))
 59 | 
 60 |         '''
 61 |         if self.use_cuda:
 62 |             self.stEmb = self.stEmb.cuda()
 63 |             self.dyEmb = self.dyEmb.cuda()
 64 |             self.stLr = self.stLr.cuda()
 65 |             self.dyLr = self.dyLr.cuda()
 66 |         '''
 67 | 
 68 |         # embedding lr layer, 3 layers
 69 |         self.all_field_size = self.field_sizes[0] + self.field_sizes[1]
 70 |         self.embLr_input_dim = self.all_field_size * (self.all_field_size - 1) // 2 * self.embedding_size
 71 |         self.hidden_size_1 = 512
 72 |         self.batch_norm_1 =  nn.BatchNorm1d(self.embLr_input_dim)
 73 |         self.embLr1 = nn.Linear(self.embLr_input_dim, self.hidden_size_1)
 74 |         self.batch_norm_2 = nn.BatchNorm1d(self.hidden_size_1)
 75 |         self.hidden_size_2 = 512
 76 |         self.embLr2 = nn.Linear(self.hidden_size_1, self.hidden_size_2)
 77 |         self.batch_norm_3 = nn.BatchNorm1d(self.hidden_size_2)
 78 |         self.hidden_size_3 = 512
 79 |         self.embLr3 = nn.Linear(self.hidden_size_2, self.hidden_size_3)
 80 | 
 81 |         self.batch_norm_out = nn.BatchNorm1d(self.hidden_size_3)
 82 |         self.embLr_out = nn.Linear(self.hidden_size_3, 1)
 83 | 
 84 |         self.embLr_is_dropout = False
 85 |         if self.dropout_rate is not None:
 86 |             self.embLr_is_dropout = True
 87 |             self.emb_dropout = nn.Dropout(self.dropout_rate)
 88 | 
 89 |         # mask for combination
 90 |         self.mask_len = [x + 1 for x in range(self.all_field_size)]
 91 |         self.mask = make_mask(self.mask_len, self.all_field_size, fill_val=False)
 92 |         if self.use_cuda:
 93 |             self.mask = self.mask.cuda()
 94 | 
 95 |     def forward(self, static_ids, dynamic_ids, dynamic_lengths):
 96 |         """
 97 |         input: relative id
 98 |         static_ids: Batch_size * Field_size
 99 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
100 |         dynamic_lengths: Batch_size * Field_size
101 |         return: Batch_size * 1,  probs
102 |         """
103 | 
104 |         # embedding layers
105 |         dynamic_embeddings = self.dyEmb(dynamic_ids, dynamic_lengths)
106 | 
107 |         static_embeddings = self.stEmb(static_ids)
108 | 
109 |         batch_size = static_embeddings.size()[0]
110 | 
111 |         # B*F*E
112 |         all_embeddings = torch.cat([static_embeddings, dynamic_embeddings], 1)
113 |         #  field_size = all_embeddings.size()[1]
114 |         field_size = self.field_sizes[0] + self.field_sizes[1]
115 | 
116 |         # combine feature by multi
117 |         '''
118 |         self.combine_embeddings = []
119 |         for i in range(self.all_embeddings.size()[1]):
120 |             for j in range(i+1, self.all_embeddings.size()[1]):
121 |                 new_emb = self.all_embeddings[:,i,:] * self.all_embeddings[:,j,:]
122 |                 self.combine_embeddings.append(new_emb)
123 | 
124 |         self.combine_embeddings = torch.cat(self.combine_embeddings, 1)
125 |         '''
126 |         all_embeddings_row = all_embeddings.view(batch_size, 1, field_size, self.embedding_size).expand(batch_size, field_size, field_size, self.embedding_size)
127 |         all_embeddings_col = all_embeddings.view(batch_size, field_size, 1, self.embedding_size).expand(batch_size, field_size, field_size, self.embedding_size)
128 |         all_embeddings_combine = all_embeddings_col * all_embeddings_row
129 |         all_mask = self.mask.view(1, field_size, field_size, 1).expand(batch_size, field_size, field_size, self.embedding_size)
130 |         combine_embeddings = torch.masked_select(all_embeddings_combine, all_mask)
131 | 
132 |         # lr layer
133 |         static_lr_out = self.stLr(static_ids).view(batch_size, -1)
134 | 
135 |         dynamic_lr_out = self.dyLr(dynamic_ids, dynamic_lengths).view(batch_size, -1)
136 | 
137 |         # embedding lr layer
138 |         # B*F1*E + B*F2*E -> B*[F1+F2]*E -> B*[F*E]
139 |         embedding_lr_in_1 = combine_embeddings.view(batch_size, -1)
140 |         if self.embLr_is_dropout:
141 |             embedding_lr_in_1 = self.emb_dropout(embedding_lr_in_1)
142 |         if self.batch_norm:
143 |             embedding_lr_in_1 = self.batch_norm_1(embedding_lr_in_1)
144 |         embedding_lr_out_1 = self.embLr1(embedding_lr_in_1)
145 | 
146 |         embedding_lr_in_2 = F.relu(embedding_lr_out_1)
147 |         if self.embLr_is_dropout:
148 |             embedding_lr_in_2 = self.emb_dropout(embedding_lr_in_2)
149 |         if self.batch_norm:
150 |             embedding_lr_in_2 = self.batch_norm_2(embedding_lr_in_2)
151 |         embedding_lr_out_2 = self.embLr2(embedding_lr_in_2)
152 | 
153 |         embedding_lr_in_3 = F.relu(embedding_lr_out_2)
154 |         if self.embLr_is_dropout:
155 |             embedding_lr_in_3 = self.emb_dropout(embedding_lr_in_3)
156 |         if self.batch_norm:
157 |             embedding_lr_in_3 = self.batch_norm_3(embedding_lr_in_3)
158 |         embedding_lr_out_3 = self.embLr3(embedding_lr_in_3)
159 | 
160 |         embedding_lr_in = F.relu(embedding_lr_out_3)
161 |         if self.embLr_is_dropout:
162 |             embedding_lr_in = self.emb_dropout(embedding_lr_in)
163 |         if self.batch_norm:
164 |             embedding_lr_in = self.batch_norm_out(embedding_lr_in)
165 |         embedding_lr_out = self.embLr_out(embedding_lr_in)
166 | 
167 |         # output
168 |         #  print self.static_lr_out
169 |         #  print self.dynamic_lr_out
170 |         #  print self.embedding_lr_out
171 |         scores = self.bias + torch.sum(static_lr_out, -1) + torch.sum(dynamic_lr_out, -1) + torch.sum(embedding_lr_out_3, -1)
172 | 
173 |         # activate layer
174 |         # self.probs = F.sigmoid(self.scores)
175 | 
176 |         return scores
177 | 
178 |     def get_loss(self, scores, labels):
179 |         """
180 |         binary cross entropy loss
181 |         """
182 |         labels = torch.autograd.Variable(labels, requires_grad=False)
183 |         loss = F.binary_cross_entropy_with_logits(scores, labels)
184 |         return loss
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     batch_size = 2
189 | 
190 |     dy_field_size = 2
191 |     dy_total_feature_size = 8
192 |     dy_ids = torch.LongTensor([[[2, 1, 3, 0, 0], [5, 0, 0, 0, 0]], [[2, 2, 0, 0, 0], [5, 5, 5, 5, 5]]])
193 |     dy_lengths = torch.LongTensor([[3, 1], [2, 5]])
194 | 
195 |     st_field_size = 2
196 |     st_total_feature_size = 6
197 |     st_ids = torch.LongTensor([[1, 5], [2, 5]])
198 | 
199 |     reals = torch.FloatTensor([1, 0])
200 | 
201 |     dyNfm = DyNFM(batch_size, [st_field_size, dy_field_size], [st_total_feature_size, dy_total_feature_size], use_cuda=False)
202 |     #  dyNfm = dyNfm.cuda()
203 | 
204 |     probs = dyNfm(st_ids, dy_ids, dy_lengths)
205 |     print(probs)
206 | 
207 |     loss = dyNfm.get_loss(probs, reals)
208 |     print(loss)
209 | 
210 | 


--------------------------------------------------------------------------------
/src/args.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | args
11 | """
12 | 
13 | class args:
14 | 
15 |     ignore_features = ['appIdInstall', 'appIdAction', 'marriageStatus', 'kw3', 'interest3', 'interest4', 'topic3']
16 | 
17 |     user_static_features = ['house', 'education', 'LBS', 'consumptionAbility', 'gender', 'age', 'carrier']
18 |     ad_static_features = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId','productId', 'productType']
19 |     user_dynamic_features = ['interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic1', 'topic2', 'ct', 'os']
20 |     #  user_dynamic_features = ['interest1']
21 |     #  len_static_features = [x+'_len' for x in user_dynamic_features] + ['uid']
22 |     len_static_features = [x+'_len' for x in user_dynamic_features] + ['uid'] + ['uid_pos']
23 |     #  len_static_features = [x+'_len' for x in user_dynamic_features] + ['uid'] + ['uid_pos'] + ['uid|{}_pos'.format(x) for x in ad_static_features[1:]]
24 |     #  len_static_features = [x+'_len' for x in user_dynamic_features] + ['uid'] + ['uid_pos'] + ['uid|{}_pos'.format(x) for x in ['advertiserId', 'campaignId', 'adCategoryId']]
25 |     #  len_static_features = ['interest1_len', 'interest2_len', 'interest5_len']
26 |     #  len_static_features = [] # + ['uid']
27 | 
28 |     #  combine_features_1 = ['aid', 'productId', 'productType', 'advertiserId']
29 |     #  combine_features_2 = ['LBS', 'gender', 'age', 'education', 'consumptionAbility']
30 |     combine_features_1 = []
31 |     combine_features_2 = []
32 |     combine_features = [x+'|'+y for x in combine_features_1 for y in combine_features_2]
33 | 
34 |     static_features = ad_static_features + user_static_features + len_static_features
35 |     #  static_features = ad_static_features + user_static_features
36 |     dynamic_features = user_dynamic_features + combine_features
37 | 
38 |     all_features = static_features + dynamic_features
39 | 
40 |     root_train_data_path = '../data/'
41 |     root_test_data_path = '../data/'
42 |     root_data_path = '../data/'
43 | 
44 |     lr = 0.0015
45 |     momentum = 0.9
46 |     weight_decay = 0.00000
47 |     dropout_rate = None
48 |     batch_norm = True
49 |     embedding_size = 16
50 |     cut_threshold = 100
51 | 
52 |     use_cuda = True
53 | 
54 |     has_cr = False  # has conversion rate features
55 | 
56 |     epochs = 1
57 |     batch_size = 1024
58 | 
59 |     n_train_parts = 11  # fusai + chusai
60 |     n_test1_parts = 3
61 |     n_test2_parts = 3
62 |     n_valid_parts = 1
63 | 


--------------------------------------------------------------------------------
/src/avg_submission.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | avg submission
11 | """
12 | 
13 | import sys
14 | 
15 | total_cnt = len(sys.argv[1:])
16 | total_res = []
17 | 
18 | cnt = 0
19 | for f in sys.argv[1:]:
20 |     with open(f, 'r') as fin:
21 |         idx = 0
22 |         for line in fin:
23 |             line = line.strip()
24 |             if cnt == 0:
25 |                 total_res.append(float(line))
26 |             else:
27 |                 total_res[idx] += float(line)
28 |             idx += 1
29 |     cnt += 1
30 | 
31 | for res in total_res:
32 |     print('%.6f' % (res/total_cnt))
33 | 
34 | 


--------------------------------------------------------------------------------
/src/build_conversion_rate.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | import argparse
14 | import sys, os
15 | 
16 | from DataLoader import *
17 | from args import *
18 | 
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument("--type_name", type=str, default="train_shuf", help="[train_shuf, test1, test2]")
21 | parser_args = parser.parse_args()
22 | assert parser_args.type_name in ["train", "train_shuf", "test1", "test2", "valid", "train_all"]
23 | 
24 | is_train = 'train' in parser_args.type_name
25 | if is_train:
26 |     args.root_data_path = args.root_train_data_path
27 | dataLoader = DataLoader(type_name=parser_args.type_name, is_train=is_train)
28 | 
29 | n_parts = args.n_train_parts
30 | if parser_args.type_name == 'test1':
31 |     n_parts = args.n_test1_parts
32 | elif parser_args.type_name == 'test2':
33 |     n_parts = args.n_test2_parts
34 | elif parser_args.type_name == 'valid':
35 |     n_parts = args.n_valid_parts
36 | 
37 | 
38 | #  now_features = args.user_static_features
39 | now_features = args.ad_static_features + args.user_static_features + args.user_dynamic_features
40 | #  now_features = args.dynamic_features
41 | file_path = os.path.join(args.root_data_path, "combine_{}.csv".format(parser_args.type_name))
42 | for fname in now_features:
43 |     sys.stderr.write('saving {} conversion rate\n'.format(fname))
44 |     dataLoader.build_conversion_rate(fname)
45 |     #  for p in range(n_parts):
46 |     #      dataLoader.save_conversion_rate(fname, p)
47 | 


--------------------------------------------------------------------------------
/src/build_len_max_idx.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | from args import *
14 | import numpy as np
15 | import sys, os
16 | import pickle
17 | 
18 | args.root_data_path = args.root_train_data_path
19 | max_idxs_file_path = args.root_data_path + "infos/max_idxs/{}.pkl"
20 | 
21 | 
22 | def save_max_idx(fname, n_parts):
23 |     max_idx = 0
24 |     for name, n_parts in zip(['train_all'], [args.n_train_parts]):
25 |         ###
26 |         for p in range(n_parts):
27 |             sys.stderr.write('loading {} part {}...\n'.format(fname, p))
28 |             bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_{}.bin'.format(name, fname, p))
29 |             len_data = np.fromfile(bin_file_path, dtype=int)
30 |             tmp = np.max(len_data)
31 |             if tmp > max_idx:
32 |                 max_idx = tmp
33 |         #  pickle.dump(max_idx, open(max_idxs_file_path.format(fname+'_len'), 'w'))
34 |     pickle.dump(max_idx+1, open(max_idxs_file_path.format(fname), 'w'))
35 | 
36 | all_fnames = ['{}_len'.format(x) for x in args.user_dynamic_features]
37 | for fname in all_fnames:
38 |     save_max_idx(fname, args.n_train_parts)
39 | 


--------------------------------------------------------------------------------
/src/build_pos_feature.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | 
 11 | """
 12 | 
 13 | import numpy as np
 14 | import sys, os
 15 | import pickle
 16 | from DataLoader import *
 17 | from args import *
 18 | 
 19 | def build_count2idx(data, labels, begin, end):
 20 |     total_count2idx = {"<pad>":0, -1:1, 0:2}
 21 |     pos_count2idx = {"<pad>":0, -1:1, 0:2}
 22 |     for i, d in enumerate(data):
 23 |         if i >= begin and i < end:
 24 |             continue
 25 |         if d not in total_count2idx:
 26 |             total_count2idx[d] = 2
 27 |         if d not in pos_count2idx:
 28 |             pos_count2idx[d] = 2
 29 |         total_count2idx[d] += 1
 30 |         if labels[i] == '1':
 31 |             pos_count2idx[d] += 1
 32 |     return total_count2idx, pos_count2idx
 33 | 
 34 | def build_test_data(test_data, pos_count2idx):
 35 |     test_res = []
 36 |     for d in test_data:
 37 |         if d not in pos_count2idx:
 38 |             test_res.append(1)
 39 |         else:
 40 |             test_res.append(pos_count2idx[d])
 41 |     return test_res
 42 | 
 43 | 
 44 | def count_pos_feature(train_data, labels, k=5, test1_data=None, test2_data=None):
 45 |     nums = len(train_data)
 46 |     last = nums
 47 |     interval = last // k
 48 |     parts = []
 49 |     for i in range(k):
 50 |         parts.append(i * interval)
 51 |     parts.append(last)
 52 |     count_train_data = train_data[0:last]
 53 |     count_labels = labels[0:last]
 54 | 
 55 |     train_res = []
 56 |     for i in range(k):
 57 |         sys.stderr.write("{}, part counting\n".format(i))
 58 |         sys.stderr.write("{}, {}\n".format(parts[i], parts[i+1]))
 59 |         tmp = []
 60 |         total_count2idx, pos_count2idx = build_count2idx(count_train_data, count_labels, parts[i], parts[i+1])
 61 |         for j in range(parts[i],parts[i+1]):
 62 |             d = train_data[j]
 63 |             if d not in pos_count2idx:
 64 |                 tmp.append(1)
 65 |             else:
 66 |                 tmp.append(pos_count2idx[d])
 67 |         train_res.extend(tmp)
 68 |     train_res = np.asarray(train_res)
 69 | 
 70 |     total_count2idx, pos_count2idx = build_count2idx(count_train_data, count_labels, 1, 0)
 71 | 
 72 |     test1_res = None
 73 |     if test1_data is not None:
 74 |         test1_res = build_test_data(test1_data, pos_count2idx)
 75 |         test1_res = np.asarray(test1_res)
 76 | 
 77 |     test2_res = None
 78 |     if test2_data is not None:
 79 |         test2_res = build_test_data(test2_data, pos_count2idx)
 80 |         test2_res = np.asarray(test2_res)
 81 | 
 82 |     max_idx = 0
 83 |     for key in pos_count2idx:
 84 |         if max_idx < pos_count2idx[key]:
 85 |             max_idx = pos_count2idx[key]
 86 |     max_idx += 1
 87 | 
 88 |     return train_res, test1_res, test2_res, max_idx
 89 | 
 90 | def save_bin(data, dl, name, fname):
 91 |    for p in range(dl.n_parts):
 92 |        sys.stderr.write("saving {} part {}\n".format(name, p))
 93 |        sp = p * dl.parts
 94 |        ep = (p+1) * dl.parts
 95 |        bin_file_path = os.path.join(args.root_data_path, 'bin_files/{}/{}_pos_{}.bin'.format(name, fname, p))
 96 |        data[sp:ep].tofile(bin_file_path, format="%d")
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     train_dl = DataLoader(type_name="train_all", is_train=True)
101 |     test1_dl = DataLoader(type_name="valid", is_train=False)
102 |     test2_dl = DataLoader(type_name="test2", is_train=False)
103 | 
104 |     fnames = ['uid']
105 |     fnames += ['uid|'+x for x in args.ad_static_features[1:]]
106 | 
107 |     tmp_fnames = []
108 |     for fname in fnames:
109 |         if fname not in tmp_fnames:
110 |             tmp_fnames.append(fname)
111 |         cf = fname.split('|')
112 |         if len(cf) == 2:
113 |             if cf[0] not in tmp_fnames:
114 |                 tmp_fnames.append(cf[0])
115 |             if cf[1] not in tmp_fnames:
116 |                 tmp_fnames.append(cf[1])
117 | 
118 |     train_dl.load_data("../data/combine_train_all.csv", tmp_fnames)
119 |     test1_dl.load_data("../data/combine_valid.csv", tmp_fnames)
120 |     test2_dl.load_data("../data/combine_test2.csv", tmp_fnames)
121 | 
122 |     for fname in fnames:
123 |         train_dl.combine_features(fname)
124 |         test1_dl.combine_features(fname)
125 |         test2_dl.combine_features(fname)
126 | 
127 |     for fname in fnames:
128 |         if fname == 'label':
129 |             continue
130 |         train_data, test1_data, test2_data, max_idx = count_pos_feature(train_dl.id_data[fname], train_dl.id_data['label'], 5, test1_dl.id_data[fname], test2_dl.id_data[fname])
131 |         #  train_data, test1_data, test2_data, max_idx = count_pos_feature(train_dl.id_data[fname], train_dl.id_data['label'], 5, test1_dl.id_data[fname], None)
132 | 
133 |         # save
134 |         save_bin(train_data, train_dl, 'train_all', fname)
135 |         save_bin(test1_data, test1_dl, 'valid', fname)
136 |         save_bin(test2_data, test2_dl, 'test2', fname)
137 | 
138 |         # save_max_idx
139 |         sys.stderr.write('max_idx: {}\n'.format(max_idx))
140 |         max_idxs_file_path = args.root_data_path + "infos/max_idxs/{}_pos.pkl".format(fname)
141 |         pickle.dump(max_idx, open(max_idxs_file_path, 'w'))
142 | 
143 | 


--------------------------------------------------------------------------------
/src/build_uid2idx.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | import os, sys
14 | import pickle
15 | import numpy as np
16 | 
17 | from args import *
18 | 
19 | file_format = os.path.join(args.root_data_path, "combine_{}.csv")
20 | #  files = ['train_all', 'test1', 'test2', 'valid']
21 | files = ['train_all']
22 | 
23 | uid2cnt = {'<pad>':0, '-1':1, '<unk>':2}
24 | for f in files:
25 |     cnt = 1
26 |     file_name = file_format.format(f)
27 |     with open(file_name, 'r') as fin:
28 |         fnames = fin.readline().strip().split(',')
29 |         # find uid
30 |         idx = 0
31 |         for i in range(len(fnames)):
32 |             if fnames[i] == 'uid':
33 |                 idx = i
34 |                 break
35 | 
36 |         for line in fin:
37 |             datas = line.strip().split(',')
38 |             uid = datas[idx]
39 |             if uid not in uid2cnt:
40 |                 uid2cnt[uid] = 2
41 |             uid2cnt[uid] += 1
42 | 
43 |             if cnt % 1000000 == 0:
44 |                 sys.stderr.write('loading {} part {}...\n'.format(f, cnt))
45 |             cnt += 1
46 | 
47 | uid2idx_file = os.path.join(args.root_data_path, "feature2idx/uid2idx.pkl")
48 | #  uid2cnt = pickle.load(open(uid2idx_file, 'rb'))
49 | 
50 | max_idx = 0
51 | for uid in uid2cnt:
52 |     #  uid2cnt[uid] = int(np.log(uid2cnt[uid] ** 2 + 1))
53 |     if uid2cnt[uid] > max_idx:
54 |         max_idx = uid2cnt[uid]
55 | sys.stderr.write('max_idx:{}\n'.format(max_idx))
56 | uid2cnt['max_idx'] = max_idx + 1
57 | 
58 | # save 
59 | pickle.dump(uid2cnt, open(uid2idx_file, 'wb'))
60 | 
61 | 


--------------------------------------------------------------------------------
/src/change_name_idx.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | import sys, os
14 | 
15 | dir_path = sys.argv[1]
16 | names = os.listdir(dir_path)
17 | for name in names:
18 |     pre_name, pos_name = name.split('.')
19 |     parts = pre_name.split('_')
20 |     parts[-1] = str(int(parts[-1]) + 10)
21 |     new_name = '_'.join(parts) + '.' + pos_name
22 |     #  new_name = name.replace('bin', '.bin')
23 |     print(new_name)
24 | 
25 |     os.system('mv {} {}'.format(dir_path+'/'+name, dir_path+'/'+new_name))
26 | 


--------------------------------------------------------------------------------
/src/check_labels.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | import sys
14 | 
15 | valid_file = sys.argv[1]
16 | 
17 | label_cnt = [0, 0]
18 | with open(valid_file, 'r') as fin:
19 |     fin.readline()
20 |     for line in fin:
21 |         # find label
22 |         parts = line.strip().split(',')
23 |         label = parts[2]
24 |         label_cnt[int(label)] += 1
25 | 
26 | print label_cnt
27 | 


--------------------------------------------------------------------------------
/src/combine_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 |     拼表和生成csv文件
 4 | 
 5 | """
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | import os
10 | import sys
11 | 
12 | from args import *
13 | from time import  time
14 | 
15 | # usage: python combine_data.py [chusai/fusai] [train/test1/test2]
16 | 
17 | assert sys.argv[1] in ['chusai', 'fusai']
18 | assert sys.argv[2] in ['train', 'test1', 'test2']
19 | 
20 | is_train = 'train' in sys.argv[2]
21 | 
22 | ad_feature_path = '../data/{}/adFeature.csv'.format(sys.argv[1])
23 | user_feature_path = '../data/{}/userFeature.data'.format(sys.argv[1])
24 | raw_path = '../data/{}/{}.csv'.format(sys.argv[1], sys.argv[2])
25 | 
26 | ad_feature=pd.read_csv(ad_feature_path)
27 | 
28 | userFeature_data = []
29 | user_feature = None
30 | with open(user_feature_path, 'r') as f:
31 |     for i, line in enumerate(f):
32 |         line = line.strip().split('|')
33 |         userFeature_dict = {}
34 |         for each in line:
35 |             each_list = each.split(' ')
36 |             userFeature_dict[each_list[0]] = ' '.join(each_list[1:])
37 |         userFeature_data.append(userFeature_dict)
38 |         if i % 100000 == 0:
39 |             print(i)
40 |     user_feature = pd.DataFrame(userFeature_data)
41 | user_feature['uid'] = user_feature['uid'].apply(int)
42 | 
43 | raw_data = pd.read_csv(raw_path)
44 | if is_train:
45 |     raw_data.loc[raw_data['label']==-1,'label']=0
46 | else:
47 |     raw_data['label']=-1
48 | 
49 | data=pd.merge(raw_data,ad_feature,on='aid',how='left')
50 | data=pd.merge(data,user_feature,on='uid',how='left')
51 | data=data.fillna('-1')
52 | 
53 | if sys.argv[1] == 'fusai':
54 |     data.to_csv(args.root_data_path + '../data/combine_{}.csv'.format(sys.argv[2]), index=False)
55 | else:
56 |     data.to_csv(args.root_data_path + '../data/combine_{}_{}.csv'.format(sys.argv[1], sys.argv[2]), index=False)
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/dynamicEmbedding.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | build the embedding of dynamic matrix [Batch*Field_size*Dynamic_Feature_Size]
 11 | """
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | 
 17 | from args import *
 18 | 
 19 | 
 20 | class DyEmb(nn.Module):
 21 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, method='avg', use_cuda=True):
 22 |         """
 23 |         fnames: feature names
 24 |         max_idxs: array of max_idx of each feature
 25 |         embedding_size: size of embedding
 26 |         dropout: prob for dropout, set None if no dropout
 27 |         method: 'avg' or 'sum'
 28 |         use_cuda: bool, True for gpu or False for cpu
 29 |         """
 30 |         super(DyEmb, self).__init__()
 31 | 
 32 |         assert method in ['avg', 'sum']
 33 | 
 34 |         self.fnames = fnames
 35 |         self.max_idxs = max_idxs
 36 |         self.embedding_size = embedding_size
 37 |         self.dropout_rate = dropout_rate
 38 |         self.method = method
 39 |         self.use_cuda = use_cuda
 40 | 
 41 |         # initial layer
 42 |         self.embeddings = nn.ModuleList([nn.Embedding(max_idx, self.embedding_size, padding_idx=0) for max_idx in self.max_idxs])
 43 | 
 44 |         self.is_dropout = False
 45 |         if self.dropout_rate is not None:
 46 |             self.is_dropout = True
 47 |             self.dropout = nn.Dropout(p=self.dropout_rate)
 48 | 
 49 |     def forward(self, dynamic_ids, dynamic_lengths):
 50 |         """
 51 |         input: relative id 
 52 |         dynamic_ids: Batch_size * Field_size * Max_feature_size
 53 |         dynamic_lengths: Batch_size * Field_size 
 54 |         return: Batch_size * Field_size * Embedding_size
 55 |         """
 56 | 
 57 |         concat_embeddings = []
 58 |         for i, key in enumerate(self.fnames):
 59 |             # B*M
 60 |             dynamic_ids_tensor = torch.autograd.Variable(torch.LongTensor(dynamic_ids[key]))
 61 |             dynamic_lengths_tensor = torch.autograd.Variable(torch.FloatTensor(dynamic_lengths[key]))
 62 |             if self.use_cuda:
 63 |                 dynamic_ids_tensor = dynamic_ids_tensor.cuda()
 64 |                 dynamic_lengths_tensor = dynamic_lengths_tensor.cuda()
 65 | 
 66 |             batch_size = dynamic_ids_tensor.size()[0]
 67 |             max_feature_size = dynamic_ids_tensor.size()[-1]
 68 | 
 69 |             # embedding layer B*M*E
 70 |             dynamic_embeddings_tensor = self.embeddings[i](dynamic_ids_tensor)
 71 | 
 72 |             # dropout
 73 |             if self.is_dropout:
 74 |                 dynamic_embeddings_tensor = self.dropout(dynamic_embeddings_tensor)
 75 | 
 76 |             # average B*M*E --AVG--> B*E
 77 |             dynamic_embedding = torch.sum(dynamic_embeddings_tensor, 1)
 78 | 
 79 |             if self.method == 'avg':
 80 |                 # B*E -> B*1*E
 81 |                 dynamic_lengths_tensor = dynamic_lengths_tensor.view(-1, 1).expand_as(dynamic_embedding)
 82 |                 dynamic_embedding = dynamic_embedding / dynamic_lengths_tensor
 83 |             concat_embeddings.append(dynamic_embedding.view(batch_size, 1, self.embedding_size))
 84 |         # B*F*E
 85 |         concat_embeddings = torch.cat(concat_embeddings, 1)
 86 |         return concat_embeddings
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     # test
 91 |     max_idxs = [4, 6]
 92 |     fnames = ["1", "2"]
 93 |     ids = {"1":[[2,1,3,0,0], [2,2,0,0,0]], "2":[[5,0,0,0,0],[5,5,5,5,5]]}
 94 |     lengths = {"1":[3,2], "2":[1,5]}
 95 | 
 96 | 
 97 |     dyEmb = DyEmb(fnames, max_idxs, use_cuda=False)
 98 | 
 99 |     avg_embeddings = dyEmb(ids, lengths)
100 | 
101 |     print avg_embeddings
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/src/focal_loss.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © https://github.com/clcarwin/focal_loss_pytorch
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | 
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | from torch.autograd import Variable
14 | 
15 | class FocalLoss(nn.Module):
16 |     def __init__(self, gamma=0, alpha=None, size_average=True):
17 |         super(FocalLoss, self).__init__()
18 |         self.gamma = gamma
19 |         self.alpha = alpha
20 |         if isinstance(alpha,(float,int,long)): self.alpha = torch.Tensor([alpha,1-alpha])
21 |         if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
22 |         self.size_average = size_average
23 | 
24 |     def forward(self, input, target):
25 |         if input.dim()>2:
26 |             input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
27 |             input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
28 |             input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
29 |         target = target.view(-1,1)
30 | 
31 |         logpt = torch.log(input)
32 |         logpt = logpt.gather(1,target)
33 |         logpt = logpt.view(-1)
34 |         pt = Variable(logpt.data.exp())
35 | 
36 |         if self.alpha is not None:
37 |             if self.alpha.type()!=input.data.type():
38 |                 self.alpha = self.alpha.type_as(input.data)
39 |             at = self.alpha.gather(0,target.data.view(-1))
40 |             logpt = logpt * Variable(at)
41 | 
42 |         loss = -1 * (1-pt)**self.gamma * logpt
43 |         if self.size_average: return loss.mean()
44 |         else: return loss.sum()
45 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2018 ouwj <ouwenjie>
  6 | #
  7 | # Distributed under terms of the MIT license.
  8 | 
  9 | """
 10 | train and test
 11 | """
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | 
 16 | from sklearn.metrics import roc_auc_score
 17 | 
 18 | import numpy as np
 19 | import sys
 20 | import time, datetime
 21 | import pickle
 22 | import random
 23 | 
 24 | from DataLoader import *
 25 | from NFFM import *
 26 | from NFFM_concat import *
 27 | from NFFM_concat_dot import *
 28 | from NFFM_concat_triple import *
 29 | from args import *
 30 | from support_model import *
 31 | 
 32 | def fit(model, data_loader, st_fnames, dy_fnames, parts, model_name, valid_data_loader=None):
 33 |     model.train()
 34 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
 35 |     for epoch in range(1, args.epochs+1):
 36 |         stime = time.time()
 37 |         cnt = 0
 38 |         total_loss = 0.0
 39 |         random_parts = parts
 40 |         random.shuffle(random_parts)
 41 |         for part in random_parts:
 42 |             data_loader.load_bin(st_fnames+dy_fnames, part)
 43 |             if args.has_cr:
 44 |                 data_loader.load_conversion_rate_from_bin(st_fnames+dy_fnames, part)
 45 |             data_loader.reset()
 46 |             data_loader.random_shuffle(st_fnames+dy_fnames)
 47 |             while True:
 48 |                 data = data_loader.next_batch([st_fnames, dy_fnames], args.batch_size)
 49 |                 if data is None:
 50 |                     break
 51 |                 
 52 |                 st_ids = data[0]
 53 |                 dy_ids = data[1]
 54 |                 dy_lens = data[2]
 55 |                 labels = data[3]
 56 | 
 57 |                 #  sys.stderr.write('{}\n'.format(st_ids))
 58 |                 #  sys.stderr.write('{}\n'.format(dy_ids))
 59 | 
 60 |                 conversion_rates = data[4]
 61 | 
 62 |                 optimizer.zero_grad()
 63 |                 scores = model(st_ids, dy_ids, dy_lens, conversion_rates)
 64 |                 loss = model.get_loss(scores, labels)
 65 |                 total_loss += loss.data[0]
 66 |                 loss.backward()
 67 |                 optimizer.step()
 68 |                 cnt += 1
 69 | 
 70 |                 if cnt % 100 == 0:
 71 |                     etime = time.time()
 72 |                     sys.stderr.write("epoch:{} | batch:{} | avg loss:{} | cost:{}s\n".format(epoch, cnt, total_loss/100, etime-stime))
 73 |                     total_loss = 0.0
 74 |                     stime = etime
 75 | 
 76 |             if valid_data_loader is not None:
 77 |                 predict(model, valid_data_loader, st_fnames, dy_fnames, range(args.n_valid_parts), is_valid=True)
 78 | 
 79 |             args.lr = args.lr * 0.9
 80 |             optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
 81 | 
 82 |         #  now_time_str = datetime.datetime.now().strftime("%Y-%m-%d~%H:%M:%S")
 83 |         #  model_name = '../models/{}_{}_epoch_{}.pkl'.format(model_name, now_time_str, epoch)
 84 |         model_name = '../models/{}.pkl'.format(model_name)
 85 |         sys.stderr.write("saving model in {}...\n".format(model_name))
 86 |         model.save(model_name)
 87 | 
 88 | 
 89 | def predict(model, data_loader, st_fnames, dy_fnames, parts, is_valid=False):
 90 |     model.eval()
 91 |     total_loss = 0.0
 92 |     cnt = 1
 93 |     total_y_true = []
 94 |     total_y_predict = []
 95 |     for part in parts:
 96 |         data_loader.load_bin(st_fnames+dy_fnames, part)
 97 |         if args.has_cr:
 98 |             data_loader.load_conversion_rate_from_bin(st_fnames+dy_fnames, part)
 99 |         data_loader.reset()
100 |         while True:
101 |             data = data_loader.next_batch([st_fnames, dy_fnames], args.batch_size)
102 |             if data is None:
103 |                 break
104 | 
105 |             st_ids = data[0]
106 |             dy_ids = data[1]
107 |             dy_lens = data[2]
108 | 
109 |             conversion_rates = data[4]
110 | 
111 |             scores = model(st_ids, dy_ids, dy_lens, conversion_rates)
112 |             probs = torch.sigmoid(scores)
113 | 
114 |             if not is_valid:
115 |                 for p in probs.data:
116 |                     print("%.6f" % p)
117 |             else:
118 |                 labels = data[3]
119 |                 total_y_true += [x for x in labels]
120 |                 total_y_predict += [x for x in scores.data]
121 |                 loss = model.get_loss(scores, labels)
122 |                 total_loss += loss.data[0] * len(labels)
123 |                 cnt += len(labels)
124 |     if is_valid:
125 |         total_auc = roc_auc_score(np.asarray(total_y_true), np.asarray(total_y_predict))
126 |         sys.stderr.write("valid loss:{} | valid auc:{}\n".format(total_loss/cnt, total_auc))
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     import argparse
131 |     parser = argparse.ArgumentParser()
132 |     parser.add_argument("--type_name", type=str, default="train_shuf", help="[train_shuf, test1, test2]")
133 |     parser.add_argument("--is_valid", type=int, default=0, help="args for train: 1 for valid, 0 for not valid")
134 |     parser.add_argument("--model_name", type=str, default="", help="model_name")
135 |     parser.add_argument("--model_path", type=str, default="", help="model_path for test load")
136 |     parser_args = parser.parse_args()
137 | 
138 |     assert parser_args.type_name in ["train", "train_shuf", "test1", "test2", "train_all", "valid"]
139 | 
140 |     is_train = 'train' in parser_args.type_name
141 |     if is_train:
142 |         args.root_data_path = args.root_train_data_path
143 |     data_loader = DataLoader(type_name=parser_args.type_name, is_train=is_train, has_cr=args.has_cr)
144 | 
145 |     st_fnames = args.static_features
146 |     st_max_idxs = data_loader.get_max_idxs(st_fnames)
147 |     dy_fnames = args.dynamic_features
148 |     dy_max_idxs = data_loader.get_max_idxs(dy_fnames)
149 |     
150 |     # for cut
151 |     data_loader.load_counts(st_fnames+dy_fnames)
152 | 
153 |     # for valid
154 |     is_valid = parser_args.is_valid == 1    
155 |     valid_data_loader = None
156 |     if is_valid:
157 |         valid_data_loader = DataLoader(type_name="valid", is_train=is_train, has_cr=args.has_cr)
158 |         valid_data_loader.load_max_idxs(st_fnames+dy_fnames)
159 |         valid_data_loader.load_counts(st_fnames+dy_fnames)
160 | 
161 | 
162 | 
163 |     sys.stderr.write("st_features:{}\n".format(st_fnames))
164 |     sys.stderr.write("dy_features:{}\n".format(dy_fnames))
165 | 
166 |     sys.stderr.write("building model...\n")
167 |     if parser_args.model_name == 'NFFM':
168 |         model = DyNFFM(
169 |             [st_fnames, dy_fnames],
170 |             [st_max_idxs, dy_max_idxs],
171 |             embedding_size=args.embedding_size,
172 |             dropout_rate = args.dropout_rate,
173 |             batch_norm=args.batch_norm,
174 |             use_cuda=args.use_cuda
175 |             )
176 |     elif parser_args.model_name == 'NFFM_concat':
177 |         model = DyNFFM_concat(
178 |             [st_fnames, dy_fnames],
179 |             [st_max_idxs, dy_max_idxs],
180 |             embedding_size=args.embedding_size,
181 |             dropout_rate = args.dropout_rate,
182 |             batch_norm=args.batch_norm,
183 |             use_cuda=args.use_cuda
184 |             )
185 |     elif parser_args.model_name == 'NFFM_concat_triple':
186 |         model = DyNFFM_concat_triple(
187 |             [st_fnames, dy_fnames],
188 |             [st_max_idxs, dy_max_idxs],
189 |             embedding_size=args.embedding_size,
190 |             dropout_rate = args.dropout_rate,
191 |             batch_norm=args.batch_norm,
192 |             use_cuda=args.use_cuda
193 |             )
194 |     elif parser_args.model_name == 'NFFM_concat_dot':
195 |         model = DyNFFM_concat_dot(
196 |             [st_fnames, dy_fnames],
197 |             [st_max_idxs, dy_max_idxs],
198 |             embedding_size=args.embedding_size,
199 |             dropout_rate = args.dropout_rate,
200 |             batch_norm=args.batch_norm,
201 |             use_cuda=args.use_cuda
202 |             )
203 |     else:
204 |         model = None
205 | 
206 |     model.apply(weight_init)
207 |     if args.use_cuda:
208 |         model.cuda()
209 | 
210 |     if "train" in parser_args.type_name:
211 | 
212 |         fit(model, data_loader, st_fnames, dy_fnames, range(args.n_train_parts), parser_args.model_name, valid_data_loader)
213 | 
214 |     elif "test" in parser_args.type_name:
215 |         sys.stderr.write("loading model in {}...\n".format(parser_args.model_path))
216 |         model.load(parser_args.model_path)
217 |         n_test_parts = args.n_test1_parts
218 |         if parser_args.type_name == 'test2':
219 |             n_test_parts = args.n_test2_parts
220 |         predict(model, data_loader, st_fnames, dy_fnames, range(n_test_parts), is_valid=False)
221 |         
222 | 
223 |         
224 | 
225 | 


--------------------------------------------------------------------------------
/src/make_submission.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | import sys
10 | 
11 | 
12 | f1 = open('../data/test2.csv')
13 | f2 = open(sys.argv[1])
14 | f = open(sys.argv[2],'wb')
15 | 
16 | f.write('aid,uid,score\n')
17 | f1.readline()
18 | for line in f1:
19 |     line = line.strip() +','+ f2.readline()
20 |     f.write(line)
21 | 


--------------------------------------------------------------------------------
/src/merge_and_split_csv.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | import sys, random
14 | 
15 | is_shuf = True
16 | split_valid = 0.02
17 | 
18 | merge_data = []
19 | 
20 | csvs = sys.argv[1:]
21 | for csv in csvs:
22 |     with open(csv, 'r') as fin:
23 |         head = fin.readline().strip()
24 |         for line in fin:
25 |             merge_data.append(line.strip())
26 | 
27 | # random shuffle
28 | total_len = len(merge_data)
29 | split = min(int(total_len * split_valid), 1000000)
30 | idxs = range(total_len)
31 | if is_shuf:
32 |     random.shuffle(idxs)
33 | 
34 | if split_valid > 0:
35 |     train_idxs = idxs[0:(total_len-split)]
36 |     valid_idxs = idxs[(total_len-split):]
37 | else:
38 |     train_idxs = idxs
39 |     valid_idxs = []
40 | 
41 | train_file_name = "../data/combine_train_all.csv"
42 | with open(train_file_name, 'w') as fout:
43 |     fout.write(head+'\n')
44 |     for i in train_idxs:
45 |         fout.write(merge_data[i]+'\n')
46 | 
47 | if split_valid > 0:
48 |     valid_file_name = "../data/combine_valid.csv"
49 |     with open(valid_file_name, 'w') as fout:
50 |         fout.write(head+'\n')
51 |         for i in valid_idxs:
52 |             fout.write(merge_data[i]+'\n')
53 | 
54 | 


--------------------------------------------------------------------------------
/src/pipeline.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # pipeline.sh
 4 | # Copyright (C) 2018 ouwj <ouwenjie>
 5 | #
 6 | # Distributed under terms of the MIT license.
 7 | #
 8 | 
 9 | # mkdir useful dir
10 | mkdir -p ../data/bin_files
11 | mkdir -p ../data/bin_files/train_all
12 | mkdir -p ../data/bin_files/valid
13 | mkdir -p ../data/bin_files/test2
14 | mkdir -p ../data/infos
15 | mkdir -p ../data/infos/max_idxs
16 | mkdir -p ../data/infos/conversion_infos
17 | mkdir -p ../data/infos/train_all/max_lens
18 | mkdir -p ../data/infos/valid/max_lens
19 | mkdir -p ../data/infos/test2/max_lens
20 | mkdir -p ../data/feature2idx
21 | mkdir -p ../models
22 | mkdir -p ../result
23 | mkdir -p logs
24 | 
25 | # combine three data file
26 | python combine_data.py chusai train
27 | python combine_data.py fusai train
28 | python combine_data.py fusai test2
29 | 
30 | # merge and split dataset
31 | python merge_and_split_csv.py ../data/combine_train.csv ../data/combine_chusai_train.csv
32 | 
33 | # build uid2idx
34 | python build_uid2idx.py
35 | 
36 | # build index data
37 | # type_name = [train_all, valid, test1, test2], but first must be train_all to build index dict
38 | python DataLoader.py --type_name train_all
39 | python DataLoader.py --type_name valid
40 | python DataLoader.py --type_name test2
41 | 
42 | # build other max_idxs
43 | python build_len_max_idx.py
44 | 
45 | # build pos count feature in [train_all, valid and test2]
46 | python build_pos_feature.py
47 | 
48 | # count and save 
49 | python build_conversion_rate.py --type_name train_all
50 | 
51 | # train model
52 | CUDA_VISIBLE_DEVICES=0 python main.py --type_name train_all --is_valid 1 --model_name NFFM_concat
53 | 
54 | # predict model
55 | CUDA_VISIBLE_DEVICES=0 python main.py --type_name test2 --is_valid 0 --model_name NFFM_concat --model_path ../models/NFFM_concat.pkl > ../result/NFFM_concat.csv
56 | 
57 | # make submission
58 | python make_submission.py ../result/NFFM_concat.csv ../result/submission.csv
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/random_shuf.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | 
11 | """
12 | 
13 | import sys
14 | import random
15 | 
16 | file_name = sys.argv[1]
17 | with open(file_name, 'r') as fin:
18 |     head = fin.readline()
19 |     data = fin.readlines()
20 |     idxs = range(len(data))
21 |     random.shuffle(idxs)
22 | 
23 |     print(head)
24 |     for i in idxs:
25 |         print data[i].strip()
26 | 


--------------------------------------------------------------------------------
/src/staticEmbedding.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | build the embedding of static matrix [Batch*Field_size]
11 | """
12 | 
13 | import sys
14 | 
15 | import torch
16 | import torch.nn as nn
17 | import torch.nn.functional as F
18 | 
19 | class StEmb(nn.Module):
20 |     def __init__(self, fnames, max_idxs, embedding_size=4, dropout_rate=None, use_cuda=True):
21 |         """
22 |         fnames: feature names
23 |         max_idxs: array of max_idx of each feature
24 |         embedding_size: size of embedding
25 |         dropout: prob for dropout, set None if no dropout
26 |         use_cuda: bool, True for gpu or False for cpu
27 |         """
28 |         super(StEmb, self).__init__()
29 |         self.fnames = fnames
30 |         self.max_idxs = max_idxs
31 |         self.embedding_size = embedding_size
32 |         self.dropout_rate = dropout_rate
33 |         self.use_cuda = use_cuda
34 | 
35 |         # initial layer
36 |         self.embeddings = nn.ModuleList([nn.Embedding(max_idx, self.embedding_size, padding_idx=0) for max_idx in self.max_idxs])
37 | 
38 |         self.is_dropout = False
39 |         if self.dropout_rate is not None:
40 |             self.is_dropout = True
41 |             self.dropout = nn.Dropout(p=self.dropout_rate)
42 | 
43 |     def forward(self, static_ids):
44 |         """
45 |         input: relative id 
46 |         static_ids: Batch_size * Field_size
47 |         return: Batch_size * Field_size * Embedding_size
48 |         """
49 | 
50 |         concat_embeddings = []
51 |         #  sys.stderr.write('{}\n'.format([static_ids[k].shape for k in self.fnames]))
52 |         for i, key in enumerate(self.fnames):
53 |             # B*1
54 |             static_ids_tensor = torch.autograd.Variable(torch.LongTensor(static_ids[key]))
55 |             if self.use_cuda:
56 |                 static_ids_tensor = static_ids_tensor.cuda()
57 | 
58 |             # embedding layer B*1*E
59 |             static_embeddings_tensor = self.embeddings[i](static_ids_tensor)
60 | 
61 |             # dropout
62 |             if self.is_dropout:
63 |                 static_embeddings_tensor = self.dropout(static_embeddings_tensor)
64 |             
65 |             concat_embeddings.append(static_embeddings_tensor)
66 |         # B*F*E
67 |         concat_embeddings = torch.cat(concat_embeddings, 1)
68 | 
69 |         return concat_embeddings
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     # test
74 |     max_idxs = [4, 6]
75 |     fnames = ["1", "2"]
76 |     ids = {"1":[[2], [3]], "2":[[5],[1]]}
77 | 
78 |     stEmb = StEmb(fnames, max_idxs, use_cuda=False)
79 | 
80 |     st_embeddings = stEmb(ids)
81 | 
82 |     print st_embeddings
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/src/support_model.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2018 ouwj <ouwenjie>
 6 | #
 7 | # Distributed under terms of the MIT license.
 8 | 
 9 | """
10 | some utils for model
11 | """
12 | 
13 | import torch
14 | import torch.nn as nn
15 | import torch.nn.functional as F
16 | 
17 | import pickle
18 | import sys
19 | 
20 | from args import *
21 | 
22 | def weight_init(m):
23 |     # 使用isinstance来判断m属于什么类型
24 |     if isinstance(m, nn.Linear):
25 |         nn.init.xavier_uniform(m.weight.data)
26 |         #  nn.init.xavier_normal(m.weight.data)
27 |         #  nn.init.normal(m.weight.data, 0.0, 0.0001)
28 |         m.bias.data.fill_(0.)
29 |         #  nn.init.normal(m.bias.data, 0.0, 0.001)
30 |     elif isinstance(m, nn.Embedding):
31 |         nn.init.xavier_uniform(m.weight.data)
32 |         #  nn.init.xavier_normal(m.weight.data)
33 |         #  nn.init.normal(m.weight.data, 0.0, 0.0001)
34 | 
35 | def make_mask(sen_lens, max_length, fill_val=True):
36 |     """
37 |     lengths_arr: one_dimension, list
38 |     max_length: the max length of arr
39 |     fill_val: the value filled in mask
40 |     """
41 | 
42 |     batch_size = len(sen_lens)
43 |     mask = torch.ByteTensor(batch_size, max_length).fill_(not fill_val)
44 |     for b in range(batch_size):
45 |         mask[b, 0:sen_lens[b]] = fill_val
46 |     mask = torch.autograd.Variable(mask, requires_grad=False)
47 |     return mask
48 | 
49 | def filter_same_features(fname1, fname2):
50 |     in_ad_1 = fname1 in args.ad_static_features
51 |     in_ad_2 = fname2 in args.ad_static_features
52 |     in_user_1 = fname1 in (args.user_static_features + args.user_dynamic_features + args.len_static_features)
53 |     in_user_2 = fname2 in (args.user_static_features + args.user_dynamic_features + args.len_static_features)
54 |     in_ad = in_ad_1 and in_ad_2
55 |     in_user = in_user_1 and in_user_2
56 |     return in_ad or in_user
57 | 


--------------------------------------------------------------------------------