├── .idea
├── .gitignore
├── Kaggle OTTO – Multi-Objective Recommender System.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
└── modules.xml
├── README.md
├── candidates
└── generate_candidates.py
├── features
├── __init__.py
├── co_visitation_features.py
├── item_features.py
├── recall_features.py
├── similarity_features.py
├── user_features.py
└── user_item_features.py
├── merge_features.py
├── predict.py
├── preprocess
├── BPRMF_ALSMF_LMF_prepare.py
├── ProNE_prepare.py
├── __init__.py
├── co-visitation_matrix_prepare.py
└── deepwalk_prepare.py
└── ranker.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/Kaggle OTTO – Multi-Objective Recommender System.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle_OTTO_Multi-Objective_Recommender_System
2 | kaggle比赛—otto多目标推荐系统源代码,单模型分数0.594,LB排名30左右
3 |
4 | ## 召回阶段
5 | 1.基于历史序列召回
6 |
7 | 2.基于协同过滤co-visitation召回(I2I)
8 |
9 | 3.基于规则召回
10 |
11 | 点击最多/加购最多/购买最多/热门商品,综合指数最高
12 |
13 | 4.基于embedding召回
14 |
15 | deepwalk last week(I2I)
16 |
17 | deepwalk last month(I2I)
18 |
19 | ## 排序阶段
20 |
21 | 构造candidates特征,使用xgboost作为排序模型,做出预测。
22 |
23 | 特征构造具体如下:
24 |
25 | 初次召回特征(多重召回策略所带的特征)
26 |
27 | item特征
28 |
29 | user特征
30 |
31 | user和item的交互特征
32 |
33 | similarity特征(包括deepwalk,ProNE等相似度特征)
34 |
35 | co-visitation特征
36 |
37 | 训练策略具体如下:
38 |
39 | 分别用xgboost训练click/cart/order模型
40 |
41 | 用户click/cart/order的样本为正样本,其余candidates为负样本
42 |
43 | 训练模型时选用所有正样本,负样本选取正样本数量的30倍
44 |
45 | ## 模型提升历程
46 |
47 | 1.利用手工规则recall@20后LB分数为0.577
48 |
49 | 2.采用rank模型,增加召回数量(平均每个user召回170个item),candidates加入相似度特征(mean和max)后,LB提升到0.585
50 |
51 | 3.尝试向量召回,继续增加召回数量(平均每个user召回220个item),并加入co-visitation权重特征(mean和max),LB提升到0.590
52 |
53 | 4.继续尝试增加相似度特征(candidate与user序列最后三个aid分别的相似度特征)和co-visitation权重特征(candidate与user序列最后三个aid分别的权重特征),LB提升到0.594
54 |
55 | ## 尝试但不work的方法
56 |
57 | 1.ProNE基于图的user-item相似度特征
58 |
59 | 2.BPRMF,ALSMF,LMF基于矩阵分解的user-item相似度特征
60 |
61 | 3.许多item特征,例如时间趋势类特征、物品点击购买率类特征等
62 |
63 | 4.许多user特征,例如点击购买时间间隔类特征、用户点击购买率类特征等
64 |
65 | 5.使用网格搜索对xgboost简单进行调参,模型几乎没有提升
66 |
67 | 6.使用简单的特征交叉,模型没有提升
68 |
--------------------------------------------------------------------------------
/candidates/generate_candidates.py:
--------------------------------------------------------------------------------
1 | # 召回策略
2 | # 1.基于历史序列召回 全部aids
3 | # 2.基于co—visitation召回(I2I) 100aids
4 | # 3.基于规则召回
5 | # 点击最多
6 | # 加购最多
7 | # 购买最多
8 | # 热门商品
9 | # 4.基于embedding召回
10 | # deepwalk last week(I2I) 80aids
11 | # deepwalk last month(I2I) 80aids
12 |
13 | # 开始计算recall@220!!!
14 | # clicks recall = 0.628
15 | # carts recall = 0.519
16 | # orders recall = 0.716
17 | # =============
18 | # Overall Recall = 0.6481
19 | # =============
20 |
21 | import gensim
22 | import pandas as pd, numpy as np
23 | import glob
24 | from collections import Counter
25 | import itertools
26 |
27 | type_labels = {'clicks': 0, 'carts': 1, 'orders': 2}
28 | VER = 6
29 | DISK_PIECES = 4
30 | IS_TRAIN = True
31 |
32 |
33 | def load_data(path):
34 | dfs = []
35 | for e, chunk_file in enumerate(glob.glob(path)):
36 | chunk = pd.read_parquet(chunk_file)
37 | chunk.ts = (chunk.ts / 1000).astype('int32')
38 | chunk['type'] = chunk['type'].map(type_labels).astype('int8')
39 | dfs.append(chunk)
40 | return pd.concat(dfs).reset_index(drop=True)
41 |
42 |
43 | def pqt_to_dict(df):
44 | df = df.loc[df.n < 20].drop('n', axis=1)
45 | # df['sim_aid_and_score'] = df['aid_y'].astype('str') + '#' + df['wgt'].astype('str')
46 | return df.groupby('aid_x').aid_y.apply(list).to_dict()
47 |
48 |
49 | if IS_TRAIN:
50 | stage = 'CV'
51 | data = ''
52 | else:
53 | stage = 'LB'
54 | data = 'all_data_'
55 |
56 | print('加载原始数据!!')
57 | test_df = load_data(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
58 | train_df = load_data(f'/home/niejianfei/otto/{stage}/data/*_parquet/*')
59 |
60 | print("开始读取co_visitation矩阵数据!!!")
61 | # LOAD THREE CO-VISITATION MATRICES
62 | top_20_clicks = pqt_to_dict(
63 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_20_clicks_v{VER}_0.pqt'))
64 | for k in range(1, DISK_PIECES):
65 | top_20_clicks.update(
66 | pqt_to_dict(
67 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_20_clicks_v{VER}_{k}.pqt')))
68 | top_20_buys = pqt_to_dict(
69 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_15_carts_orders_v{VER}_0.pqt'))
70 | for k in range(1, DISK_PIECES):
71 | top_20_buys.update(
72 | pqt_to_dict(
73 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_15_carts_orders_v{VER}_{k}.pqt')))
74 | top_20_buy2buy = pqt_to_dict(
75 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_15_buy2buy_v{VER}_0.pqt'))
76 |
77 | print('开始读取deepwalk词向量!!')
78 | word2vec_last_week = gensim.models.KeyedVectors.load_word2vec_format(
79 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_week.w2v',
80 | binary=False)
81 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format(
82 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v',
83 | binary=False)
84 |
85 | # 基于规则,热门商品
86 | print("开始生成test热门商品!!!")
87 | top_clicks = test_df.loc[test_df['type'] == 0, 'aid'].value_counts()[:200].to_dict()
88 | top_carts = test_df.loc[test_df['type'] == 1, 'aid'].value_counts()[:200].to_dict()
89 | top_orders = test_df.loc[test_df['type'] == 2, 'aid'].value_counts()[:200].to_dict()
90 |
91 | # 修改权重
92 | type_weight_multipliers = {0: 1, 1: 5, 2: 4}
93 | print("开始生成test hot商品!!!")
94 | test_df['score'] = test_df['type'].map(type_weight_multipliers)
95 | top_hot_items = test_df.groupby('aid')['score'].apply(lambda x: x.sum()) \
96 | .sort_values(ascending=False)[:200].to_dict()
97 | print('开始生成train hot商品!!!')
98 | train_df['score'] = train_df['type'].map(type_weight_multipliers)
99 | top_hot_items_last_month = train_df.groupby('aid')['score'].apply(lambda x: x.sum()) \
100 | .sort_values(ascending=False)[:200].to_dict()
101 | print(top_hot_items_last_month)
102 | print('开始生成train click hot商品!!!')
103 | train_df['score'] = 1
104 | top_clicks_items_last_month = train_df.groupby('aid')['score'].apply(lambda x: x.sum()) \
105 | .sort_values(ascending=False)[:200].to_dict()
106 | print(top_clicks_items_last_month)
107 |
108 |
109 | def suggest_clicks(df):
110 | # USER HISTORY AIDS AND TYPES
111 | aids = df.aid.tolist()
112 | types = df.type.tolist()
113 | # unique_aids = list(dict.fromkeys(aids[::-1]))
114 |
115 | # RERANK CANDIDATES USING WEIGHTS
116 | # 直接召回历史序列按权重划分的aids
117 | # 等比数列 2**0.1 - 2**1 权重差的过大? 0.07-1?对于大序列,这点不好
118 | weights = np.logspace(0.1, 1, len(aids), base=2, endpoint=True) - 1
119 | aids_temp = Counter()
120 | # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
121 | # 历史序列召回,考虑时间效应,优先召回时间近的
122 | for aid, w, t in zip(aids, weights, types):
123 | aids_temp[aid] += w * type_weight_multipliers[t]
124 | # session长度40已经可以涵盖90%的数据了
125 | history_aids = [k for k, v in aids_temp.most_common()]
126 | type_1 = [1] * len(history_aids)
127 | scores_1 = [v for k, v in aids_temp.most_common()]
128 | if len(set(scores_1)) == 1:
129 | scores_1 = [1] * len(scores_1)
130 | else:
131 | min_ = min(scores_1)
132 | max_ = max(scores_1)
133 | scores_1 = [(j - min_) / (max_ - min_) for j in scores_1]
134 |
135 | # 相似度矩阵召回
136 | # USE "CLICKS" CO-VISITATION MATRIX
137 | # click矩阵只考虑了时间的因素,cart-orders还考虑了相似商品的类别
138 | # 这里可以修改,通过sorted_aids召回相似物品 ---------------sort <= unique
139 | aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in history_aids if aid in top_20_clicks]))
140 | aids3 = list(itertools.chain(*[top_20_buys[aid] for aid in history_aids if aid in top_20_buys]))
141 | # RERANK CANDIDATES Counter计数筛选,不管得分,历史序列优先
142 | # 融合aids2和aids3的信息,同时考虑了相似item的时间权重和类型权重
143 | sim_aids_100 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(100)]
144 | type_2 = [2] * len(sim_aids_100)
145 | scores_2 = [cnt for aid2, cnt in Counter(aids2 + aids3).most_common(100)]
146 |
147 | # 基于规则召回n个
148 | # 热门商品召回100个,类别加权
149 | top_hot_items_100 = list(top_hot_items.keys())[:100]
150 | type_3 = [3] * (len(top_hot_items_100))
151 | score_3 = list(top_hot_items.values())[:100]
152 | # 点击最多的商品召回100个
153 | top_clicks_100 = list(top_clicks.keys())[:100]
154 | type_4 = [4] * (len(top_clicks_100))
155 | score_4 = list(top_clicks.values())[:100]
156 | # 过去一个月点击最多的商品召回100个
157 | top_clicks_last_month_100 = list(top_clicks_items_last_month.keys())[:100]
158 | type_5 = [5] * (len(top_clicks_last_month_100))
159 | score_5 = list(top_clicks_items_last_month.values())[:100]
160 | # 过去一个月热度最高的100个商品
161 | top_hot_items_one_month_100 = list(top_hot_items_last_month.keys())[:100]
162 | type_6 = [6] * (len(top_hot_items_one_month_100))
163 | score_6 = list(top_hot_items_last_month.values())[:100]
164 |
165 | # 基于向量embedding召回160个
166 | # 基于最后一周deepwalk召回80个
167 | temp_counter = Counter()
168 | for i in history_aids:
169 | if f'item_{i}' in word2vec_last_week:
170 | for j in word2vec_last_week.similar_by_word(f'item_{i}', topn=20):
171 | temp_counter[j[0]] += j[1]
172 | item_emb_deepwalk_last_week_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter.most_common(80)]
173 | type_7 = [7] * len(item_emb_deepwalk_last_week_80)
174 | score_7 = [cnt for aid2, cnt in temp_counter.most_common(80)]
175 |
176 | # 基于全局deepwalk召回80个
177 | temp_counter1 = Counter()
178 | for i in history_aids:
179 | for j in word2vec_last_month.similar_by_word(f'item_{i}', topn=20):
180 | temp_counter1[j[0]] += j[1]
181 | item_emb_last_month_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter1.most_common(80)]
182 | type_8 = [8] * len(item_emb_last_month_80)
183 | score_8 = [cnt for aid2, cnt in temp_counter1.most_common(80)]
184 | # print(item_emb_deepwalk_last_week_80[0], score_7[0], item_emb_last_month_80[0], score_8[0])
185 |
186 | result = history_aids + sim_aids_100 + top_hot_items_100 + top_clicks_100 + top_clicks_last_month_100 + \
187 | top_hot_items_one_month_100 + item_emb_deepwalk_last_week_80 + item_emb_last_month_80
188 |
189 | type = type_1 + type_2 + type_3 + type_4 + type_5 + type_6 + type_7 + type_8
190 | score = scores_1 + scores_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8
191 |
192 | info = [str(result[i]) + "#" + str(type[i]) + "#" + str(score[i]) for i in range(len(result))]
193 |
194 | return info
195 |
196 |
197 | def suggest_carts(df):
198 | # User history aids and types
199 | aids = df.aid.tolist()
200 | types = df.type.tolist()
201 |
202 | # UNIQUE AIDS AND UNIQUE BUYS
203 | unique_aids = list(dict.fromkeys(aids[::-1]))
204 | df = df.loc[(df['type'] == 0) | (df['type'] == 1)]
205 | unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
206 |
207 | # Rerank candidates using weights,时间weight?
208 | # 等比数列 2**0.5[0.414] -- 2**1-1,要突出以往carts和orders的权重,时间权重不能过小
209 | weights = np.logspace(0.5, 1, len(aids), base=2, endpoint=True) - 1
210 | aids_temp = Counter()
211 |
212 | # Rerank based on repeat items and types of items
213 | # 使用aids信息召回
214 | for aid, w, t in zip(aids, weights, types): # w: 0.414-1 types:1,5,4 min 0.414 max 5
215 | aids_temp[aid] += w * type_weight_multipliers[t]
216 | # 不直接召回,下面利用矩阵信息再算一次
217 | # Rerank candidates using"top_20_carts" co-visitation matrix
218 | # 基于buy2buys召回carts 用unique_buys召回carts
219 | # aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_buys if aid in top_20_buys]))
220 | # aids2 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
221 | # 将buy2buy矩阵输出 +0.1,
222 | # 还是以历史序列为主,尽量不要超过历史权重的量级,0.1算是合理
223 | # for aid in aids2: aids_temp[aid] += 0.1
224 | history_aids = [k for k, v in aids_temp.most_common()]
225 | type_1 = [1] * len(history_aids)
226 | scores_1 = [v for k, v in aids_temp.most_common()]
227 | if len(set(scores_1)) == 1:
228 | scores_1 = [1] * len(scores_1)
229 | else:
230 | min_ = min(scores_1)
231 | max_ = max(scores_1)
232 | scores_1 = [(j - min_) / (max_ - min_) for j in scores_1]
233 | # print(scores_1[1])
234 |
235 | # Use "cart order" and "clicks" co-visitation matrices
236 | # click时间序列召回 基于历史session,要考虑时间,召回最新的
237 | aids1 = list(itertools.chain(*[top_20_clicks[aid] for aid in history_aids if aid in top_20_clicks]))
238 | # carts-orders召回 这里通过aids召回,使用buys也情有可原
239 | # 使用点击召回carts
240 | aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in history_aids if aid in top_20_buys]))
241 | # 修改5:基于unique_buys召回carts,要考虑carts-orders,那么使用buy2buy
242 | aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
243 |
244 | # RERANK CANDIDATES
245 | sim_aids_100 = [aid2 for aid2, cnt in Counter(aids1 + aids2 + aids3).most_common(100) if aid2 not in history_aids]
246 | type_2 = [2] * len(sim_aids_100)
247 | scores_2 = [cnt for aid2, cnt in Counter(aids1 + aids2 + aids3).most_common(100) if aid2 not in history_aids]
248 |
249 | # 基于规则召回200个
250 | # 热门商品召回100个,类别加权
251 | top_hot_items_100 = list(top_hot_items.keys())[:100]
252 | type_3 = [3] * (len(top_hot_items_100))
253 | score_3 = list(top_hot_items.values())[:100]
254 | # 购买最多的商品召回100个
255 | top_orders_100 = list(top_orders.keys())[:100]
256 | type_4 = [4] * (len(top_orders_100))
257 | score_4 = list(top_orders.values())[:100]
258 | # 加购最多的商品召回100个
259 | top_carts_100 = list(top_carts.keys())[:100]
260 | type_5 = [5] * (len(top_carts_100))
261 | score_5 = list(top_carts.values())[:100]
262 | # 过去一个月热度最高的100个商品
263 | top_hot_items_one_month_100 = list(top_hot_items_last_month.keys())[:150]
264 | type_6 = [6] * (len(top_hot_items_one_month_100))
265 | score_6 = list(top_hot_items_last_month.values())[:150]
266 |
267 | # 基于向量embedding召回160个
268 | # 基于最后一周deepwalk召回80个
269 | temp_counter = Counter()
270 | for i in history_aids:
271 | if f'item_{i}' in word2vec_last_week:
272 | for j in word2vec_last_week.similar_by_word(f'item_{i}', topn=20):
273 | temp_counter[j[0]] += j[1]
274 | item_emb_deepwalk_last_week_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter.most_common(80)]
275 | type_7 = [7] * len(item_emb_deepwalk_last_week_80)
276 | score_7 = [cnt for aid2, cnt in temp_counter.most_common(80)]
277 |
278 | # 基于全局deepwalk召回80个
279 | temp_counter1 = Counter()
280 | for i in history_aids:
281 | for j in word2vec_last_month.similar_by_word(f'item_{i}', topn=20):
282 | temp_counter1[j[0]] += j[1]
283 | item_emb_last_month_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter1.most_common(80)]
284 | type_8 = [8] * len(item_emb_last_month_80)
285 | score_8 = [cnt for aid2, cnt in temp_counter1.most_common(80)]
286 | print(item_emb_deepwalk_last_week_80[0], score_7[0], item_emb_last_month_80[0], score_8[0])
287 |
288 | result = history_aids + sim_aids_100 + top_hot_items_100 + top_orders_100 + top_carts_100 + \
289 | top_hot_items_one_month_100 + item_emb_deepwalk_last_week_80 + item_emb_last_month_80
290 |
291 | type = type_1 + type_2 + type_3 + type_4 + type_5 + type_6 + type_7 + type_8
292 | score = scores_1 + scores_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8
293 |
294 | info = [str(result[i]) + "#" + str(type[i]) + "#" + str(score[i]) for i in range(len(result))]
295 |
296 | return info
297 |
298 |
299 | def suggest_buys(df):
300 | # USER HISTORY AIDS AND TYPES
301 | aids = df.aid.tolist()
302 | types = df.type.tolist()
303 | # UNIQUE AIDS AND UNIQUE BUYS
304 | # unique_aids = list(dict.fromkeys(aids[::-1]))
305 | df = df.loc[(df['type'] == 1) | (df['type'] == 2)]
306 | unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
307 |
308 | # 基于历史序列召回40个
309 | # RERANK CANDIDATES USING WEIGHTS
310 | # 等比数列 0.414-1
311 | weights = np.logspace(0.5, 1, len(aids), base=2, endpoint=True) - 1
312 | aids_temp = Counter()
313 | # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
314 | for aid, w, t in zip(aids, weights, types):
315 | aids_temp[aid] += w * type_weight_multipliers[t]
316 | # 直接取40,不管够不够,不够的话就这样
317 | history_aids = [k for k, v in aids_temp.most_common()]
318 | type_1 = [1] * len(history_aids)
319 | scores_1 = [v for k, v in aids_temp.most_common()]
320 | if len(set(scores_1)) == 1:
321 | scores_1 = [1] * len(scores_1)
322 | else:
323 | min_ = min(scores_1)
324 | max_ = max(scores_1)
325 | scores_1 = [(j - min_) / (max_ - min_) for j in scores_1]
326 |
327 | # 基于co—visitation召回100个
328 | # USE "CART ORDER" CO-VISITATION MATRIX 用aids召回orders,对的!
329 | aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in history_aids if aid in top_20_buys]))
330 | # USE "BUY2BUY" CO-VISITATION MATRIX 用unique_buys召回orders,对的!!
331 | aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
332 | # RERANK CANDIDATES
333 |
334 | sim_aids_100 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(100)]
335 | type_2 = [2] * len(sim_aids_100)
336 | scores_2 = [cnt for aid2, cnt in Counter(aids2 + aids3).most_common(100)]
337 |
338 | # 基于规则召回n个
339 | # 热门商品召回100个,类别加权
340 | top_hot_items_100 = list(top_hot_items.keys())[:100]
341 | type_3 = [3] * (len(top_hot_items_100))
342 | score_3 = list(top_hot_items.values())[:100]
343 | # 购买最多的商品召回100个
344 | top_orders_100 = list(top_orders.keys())[:100]
345 | type_4 = [4] * (len(top_orders_100))
346 | score_4 = list(top_orders.values())[:100]
347 | # 加购最多的商品召回100个
348 | top_carts_100 = list(top_carts.keys())[:100]
349 | type_5 = [5] * (len(top_carts_100))
350 | score_5 = list(top_carts.values())[:100]
351 | # 过去一个月热度最高的100个商品
352 | top_hot_items_one_month_100 = list(top_hot_items_last_month.keys())[:100]
353 | type_6 = [6] * (len(top_hot_items_one_month_100))
354 | score_6 = list(top_hot_items_last_month.values())[:100]
355 |
356 | # 基于向量embedding召回160个
357 | # 基于最后一周deepwalk召回80个
358 | temp_counter = Counter()
359 | for i in history_aids:
360 | if f'item_{i}' in word2vec_last_week:
361 | for j in word2vec_last_week.similar_by_word(f'item_{i}', topn=20):
362 | temp_counter[j[0]] += j[1]
363 | item_emb_deepwalk_last_week_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter.most_common(80)]
364 | type_7 = [7] * len(item_emb_deepwalk_last_week_80)
365 | score_7 = [cnt for aid2, cnt in temp_counter.most_common(80)]
366 |
367 | # 基于全局deepwalk召回80个
368 | temp_counter1 = Counter()
369 | for i in history_aids:
370 | for j in word2vec_last_month.similar_by_word(f'item_{i}', topn=20):
371 | temp_counter1[j[0]] += j[1]
372 | item_emb_last_month_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter1.most_common(80)]
373 | type_8 = [8] * len(item_emb_last_month_80)
374 | score_8 = [cnt for aid2, cnt in temp_counter1.most_common(80)]
375 | print(item_emb_deepwalk_last_week_80[0], score_7[0], item_emb_last_month_80[0], score_8[0])
376 |
377 | result = history_aids + sim_aids_100 + top_hot_items_100 + top_orders_100 + top_carts_100 + \
378 | top_hot_items_one_month_100 + item_emb_deepwalk_last_week_80 + item_emb_last_month_80
379 |
380 | type = type_1 + type_2 + type_3 + type_4 + type_5 + type_6 + type_7 + type_8
381 | score = scores_1 + scores_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8
382 |
383 | info = [str(result[i]) + "#" + str(type[i]) + "#" + str(score[i]) for i in range(len(result))]
384 |
385 | return info
386 |
387 |
388 | print("开始进行clicks推荐!!!")
389 |
390 | pred_df_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
391 | lambda x: suggest_clicks(x)
392 | )
393 | print("开始进行carts推荐!!!")
394 | pred_df_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
395 | lambda x: suggest_carts(x)
396 | )
397 | print("开始进行buys推荐!!!")
398 | pred_df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
399 | lambda x: suggest_buys(x)
400 | )
401 |
402 | print("开始进行推荐!!!")
403 | clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
404 | orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
405 | carts_pred_df = pd.DataFrame(pred_df_carts.add_suffix("_carts"), columns=["labels"]).reset_index()
406 |
407 | pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
408 | pred_df.columns = ["session_type", "labels"]
409 | pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str, x)))
410 | pred_df.to_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates.pqt')
411 | print(pred_df)
412 |
413 |
414 | print("开始计算recall!!!")
415 | score = 0
416 | recall_score = {}
417 | weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
418 | for t in ['clicks', 'carts', 'orders']:
419 | sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
420 | sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
421 | # sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')])
422 | sub['labels'] = sub['labels'].apply(lambda x: [int(i.split('#')[0]) for i in x.split(' ')])
423 | test_labels = pd.read_parquet(f'/home/niejianfei/otto/CV/preprocess/test_labels.parquet')
424 | test_labels = test_labels.loc[test_labels['type'] == t]
425 | test_labels = test_labels.merge(sub, how='left', on=['session'])
426 | test_labels['hits'] = test_labels.apply(
427 | lambda df: min(20, len(set(df.ground_truth).intersection(set(df.labels)))), axis=1)
428 | # 设定阈值 长度多于20,定为20
429 | test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0, 20)
430 | recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
431 | recall_score[t] = recall
432 | score += weights[t] * recall
433 | print(f'{t} recall =', recall)
434 |
435 | print('=============')
436 | print('Overall Recall =', score)
437 | print('=============')
438 |
439 | # handcraft recall LB,0.577
440 | # 开始计算recall!!!
441 | # clicks recall = 0.5257653796508641
442 | # carts recall = 0.41246734503419014
443 | # orders recall = 0.6498501450672353
444 | # =============
445 | # Overall Recall = 0.5662268285156846
446 | # =============
447 |
448 | # 开始计算recall@170!!!
449 | # clicks recall = 0.6012911171187798
450 | # carts recall = 0.5011587525716328
451 | # orders recall = 0.7053682856531855
452 | # =============
453 | # Overall Recall = 0.6336977088752791
454 | # =============
455 |
--------------------------------------------------------------------------------
/features/__init__.py:
--------------------------------------------------------------------------------
1 | from .recall_features import recall_features
2 | from .user_item_features import user_item_features
3 | from .similarity_features import similarity_features
4 | from .co_visitation_features import co_visitation_features
--------------------------------------------------------------------------------
/features/co_visitation_features.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import pandas as pd
3 |
4 |
5 | def load_validate(path):
6 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
7 | dfs = []
8 | # 只导入训练数据
9 | for e, chunk_file in enumerate(glob.glob(path)):
10 | chunk = pd.read_parquet(chunk_file)
11 | chunk.ts = (chunk.ts / 1000).astype('int32')
12 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
13 | dfs.append(chunk)
14 | return pd.concat(dfs).reset_index(drop=True)
15 |
16 |
17 | def calculate_cf_u2i_similarity(string, dic):
18 | list = string.split(' ')
19 | if int(list[-1]) < 0:
20 | return '-10 -10'
21 | aid = list[0]
22 | score = []
23 | for i in list[1:]:
24 | if aid + ' ' + i in dic:
25 | temp_score = float(dic[aid + ' ' + i])
26 | else:
27 | temp_score = 0
28 | score.append(temp_score)
29 | return str(max(score)) + ' ' + str(sum(score))
30 |
31 |
32 | # 计算候选aid与user序列co_visitation矩阵的权重之和
33 | def cf_u2i_similarity(stage, candidate_type, start, end):
34 | print('开始读取数据!!!')
35 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
36 | print(valid)
37 | print('开始筛选')
38 |
39 | valid1 = valid[valid['type'] != 0]
40 | print(valid1)
41 | print('开始排序')
42 | # 分别对session_id聚合,对时间进行排序
43 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
44 | print('生成list')
45 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
46 | sentences_df.columns = ['carts_and_orders']
47 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
48 | sentences_df = sentences_df.drop(columns='carts_and_orders')
49 | print(sentences_df)
50 |
51 | valid2 = valid
52 | print(valid2)
53 | print('开始排序')
54 | # 分别对session_id聚合,对时间进行排序
55 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
56 | print('生成list')
57 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
58 | sentences_df1.columns = ['clicks']
59 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
60 | sentences_df1 = sentences_df1.drop(columns='clicks')
61 | print(sentences_df1)
62 |
63 | print('开始读取字典!!')
64 | print('click')
65 | VER = 6
66 | print(VER)
67 | dic_click = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_0.pqt')
68 | DISK_PIECES = 4
69 | for k in range(1, DISK_PIECES):
70 | dic_click = dic_click.append(
71 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_{k}.pqt'))
72 |
73 | dic_click['aids1'] = dic_click['aid_x'].astype('str') + ' ' + dic_click['aid_y'].astype('str')
74 | dic_click['aids2'] = dic_click['aid_y'].astype('str') + ' ' + dic_click['aid_x'].astype('str')
75 |
76 | dic_click = dic_click.drop(columns=['aid_x', 'aid_y'])
77 | dic_click1 = dic_click[['aids1', 'wgt']]
78 | print(dic_click1)
79 | dic_click2 = dic_click[['aids2', 'wgt']]
80 | dic_click2.columns = ['aids1', 'wgt']
81 | print(dic_click2)
82 | dic_click = dic_click1.append(dic_click2)
83 | print(dic_click)
84 | dic_click.index = dic_click['aids1']
85 | print(dic_click)
86 | dic_click = dic_click['wgt'].to_dict()
87 | print('0 532042' in dic_click)
88 | print('532042 0' in dic_click)
89 | print('0 532022242' in dic_click)
90 |
91 | print('hot')
92 | dic_hot = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_0.pqt')
93 | DISK_PIECES = 4
94 | for k in range(1, DISK_PIECES):
95 | dic_hot = dic_hot.append(
96 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_{k}.pqt'))
97 |
98 | dic_hot['aids1'] = dic_hot['aid_x'].astype('str') + ' ' + dic_hot['aid_y'].astype('str')
99 | dic_hot['aids2'] = dic_hot['aid_y'].astype('str') + ' ' + dic_hot['aid_x'].astype('str')
100 |
101 | dic_hot = dic_hot.drop(columns=['aid_x', 'aid_y'])
102 | dic_click1 = dic_hot[['aids1', 'wgt']]
103 | print(dic_click1)
104 | dic_click2 = dic_hot[['aids2', 'wgt']]
105 | dic_click2.columns = ['aids1', 'wgt']
106 | print(dic_click2)
107 | dic_hot = dic_click1.append(dic_click2)
108 | print(dic_hot)
109 | dic_hot.index = dic_hot['aids1']
110 | print(dic_hot)
111 | dic_hot = dic_hot['wgt'].to_dict()
112 | print('0 532042' in dic_hot)
113 | print('532042 0' in dic_hot)
114 | print('0 532022242' in dic_hot)
115 |
116 | print('buys')
117 | dic_buys = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_buy2buy_v{VER}_0.pqt')
118 | print(dic_buys)
119 |
120 | dic_buys['aids1'] = dic_buys['aid_x'].astype('str') + ' ' + dic_buys['aid_y'].astype('str')
121 | dic_buys['aids2'] = dic_buys['aid_y'].astype('str') + ' ' + dic_buys['aid_x'].astype('str')
122 |
123 | dic_buys = dic_buys.drop(columns=['aid_x', 'aid_y'])
124 | dic_click1 = dic_buys[['aids1', 'wgt']]
125 | print(dic_click1)
126 | dic_click2 = dic_buys[['aids2', 'wgt']]
127 | dic_click2.columns = ['aids1', 'wgt']
128 | print(dic_click2)
129 | dic_buys = dic_click1.append(dic_click2)
130 | print(dic_buys)
131 | dic_buys.index = dic_buys['aids1']
132 | print(dic_buys)
133 | dic_buys = dic_buys['wgt'].to_dict()
134 | print('0 532042' in dic_buys)
135 | print('532042 0' in dic_buys)
136 | print('0 532022242' in dic_buys)
137 |
138 | for t in candidate_type:
139 | # 只导入训练数据
140 | print('开始导入数据')
141 | for i in range(start, end):
142 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
143 | print(f'第{i + 1}块数据')
144 | chunk = pd.read_parquet(path)
145 | print(path)
146 | print(chunk.columns)
147 | chunk = chunk.astype("float32")
148 | chunk['session'] = chunk['session'].astype('int32')
149 | chunk['aid'] = chunk['aid'].astype('int32')
150 | print(chunk)
151 | print(chunk.columns)
152 |
153 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
154 | print(chunk)
155 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
156 | print('开始计算相似度!!!')
157 | chunk['sim_score_str'] = chunk['sim_list'].apply(lambda x: calculate_cf_u2i_similarity(x, dic_buys))
158 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
159 | chunk['buys_CF_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
160 | chunk['buys_CF_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
161 | print(chunk[(chunk['buys_CF_sim_max'] != -10) & (chunk['buys_CF_sim_max'] != 0)])
162 |
163 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
164 | print(chunk)
165 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
166 | print('click开始计算相似度!!!')
167 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
168 | lambda x: calculate_cf_u2i_similarity(x, dic_click))
169 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
170 | chunk['clicks_CF_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
171 | chunk['clicks_CF_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
172 | print(chunk[(chunk['clicks_CF_sim_max'] != -10) & (chunk['clicks_CF_sim_max'] != 0)])
173 |
174 | print('click开始计算相似度!!!')
175 | chunk['clicks_sim_score_str1'] = chunk['clicks_sim_list'].apply(
176 | lambda x: calculate_cf_u2i_similarity(x, dic_hot))
177 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str1']])
178 | chunk['hot_CF_sim_max'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[0]))
179 | chunk['hot_CF_sim_sum'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[1]))
180 | print(chunk[(chunk['hot_CF_sim_max'] != -10) & (chunk['hot_CF_sim_max'] != 0)])
181 |
182 | chunk = chunk.drop(
183 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
184 | 'clicks_sim_score_str', 'clicks_sim_score_str1'])
185 | print(chunk[['buys_CF_sim_max', 'buys_CF_sim_sum', 'hot_CF_sim_max', 'hot_CF_sim_sum', 'clicks_CF_sim_max',
186 | 'clicks_CF_sim_sum']])
187 | print(chunk.columns)
188 | print(chunk)
189 | chunk.to_parquet(path)
190 |
191 |
192 | def calculate_cf_u2i_similarity_tail(string, dic):
193 | list = string.split(' ')
194 | if int(list[-1]) < 0:
195 | return '-10 -10'
196 | aid = list[0]
197 | score = []
198 | for i in list[1:]:
199 | if aid + ' ' + i in dic:
200 | temp_score = float(dic[aid + ' ' + i])
201 | else:
202 | temp_score = 0
203 | score.append(temp_score)
204 | return str(sum(score) / len(score)) + ' ' + str(score[-1])
205 |
206 |
207 | # 计算候选aid与user序列co_visitation矩阵的权重之和
208 | def cf_u2i_similarity_tail(stage, candidate_type, start, end):
209 | print('开始读取数据!!!')
210 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
211 | print(valid)
212 | print('开始筛选')
213 |
214 | valid1 = valid[valid['type'] != 0]
215 | print(valid1)
216 | print('开始排序')
217 | # 分别对session_id聚合,对时间进行排序
218 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
219 | print('生成list')
220 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
221 | sentences_df.columns = ['carts_and_orders']
222 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
223 | sentences_df = sentences_df.drop(columns='carts_and_orders')
224 | print(sentences_df)
225 |
226 | valid2 = valid
227 | print(valid2)
228 | print('开始排序')
229 | # 分别对session_id聚合,对时间进行排序
230 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
231 | print('生成list')
232 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
233 | sentences_df1.columns = ['clicks']
234 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
235 | sentences_df1 = sentences_df1.drop(columns='clicks')
236 | print(sentences_df1)
237 |
238 | print('开始读取字典!!')
239 | print('click')
240 | VER = 6
241 | print(VER)
242 | dic_click = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_0.pqt')
243 | DISK_PIECES = 4
244 | for k in range(1, DISK_PIECES):
245 | dic_click = dic_click.append(
246 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_{k}.pqt'))
247 |
248 | dic_click['aids1'] = dic_click['aid_x'].astype('str') + ' ' + dic_click['aid_y'].astype('str')
249 | dic_click['aids2'] = dic_click['aid_y'].astype('str') + ' ' + dic_click['aid_x'].astype('str')
250 |
251 | dic_click = dic_click.drop(columns=['aid_x', 'aid_y'])
252 | dic_click1 = dic_click[['aids1', 'wgt']]
253 | print(dic_click1)
254 | dic_click2 = dic_click[['aids2', 'wgt']]
255 | dic_click2.columns = ['aids1', 'wgt']
256 | print(dic_click2)
257 | dic_click = dic_click1.append(dic_click2)
258 | print(dic_click)
259 | dic_click.index = dic_click['aids1']
260 | print(dic_click)
261 | dic_click = dic_click['wgt'].to_dict()
262 | print('0 532042' in dic_click)
263 | print('532042 0' in dic_click)
264 | print('0 532022242' in dic_click)
265 |
266 | print('hot')
267 | dic_hot = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_0.pqt')
268 | DISK_PIECES = 4
269 | for k in range(1, DISK_PIECES):
270 | dic_hot = dic_hot.append(
271 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_{k}.pqt'))
272 |
273 | dic_hot['aids1'] = dic_hot['aid_x'].astype('str') + ' ' + dic_hot['aid_y'].astype('str')
274 | dic_hot['aids2'] = dic_hot['aid_y'].astype('str') + ' ' + dic_hot['aid_x'].astype('str')
275 |
276 | dic_hot = dic_hot.drop(columns=['aid_x', 'aid_y'])
277 | dic_click1 = dic_hot[['aids1', 'wgt']]
278 | print(dic_click1)
279 | dic_click2 = dic_hot[['aids2', 'wgt']]
280 | dic_click2.columns = ['aids1', 'wgt']
281 | print(dic_click2)
282 | dic_hot = dic_click1.append(dic_click2)
283 | print(dic_hot)
284 | dic_hot.index = dic_hot['aids1']
285 | print(dic_hot)
286 | dic_hot = dic_hot['wgt'].to_dict()
287 | print('0 532042' in dic_hot)
288 | print('532042 0' in dic_hot)
289 | print('0 532022242' in dic_hot)
290 |
291 | print('buys')
292 | dic_buys = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_buy2buy_v{VER}_0.pqt')
293 | print(dic_buys)
294 |
295 | dic_buys['aids1'] = dic_buys['aid_x'].astype('str') + ' ' + dic_buys['aid_y'].astype('str')
296 | dic_buys['aids2'] = dic_buys['aid_y'].astype('str') + ' ' + dic_buys['aid_x'].astype('str')
297 |
298 | dic_buys = dic_buys.drop(columns=['aid_x', 'aid_y'])
299 | dic_click1 = dic_buys[['aids1', 'wgt']]
300 | print(dic_click1)
301 | dic_click2 = dic_buys[['aids2', 'wgt']]
302 | dic_click2.columns = ['aids1', 'wgt']
303 | print(dic_click2)
304 | dic_buys = dic_click1.append(dic_click2)
305 | print(dic_buys)
306 | dic_buys.index = dic_buys['aids1']
307 | print(dic_buys)
308 | dic_buys = dic_buys['wgt'].to_dict()
309 | print('0 532042' in dic_buys)
310 | print('532042 0' in dic_buys)
311 | print('0 532022242' in dic_buys)
312 |
313 | for t in candidate_type:
314 | # 只导入训练数据
315 | print('开始导入数据')
316 | for i in range(start, end):
317 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
318 | print(f'第{i + 1}块数据')
319 | chunk = pd.read_parquet(path)
320 | print(path)
321 | print(chunk.columns)
322 | chunk = chunk.astype("float32")
323 | chunk['session'] = chunk['session'].astype('int32')
324 | chunk['aid'] = chunk['aid'].astype('int32')
325 | print(chunk)
326 | print(chunk.columns)
327 |
328 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
329 | print(chunk)
330 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
331 | print('开始计算相似度!!!')
332 | chunk['sim_score_str'] = chunk['sim_list'].apply(lambda x: calculate_cf_u2i_similarity_tail(x, dic_buys))
333 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
334 | chunk['buys_CF_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
335 | chunk['buys_CF_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
336 | print(chunk[(chunk['buys_CF_sim_mean'] != -10) & (chunk['buys_CF_sim_-1'] != 0)])
337 |
338 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
339 | print(chunk)
340 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
341 | print('click开始计算相似度!!!')
342 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
343 | lambda x: calculate_cf_u2i_similarity_tail(x, dic_click))
344 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
345 | chunk['clicks_CF_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
346 | chunk['clicks_CF_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
347 | print(chunk[(chunk['clicks_CF_sim_mean'] != -10) & (chunk['clicks_CF_sim_-1'] != 0)])
348 |
349 | print('click开始计算相似度!!!')
350 | chunk['clicks_sim_score_str1'] = chunk['clicks_sim_list'].apply(
351 | lambda x: calculate_cf_u2i_similarity_tail(x, dic_hot))
352 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str1']])
353 | chunk['hot_CF_sim_mean'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[0]))
354 | chunk['hot_CF_sim_-1'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[1]))
355 | print(chunk[(chunk['hot_CF_sim_mean'] != -10) & (chunk['hot_CF_sim_-1'] != 0)])
356 |
357 | chunk = chunk.drop(
358 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
359 | 'clicks_sim_score_str', 'clicks_sim_score_str1'])
360 | print(chunk[['buys_CF_sim_max', 'buys_CF_sim_sum', 'hot_CF_sim_max', 'hot_CF_sim_sum', 'clicks_CF_sim_max',
361 | 'clicks_CF_sim_sum']])
362 | print(chunk.columns)
363 | print(chunk)
364 | chunk.to_parquet(path)
365 |
366 |
367 | # 三个矩阵的特征
368 | def co_visitation_features(stage, candidate_type, start, end):
369 | cf_u2i_similarity(stage, candidate_type, start, end)
370 | cf_u2i_similarity_tail(stage, candidate_type, start, end)
371 |
372 |
373 | if __name__ == '__main__':
374 | IS_TRAIN = True
375 | if IS_TRAIN:
376 | stage = 'CV'
377 | else:
378 | stage = 'LB'
379 | candidate_type = ['clicks', 'carts', 'orders']
380 | co_visitation_features(stage, candidate_type, 0, 8)
381 |
--------------------------------------------------------------------------------
/features/item_features.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import math
3 | import pandas as pd
4 | import numpy as np
5 |
6 |
7 | def load_data(path):
8 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
9 | dfs = []
10 | # 只导入训练数据
11 | for e, chunk_file in enumerate(glob.glob(path)):
12 | chunk = pd.read_parquet(chunk_file)
13 | chunk.ts = (chunk.ts / 1000).astype('int32')
14 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
15 | if not IS_TRAIN:
16 | # 除去第一周的数据
17 | chunk = chunk[chunk['ts'] >= 1659909599]
18 | dfs.append(chunk)
19 | return pd.concat(dfs).reset_index(drop=True)
20 |
21 |
22 | def item_features(input_path, output_path):
23 | print("开始导入数据!!!")
24 | train = load_data(input_path)
25 |
26 | print("开始构造item_feature!!!")
27 | # Step 2:构造item_features
28 | # item_features,使用train data 和valid data
29 | print("开始聚合aid:agg中!!!")
30 | item_features = train.groupby('aid').agg({'aid': 'count', 'session': 'nunique', 'type': ['mean', 'skew'],
31 | 'ts': ['min', 'max', 'skew']})
32 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
33 | item_features.columns = ['item_item_count', 'item_user_count', 'item_buy_ratio', 'item_buy_skew', 'item_min_ts',
34 | 'item_max_ts', 'item_skew_ts']
35 | print("开始构造ts偏态峰态中!!!")
36 | # 计算时间偏态系数,计算时间峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0)
37 | item_features['item_skew_ts'] = item_features['item_skew_ts'].fillna(value=0)
38 | item_features['item_kurt_ts'] = train.groupby('aid')['ts'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(value=0)
39 |
40 | print("开始构造type偏态峰态中!!!")
41 | # 计算类型偏态系数,计算类型峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0)
42 | item_features['item_buy_skew'] = item_features['item_buy_skew'].fillna(value=0)
43 | item_features['item_buy_kurt'] = train.groupby('aid')['type'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(value=0)
44 | # aids序列持续的时间(天)
45 | print("开始计算ts时间s!!!")
46 | item_features['item_long_ts'] = item_features['item_max_ts'] - item_features['item_min_ts']
47 | print(item_features)
48 | item_features = item_features.drop(columns=['item_min_ts', 'item_max_ts'])
49 |
50 | print("开始计算aid三个比例特征!!!")
51 | # aid平均每天被观看几次
52 | item_features["item_avg_visit_per_day"] = item_features['item_item_count'] / (item_features['item_long_ts'] / (60 *
53 | 60 * 24)).clip(
54 | 1, 60).apply(lambda x: math.ceil(x))
55 | item_features["item_repeat_visit_num"] = item_features['item_item_count'] - item_features['item_user_count']
56 | # 平均每个商品被每个用户观看的次数
57 | item_features["item_ave_visit_num"] = item_features['item_item_count'] / item_features['item_user_count']
58 | # aids的re_watch比例
59 | item_features["item_re_visit_rate"] = item_features['item_repeat_visit_num'] / item_features['item_item_count']
60 |
61 | # train 的ts是毫秒,没有除以1000
62 | print("开始导入数据!!!")
63 | # 前三周的训练数据
64 |
65 | time = (train['ts'].max() - train['ts'].min()) / (60 * 60 * 24)
66 | print('天', time)
67 | # 只要后几周的数据
68 | train['ts_minus'] = (train['ts'] - train['ts'].min()) / (60 * 60 * 24)
69 | # 最后一周
70 | print('最后一周')
71 | train1 = train[train['ts_minus'] >= 21].drop(columns='ts_minus')
72 | print(train1)
73 | item_item_count_last_week = train1.groupby('aid').agg({'aid': 'count', 'type': 'mean'})
74 | item_item_count_last_week.columns = ['item_item_count_last_week', 'item_buy_ratio_last_week']
75 | print(item_item_count_last_week)
76 | # 最后两周
77 | print('最后两周')
78 | train2 = train[train['ts_minus'] >= 14].drop(columns='ts_minus')
79 | print(train2)
80 | item_item_count_last_two_week = train2.groupby('aid').agg({'aid': 'count', 'type': 'mean'})
81 | item_item_count_last_two_week.columns = ['item_item_count_last_two_week', 'item_buy_ratio_last_two_week']
82 | print(item_item_count_last_two_week)
83 | # 最后三周
84 | print('最后三周')
85 | train3 = train[train['ts_minus'] >= 7].drop(columns='ts_minus')
86 | print(train3)
87 | item_item_count_last_three_week = train3.groupby('aid').agg({'aid': 'count', 'type': 'mean'})
88 | item_item_count_last_three_week.columns = ['item_item_count_last_three_week', 'item_buy_ratio_last_three_week']
89 | print(item_item_count_last_three_week)
90 |
91 | item_features = item_features.merge(item_item_count_last_week, left_index=True, right_index=True,
92 | how='left').fillna(value=-1000)
93 | item_features = item_features.merge(item_item_count_last_two_week, left_index=True, right_index=True,
94 | how='left').fillna(value=-1000)
95 | item_features = item_features.merge(item_item_count_last_three_week, left_index=True, right_index=True,
96 | how='left').fillna(value=-1000)
97 |
98 | print(item_features)
99 | print(item_features.columns)
100 |
101 | # 规定保存格式
102 | item_features = item_features.astype('float32')
103 | print("开始保存特征到文件!!!")
104 | item_features.to_parquet(output_path)
105 |
106 |
107 | def add_item_features(input_path1, input_path2, output_path):
108 | # item feature
109 | # item_feature:点击购买率 item_item 总count / cart/order count
110 | # 点击加购率
111 | # 加购购买率
112 | # 点击占比(点击占全部点击之比)
113 | # 加购占比
114 | # 购买占比
115 | # last_week和last_month 趋势 斜率变化
116 | # 复购率 集中度
117 | # 复加购率
118 | # 复点击率 item_item - item_user
119 | print("开始导入数据!!!")
120 | train = load_data(input_path1)
121 |
122 | train_click = train[train['type'] == 0]
123 | train_cart = train[train['type'] == 1]
124 | train_order = train[train['type'] == 2]
125 |
126 | print("开始构造item_feature!!!")
127 | # 最后一个月
128 | print("开始聚合aid:agg中!!!")
129 | click_item_features = train_click.groupby('aid').agg({'aid': 'count', 'session': 'nunique'})
130 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
131 | click_item_features.columns = ['click_item_item_count', 'click_item_user_count']
132 |
133 | cart_item_features = train_cart.groupby('aid').agg({'aid': 'count', 'session': 'nunique'})
134 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
135 | cart_item_features.columns = ['cart_item_item_count', 'cart_item_user_count']
136 |
137 | order_item_features = train_order.groupby('aid').agg({'aid': 'count', 'session': 'nunique'})
138 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
139 | order_item_features.columns = ['order_item_item_count', 'order_item_user_count']
140 |
141 | click_item_features = click_item_features.merge(cart_item_features, left_index=True, right_index=True,
142 | how='left').fillna(value=0)
143 | click_item_features = click_item_features.merge(order_item_features, left_index=True, right_index=True,
144 | how='left').fillna(value=0)
145 |
146 | # click_item_item_count, click_item_user_count
147 | # 点击购买率 * 3
148 | click_item_features['click_cart_rate'] = click_item_features['cart_item_item_count'] / click_item_features[
149 | 'click_item_item_count']
150 | click_item_features['click_order_rate'] = click_item_features['order_item_item_count'] / click_item_features[
151 | 'click_item_item_count']
152 | click_item_features['cart_order_rate'] = (
153 | click_item_features['order_item_item_count'] / click_item_features['cart_item_item_count'])
154 | print(click_item_features['cart_order_rate'].max())
155 | print(click_item_features['cart_order_rate'].min())
156 | features = click_item_features[
157 | (click_item_features['order_item_item_count'] == 0) & (click_item_features['cart_item_item_count'] == 0)]
158 | print(features[['cart_item_item_count', 'order_item_item_count', 'cart_order_rate']])
159 | # 点击占比
160 | click_item_features['click_percentage'] = click_item_features['click_item_item_count'] / click_item_features[
161 | 'click_item_item_count'].sum()
162 | click_item_features['cart_percentage'] = click_item_features['cart_item_item_count'] / click_item_features[
163 | 'cart_item_item_count'].sum()
164 | click_item_features['order_percentage'] = click_item_features['order_item_item_count'] / click_item_features[
165 | 'order_item_item_count'].sum()
166 | # 复购率
167 | click_item_features['re_click_rate'] = (click_item_features['click_item_item_count'] - click_item_features[
168 | 'click_item_user_count']) / click_item_features['click_item_item_count']
169 | click_item_features['re_cart_rate'] = (click_item_features['cart_item_item_count'] - click_item_features[
170 | 'cart_item_user_count']) / click_item_features['cart_item_item_count']
171 | click_item_features['re_order_rate'] = (click_item_features['order_item_item_count'] - click_item_features[
172 | 'order_item_user_count']) / click_item_features['order_item_item_count']
173 |
174 | click_item_features = click_item_features.replace(np.inf, 100)
175 |
176 | print("开始导入valid数据!!!")
177 | valid = load_data(input_path2)
178 |
179 | valid_click = valid[valid['type'] == 0]
180 | valid_cart = valid[valid['type'] == 1]
181 | valid_order = valid[valid['type'] == 2]
182 |
183 | print("开始构造item_feature!!!")
184 | # 最后一个月
185 | print("开始聚合aid:agg中!!!")
186 | valid_click_item_features = valid_click.groupby('aid').agg({'aid': 'count', 'session': 'nunique'})
187 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
188 | valid_click_item_features.columns = ['click_item_item_count1', 'click_item_user_count1']
189 |
190 | valid_cart_item_features = valid_cart.groupby('aid').agg({'aid': 'count', 'session': 'nunique'})
191 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
192 | valid_cart_item_features.columns = ['cart_item_item_count1', 'cart_item_user_count1']
193 |
194 | valid_order_item_features = valid_order.groupby('aid').agg({'aid': 'count', 'session': 'nunique'})
195 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
196 | valid_order_item_features.columns = ['order_item_item_count1', 'order_item_user_count1']
197 |
198 | valid_click_item_features = valid_click_item_features.merge(valid_cart_item_features, left_index=True,
199 | right_index=True,
200 | how='left').fillna(value=0)
201 | valid_click_item_features = valid_click_item_features.merge(valid_order_item_features, left_index=True,
202 | right_index=True,
203 | how='left').fillna(value=0)
204 | # click_item_item_count, click_item_user_count
205 | # 点击购买率 * 3
206 | valid_click_item_features['click_cart_rate1'] = valid_click_item_features['cart_item_item_count1'] / \
207 | valid_click_item_features[
208 | 'click_item_item_count1']
209 | valid_click_item_features['click_order_rate1'] = valid_click_item_features['order_item_item_count1'] / \
210 | valid_click_item_features[
211 | 'click_item_item_count1']
212 | valid_click_item_features['cart_order_rate1'] = valid_click_item_features['order_item_item_count1'] / \
213 | valid_click_item_features[
214 | 'cart_item_item_count1']
215 | # 点击占比
216 | valid_click_item_features['click_percentage1'] = valid_click_item_features['click_item_item_count1'] / \
217 | valid_click_item_features[
218 | 'click_item_item_count1'].sum()
219 | valid_click_item_features['cart_percentage1'] = valid_click_item_features['cart_item_item_count1'] / \
220 | valid_click_item_features[
221 | 'cart_item_item_count1'].sum()
222 | valid_click_item_features['order_percentage1'] = valid_click_item_features['order_item_item_count1'] / \
223 | valid_click_item_features[
224 | 'order_item_item_count1'].sum()
225 | # 复购率
226 | valid_click_item_features['re_click_rate1'] = (valid_click_item_features['click_item_item_count1'] -
227 | valid_click_item_features[
228 | 'click_item_user_count1']) / valid_click_item_features[
229 | 'click_item_item_count1']
230 | valid_click_item_features['re_cart_rate1'] = (valid_click_item_features['cart_item_item_count1'] -
231 | valid_click_item_features[
232 | 'cart_item_user_count1']) / valid_click_item_features[
233 | 'cart_item_item_count1']
234 | valid_click_item_features['re_order_rate1'] = (valid_click_item_features['order_item_item_count1'] -
235 | valid_click_item_features[
236 | 'order_item_user_count1']) / valid_click_item_features[
237 | 'order_item_item_count1']
238 | valid_click_item_features = valid_click_item_features.replace(np.inf, 100)
239 |
240 | # 缺失值用-1填补,相减后也是负数,小于等于-1
241 | click_item_features = click_item_features.merge(valid_click_item_features, left_index=True, right_index=True,
242 | how='left').fillna(value=-10)
243 | # 点击加购率
244 | click_item_features['click_cart_rate_trend'] = (
245 | click_item_features['click_cart_rate1'] - click_item_features['click_cart_rate']).clip(-10)
246 | click_item_features['click_order_rate_trend'] = (
247 | click_item_features['click_order_rate1'] - click_item_features['click_order_rate']).clip(-10)
248 | click_item_features['cart_order_rate_trend'] = (
249 | click_item_features['cart_order_rate1'] - click_item_features['cart_order_rate']).clip(-10)
250 | # 点击占比
251 | click_item_features['click_percentage_trend'] = (
252 | click_item_features['click_percentage1'] - click_item_features['click_percentage']).clip(-10)
253 | click_item_features['cart_percentage_trend'] = (
254 | click_item_features['cart_percentage1'] - click_item_features['cart_percentage']).clip(-10)
255 | click_item_features['order_percentage_trend'] = (
256 | click_item_features['order_percentage1'] - click_item_features['order_percentage']).clip(-10)
257 | # 复购率
258 | click_item_features['re_click_rate_trend'] = (
259 | click_item_features['re_click_rate1'] - click_item_features['re_click_rate']).clip(-10)
260 | click_item_features['re_cart_rate_trend'] = (
261 | click_item_features['re_cart_rate1'] - click_item_features['re_cart_rate']).clip(-10)
262 | click_item_features['re_order_rate_trend'] = (
263 | click_item_features['re_order_rate1'] - click_item_features['re_order_rate']).clip(-10)
264 |
265 | print(click_item_features)
266 | print(click_item_features.describe())
267 |
268 | print("开始保存特征到文件!!!")
269 | click_item_features.to_parquet(output_path)
270 |
271 |
272 | def trans_time_span_item_features(input_path, output_path1, output_path2, output_path3):
273 | train = load_data(input_path)
274 |
275 | train_clicks = train[train['type'] == 0].drop(columns='type')
276 | train_clicks = train_clicks.rename(columns={'ts': 'ts_click'})
277 | train_carts = train[train['type'] == 1].drop(columns='type')
278 | train_carts = train_carts.rename(columns={'ts': 'ts_cart'})
279 | train_orders = train[train['type'] == 2].drop(columns='type')
280 | train_orders = train_orders.rename(columns={'ts': 'ts_order'})
281 |
282 | print('click_cart_span')
283 | click_cart_span = train_clicks.merge(train_carts, on=['session', 'aid'], how='inner')
284 | print(click_cart_span)
285 | click_cart_span['min'] = click_cart_span['ts_click'] - click_cart_span['ts_cart']
286 | click_cart_span = click_cart_span[click_cart_span['min'] <= 0].drop(columns='min')
287 | print(click_cart_span)
288 | click_cart_span_feature = click_cart_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_cart': 'min'})
289 | click_cart_span_feature.columns = ['ts_click_min', 'ts_cart_min']
290 | print(click_cart_span_feature)
291 | click_cart_span_feature['click_cart_span'] = click_cart_span_feature['ts_cart_min'] - click_cart_span_feature[
292 | 'ts_click_min']
293 | print(click_cart_span_feature)
294 | click_cart_span_feature['aids'] = click_cart_span_feature.index.get_level_values('aid')
295 | print(click_cart_span_feature)
296 | print(click_cart_span_feature.index.get_level_values('aid')[:10])
297 | click_cart_span_feature = click_cart_span_feature.groupby('aids').agg({'aids': 'count', 'click_cart_span': 'mean'})
298 | click_cart_span_feature.columns = ['trans_click_cart_count', 'trans_click_cart_span_avg']
299 | print(click_cart_span_feature.describe())
300 | print(click_cart_span_feature)
301 | click_cart_span_feature.to_parquet(output_path1)
302 |
303 | print('click_order_span')
304 | click_order_span = train_clicks.merge(train_orders, on=['session', 'aid'], how='inner')
305 | print(click_order_span)
306 | click_order_span['min'] = click_order_span['ts_click'] - click_order_span['ts_order']
307 | click_order_span = click_order_span[click_order_span['min'] <= 0].drop(columns='min')
308 | print(click_order_span)
309 | click_order_span_feature = click_order_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_order': 'min'})
310 | click_order_span_feature.columns = ['ts_click_min', 'ts_order_min']
311 | print(click_order_span_feature)
312 | click_order_span_feature['click_order_span'] = click_order_span_feature['ts_order_min'] - click_order_span_feature[
313 | 'ts_click_min']
314 | print(click_order_span_feature)
315 | click_order_span_feature['aids'] = click_order_span_feature.index.get_level_values('aid')
316 | print(click_order_span_feature)
317 | print(click_order_span_feature.index.get_level_values('aid')[:10])
318 | click_order_span_feature = click_order_span_feature.groupby('aids').agg(
319 | {'aids': 'count', 'click_order_span': 'mean'})
320 | click_order_span_feature.columns = ['trans_click_order_count', 'trans_click_order_span_avg']
321 | print(click_order_span_feature.describe())
322 | print(click_order_span_feature)
323 | click_order_span_feature.to_parquet(output_path2)
324 |
325 | print('cart_order_span')
326 | carts_order_span = train_carts.merge(train_orders, on=['session', 'aid'], how='inner')
327 | print(carts_order_span)
328 | carts_order_span['min'] = carts_order_span['ts_cart'] - carts_order_span['ts_order']
329 | carts_order_span = carts_order_span[carts_order_span['min'] <= 0].drop(columns='min')
330 | print(carts_order_span)
331 | cart_order_span_feature = carts_order_span.groupby(['session', 'aid']).agg({'ts_cart': 'min', 'ts_order': 'min'})
332 | cart_order_span_feature.columns = ['ts_cart_min', 'ts_order_min']
333 | print(cart_order_span_feature)
334 | cart_order_span_feature['cart_order_span'] = cart_order_span_feature['ts_order_min'] - cart_order_span_feature[
335 | 'ts_cart_min']
336 | print(cart_order_span_feature)
337 | cart_order_span_feature['aids'] = cart_order_span_feature.index.get_level_values('aid')
338 | print(cart_order_span_feature)
339 | print(cart_order_span_feature.index.get_level_values('aid')[:10])
340 | cart_order_span_feature = cart_order_span_feature.groupby('aids').agg({'aids': 'count', 'cart_order_span': 'mean'})
341 | cart_order_span_feature.columns = ['trans_cart_order_count', 'trans_cart_order_span_avg']
342 | print(cart_order_span_feature.describe())
343 | print(cart_order_span_feature)
344 | cart_order_span_feature.to_parquet(output_path3)
345 |
346 |
347 | if __name__ == '__main__':
348 | IS_TRAIN = True
349 | if IS_TRAIN:
350 | stage = 'CV'
351 | else:
352 | stage = 'LB'
353 | input_path = f'/home/niejianfei/otto/{stage}/data/*_parquet/*'
354 | input_path2 = f'/home/niejianfei/otto/{stage}/data/test_parquet/*'
355 | output_path = f'/home/niejianfei/otto/{stage}/preprocess/item_features.pqt'
356 | output_path1 = f'/home/niejianfei/otto/{stage}/preprocess/add_item_features.pqt'
357 | item_features(input_path, output_path)
358 | add_item_features(input_path, input_path2, output_path1)
359 |
360 | input_path3 = f'/home/niejianfei/otto/{stage}/data/train_parquet/*'
361 | output_path2 = f'/home/niejianfei/otto/{stage}/preprocess/click_cart_item_features.pqt'
362 | output_path3 = f'/home/niejianfei/otto/{stage}/preprocess/click_order_item_features.pqt'
363 | output_path4 = f'/home/niejianfei/otto/{stage}/preprocess/cart_order_item_features.pqt'
364 | trans_time_span_item_features(input_path3, output_path2, output_path3, output_path4)
365 |
--------------------------------------------------------------------------------
/features/recall_features.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | # 导入candidates数据
5 | def recall_features(stage, candidate_type):
6 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
7 | print("开始导入数据!!!")
8 | candidates = pd.read_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates.pqt')
9 | print('candidate的长度为', len(candidates))
10 |
11 | print("开始处理candidates数据!!!")
12 | # 标记类型
13 | print("转换!!!")
14 | candidates["type"] = candidates.session_type.apply(lambda x: x.split("_")[1])
15 | candidates["type"] = candidates["type"].map(type_transform).astype('int8')
16 | for t in candidate_type:
17 | print(f"只要{t}!!!")
18 | candidates = candidates[candidates['type'] == type_transform[t]]
19 | print("推荐长度:", len(candidates))
20 | # 裂开 session_type, labels, type
21 | print("裂开!!!")
22 | candidates["labels"] = candidates["labels"].apply(lambda x: x.split(" "))
23 | candidates = candidates.explode("labels")
24 | # 开始计算类型 session_type, labels, type, candidate_type
25 | print("candidate_type")
26 | candidates["candidate_type"] = candidates["labels"].apply(lambda x: x.split('#')[1]).astype('float32').astype(
27 | 'int32')
28 | # 开始计算得分 session_type, labels, type, candidate_type, candidate_type_scores
29 | print("candidate_type_scores")
30 | candidates["candidate_type_scores"] = candidates["labels"].apply(lambda x: x.split('#')[2]).astype('float32')
31 | # 开始标签 session_type, labels, type, candidate_type, candidate_type_scores
32 | print("labels")
33 | candidates["labels"] = candidates["labels"].apply(lambda x: x.split('#')[0]).astype('int32')
34 | candidates["session_type"] = candidates.session_type.apply(lambda x: x.split("_")[0]).astype("int32")
35 | candidates.rename(columns={'session_type': 'session', 'labels': 'aid'}, inplace=True)
36 | print(candidates)
37 |
38 | # 'session', 'aid', 'type', 'candidate_type', 'candidate_type_scores'
39 | # history_aid, sim_aid, top_hot_aid, top_orders_aid
40 | candidate_type_dic = {1: 'history_aid', 2: 'sim_aid', 3: 'top_hot_aid', 4: 'top_orders_aid',
41 | 5: 'top_carts_aid', 6: 'top_hot_aid_last_month', 7: 'deepwalk', 8: 'word2vec'}
42 | candidate_type_scores_dic = {1: 'history_aid_score', 2: 'sim_aid_score', 3: 'top_hot_aid_score',
43 | 4: 'top_orders_aid_score', 5: 'top_carts_aid_score',
44 | 6: 'top_hot_aid_last_month_score', 7: 'deepwalk_score',
45 | 8: 'word2vec_score'}
46 | print('开始merge!!!')
47 | candidates1 = candidates[candidates['candidate_type'] == 1]
48 | candidates1.columns = ['session', 'aid', 'type', 'history_aid', 'history_aid_score']
49 | candidates1 = candidates1.sort_values(['session', 'history_aid_score'], ascending=[True, False])
50 | candidates1['history_aid_rank'] = candidates1.groupby('session')['aid'].cumcount()
51 | print(candidates1)
52 | for i in range(7):
53 | temp_df = candidates[candidates['candidate_type'] == i + 2]
54 | temp_df['candidate_type'] = 1
55 | temp_df.rename(columns={'candidate_type': f'{candidate_type_dic[i + 2]}',
56 | 'candidate_type_scores': f'{candidate_type_scores_dic[i + 2]}'}, inplace=True)
57 | temp_df = temp_df.sort_values(['session', f'{candidate_type_scores_dic[i + 2]}'],
58 | ascending=[True, False])
59 | temp_df[f'{candidate_type_dic[i + 2]}_rank'] = temp_df.groupby('session')['aid'].cumcount()
60 | print(temp_df)
61 | candidates1 = candidates1.merge(temp_df, on=['session', 'aid', 'type'], how='outer').fillna(value=-1)
62 | print(candidates1)
63 | candidates1.to_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates_{t}.pqt')
64 | print('保存完毕')
65 |
66 |
67 | if __name__ == '__main__':
68 | IS_TRAIN = True
69 | candidate_type = ['clicks', 'carts', 'orders']
70 | if IS_TRAIN:
71 | stage = 'CV'
72 | else:
73 | stage = 'LB'
74 |
75 | recall_features(stage, candidate_type)
76 |
--------------------------------------------------------------------------------
/features/similarity_features.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import pickle
3 | import gensim
4 | import pandas as pd
5 | import numpy as np
6 | from sklearn.metrics.pairwise import cosine_similarity
7 |
8 |
9 | def load_validate(path):
10 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
11 | dfs = []
12 | # 只导入训练数据
13 | for e, chunk_file in enumerate(glob.glob(path)):
14 | chunk = pd.read_parquet(chunk_file)
15 | chunk.ts = (chunk.ts / 1000).astype('int32')
16 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
17 | dfs.append(chunk)
18 | return pd.concat(dfs).reset_index(drop=True)
19 |
20 |
21 | def calculate_deepwalk_similarity(string, model):
22 | list = string.split(' ')
23 | if int(list[-1]) < 0:
24 | return '-10 -10'
25 | sim = []
26 | aid = 'item_' + list[0]
27 | for i in list[1:]:
28 | simm = model.similarity(f'item_{i}', aid)
29 | sim.append(simm)
30 | sim_mean = sum(sim) / len(sim)
31 | sim_max = max(sim)
32 | return str(sim_mean) + ' ' + str(sim_max)
33 |
34 |
35 | # deepwalk,i2i相似度buys和clicks相似度的mean和max
36 | def deepwalk_i2i_similarity1(stage, candidate_type, start, end):
37 | print('开始读取数据!!!')
38 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
39 | print(valid)
40 | print('开始筛选buys')
41 | valid1 = valid[valid['type'] != 0]
42 | print(valid1)
43 | print('开始排序')
44 | # 分别对session_id聚合,对时间进行排序
45 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
46 | print('生成list')
47 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
48 | sentences_df.columns = ['carts_and_orders']
49 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
50 | sentences_df = sentences_df.drop(columns='carts_and_orders')
51 | print(sentences_df)
52 | print('开始筛选clicks')
53 | valid2 = valid[valid['type'] == 0]
54 | print(valid2)
55 | print('开始排序')
56 | # 分别对session_id聚合,对时间进行排序
57 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
58 | print('生成list')
59 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
60 | sentences_df1.columns = ['clicks']
61 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
62 | sentences_df1 = sentences_df1.drop(columns='clicks')
63 | print(sentences_df1)
64 |
65 | print('开始读取词向量!!')
66 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format(
67 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v',
68 | binary=False)
69 | for t in candidate_type:
70 | # 只导入训练数据
71 | print('开始导入数据')
72 | for i in range(start, end):
73 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
74 | print(f'第{i + 1}块数据')
75 | chunk = pd.read_parquet(path)
76 | print(path)
77 | print(chunk.columns)
78 | chunk = chunk.astype("float32")
79 | chunk['session'] = chunk['session'].astype('int32')
80 | chunk['aid'] = chunk['aid'].astype('int32')
81 | print(chunk)
82 |
83 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
84 | print(chunk)
85 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
86 | print('开始计算相似度!!!')
87 | chunk['sim_score_str'] = chunk['sim_list'].apply(
88 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month))
89 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
90 | chunk['buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
91 | chunk['buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
92 | print(chunk[chunk['buys_sim_mean'] != -10])
93 |
94 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
95 | print(chunk)
96 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
97 | print('click开始计算相似度!!!')
98 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
99 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month))
100 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
101 | chunk['clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
102 | chunk['clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
103 | print(chunk[chunk['clicks_sim_mean'] != -10])
104 |
105 | chunk = chunk.drop(
106 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
107 | 'clicks_sim_score_str'])
108 | print(chunk[['buys_sim_mean', 'buys_sim_max', 'clicks_sim_mean', 'clicks_sim_max']])
109 | print(chunk)
110 | chunk.to_parquet(path)
111 |
112 |
113 | # deepwalk,i2i相似度orders和carts相似度的mean和max
114 | def deepwalk_i2i_similarity2(stage, candidate_type, start, end):
115 | print('开始读取数据!!!')
116 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
117 | print(valid)
118 | print('开始筛选buys')
119 | valid1 = valid[valid['type'] == 2]
120 | print(valid1)
121 | print('开始排序')
122 | # 分别对session_id聚合,对时间进行排序
123 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
124 | print('生成list')
125 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
126 | sentences_df.columns = ['carts_and_orders']
127 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
128 | sentences_df = sentences_df.drop(columns='carts_and_orders')
129 | print(sentences_df)
130 | print('开始筛选clicks')
131 | valid2 = valid[valid['type'] == 1]
132 | print(valid2)
133 | print('开始排序')
134 | # 分别对session_id聚合,对时间进行排序
135 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
136 | print('生成list')
137 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
138 | sentences_df1.columns = ['clicks']
139 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
140 | sentences_df1 = sentences_df1.drop(columns='clicks')
141 | print(sentences_df1)
142 |
143 | print('开始读取词向量!!')
144 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format(
145 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v',
146 | binary=False)
147 | for t in candidate_type:
148 | # 只导入训练数据
149 | print('开始导入数据')
150 | for i in range(start, end):
151 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
152 | print(f'第{i + 1}块数据')
153 | chunk = pd.read_parquet(path)
154 | print(path)
155 | print(chunk.columns)
156 | chunk = chunk.astype("float32")
157 | chunk['session'] = chunk['session'].astype('int32')
158 | chunk['aid'] = chunk['aid'].astype('int32')
159 | print(chunk)
160 |
161 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
162 | print(chunk)
163 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
164 | print('开始计算相似度!!!')
165 | chunk['sim_score_str'] = chunk['sim_list'].apply(
166 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month))
167 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
168 | chunk['orders_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
169 | chunk['orders_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
170 | print(chunk[chunk['orders_sim_mean'] != -10])
171 |
172 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
173 | print(chunk)
174 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
175 | print('click开始计算相似度!!!')
176 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
177 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month))
178 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
179 | chunk['carts_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
180 | chunk['carts_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
181 | print(chunk[chunk['carts_sim_mean'] != -10])
182 |
183 | chunk = chunk.drop(
184 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
185 | 'clicks_sim_score_str'])
186 | print(chunk[['orders_sim_mean', 'orders_sim_max', 'carts_sim_mean', 'carts_sim_max']])
187 | print(chunk)
188 | chunk.to_parquet(path)
189 |
190 |
191 | def calculate_deepwalk_similarity_tail(string, model):
192 | list = string.split(' ')
193 | if int(list[-1]) < 0:
194 | return '-10 -10 -10'
195 | sim = []
196 | aid = 'item_' + list[0]
197 | for i in list[1:]:
198 | simm = model.similarity(f'item_{i}', aid)
199 | sim.append(simm)
200 | if len(sim) >= 3:
201 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' ' + str(sim[-3])
202 | elif len(sim) == 2:
203 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' -10'
204 | else:
205 | return str(sim[-1]) + ' -10 -10'
206 |
207 |
208 | def deepwalk_i2i_similarity_tail(stage, candidate_type, start, end):
209 | print('开始读取数据!!!')
210 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
211 | print(valid)
212 | print('开始筛选')
213 |
214 | valid1 = valid[valid['type'] != 0]
215 | print(valid1)
216 | print('开始排序')
217 | # 分别对session_id聚合,对时间进行排序
218 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
219 | print('生成list')
220 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
221 | sentences_df.columns = ['carts_and_orders']
222 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
223 | sentences_df = sentences_df.drop(columns='carts_and_orders')
224 | print(sentences_df)
225 |
226 | valid2 = valid[valid['type'] == 0]
227 | print(valid2)
228 | print('开始排序')
229 | # 分别对session_id聚合,对时间进行排序
230 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
231 | print('生成list')
232 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
233 | sentences_df1.columns = ['clicks']
234 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
235 | sentences_df1 = sentences_df1.drop(columns='clicks')
236 | print(sentences_df1)
237 |
238 | print('开始读取词向量!!')
239 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format(f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v',
240 | binary=False)
241 | for t in candidate_type:
242 | # 只导入训练数据
243 | print('开始导入数据')
244 | for i in range(start, end):
245 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
246 | print(f'第{i + 1}块数据')
247 | chunk = pd.read_parquet(path)
248 | print(path)
249 | chunk = chunk.astype("float32")
250 | chunk['session'] = chunk['session'].astype('int32')
251 | chunk['aid'] = chunk['aid'].astype('int32')
252 | print(chunk)
253 | print(chunk.columns)
254 |
255 | print('merge')
256 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
257 | print(chunk)
258 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
259 | print('开始计算相似度!!!')
260 | chunk['sim_score_str'] = chunk['sim_list'].apply(lambda x: calculate_deepwalk_similarity_tail(x, word2vec_last_month))
261 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
262 | chunk['buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
263 | chunk['buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
264 | chunk['buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2]))
265 | print(chunk[chunk['buys_sim_-1'] != -10])
266 |
267 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
268 | print(chunk)
269 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
270 | print('click开始计算相似度!!!')
271 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
272 | lambda x: calculate_deepwalk_similarity_tail(x, word2vec_last_month))
273 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
274 | chunk['clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
275 | chunk['clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
276 | chunk['clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2]))
277 | print(chunk[chunk['clicks_sim_-1'] != -10])
278 |
279 | chunk = chunk.drop(
280 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
281 | 'clicks_sim_score_str'])
282 | print(chunk[['buys_sim_-1', 'buys_sim_-2', 'clicks_sim_-1', 'clicks_sim_-2']])
283 | print(chunk.columns)
284 | print(chunk)
285 | chunk.to_parquet(path)
286 |
287 |
288 | def calculate_deepwalk_u2i_similarity(string, model):
289 | list = string.split(' ')
290 | if int(list[-1]) < 0:
291 | return '-10'
292 | aid_emb = np.array(model[f'item_{list[0]}'])
293 | user_emb = np.zeros(64)
294 | for i in list[1:]:
295 | user_emb += np.array(model[f'item_{i}']) / (len(list) - 1)
296 |
297 | cos_sim = cosine_similarity(aid_emb.reshape(1, -1), user_emb.reshape(1, -1))
298 |
299 | return str(cos_sim[0][0])
300 |
301 |
302 | # deepwalk,u2i相似度orders和carts相似度的mean和max
303 | def deepwalk_u2i_similarity(stage, candidate_type, start, end):
304 | print('开始读取数据!!!')
305 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
306 | print(valid)
307 |
308 | print('开始筛选order')
309 | valid1 = valid[valid['type'] == 2]
310 | print(valid1)
311 | print('开始排序')
312 | # 分别对session_id聚合,对时间进行排序
313 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
314 | print(df.head(10))
315 | print('生成list')
316 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
317 | sentences_df.columns = ['carts_and_orders']
318 | print(sentences_df)
319 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
320 | sentences_df = sentences_df.drop(columns='carts_and_orders')
321 | print(sentences_df)
322 |
323 | print('开始筛选cart')
324 | valid2 = valid[valid['type'] == 1]
325 | print(valid2)
326 | print('开始排序')
327 | # 分别对session_id聚合,对时间进行排序
328 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
329 | print(df1.head(10))
330 | print('生成list')
331 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
332 | sentences_df1.columns = ['clicks']
333 | print(sentences_df1)
334 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
335 | sentences_df1 = sentences_df1.drop(columns='clicks')
336 | print(sentences_df1)
337 |
338 | print('开始筛选click')
339 | valid3 = valid[valid['type'] == 0]
340 | print(valid3)
341 | print('开始排序')
342 | # 分别对session_id聚合,对时间进行排序
343 | df2 = valid3.sort_values(by=["session", "ts"], ascending=True)
344 | print(df2.head(10))
345 | print('生成list')
346 | sentences_df2 = pd.DataFrame(df2.groupby('session')['aid'].agg(list))
347 | sentences_df2.columns = ['clicks']
348 | print(sentences_df2)
349 | sentences_df2["clicks_str1"] = sentences_df2.clicks.apply(lambda x: " ".join(map(str, x)))
350 | sentences_df2 = sentences_df2.drop(columns='clicks')
351 | print(sentences_df2)
352 |
353 | print('开始读取词向量!!')
354 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format('/home/niejianfei/deepwalk_last_month.w2v',
355 | binary=False)
356 | for t in candidate_type:
357 | # 只导入训练数据
358 | print('开始导入数据')
359 | for i in range(start, end):
360 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
361 | print(f'第{i + 1}块数据')
362 | chunk = pd.read_parquet(path)
363 | print(path)
364 | print(chunk.columns)
365 | chunk = chunk.astype("float32")
366 | chunk['session'] = chunk['session'].astype('int32')
367 | chunk['aid'] = chunk['aid'].astype('int32')
368 | print(chunk)
369 |
370 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
371 | print(chunk)
372 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
373 | print('order开始计算相似度!!!')
374 | chunk['sim_score_str'] = chunk['sim_list'].apply(
375 | lambda x: calculate_deepwalk_u2i_similarity(x, word2vec_last_month))
376 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
377 | chunk['orders_user_item_sim'] = chunk['sim_score_str'].astype('float32')
378 | print(chunk[chunk['orders_user_item_sim'] != -10])
379 |
380 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
381 | print(chunk)
382 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
383 | print('cart开始计算相似度!!!')
384 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
385 | lambda x: calculate_deepwalk_u2i_similarity(x, word2vec_last_month))
386 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
387 | chunk['carts_user_item_sim'] = chunk['clicks_sim_score_str'].astype('float32')
388 | print(chunk[chunk['carts_user_item_sim'] != -10])
389 |
390 | chunk = chunk.merge(sentences_df2, left_on='session', right_index=True, how='left').fillna(value=-1)
391 | print(chunk)
392 | chunk['clicks_sim_list1'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str1'].astype('str')
393 | print('click开始计算相似度!!!')
394 | chunk['clicks_sim_score_str1'] = chunk['clicks_sim_list1'].apply(
395 | lambda x: calculate_deepwalk_u2i_similarity(x, word2vec_last_month))
396 | print(chunk[['clicks_str1', 'clicks_sim_list1', 'clicks_sim_score_str1']])
397 | chunk['clicks_user_item_sim'] = chunk['clicks_sim_score_str1'].astype('float32')
398 | print(chunk[chunk['clicks_user_item_sim'] != -10])
399 |
400 | chunk = chunk.drop(
401 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
402 | 'clicks_sim_score_str', 'clicks_str1', 'clicks_sim_list1',
403 | 'clicks_sim_score_str1'])
404 | print(chunk[['orders_user_item_sim', 'carts_user_item_sim', 'clicks_user_item_sim']])
405 | print(chunk.columns)
406 | print(chunk)
407 | chunk.to_parquet(path)
408 |
409 |
410 | def calculate_prone_similarity(string, model, aid_num_dict):
411 | list = string.split(' ')
412 | if int(list[-1]) < 0:
413 | return '-10 -10'
414 | sim = []
415 | aid = list[0]
416 | if int(aid) in aid_num_dict:
417 | for i in list[1:]:
418 | if int(i) in aid_num_dict:
419 | simm = model.similarity(str(aid_num_dict[int(i)]), str(aid_num_dict[int(aid)]))
420 | sim.append(simm)
421 | if len(sim) == 0:
422 | return '-10 -10'
423 | sim_mean = sum(sim) / len(sim)
424 | sim_max = max(sim)
425 | return str(sim_mean) + ' ' + str(sim_max)
426 |
427 |
428 | # prone,i2i相似度buys和clicks相似度的mean和max
429 | def prone_i2i_similarity(stage, candidate_type, start, end):
430 | print('开始读取数据!!!')
431 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
432 | print(valid)
433 | print('开始筛选')
434 | valid1 = valid[valid['type'] != 0]
435 | print(valid1)
436 | print('开始排序')
437 | # 分别对session_id聚合,对时间进行排序
438 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
439 | print('生成list')
440 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
441 | sentences_df.columns = ['carts_and_orders']
442 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
443 | sentences_df = sentences_df.drop(columns='carts_and_orders')
444 | print(sentences_df)
445 |
446 | valid2 = valid[valid['type'] == 0]
447 | print(valid2)
448 | print('开始排序')
449 | # 分别对session_id聚合,对时间进行排序
450 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
451 | print('生成list')
452 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
453 | sentences_df1.columns = ['clicks']
454 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
455 | sentences_df1 = sentences_df1.drop(columns='clicks')
456 | print(sentences_df1)
457 |
458 | print('开始读取词向量!!')
459 | proNE_last_month = gensim.models.KeyedVectors.load_word2vec_format(
460 | f"/home/niejianfei/otto/{stage}/preprocess/proNE_last_month_enhanced.emb",
461 | binary=False)
462 |
463 | print("开始读取aim_num映射文件!!!")
464 | f_read = open(f'/home/niejianfei/otto/{stage}/preprocess/aid_num_dict.pkl', 'rb')
465 | aid_num_dict = pickle.load(f_read)
466 | f_read.close()
467 | print('输出', aid_num_dict[0])
468 | print("aim_num映射文件读取完毕!!!")
469 |
470 | for t in candidate_type:
471 | # 只导入训练数据
472 | print('开始导入数据')
473 | for i in range(start, end):
474 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
475 | print(f'第{i + 1}块数据')
476 | chunk = pd.read_parquet(path)
477 | print(path)
478 | print(chunk.columns)
479 |
480 | chunk = chunk.astype("float32")
481 | chunk['session'] = chunk['session'].astype('int32')
482 | chunk['aid'] = chunk['aid'].astype('int32')
483 | print(chunk)
484 |
485 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
486 | print(chunk)
487 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
488 | print('开始计算相似度!!!')
489 | chunk['sim_score_str'] = chunk['sim_list'].apply(
490 | lambda x: calculate_prone_similarity(x, proNE_last_month, aid_num_dict))
491 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
492 | chunk['proNE_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
493 | chunk['proNE_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
494 | print(chunk[chunk['proNE_buys_sim_mean'] != -10])
495 |
496 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
497 | print(chunk)
498 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
499 | print('click开始计算相似度!!!')
500 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
501 | lambda x: calculate_prone_similarity(x, proNE_last_month, aid_num_dict))
502 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
503 | chunk['proNE_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
504 | chunk['proNE_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
505 | print(chunk[chunk['proNE_clicks_sim_mean'] != -10])
506 |
507 | chunk = chunk.drop(
508 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
509 | 'clicks_sim_score_str'])
510 | print(chunk[['proNE_buys_sim_mean', 'proNE_buys_sim_max', 'proNE_clicks_sim_mean', 'proNE_clicks_sim_max']])
511 | print(chunk)
512 | chunk.to_parquet(path)
513 |
514 |
515 | def calculate_prone_similarity_tail(string, model, aid_num_dict):
516 | list = string.split(' ')
517 | if int(list[-1]) < 0:
518 | return '-10 -10 -10'
519 | sim = []
520 | aid = list[0]
521 | if int(aid) in aid_num_dict:
522 | for i in list[1:]:
523 | if int(i) in aid_num_dict:
524 | simm = model.similarity(str(aid_num_dict[int(i)]), str(aid_num_dict[int(aid)]))
525 | sim.append(simm)
526 | if len(sim) >= 3:
527 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' ' + str(sim[-3])
528 | elif len(sim) == 2:
529 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' -10'
530 | elif len(sim) == 1:
531 | return str(sim[-1]) + ' -10 -10'
532 | else:
533 | return '-10 -10 -10'
534 |
535 |
536 | def prone_i2i_similarity_tail(stage, candidate_type, start, end):
537 | print('开始读取数据!!!')
538 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
539 | print(valid)
540 | print('开始筛选')
541 |
542 | valid1 = valid[valid['type'] != 0]
543 | print(valid1)
544 | print('开始排序')
545 | # 分别对session_id聚合,对时间进行排序
546 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
547 | print('生成list')
548 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
549 | sentences_df.columns = ['carts_and_orders']
550 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
551 | sentences_df = sentences_df.drop(columns='carts_and_orders')
552 | print(sentences_df)
553 |
554 | valid2 = valid[valid['type'] == 0]
555 | print(valid2)
556 | print('开始排序')
557 | # 分别对session_id聚合,对时间进行排序
558 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
559 | print('生成list')
560 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
561 | sentences_df1.columns = ['clicks']
562 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
563 | sentences_df1 = sentences_df1.drop(columns='clicks')
564 | print(sentences_df1)
565 |
566 | print('开始读取词向量!!')
567 | proNE_last_month = gensim.models.KeyedVectors.load_word2vec_format(
568 | f"/home/niejianfei/otto/{stage}/preprocess/proNE_last_month_enhanced.emb",
569 | binary=False)
570 |
571 | print("开始读取aim_num映射文件!!!")
572 | f_read = open(f'/home/niejianfei/otto/{stage}/preprocess/aid_num_dict.pkl', 'rb')
573 | aid_num_dict = pickle.load(f_read)
574 | f_read.close()
575 | print('输出', aid_num_dict[0])
576 | print("aim_num映射文件读取完毕!!!")
577 |
578 | for t in candidate_type:
579 | # 只导入训练数据
580 | print('开始导入数据')
581 | for i in range(start, end):
582 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
583 | print(f'第{i + 1}块数据')
584 | chunk = pd.read_parquet(path)
585 | print(path)
586 | print(chunk.columns)
587 | chunk = chunk.astype("float32")
588 | chunk['session'] = chunk['session'].astype('int32')
589 | chunk['aid'] = chunk['aid'].astype('int32')
590 | print(chunk)
591 | print(chunk.columns)
592 |
593 | print('merge')
594 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
595 | print(chunk)
596 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
597 | print('开始计算相似度!!!')
598 | chunk['sim_score_str'] = chunk['sim_list'].apply(
599 | lambda x: calculate_prone_similarity_tail(x, proNE_last_month, aid_num_dict))
600 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
601 | chunk['proNE_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
602 | chunk['proNE_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
603 | chunk['proNE_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2]))
604 | print(chunk[chunk['proNE_buys_sim_-1'] != -10])
605 |
606 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
607 | print(chunk)
608 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
609 | print('click开始计算相似度!!!')
610 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
611 | lambda x: calculate_prone_similarity_tail(x, proNE_last_month, aid_num_dict))
612 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
613 | chunk['proNE_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0]))
614 | chunk['proNE_clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1]))
615 | chunk['proNE_clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2]))
616 | print(chunk[chunk['proNE_clicks_sim_-1'] != -10])
617 |
618 | chunk = chunk.drop(
619 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
620 | 'clicks_sim_score_str'])
621 | print(chunk[['proNE_buys_sim_-1', 'proNE_buys_sim_-2', 'proNE_clicks_sim_-1', 'proNE_clicks_sim_-2']])
622 | print(chunk.columns)
623 | print(chunk)
624 | chunk.to_parquet(path)
625 |
626 |
627 | def calculate_MF_similarity(string, array):
628 | list = string.split(' ')
629 | if int(list[-1]) < 0:
630 | return '-10' + ' -10' * 3
631 | sim = []
632 | aid = int(list[0])
633 | for i in list[1:]:
634 | simm = cosine_similarity(array[aid].reshape(1, -1), array[int(i)].reshape(1, -1))[0][0]
635 | sim.append(simm)
636 | sim_sum = sum(sim)
637 | sim_mean = sim_sum / len(sim)
638 | sim_max = max(sim)
639 |
640 | return str(sim_mean) + ' ' + str(sim_max) + ' ' + str(sim_sum) + ' ' + str(sim[-1])
641 |
642 |
643 | # bpr,als,lmf,u2i相似度
644 | def bpr_als_lmf_u2i_similarity(stage, candidate_type, start, end):
645 | print('bpr')
646 | bpr_user_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/bpr_user_emb.npy')
647 | bpr_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/bpr_item_emb.npy')
648 | print('als')
649 | als_user_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/als_user_emb.npy')
650 | als_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/als_item_emb.npy')
651 | print('lmf')
652 | lmf_user_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/lmf_user_emb.npy')
653 | lmf_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/lmf_item_emb.npy')
654 |
655 | for t in candidate_type:
656 | print('开始导入数据')
657 | for i in range(start, end):
658 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
659 | print(f'第{i + 1}块数据')
660 | chunk = pd.read_parquet(path)
661 | print(path)
662 | print(chunk.columns)
663 |
664 | chunk = chunk.astype("float32")
665 | chunk['session'] = chunk['session'].astype('int32')
666 | chunk['aid'] = chunk['aid'].astype('int32')
667 | print(chunk)
668 |
669 | chunk['list'] = chunk['session'].astype('str') + ' ' + chunk['aid'].astype('str')
670 | print(chunk)
671 | chunk['bpr_user_item_sim'] = chunk['list'].map(
672 | lambda x: np.dot(bpr_user_emb[int(x.split(' ')[0])], bpr_item_emb[int(x.split(' ')[1])]))
673 | print(chunk['bpr_user_item_sim'].describe())
674 |
675 | chunk['als_user_item_sim'] = chunk['list'].map(
676 | lambda x: np.dot(als_user_emb[int(x.split(' ')[0])], als_item_emb[int(x.split(' ')[1])]))
677 | print(chunk['als_user_item_sim'].describe())
678 |
679 | chunk['lmf_user_item_sim'] = chunk['list'].map(
680 | lambda x: np.dot(lmf_user_emb[int(x.split(' ')[0])], lmf_item_emb[int(x.split(' ')[1])]))
681 | print(chunk['lmf_user_item_sim'].describe())
682 |
683 | print(chunk)
684 | chunk.to_parquet(path)
685 |
686 |
687 | def bpr_als_lmf_i2i_similarity(stage, candidate_type, start, end):
688 | print('开始读取数据!!!')
689 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
690 | print(valid)
691 | print('开始筛选')
692 |
693 | valid1 = valid[valid['type'] != 0]
694 | print(valid1)
695 | print('开始排序')
696 | # 分别对session_id聚合,对时间进行排序
697 | df = valid1.sort_values(by=["session", "ts"], ascending=True)
698 | print(df.head(10))
699 | print('生成list')
700 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list))
701 | sentences_df.columns = ['carts_and_orders']
702 | print(sentences_df)
703 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x)))
704 | sentences_df = sentences_df.drop(columns='carts_and_orders')
705 | print(sentences_df)
706 |
707 | valid2 = valid[valid['type'] == 0]
708 | print(valid2)
709 | print('开始排序')
710 | # 分别对session_id聚合,对时间进行排序
711 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True)
712 | print(df1.head(10))
713 | print('生成list')
714 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list))
715 | sentences_df1.columns = ['clicks']
716 | print(sentences_df1)
717 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x)))
718 | sentences_df1 = sentences_df1.drop(columns='clicks')
719 | print(sentences_df1)
720 |
721 | print('bpr')
722 | bpr_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/bpr_item_emb.npy')
723 | print('als')
724 | als_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/als_item_emb.npy')
725 | print('lmf')
726 | lmf_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/lmf_item_emb.npy')
727 |
728 | for t in candidate_type:
729 | # 只导入训练数据
730 | print('开始导入数据')
731 | for i in range(start, end):
732 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
733 | print(f'第{i + 1}块数据')
734 | chunk = pd.read_parquet(path)
735 | print(path)
736 | print(chunk.columns)
737 |
738 | chunk = chunk.astype("float32")
739 | chunk['session'] = chunk['session'].astype('int32')
740 | chunk['aid'] = chunk['aid'].astype('int32')
741 | print(chunk)
742 |
743 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1)
744 | print(chunk)
745 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str')
746 | print('开始计算相似度!!!')
747 | chunk['sim_score_str'] = chunk['sim_list'].apply(
748 | lambda x: calculate_MF_similarity(x, bpr_item_emb))
749 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
750 | chunk['bpr_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32')
751 | chunk['bpr_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32')
752 | chunk['bpr_buys_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32')
753 | chunk['bpr_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32')
754 | chunk['bpr_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32')
755 | chunk['bpr_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32')
756 | print(chunk[chunk['bpr_buys_sim_-3'] != -10])
757 | print(chunk)
758 |
759 | print('开始计算相似度!!!')
760 | chunk['sim_score_str'] = chunk['sim_list'].apply(
761 | lambda x: calculate_MF_similarity(x, als_item_emb))
762 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
763 | chunk['als_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32')
764 | chunk['als_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32')
765 | chunk['als_buys_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32')
766 | chunk['als_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32')
767 | chunk['als_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32')
768 | chunk['als_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32')
769 |
770 | print(chunk[chunk['als_buys_sim_-3'] != -10])
771 | print(chunk)
772 |
773 | print('开始计算相似度!!!')
774 | chunk['sim_score_str'] = chunk['sim_list'].apply(
775 | lambda x: calculate_MF_similarity(x, lmf_item_emb))
776 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']])
777 | chunk['lmf_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32')
778 | chunk['lmf_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32')
779 | chunk['lmf_buys_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32')
780 | chunk['lmf_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32')
781 | chunk['lmf_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32')
782 | chunk['lmf_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32')
783 |
784 | print(chunk[chunk['lmf_buys_sim_-3'] != -10])
785 | print(chunk)
786 |
787 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1)
788 | print(chunk)
789 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str')
790 | print('click开始计算相似度!!!')
791 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
792 | lambda x: calculate_MF_similarity(x, bpr_item_emb))
793 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
794 | chunk['bpr_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32')
795 | chunk['bpr_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32')
796 | chunk['bpr_clicks_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32')
797 | chunk['bpr_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32')
798 | chunk['bpr_clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32')
799 | chunk['bpr_clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32')
800 | print(chunk[chunk['bpr_clicks_sim_-3'] != -10])
801 | print(chunk)
802 |
803 | print('click开始计算相似度!!!')
804 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
805 | lambda x: calculate_MF_similarity(x, als_item_emb))
806 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
807 | chunk['als_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype(
808 | 'float32')
809 | chunk['als_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype(
810 | 'float32')
811 | chunk['als_clicks_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype(
812 | 'float32')
813 | chunk['als_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype(
814 | 'float32')
815 | print(chunk[chunk['als_clicks_sim_-1'] != -10])
816 | print(chunk)
817 |
818 | print('click开始计算相似度!!!')
819 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply(
820 | lambda x: calculate_MF_similarity(x, lmf_item_emb))
821 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']])
822 | chunk['lmf_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32')
823 | chunk['lmf_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32')
824 | chunk['lmf_clicks_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32')
825 | chunk['lmf_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32')
826 | chunk['lmf_clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32')
827 | chunk['lmf_clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32')
828 | print(chunk[chunk['lmf_clicks_sim_-3'] != -10])
829 | print(chunk)
830 |
831 | chunk = chunk.drop(
832 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list',
833 | 'clicks_sim_score_str'])
834 | print(chunk['als_clicks_sim_max'])
835 | print(chunk.columns)
836 | print(chunk)
837 | chunk.to_parquet(path)
838 |
839 |
840 | def similarity_features(stage, candidate_type, start, end):
841 | # buys&clicks * 4
842 | deepwalk_i2i_similarity1(stage, candidate_type, start, end)
843 | # orders&carts * 4
844 | deepwalk_i2i_similarity2(stage, candidate_type, start, end)
845 | # buys&clicks * 6
846 | deepwalk_i2i_similarity_tail(stage, candidate_type, start, end)
847 | # buys&clicks * 3
848 | deepwalk_u2i_similarity(stage, candidate_type, start, end)
849 |
850 | # buys&clicks * 4
851 | prone_i2i_similarity(stage, candidate_type, start, end)
852 | # buys&clicks * 6
853 | prone_i2i_similarity_tail(stage, candidate_type, start, end)
854 |
855 |
856 | if __name__ == '__main__':
857 | IS_TRAIN = True
858 | if IS_TRAIN:
859 | stage = 'CV'
860 | else:
861 | stage = 'LB'
862 | candidate_type = ['clicks', 'carts', 'orders']
863 | similarity_features(stage, candidate_type, 0, 8)
864 |
--------------------------------------------------------------------------------
/features/user_features.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import math
3 | import pandas as pd
4 | import numpy as np
5 |
6 |
7 | def load_data(path):
8 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
9 | dfs = []
10 | # 只导入训练数据
11 | for e, chunk_file in enumerate(glob.glob(path)):
12 | chunk = pd.read_parquet(chunk_file)
13 | chunk.ts = (chunk.ts / 1000).astype('int32')
14 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
15 | if not IS_TRAIN:
16 | # 除去第一周的数据
17 | chunk = chunk[chunk['ts'] >= 1659909599]
18 | dfs.append(chunk)
19 | return pd.concat(dfs).reset_index(drop=True)
20 |
21 |
22 | def user_features(input_path, output_path):
23 | print('开始读取数据!!!')
24 | valid = load_data(input_path)
25 | print("开始构造user_feature!!!")
26 | # 类别型变量分析:计数,分布
27 | # 连续性变量分析:最小值,最大值,离差,平均数,中位数,众数,标准差,变异系数,偏度,峰度
28 | print("开始聚合user:agg中!!!")
29 | user_features = valid.groupby('session').agg({'session': 'count', 'aid': 'nunique', 'type': ['mean', 'skew'],
30 | 'ts': ['min', 'max', 'skew']})
31 | user_features.columns = ['user_user_count', 'user_item_count', 'user_buy_ratio', 'user_buy_skew',
32 | 'user_min_ts', 'user_max_ts', 'user_skew_ts']
33 | print("开始计算ts偏态峰态系数!!!")
34 | # 计算时间偏态系数,计算时间峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0)
35 | user_features['user_skew_ts'] = user_features['user_skew_ts'].fillna(value=0)
36 | user_features['user_kurt_ts'] = valid.groupby('session')['ts'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(value=0)
37 |
38 | print("开始计算type偏态峰态系数!!!")
39 | # 计算类型偏态系数,计算类型峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0)
40 | user_features['user_buy_skew'] = user_features['user_buy_skew'].fillna(value=0)
41 | user_features['user_buy_kurt'] = valid.groupby('session')['type'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(
42 | value=0)
43 |
44 | print("开始计算ts天数!!!")
45 | # 序列持续的时间(天)
46 | user_features['user_long_ts'] = user_features['user_max_ts'] - user_features['user_min_ts']
47 |
48 | print("开始计算user三个比例特征!!!")
49 | # 平均每天观看几个商品
50 | user_features["user_avg_visit_per_day"] = user_features['user_user_count'] / (
51 | user_features['user_long_ts'] / (60 * 60 * 24)).clip(1, 60).apply(
52 | lambda x: math.ceil(x))
53 | # user重复观看的商品次数
54 | user_features["user_repeat_visit_num"] = user_features['user_user_count'] - user_features['user_item_count']
55 | # 平均每个商品观看的次数
56 | user_features["user_ave_visit_num"] = user_features['user_user_count'] / user_features['user_item_count']
57 | # session里面aids的re_watch比例
58 | user_features["user_re_visit_rate"] = user_features['user_repeat_visit_num'] / user_features['user_user_count']
59 | print(user_features.head())
60 | print(user_features.columns)
61 | print(user_features.shape)
62 | # 规定保存格式
63 | user_features = user_features.astype('float32')
64 | print("开始保存特征到文件!!!")
65 | user_features.to_parquet(output_path)
66 |
67 |
68 | def add_user_features(input_path, output_path):
69 | # user feature 7
70 | # 平均购买/加购/点击间隔 max - min / num
71 | # 点击购买率
72 | # 点击加购率
73 | # 加购购买率
74 | # 点击占比
75 | # 加购占比
76 | # 购买占比 user特征比较稀疏,加上可能效果不好
77 | # 复购率
78 | # 复加购率
79 | # 复点击率 item_item - item_user
80 | print('开始读取数据!!!')
81 | train = load_data(input_path)
82 | print("开始构造user_feature!!!")
83 | # 类别型变量分析:计数,分布
84 | # 连续性变量分析:最小值,最大值,离差,平均数,中位数,众数,标准差,变异系数,偏度,峰度
85 | print("开始聚合user:agg中!!!")
86 | train_click = train[train['type'] == 0]
87 | train_cart = train[train['type'] == 1]
88 | train_order = train[train['type'] == 2]
89 |
90 | print("开始构造item_feature!!!")
91 | click_user_features = train_click.groupby('session').agg({'aid': ['count', 'nunique'], 'ts': ['min', 'max']})
92 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
93 | click_user_features.columns = ['click_user_user_count', 'click_user_item_count', 'ts_min', 'ts_max']
94 | click_user_features['click_time'] = click_user_features['ts_max'] - click_user_features['ts_min']
95 | click_user_features['avg_click_span'] = click_user_features['click_time'] / click_user_features['click_user_user_count']
96 | click_user_features = click_user_features.drop(columns=['ts_min', 'ts_max', 'click_time'])
97 | print(click_user_features)
98 |
99 | cart_user_features = train_cart.groupby('session').agg({'aid': ['count', 'nunique'], 'ts': ['min', 'max']})
100 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
101 | cart_user_features.columns = ['cart_user_user_count', 'cart_user_item_count', 'ts_min', 'ts_max']
102 | cart_user_features['cart_time'] = cart_user_features['ts_max'] - cart_user_features['ts_min']
103 | cart_user_features['avg_cart_span'] = cart_user_features['cart_time'] / cart_user_features['cart_user_user_count']
104 | cart_user_features = cart_user_features.drop(columns=['ts_min', 'ts_max', 'cart_time'])
105 | print(cart_user_features)
106 |
107 | order_user_features = train_order.groupby('session').agg({'aid': ['count', 'nunique'], 'ts': ['min', 'max']})
108 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度
109 | order_user_features.columns = ['order_user_user_count', 'order_user_item_count', 'ts_min', 'ts_max']
110 | order_user_features['order_time'] = order_user_features['ts_max'] - order_user_features['ts_min']
111 | order_user_features['avg_order_span'] = order_user_features['order_time'] / order_user_features['order_user_user_count']
112 | order_user_features = order_user_features.drop(columns=['ts_min', 'ts_max', 'order_time'])
113 | print(order_user_features)
114 |
115 | click_user_features = click_user_features.merge(cart_user_features, left_index=True, right_index=True,
116 | how='left').fillna(value=0)
117 | click_user_features = click_user_features.merge(order_user_features, left_index=True, right_index=True,
118 | how='left').fillna(value=0)
119 |
120 | # click_item_item_count, click_item_user_count
121 | # 点击购买率 * 3
122 | click_user_features['user_click_cart_rate'] = click_user_features['cart_user_user_count'] / click_user_features[
123 | 'click_user_user_count']
124 | click_user_features['user_click_order_rate'] = click_user_features['order_user_user_count'] / click_user_features[
125 | 'click_user_user_count']
126 | click_user_features['user_cart_order_rate'] = click_user_features['order_user_user_count'] / click_user_features['cart_user_user_count']
127 |
128 | # 点击占比
129 | click_user_features['user_click_percentage'] = click_user_features['click_user_user_count'] / click_user_features[
130 | 'click_user_user_count'].sum()
131 | click_user_features['user_cart_percentage'] = click_user_features['cart_user_user_count'] / click_user_features[
132 | 'cart_user_user_count'].sum()
133 | click_user_features['user_order_percentage'] = click_user_features['order_user_user_count'] / click_user_features[
134 | 'order_user_user_count'].sum()
135 | # 复购率
136 | click_user_features['user_re_click_rate'] = (click_user_features['click_user_user_count'] - click_user_features[
137 | 'click_user_item_count']) / click_user_features['click_user_user_count']
138 | click_user_features['user_re_cart_rate'] = (click_user_features['cart_user_user_count'] - click_user_features[
139 | 'cart_user_item_count']) / click_user_features['cart_user_user_count']
140 | click_user_features['user_re_order_rate'] = (click_user_features['order_user_user_count'] - click_user_features[
141 | 'order_user_item_count']) / click_user_features['order_user_user_count']
142 |
143 | click_user_features = click_user_features.replace(np.inf, 100)
144 | click_user_features = click_user_features.fillna(value=-10)
145 | print(click_user_features)
146 |
147 | print("开始保存特征到文件!!!")
148 | click_user_features.to_parquet(output_path)
149 |
150 |
151 | def trans_time_span_features(input_path, output_path1, output_path2, output_path3):
152 | train = load_data(input_path)
153 |
154 | train_clicks = train[train['type'] == 0].drop(columns='type')
155 | train_clicks = train_clicks.rename(columns={'ts': 'ts_click'})
156 | train_carts = train[train['type'] == 1].drop(columns='type')
157 | train_carts = train_carts.rename(columns={'ts': 'ts_cart'})
158 | train_orders = train[train['type'] == 2].drop(columns='type')
159 | train_orders = train_orders.rename(columns={'ts': 'ts_order'})
160 |
161 | print('click_cart_span')
162 | click_cart_span = train_clicks.merge(train_carts, on=['session', 'aid'], how='inner')
163 | print(click_cart_span)
164 | click_cart_span['min'] = click_cart_span['ts_click'] - click_cart_span['ts_cart']
165 | click_cart_span = click_cart_span[click_cart_span['min'] <= 0].drop(columns='min')
166 | print(click_cart_span)
167 | click_cart_span_feature = click_cart_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_cart': 'min'})
168 | click_cart_span_feature.columns = ['ts_click_min', 'ts_cart_min']
169 | print(click_cart_span_feature)
170 | click_cart_span_feature['click_cart_span'] = click_cart_span_feature['ts_cart_min'] - click_cart_span_feature['ts_click_min']
171 | print(click_cart_span_feature)
172 | click_cart_span_feature['aids'] = click_cart_span_feature.index.get_level_values('aid')
173 | print(click_cart_span_feature)
174 | print(click_cart_span_feature.index.get_level_values('aid')[:10])
175 | click_cart_span_feature = click_cart_span_feature.groupby('aids').agg({'aids': 'count', 'click_cart_span': 'mean'})
176 | click_cart_span_feature.columns = ['trans_click_cart_count', 'trans_click_cart_span_avg']
177 | print(click_cart_span_feature.describe())
178 | print(click_cart_span_feature)
179 | click_cart_span_feature.to_parquet(output_path1)
180 |
181 | print('click_order_span')
182 | click_order_span = train_clicks.merge(train_orders, on=['session', 'aid'], how='inner')
183 | print(click_order_span)
184 | click_order_span['min'] = click_order_span['ts_click'] - click_order_span['ts_order']
185 | click_order_span = click_order_span[click_order_span['min'] <= 0].drop(columns='min')
186 | print(click_order_span)
187 | click_order_span_feature = click_order_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_order': 'min'})
188 | click_order_span_feature.columns = ['ts_click_min', 'ts_order_min']
189 | print(click_order_span_feature)
190 | click_order_span_feature['click_order_span'] = click_order_span_feature['ts_order_min'] - click_order_span_feature['ts_click_min']
191 | print(click_order_span_feature)
192 | click_order_span_feature['aids'] = click_order_span_feature.index.get_level_values('aid')
193 | print(click_order_span_feature)
194 | print(click_order_span_feature.index.get_level_values('aid')[:10])
195 | click_order_span_feature = click_order_span_feature.groupby('aids').agg({'aids': 'count', 'click_order_span': 'mean'})
196 | click_order_span_feature.columns = ['trans_click_order_count', 'trans_click_order_span_avg']
197 | print(click_order_span_feature.describe())
198 | print(click_order_span_feature)
199 | click_order_span_feature.to_parquet(output_path2)
200 |
201 |
202 | print('cart_order_span')
203 | carts_order_span = train_carts.merge(train_orders, on=['session', 'aid'], how='inner')
204 | print(carts_order_span)
205 | carts_order_span['min'] = carts_order_span['ts_cart'] - carts_order_span['ts_order']
206 | carts_order_span = carts_order_span[carts_order_span['min'] <= 0].drop(columns='min')
207 | print(carts_order_span)
208 | cart_order_span_feature = carts_order_span.groupby(['session', 'aid']).agg({'ts_cart': 'min', 'ts_order': 'min'})
209 | cart_order_span_feature.columns = ['ts_cart_min', 'ts_order_min']
210 | print(cart_order_span_feature)
211 | cart_order_span_feature['cart_order_span'] = cart_order_span_feature['ts_order_min'] - cart_order_span_feature['ts_cart_min']
212 | print(cart_order_span_feature)
213 | cart_order_span_feature['aids'] = cart_order_span_feature.index.get_level_values('aid')
214 | print(cart_order_span_feature)
215 | print(cart_order_span_feature.index.get_level_values('aid')[:10])
216 | cart_order_span_feature = cart_order_span_feature.groupby('aids').agg({'aids': 'count', 'cart_order_span': 'mean'})
217 | cart_order_span_feature.columns = ['trans_cart_order_count', 'trans_cart_order_span_avg']
218 | print(cart_order_span_feature.describe())
219 | print(cart_order_span_feature)
220 | cart_order_span_feature.to_parquet(output_path3)
221 |
222 |
223 | if __name__ == '__main__':
224 | IS_TRAIN = True
225 | if IS_TRAIN:
226 | stage = 'CV'
227 | else:
228 | stage = 'LB'
229 | input_path = f'/home/niejianfei/otto/{stage}/data/test_parquet/*'
230 | output_path = f'/home/niejianfei/otto/{stage}/preprocess/user_features.pqt'
231 | output_path1 = f'/home/niejianfei/otto/{stage}/preprocess/add_user_features.pqt'
232 | user_features(input_path, output_path)
233 | add_user_features(input_path, output_path1)
234 |
235 | input_path1 = f'/home/niejianfei/otto/{stage}/data/train_parquet/*'
236 | output_path2 = f'/home/niejianfei/otto/{stage}/preprocess/click_cart_span_features.pqt'
237 | output_path3 = f'/home/niejianfei/otto/{stage}/preprocess/click_order_span_features.pqt'
238 | output_path4 = f'/home/niejianfei/otto/{stage}/preprocess/cart_order_span_features.pqt'
239 | trans_time_span_features(input_path1, output_path2, output_path3, output_path4)
--------------------------------------------------------------------------------
/features/user_item_features.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import pandas as pd
3 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
4 | IS_TRAIN = True
5 |
6 |
7 | def load_data(path):
8 | dfs = []
9 | # 只导入训练数据
10 | for e, chunk_file in enumerate(glob.glob(path)):
11 | chunk = pd.read_parquet(chunk_file)
12 | chunk.ts = (chunk.ts / 1000).astype('int32')
13 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
14 | dfs.append(chunk)
15 | return pd.concat(dfs).reset_index(drop=True)
16 |
17 |
18 | def user_item_features(stage, candidate_type):
19 | valid = load_data(f'/home/niejianfei/otto/{stage}/data/test_parquet/*')
20 | for t in candidate_type:
21 | print('读取candidates!!!')
22 | candidates = pd.read_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates_{t}.pqt').reset_index(
23 | drop=True)
24 | candidates = candidates.sort_values('session', ascending=True)
25 | print(candidates)
26 |
27 | print("开始构造user_item interaction features!!!")
28 | # 构造user_item interaction features
29 | print("click!!!")
30 | # item是否被点击
31 | item_clicked = valid[valid["type"] == 0].drop(columns="ts").drop_duplicates(["session", "aid"])
32 | item_clicked["type"] = 1
33 | item_clicked.columns = ["session", "aid", "item_clicked"]
34 | # item_clicked 特征
35 | item_clicked_features = valid[valid["type"] == 0].groupby(['session', 'aid']).agg(
36 | {'aid': 'count'})
37 | item_clicked_features.columns = ['item_clicked_num']
38 | item_clicked_features = item_clicked_features.astype('float32')
39 |
40 | print("cart!!!")
41 | # item是否被加购
42 | item_carted = valid[valid["type"] == 1].drop(columns="ts").drop_duplicates(["session", "aid"])
43 | item_carted["type"] = 1
44 | item_carted.columns = ["session", "aid", "item_carted"]
45 | # item_carted 特征
46 | item_carted_features = valid[valid["type"] == 1].groupby(['session', 'aid']).agg(
47 | {'aid': 'count'})
48 | item_carted_features.columns = ['item_carted_num']
49 | item_carted_features = item_carted_features.astype('float32')
50 | print("order!!!")
51 |
52 | # item是否被购买
53 | item_ordered = valid[valid["type"] == 2].drop(columns="ts").drop_duplicates(["session", "aid"])
54 | item_ordered["type"] = 1
55 | item_ordered.columns = ["session", "aid", "item_ordered"]
56 | # item_ordered 特征
57 | item_ordered_features = valid[valid["type"] == 2].groupby(['session', 'aid']).agg(
58 | {'aid': 'count'})
59 | item_ordered_features.columns = ['item_ordered_num']
60 | item_ordered_features = item_ordered_features.astype('float32')
61 |
62 | print("开始聚合数据!!!")
63 | item_features = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/item_features.pqt')
64 |
65 | chunk = 8
66 | size = candidates.shape[0] + 200
67 | print(f"candidates有{candidates.shape[0]}条数据!!!")
68 | # 距离session结束的时间sec, 需要ts merge到candidate上然后减去min_ts
69 | # 去重,保留最后一个ts,merge 相减 加两列特征距离session结束的时间sec,和最后一次和aid交互的类型
70 | valid = valid.drop_duplicates(['session', 'aid'], keep='last').drop(columns='type')
71 | # valid['user_item_within'] = 1
72 | print(valid)
73 |
74 | user_features = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/user_features.pqt')
75 | valid = valid.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1000)
76 |
77 | valid['sec_to_session_start'] = valid['ts'] - valid['user_min_ts']
78 | valid['sec_to_session_end'] = valid['user_max_ts'] - valid['ts']
79 | valid = valid.drop(columns=['user_min_ts', 'user_max_ts', 'ts'])
80 |
81 | val_session = valid[['sec_to_session_start', 'sec_to_session_end', 'user_long_ts']]
82 | print(val_session)
83 | print((val_session['sec_to_session_start'] + val_session['sec_to_session_end'] - val_session['user_long_ts']).max())
84 |
85 | k = size // chunk
86 | t = 0
87 | for i in range(chunk):
88 | print(f"第{i + 1}块!!!")
89 | print("1!!!")
90 | temp_candidates = candidates.iloc[k * i:k * (i + 1), :]
91 | print(temp_candidates)
92 | # merge user_item interaction features
93 | temp_candidates = temp_candidates.merge(item_clicked, how="left", on=["session", "aid"]).fillna(value=-1)
94 | temp_candidates = temp_candidates.merge(item_clicked_features, how="left", on=["session", "aid"]).fillna(
95 | value=-1)
96 | print(temp_candidates)
97 | print("2!!!")
98 | temp_candidates = temp_candidates.merge(item_carted, how="left", on=["session", "aid"]).fillna(value=-1)
99 | temp_candidates = temp_candidates.merge(item_carted_features, how="left", on=["session", "aid"]).fillna(
100 | value=-1)
101 | print("3!!!")
102 | temp_candidates = temp_candidates.merge(item_ordered, how="left", on=["session", "aid"]).fillna(value=-1)
103 | temp_candidates = temp_candidates.merge(item_ordered_features, how="left", on=["session", "aid"]).fillna(
104 | value=-1)
105 | print(temp_candidates)
106 | print("开始读取聚合item_features!!!")
107 | # Step 5:add features to our candidate dataframe
108 | temp_candidates = temp_candidates.merge(item_features, left_on='aid', right_index=True, how='left').fillna(
109 | -1000)
110 |
111 | # 加入交互特征
112 | temp_candidates = temp_candidates.merge(valid, on=["session", "aid"], how='left').fillna(-1)
113 | print(temp_candidates)
114 |
115 | temp_candidates.to_parquet(
116 | f"/home/niejianfei/otto/{stage}/candidates/candidates_{candidate_type[0:-1]}_features_data/candidate_{candidate_type[0:-1]}_{i}.pqt")
117 | print(temp_candidates)
118 | t += len(temp_candidates)
119 | print(f'第{i+1}块数据量:', len(temp_candidates))
120 | print('数据总量:', t)
121 |
122 |
123 | if __name__ == '__main__':
124 | IS_TRAIN = True
125 | candidate_type = ['clicks', 'carts', 'orders']
126 | if IS_TRAIN:
127 | stage = 'CV'
128 | else:
129 | stage = 'LB'
130 |
131 | user_item_features(stage, candidate_type)
--------------------------------------------------------------------------------
/merge_features.py:
--------------------------------------------------------------------------------
1 | from features import recall_features
2 | from features import user_item_features
3 | from features import similarity_features
4 | from features import co_visitation_features
5 | import pandas as pd
6 |
7 |
8 | def add_labels(candidate_type):
9 | targets = pd.read_parquet('/home/niejianfei/otto/CV/preprocess/test_labels.parquet')
10 | for t in candidate_type:
11 | print("给data加标签!!!")
12 | # 加标签
13 | temp_target = targets[targets['type'] == t].drop(columns="type")
14 | temp_target = temp_target.explode("ground_truth").astype("int32")
15 | temp_target.columns = ['session', 'aid']
16 | temp_target[t[0:-1]] = 1
17 |
18 | # 只导入CV数据
19 | print('开始导入数据')
20 | for i in range(0, 8):
21 | path = f"/home/niejianfei/otto/CV/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
22 | print(f'第{i + 1}块数据')
23 | chunk = pd.read_parquet(path)
24 | print(path)
25 | print(chunk.columns)
26 | # 加标签,负类标0
27 | chunk = chunk.merge(temp_target, ['session', 'aid'], how='left').fillna(value=0)
28 | print(chunk)
29 | chunk.to_parquet(path)
30 |
31 |
32 | if __name__ == '__main__':
33 | IS_TRAIN = True
34 | candidate_type = ['clicks', 'carts', 'orders']
35 | if IS_TRAIN:
36 | stage = 'CV'
37 | else:
38 | stage = 'LB'
39 |
40 | recall_features(stage, candidate_type)
41 | user_item_features(stage, candidate_type)
42 | similarity_features(stage, candidate_type, 0, 8)
43 | co_visitation_features(stage, candidate_type, 0, 8)
44 | # 给CV加标签
45 | if IS_TRAIN:
46 | add_labels(candidate_type)
--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from ranker import generate_submission
3 | from ranker import user_sample
4 |
5 |
6 | def submission(candidate_type):
7 | sub = pd.DataFrame()
8 | for t in candidate_type:
9 | df = pd.read_parquet(f'/home/niejianfei/otto/LB/submission/sub_{t}.pqt')
10 | df = df.loc[df.session_type.str.contains(t)]
11 | sub = sub.append(df)
12 | return sub
13 |
14 |
15 | if __name__ == '__main__':
16 | candidate_type = ['clicks', 'carts', 'orders']
17 | generate_submission('test', 'LB', candidate_type, user_sample(0.5), 'final_all_data')
18 |
19 | submission_final = submission(candidate_type)
20 | submission_final.to_csv(f'/home/niejianfei/otto/LB/submission/submission_final.csv', index=False)
21 |
--------------------------------------------------------------------------------
/preprocess/BPRMF_ALSMF_LMF_prepare.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import numpy as np
3 | import pandas as pd
4 | import scipy.sparse as sparse
5 | import implicit
6 | IS_TRAIN = True
7 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
8 |
9 |
10 | def load_data(path):
11 | dfs = []
12 | # 只导入训练数据
13 | for e, chunk_file in enumerate(glob.glob(path)):
14 | chunk = pd.read_parquet(chunk_file)
15 | chunk.ts = (chunk.ts / 1000).astype('int32')
16 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
17 | dfs.append(chunk)
18 | return pd.concat(dfs).reset_index(drop=True)
19 |
20 |
21 | if IS_TRAIN:
22 | test_df = load_data('/home/niejianfei/otto/CV/data/*_parquet/*')
23 | else:
24 | test_df = load_data('/home/niejianfei/otto/LB/data/*_parquet/*')
25 |
26 |
27 | dic1 = {0: 1, 1: 5, 2: 4}
28 | test_df['type'] = test_df['type'].map(dic1)
29 | grouped_df = test_df.groupby(['session', 'aid']).sum().reset_index()
30 |
31 | # sparse_content_person = sparse.csr_matrix(
32 | # (grouped_df['type'].astype(float), (grouped_df['aid'], grouped_df['session'])))
33 | sparse_person_content = sparse.csr_matrix(
34 | (grouped_df['type'].astype(float), (grouped_df['session'], grouped_df['aid'])))
35 |
36 | print(sparse_person_content.shape)
37 | # print(sparse_person_content.shape)
38 |
39 | alpha = 15
40 | sparse_person_content = (sparse_person_content * alpha).astype('double')
41 |
42 | # from implicit.nearest_neighbours import bm25_weight
43 | # # weight the matrix, both to reduce impact of users that have played the same artist thousands of times
44 | # # and to reduce the weight given to popular items
45 | # artist_user_plays = bm25_weight(sparse_person_content, K1=100, B=0.8)
46 |
47 | model1 = implicit.bpr.BayesianPersonalizedRanking(factors=64, regularization=0.1)
48 | model2 = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.1)
49 | model3 = implicit.lmf.LogisticMatrixFactorization(factors=64, regularization=0.6)
50 |
51 | models = [model1, model2, model3]
52 | names = ['bpr', 'als', 'lmf']
53 |
54 | for model, name in zip(models, names):
55 | model.fit(sparse_person_content)
56 | user_emb = model.user_factors.to_numpy()
57 | print("user")
58 | print(user_emb[0], len(user_emb))
59 | print("item")
60 | item_emb = model.item_factors.to_numpy()
61 | print(item_emb[0], len(item_emb))
62 | print('save')
63 | if IS_TRAIN:
64 | stage = 'CV'
65 | else:
66 | stage = 'LB'
67 | np.save(f'/home/niejianfei/otto/{stage}/preprocess/{name}_user_emb', user_emb)
68 | np.save(f'/home/niejianfei/otto/{stage}/preprocess/{name}_item_emb', item_emb)
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/preprocess/ProNE_prepare.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import pickle
3 | import pandas as pd
4 | import numpy as np
5 | IS_TRAIN = True
6 |
7 |
8 | def load_data(path):
9 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
10 | dfs = []
11 | # 只导入训练数据
12 | for e, chunk_file in enumerate(glob.glob(path)):
13 | chunk = pd.read_parquet(chunk_file)
14 | chunk.ts = (chunk.ts / 1000).astype('int32')
15 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
16 | dfs.append(chunk)
17 | return pd.concat(dfs).reset_index(drop=True)
18 |
19 |
20 | # 加载数据
21 | print('加载数据')
22 |
23 | if IS_TRAIN:
24 | train_sessions = load_data('/home/niejianfei/otto/CV/data/*_parquet/*')
25 | else:
26 | train_sessions = load_data('/home/niejianfei/otto/LB/data/*_parquet/*')
27 | print(train_sessions)
28 |
29 | dic = pd.DataFrame(train_sessions.drop_duplicates(['aid']).sort_values(by='aid', ascending=True)['aid'])
30 | dic['num'] = range(len(dic))
31 | dic.index = dic['aid']
32 | dic = dic.drop(columns='aid').to_dict()['num']
33 | # print(dic)
34 |
35 | # 保存矩阵到本地
36 | if IS_TRAIN:
37 | f_save = open('/home/niejianfei/otto/CV/preprocess/aid_num_dict.pkl', 'wb')
38 | pickle.dump(dic, f_save)
39 | f_save.close()
40 | else:
41 | f_save = open('/home/niejianfei/otto/LB/preprocess/aid_num_dict.pkl', 'wb')
42 | pickle.dump(dic, f_save)
43 | f_save.close()
44 | print("aid_num映射保存完毕!!!")
45 |
46 |
47 | def generate_pairs(df):
48 | df = df.sort_values(by=['session', 'ts'])
49 | print(df)
50 | df['aid'] = df['aid'].map(dic)
51 | print(df)
52 |
53 | print('count 1')
54 | df['session_count'] = df['session'].map(df['session'].value_counts())
55 | print(df)
56 | df1 = df[df['session_count'] == 1]
57 | df = df.append(df1)
58 | print('count 2')
59 | df['session_count'] = df['session'].map(df['session'].value_counts())
60 | print(df['session_count'].min())
61 | print(df)
62 |
63 | df = df.sort_values(by=['session', 'ts'])
64 | df['ranking'] = df.groupby(['session'])['ts'].rank(method='first', ascending=True)
65 | print(df)
66 | df['aid_next'] = df['aid'].shift(-1)
67 | print(df)
68 | df = df.query('session_count!=ranking').reset_index(drop=True)
69 |
70 | df['aid_next'] = df['aid_next'].astype('int32')
71 | print(df)
72 | df = df[['aid', 'aid_next']]
73 | print(df)
74 | pairs_list = np.array(df)
75 | return pairs_list
76 |
77 |
78 | pairs_list = generate_pairs(train_sessions).tolist()
79 | print(pairs_list[:10])
80 |
81 | if IS_TRAIN:
82 | f = open('/home/niejianfei/otto/CV/preprocess/session_pairs.ungraph', "w")
83 | for line in pairs_list:
84 | f.write(str(line[0]) + ' ' + str(line[1]) + '\n')
85 | f.close()
86 | else:
87 | f = open('/home/niejianfei/otto/LB/preprocess/session_pairs.ungraph', "w")
88 | for line in pairs_list:
89 | f.write(str(line[0]) + ' ' + str(line[1]) + '\n')
90 | f.close()
91 |
--------------------------------------------------------------------------------
/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niejianfei/Kaggle_OTTO_Multi-Objective_Recommender_System/3d1f7bff40891628f7a2edd2b31d6a40011aa38a/preprocess/__init__.py
--------------------------------------------------------------------------------
/preprocess/co-visitation_matrix_prepare.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import glob
3 | import cudf
4 | import numpy as np
5 | import pandas as pd
6 |
7 | print('We will use RAPIDS version', cudf.__version__)
8 | VER = 6
9 | type_weight = {0: 1, 1: 5, 2: 4}
10 | IS_TRAIN = True
11 | use_all_data = True
12 |
13 |
14 | # CACHE FUNCTIONS
15 | # 读取文件路径,将cpu RAM上的对应df读取到GPU上
16 | def read_file(f):
17 | return cudf.DataFrame(data_cache[f])
18 |
19 |
20 | def read_file_to_cache(f):
21 | df = pd.read_parquet(f)
22 | df.ts = (df.ts / 1000).astype('int32')
23 | if not use_all_data:
24 | # 除去第一周的数据
25 | df = df[df['ts'] >= 1659909599]
26 | df['type'] = df['type'].map(type_labels).astype('int8')
27 | return df
28 |
29 |
30 | # CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
31 | # 存储在cpu上的字典
32 | data_cache = {}
33 | type_labels = {'clicks': 0, 'carts': 1, 'orders': 2}
34 | if IS_TRAIN:
35 | # glob模块用来查找文件目录和文件,并将搜索的到的结果返回到一个列表中
36 | files = glob.glob('/home/niejianfei/otto/CV/data/*_parquet/*')
37 | # 存到字典里面存到cpu的RAM里面,字典的键是文件路径,值是对应路径文件生成的dataframe
38 | for f in files: data_cache[f] = read_file_to_cache(f)
39 | else:
40 | # glob模块用来查找文件目录和文件,并将搜索的到的结果返回到一个列表中
41 | files = glob.glob('/home/niejianfei/otto/LB/data/*_parquet/*')
42 | # 存到字典里面存到cpu的RAM里面,字典的键是文件路径,值是对应路径文件生成的dataframe
43 | for f in files: data_cache[f] = read_file_to_cache(f)
44 |
45 | # CHUNK PARAMETERS
46 | # 分成5组
47 | READ_CT = 5
48 | # ceil向上取整将文件分成chunk=len/6块,将文件分成6块
49 | CHUNK = int(np.ceil(len(files) / 6))
50 | print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.')
51 |
52 | # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
53 | DISK_PIECES = 4
54 | # item的数量/分区数量
55 | SIZE = 1.86e6 / DISK_PIECES
56 |
57 | # "Carts Orders" Co-visitation Matrix - Type Weighted
58 | # COMPUTE IN PARTS FOR MEMORY MANGEMENT
59 | # for循环分块计算
60 | for PART in range(DISK_PIECES): # 一次循环计算150个文件中的1/4 个items(item[0,180w/4]),
61 | print()
62 | print('### DISK PART', PART + 1)
63 |
64 | # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
65 | # => OUTER CHUNKS
66 | # 150个文件分成6大块,每一块25个小文件
67 | for j in range(6): # 6 * 5 *5 = 30 * 5 = 150
68 | a = j * CHUNK
69 | b = min((j + 1) * CHUNK, len(files))
70 | print(f'Processing files {a} thru {b - 1} in groups of {READ_CT}...')
71 |
72 | # => INNER CHUNKS
73 | # 25个小文件分成5份,每份5个文件,读取最开始的那1份文件
74 | for k in range(a, b, READ_CT):
75 | # READ FILE
76 | # 读到GPU里面去,df为list
77 | df = [read_file(files[k])]
78 | for i in range(1, READ_CT): # 在上述一份文件的基础上,在添加4份文件到GPU
79 | if k + i < b: df.append(read_file(files[k + i]))
80 | # 融合5个dataframe信息
81 | df = cudf.concat(df, ignore_index=True, axis=0)
82 | # 升序排列session,降序排列ts
83 | df = df.sort_values(['session', 'ts'], ascending=[True, False])
84 |
85 | # USE TAIL OF SESSION
86 | df = df.reset_index(drop=True)
87 | # session分组排序标序号,[0-count-1],df顺序不变,0-n-1的顺序是降序排列,留下session最近的30个item
88 | df['n'] = df.groupby('session').cumcount()
89 | # 过滤数据,筛选出n小于30的session,类似于baseline中的ranking和session_day count
90 | df = df.loc[df.n < 30].drop('n', axis=1)
91 |
92 | # CREATE PAIRS
93 | df = df.merge(df, on='session')
94 | # 构造item对,这两个item被user查看的时间相差不到一天
95 | df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
96 |
97 | # MEMORY MANAGEMENT COMPUTE IN PARTS
98 | # 内存管理,这里对df的计算继续分区(采用过滤的方式),分part计算,一共有1800000个item,size=sum(items)/ DISK_PIECES
99 | df = df.loc[(df.aid_x >= PART * SIZE) & (df.aid_x < (PART + 1) * SIZE)]
100 |
101 | # ASSIGN WEIGHTS
102 | # 只留下 session ,item pair,type信息并去重
103 | df = df[['session', 'aid_x', 'aid_y', 'type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', 'type_y'])
104 | # 根据merge的aid_y的类型赋予权重,,x的类型不知道??
105 | df['wgt'] = df.type_y.map(type_weight)
106 | # 去掉session和type信息
107 | df = df[['aid_x', 'aid_y', 'wgt']]
108 | df.wgt = df.wgt.astype('float32')
109 | # items pair groupby分组计算权重 click/carts/orders 1/5/4
110 | df = df.groupby(['aid_x', 'aid_y']).wgt.sum()
111 | # print(df)
112 | # COMBINE INNER CHUNKS
113 | if k == a:
114 | tmp2 = df
115 | else:
116 | tmp2 = tmp2.add(df, fill_value=0)
117 | print(k, ', ', end='')
118 |
119 | print()
120 |
121 | # COMBINE OUTER CHUNKS
122 | if a == 0:
123 | tmp = tmp2
124 | else:
125 | tmp = tmp.add(tmp2, fill_value=0)
126 | del tmp2, df
127 | gc.collect()
128 |
129 | # CONVERT MATRIX TO DICTIONARY
130 | tmp = tmp.reset_index()
131 | tmp = tmp.sort_values(['aid_x', 'wgt'], ascending=[True, False])
132 | print(tmp)
133 | # SAVE TOP 40 15
134 | tmp = tmp.reset_index(drop=True)
135 | tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
136 | print(tmp)
137 | tmp = tmp.loc[tmp.n < 50]
138 | print(tmp)
139 | # SAVE PART TO DISK (convert to pandas first uses less memory)
140 | if IS_TRAIN:
141 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/CV/preprocess/top_15_carts_orders_v{VER}_{PART}.pqt')
142 | else:
143 | if use_all_data:
144 | tmp.to_pandas().to_parquet(
145 | f'/home/niejianfei/otto/LB/preprocess/all_data_top_15_carts_orders_v{VER}_{PART}.pqt')
146 | else:
147 | tmp.to_pandas().to_parquet(
148 | f'/home/niejianfei/otto/LB/preprocess/top_15_carts_orders_v{VER}_{PART}.pqt')
149 |
150 | # 2."Buy2Buy" Co-visitation Matrix
151 | # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
152 | DISK_PIECES = 1
153 | SIZE = 1.86e6 / DISK_PIECES
154 |
155 | # COMPUTE IN PARTS FOR MEMORY MANGEMENT
156 | for PART in range(DISK_PIECES):
157 | print()
158 | print('### DISK PART', PART + 1)
159 |
160 | # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
161 | # => OUTER CHUNKS
162 | for j in range(6):
163 | a = j * CHUNK
164 | b = min((j + 1) * CHUNK, len(files))
165 | print(f'Processing files {a} thru {b - 1} in groups of {READ_CT}...')
166 |
167 | # => INNER CHUNKS
168 | for k in range(a, b, READ_CT):
169 |
170 | # READ FILE
171 | df = [read_file(files[k])]
172 | for i in range(1, READ_CT):
173 | if k + i < b: df.append(read_file(files[k + i]))
174 | df = cudf.concat(df, ignore_index=True, axis=0)
175 | df = df.loc[df['type'].isin([1, 2])] # ONLY WANT CARTS AND ORDERS
176 | df = df.sort_values(['session', 'ts'], ascending=[True, False])
177 |
178 | # USE TAIL OF SESSION
179 | df = df.reset_index(drop=True)
180 | df['n'] = df.groupby('session').cumcount()
181 | df = df.loc[df.n < 30].drop('n', axis=1)
182 |
183 | # CREATE PAIRS
184 | df = df.merge(df, on='session')
185 | df = df.loc[((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y)] # 14 DAYS
186 |
187 | # MEMORY MANAGEMENT COMPUTE IN PARTS
188 | df = df.loc[(df.aid_x >= PART * SIZE) & (df.aid_x < (PART + 1) * SIZE)]
189 |
190 | # ASSIGN WEIGHTS
191 | df = df[['session', 'aid_x', 'aid_y', 'type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', 'type_y'])
192 | df['wgt'] = 1
193 | df = df[['aid_x', 'aid_y', 'wgt']]
194 | df.wgt = df.wgt.astype('float32')
195 | df = df.groupby(['aid_x', 'aid_y']).wgt.sum()
196 |
197 | # COMBINE INNER CHUNKS
198 | if k == a:
199 | tmp2 = df
200 | else:
201 | tmp2 = tmp2.add(df, fill_value=0)
202 | print(k, ', ', end='')
203 |
204 | print()
205 |
206 | # COMBINE OUTER CHUNKS
207 | if a == 0:
208 | tmp = tmp2
209 | else:
210 | tmp = tmp.add(tmp2, fill_value=0)
211 | del tmp2, df
212 | gc.collect()
213 |
214 | # CONVERT MATRIX TO DICTIONARY
215 | tmp = tmp.reset_index()
216 | tmp = tmp.sort_values(['aid_x', 'wgt'], ascending=[True, False])
217 |
218 | # SAVE TOP 15
219 | tmp = tmp.reset_index(drop=True)
220 | tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
221 | tmp = tmp.loc[tmp.n < 50]
222 | # SAVE PART TO DISK (convert to pandas first uses less memory)
223 | if IS_TRAIN:
224 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/CV/preprocess/top_15_buy2buy_v{VER}_{PART}.pqt')
225 | else:
226 | if use_all_data:
227 | tmp.to_pandas().to_parquet(
228 | f'/home/niejianfei/otto/LB/preprocess/all_data_top_15_buy2buy_v{VER}_{PART}.pqt')
229 | else:
230 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/LB/preprocess/top_15_buy2buy_v{VER}_{PART}.pqt')
231 |
232 | # 3."Clicks" Co-visitation Matrix - Time Weighted
233 | # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
234 | DISK_PIECES = 4
235 | SIZE = 1.86e6 / DISK_PIECES
236 |
237 | # COMPUTE IN PARTS FOR MEMORY MANGEMENT
238 | for PART in range(DISK_PIECES):
239 | print()
240 | print('### DISK PART', PART + 1)
241 |
242 | # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
243 | # => OUTER CHUNKS
244 | for j in range(6):
245 | a = j * CHUNK
246 | b = min((j + 1) * CHUNK, len(files))
247 | print(f'Processing files {a} thru {b - 1} in groups of {READ_CT}...')
248 |
249 | # => INNER CHUNKS
250 | for k in range(a, b, READ_CT):
251 | # READ FILE
252 | df = [read_file(files[k])]
253 | for i in range(1, READ_CT):
254 | if k + i < b: df.append(read_file(files[k + i]))
255 | df = cudf.concat(df, ignore_index=True, axis=0)
256 | df = df.sort_values(['session', 'ts'], ascending=[True, False])
257 |
258 | # USE TAIL OF SESSION
259 | df = df.reset_index(drop=True)
260 | df['n'] = df.groupby('session').cumcount()
261 | df = df.loc[df.n < 30].drop('n', axis=1)
262 |
263 | # CREATE PAIRS
264 | df = df.merge(df, on='session')
265 | df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
266 |
267 | # MEMORY MANAGEMENT COMPUTE IN PARTS
268 | df = df.loc[(df.aid_x >= PART * SIZE) & (df.aid_x < (PART + 1) * SIZE)]
269 |
270 | # ASSIGN WEIGHTS
271 | df = df[['session', 'aid_x', 'aid_y', 'ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
272 | df['wgt'] = 1 + 3 * (df.ts_x - 1659304800) / (1662328791 - 1659304800) # 归一化数据,离得时间越近,权重越大
273 | # 1659304800 : minimum timestamp
274 | # 1662328791 : maximum timestamp
275 | df = df[['aid_x', 'aid_y', 'wgt']]
276 | df.wgt = df.wgt.astype('float32')
277 | df = df.groupby(['aid_x', 'aid_y']).wgt.sum()
278 |
279 | # COMBINE INNER CHUNKS
280 | if k == a:
281 | tmp2 = df
282 | else:
283 | tmp2 = tmp2.add(df, fill_value=0)
284 | print(k, ', ', end='')
285 | print()
286 |
287 | # COMBINE OUTER CHUNKS
288 | if a == 0:
289 | tmp = tmp2
290 | else:
291 | tmp = tmp.add(tmp2, fill_value=0)
292 | del tmp2, df
293 | gc.collect()
294 |
295 | # CONVERT MATRIX TO DICTIONARY
296 | tmp = tmp.reset_index()
297 | tmp = tmp.sort_values(['aid_x', 'wgt'], ascending=[True, False])
298 |
299 | # SAVE TOP 20
300 | tmp = tmp.reset_index(drop=True)
301 | tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
302 | tmp = tmp.loc[tmp.n < 50]
303 |
304 | # SAVE PART TO DISK (convert to pandas first uses less memory)
305 | if IS_TRAIN:
306 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/CV/preprocess/top_20_clicks_v{VER}_{PART}.pqt')
307 | else:
308 | if use_all_data:
309 | tmp.to_pandas().to_parquet(
310 | f'/home/niejianfei/otto/LB/preprocess/all_data_top_20_clicks_v{VER}_{PART}.pqt')
311 | else:
312 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/LB/preprocess/top_20_clicks_v{VER}_{PART}.pqt')
313 |
--------------------------------------------------------------------------------
/preprocess/deepwalk_prepare.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import pandas as pd
3 | from tqdm import tqdm
4 | from collections import defaultdict
5 | from gensim.models import Word2Vec
6 | import numpy as np
7 |
8 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
9 | IS_TRAIN = True
10 | IS_Last_Month = True
11 |
12 |
13 | def load_data(path):
14 | dfs = []
15 | # 只导入训练数据
16 | for e, chunk_file in enumerate(glob.glob(path)):
17 | chunk = pd.read_parquet(chunk_file)
18 | chunk.ts = (chunk.ts / 1000).astype('int32')
19 | # if not IS_TRAIN:
20 | # # 除去第一周的数据
21 | # chunk = chunk[chunk['ts'] >= 1659909599]
22 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
23 | dfs.append(chunk)
24 | return pd.concat(dfs).reset_index(drop=True)
25 |
26 |
27 | # 加载数据
28 | print('加载数据')
29 | if IS_TRAIN:
30 | if IS_Last_Month:
31 | train_sessions = load_data('/home/niejianfei/otto/CV/data/*_parquet/*')
32 | print(train_sessions)
33 | else:
34 | train_sessions = load_data('/home/niejianfei/otto/CV/data/test_parquet/*')
35 | print(train_sessions)
36 | else:
37 | if IS_Last_Month:
38 | train_sessions = load_data('/home/niejianfei/otto/LB/data/*_parquet/*')
39 | print(train_sessions)
40 | else:
41 | train_sessions = load_data('/home/niejianfei/otto/LB/data/test_parquet/*')
42 | print(train_sessions)
43 |
44 | print('开始排序')
45 | # 分别对session_id聚合,对时间进行排序
46 | df = train_sessions.sort_values(by=["session", "ts"], ascending=True)
47 | print(df.head(10))
48 |
49 | print('开始构图')
50 | # 开始构图
51 | dic = defaultdict(list) # defaultdict为了给key不在字典的情况赋予一个default值
52 | # 加文字是区分item和user
53 | for x in tqdm(df[["session", "aid"]].values):
54 | dic[f"user_{x[0]}"].append(f"item_{x[1]}") # list中元素是有顺序的
55 | dic[f"item_{x[1]}"].append(f"user_{x[0]}")
56 |
57 | # 随机游走
58 | print('开始随机游走')
59 | # 中心点item,先选定一个session,再走到session中item后面的元素中
60 | # 计算user item对应长度
61 | dic_count = {}
62 | for key in dic:
63 | dic_count[key] = len(dic[key])
64 |
65 | item_list = df["aid"].unique()
66 | user_list = df["session"].unique()
67 | print('item数量', len(item_list))
68 | print('user数量', len(user_list))
69 |
70 | path_length = 20
71 | sentences = []
72 | num_sentences = 20000000 # 实际跑的时候建议50w+ (有2w个item)
73 | '''
74 | badcase:
75 | item_a : session_1
76 | session_1 : [item_b,item_a]
77 | 需要加一个max_repeat_time 避免死循环
78 | '''
79 |
80 | max_repeat_nums = path_length * 2
81 | for _ in tqdm(range(num_sentences)):
82 | start_item = "item_{}".format(item_list[np.random.randint(0, len(item_list))])
83 | sentence = [start_item]
84 | repeat_time = 0
85 | while len(sentence) < path_length:
86 | last_item = sentence[-1]
87 | random_user = dic[last_item][np.random.randint(0, dic_count[last_item])] # 递归,选最后一个得到user列表,再选一个user
88 | # 若两个相同的item紧挨着,则+1后跳到下一个,继续session随机可能跳出来,其实图也有这种情况,闭环的产生
89 | next_item_index = np.where(np.array(dic[random_user]) == last_item)[0][
90 | 0] + 1 # 在random_user的items里面找到last_item的索引+1
91 | # user内item不是最后一个,把后面这个加过去
92 | # 若是最后一个,不做操作继续循环,可能有bad case
93 | if next_item_index <= dic_count[random_user] - 1:
94 | next_item = dic[random_user][next_item_index]
95 | sentence.append(next_item)
96 | repeat_time += 1
97 | if repeat_time > max_repeat_nums:
98 | break
99 | sentences.append(sentence)
100 |
101 | # embedding_dimensions = number_of_categories**0.25
102 | model = Word2Vec(sentences, vector_size=64, sg=1, window=5, min_count=1, hs=1, negative=5, sample=0.001, workers=4)
103 | # 保存模型
104 | if IS_TRAIN:
105 | if IS_Last_Month:
106 | model.wv.save_word2vec_format('/home/niejianfei/otto/CV/preprocess/deepwalk_last_month.w2v', binary=False)
107 | else:
108 | model.wv.save_word2vec_format('/home/niejianfei/otto/CV/preprocess/deepwalk_last_week.w2v', binary=False)
109 | else:
110 | if IS_Last_Month:
111 | model.wv.save_word2vec_format('/home/niejianfei/otto/LB/preprocess/deepwalk_last_month.w2v', binary=False)
112 | else:
113 | model.wv.save_word2vec_format('/home/niejianfei/otto/LB/preprocess/deepwalk_last_week.w2v', binary=False)
114 |
--------------------------------------------------------------------------------
/ranker.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import pandas as pd
3 | import xgboost as xgb
4 | import numpy as np
5 | from sklearn.model_selection import GroupKFold
6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
7 |
8 |
9 | def load_data(path):
10 | type_transform = {"clicks": 0, "carts": 1, "orders": 2}
11 | dfs = []
12 | # 只导入训练数据
13 | for e, chunk_file in enumerate(glob.glob(path)):
14 | chunk = pd.read_parquet(chunk_file)
15 | chunk.ts = (chunk.ts / 1000).astype('int32')
16 | chunk['type'] = chunk['type'].map(type_transform).astype('int8')
17 | dfs.append(chunk)
18 | return pd.concat(dfs).reset_index(drop=True)
19 |
20 |
21 | def load_train_data(t, semi_sessions):
22 | dfs = []
23 | # 只导入训练数据
24 | k = 0
25 | print('开始导入数据')
26 | for i in range(0, 8):
27 | path = f"/home/niejianfei/otto/CV/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt"
28 | print(f'第{i + 1}块数据')
29 | chunk = pd.read_parquet(path)
30 | print(path)
31 | print(chunk)
32 |
33 | train = chunk[chunk.session.isin(semi_sessions)]
34 |
35 | chunk = train.astype("float32")
36 | chunk['session'] = chunk['session'].astype('int32')
37 | chunk['aid'] = chunk['aid'].astype('int32')
38 |
39 | chunk_pos = chunk[chunk[t[0:-1]] == 1].sort_values(by='session', ascending=True)
40 | print('正类', len(chunk_pos))
41 | chunk_neg = chunk[chunk[t[0:-1]] == 0].sample(len(chunk_pos) * 30, random_state=random_state)
42 | chunk = chunk_neg.append(chunk_pos).sort_values(by='session', ascending=True)
43 | dfs.append(chunk)
44 | print(len(chunk))
45 | k += len(chunk_pos)
46 | print(f'正类一共有:', k)
47 | return pd.concat(dfs).reset_index(drop=True)
48 |
49 |
50 | # 训练
51 | def train_xgb(candidate_type, semi_sessions, describe):
52 | for t in candidate_type:
53 | candidates = load_train_data(t, semi_sessions)
54 | print(candidates)
55 |
56 | # 训练
57 | candidates = candidates.sort_values(by='session', ascending=True)
58 | FEATURES = candidates.columns[0:-1]
59 | print(FEATURES)
60 |
61 | skf = GroupKFold(n_splits=5)
62 | for fold, (train_idx, valid_idx) in enumerate(
63 | skf.split(candidates, candidates[t[0:-1]], groups=candidates['session'])):
64 | # loc: 标签索引
65 | X_train_ = candidates.loc[train_idx, FEATURES]
66 | X_train = X_train_.drop(columns=['session', 'aid'])
67 | y_train = candidates.loc[train_idx, t[0:-1]]
68 |
69 | X_valid_ = candidates.loc[valid_idx, FEATURES]
70 | X_valid = X_valid_.drop(columns=['session', 'aid'])
71 | y_valid = candidates.loc[valid_idx, t[0:-1]]
72 |
73 | groups1 = X_train_.groupby('session').aid.agg('count').values
74 | groups2 = X_valid_.groupby('session').aid.agg('count').values
75 | # 读取数据,每个user一起训练
76 | # DMatrix是XGBoost使用的内部数据结构,它针对内存效率和训练速度进行了优化
77 | dtrain = xgb.DMatrix(X_train, y_train, group=groups1)
78 | dvalid = xgb.DMatrix(X_valid, y_valid, group=groups2)
79 | # 就当成是一组
80 | # dtrain = xgb.DMatrix(X_train, y_train)
81 | # dvalid = xgb.DMatrix(X_valid, y_valid)
82 |
83 | xgb_parms = {'booster': 'gbtree',
84 | 'tree_method': 'gpu_hist',
85 | 'objective': 'binary:logistic',
86 | 'eta': 0.01,
87 | 'eval_metric': 'logloss',
88 | 'seed': 0,
89 | # 'early_stopping_rounds': 300,
90 | # 'subsample': 0.5,
91 | # 'colsample_bytree': 0.5,
92 | # 'max_depth': 3,
93 | # 'reg_alpha': 1,
94 | 'reg_lambda': 20,
95 | 'scale_pos_weight': 30}
96 |
97 | model = xgb.train(xgb_parms,
98 | dtrain=dtrain,
99 | evals=[(dtrain, 'train'), (dvalid, 'valid')],
100 | num_boost_round=3000,
101 | verbose_eval=100,
102 | )
103 |
104 | print(f"第{fold + 1}次开始输出模型指标!!!")
105 | name = 'XGB'
106 | dtrain1 = xgb.DMatrix(X_train)
107 | dtest1 = xgb.DMatrix(X_valid)
108 |
109 | def sigmoid(x):
110 | return 1. / (1 + np.exp(-x))
111 |
112 | y_train_pred_pre = np.array(model.predict(dtrain1))
113 | # y_train_pred_pre = sigmoid(y_train_pred_pre)
114 | print(y_train_pred_pre[:10])
115 | y_train_pred = np.array(y_train_pred_pre)
116 |
117 | y_train_pred[y_train_pred >= 0.5] = int(1)
118 | y_train_pred[y_train_pred < 0.5] = int(0)
119 | print(y_train_pred[:10])
120 |
121 | y_test_pred_pre = np.array(model.predict(dtest1))
122 | # y_test_pred_pre = sigmoid(y_test_pred_pre)
123 | y_test_pred = np.array(y_test_pred_pre)
124 |
125 | y_test_pred[y_test_pred >= 0.5] = int(1)
126 | y_test_pred[y_test_pred < 0.5] = int(0)
127 |
128 | # accuracy
129 | train_accuracy = accuracy_score(y_train, y_train_pred)
130 | test_accuracy = accuracy_score(y_valid, y_test_pred)
131 |
132 | # precision
133 | train_precision = precision_score(y_train, y_train_pred)
134 | test_precision = precision_score(y_valid, y_test_pred)
135 | # recall
136 | train_recall = recall_score(y_train, y_train_pred)
137 | test_recall = recall_score(y_valid, y_test_pred)
138 | # f1
139 | train_f1 = f1_score(y_train, y_train_pred)
140 | test_f1 = f1_score(y_valid, y_test_pred)
141 | # auc 计算时,计算的应该是不同的概率画出来的曲线下的面积,而不是预测值对应的曲线下的面积
142 |
143 | train_auc = roc_auc_score(y_train, y_train_pred_pre)
144 | test_auc = roc_auc_score(y_valid, y_test_pred_pre)
145 |
146 | print('{} 训练集: accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,
147 | train_accuracy,
148 | train_precision,
149 | train_recall,
150 | train_f1,
151 | train_auc))
152 | print(
153 | '{} 验证集: accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name, test_accuracy,
154 | test_precision,
155 | test_recall,
156 | test_f1,
157 | test_auc))
158 | importance_weight = model.get_score(fmap='', importance_type='weight')
159 | print('weight', importance_weight)
160 | importance_gain = model.get_score(fmap='', importance_type='gain')
161 | print('gain', importance_gain)
162 |
163 | model.save_model(f'/home/niejianfei/otto/CV/models/xgb_fold{fold}_{t[0:-1]}_{describe}.xgb')
164 |
165 |
166 | # 预测
167 | def xgb_inference(key, stage, t, semi_sessions, describe):
168 | fold_num = 5
169 | dfs = []
170 | # 只导入训练数据
171 | for e, chunk_file in enumerate(
172 | glob.glob(f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/*")):
173 | print(f"第{e + 1}块数据!!!")
174 |
175 | chunk = pd.read_parquet(chunk_file)
176 | print(chunk)
177 | print(chunk.columns)
178 |
179 | if stage == 'CV':
180 | x_train = chunk[chunk.session.isin(semi_sessions)].astype("float32")
181 | x_test = chunk[~chunk.session.isin(semi_sessions)].astype("float32")
182 |
183 | if key == 'train':
184 | chunk = x_train.astype("float32")
185 | if len(chunk) == 0:
186 | continue
187 | else:
188 | chunk = x_test.astype("float32")
189 | if len(chunk) == 0:
190 | continue
191 | print(f'{key}长度为', len(chunk))
192 | FEATURES = chunk.columns[2:-1]
193 | chunk['session'] = chunk['session'].astype('int32')
194 | chunk['aid'] = chunk['aid'].astype('int32')
195 |
196 | preds = np.zeros(len(chunk))
197 | for fold in range(fold_num):
198 | print(f"第{fold + 1}次预测!!!")
199 |
200 | model = xgb.Booster()
201 | model.load_model(
202 | f'/home/niejianfei/otto/CV/models/xgb_fold{fold}_{t[0:-1]}_{describe}.xgb')
203 | model.set_param({'predictor': 'gpu_predictor'})
204 | print("开始构建test数据集!!!")
205 | dtest = xgb.DMatrix(data=chunk[FEATURES])
206 | print("开始预测!!!")
207 | preds += model.predict(dtest) / fold_num
208 | print(preds.max())
209 | print(f"第{e + 1}次构建predictions!!!")
210 | predictions = chunk[['session', 'aid']].copy()
211 | predictions['pred'] = preds
212 | print(predictions[:10])
213 | dfs.append(predictions)
214 | return pd.concat(dfs, axis=0).reset_index(drop=True)
215 |
216 |
217 | def generate_submission(key, stage, candidate_type, semi_sessions, describe):
218 | for t in candidate_type:
219 | predictions = xgb_inference(key, stage, t, semi_sessions, describe)
220 |
221 | print("开始构造submission!!!")
222 | predictions = predictions.sort_values(['session', 'pred'], ascending=[True, False]).reset_index(
223 | drop=True).drop_duplicates(['session', 'aid'], keep='first')
224 | predictions['n'] = predictions.groupby('session').aid.cumcount().astype('int32')
225 | print(predictions[:200])
226 | print(f"开始筛选<20!!!")
227 | predictions1 = predictions[predictions['n'] < 20]
228 | print(predictions1[:20])
229 |
230 | sub = predictions1.groupby('session').aid.apply(list)
231 | sub = sub.to_frame().reset_index()
232 | sub.aid = sub.aid.apply(lambda x: " ".join(map(str, x)))
233 | sub.columns = ['session_type', 'labels']
234 | sub.session_type = sub.session_type.astype('str') + f'_{t}'
235 | print(len(sub))
236 | print("开始写入本地!!!")
237 | sub.to_parquet(f'/home/niejianfei/otto/{stage}/submission/sub_{t}.pqt')
238 |
239 |
240 | def get_recall(key, candidate_type):
241 | for t in candidate_type:
242 | print("开始读取数据!!!")
243 | pred_df = pd.read_parquet(f'/home/niejianfei/otto/CV/submission/sub_{t}.pqt')
244 | print(len(pred_df))
245 |
246 | sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
247 | sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
248 | sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')])
249 | print("开始读取labels!!!")
250 | test_labels = pd.read_parquet(f'/home/niejianfei/otto/CV/preprocess/test_labels.parquet')
251 | print(len(test_labels))
252 | print(len(pred_df) - len(pred_df))
253 | test_labels = test_labels.loc[test_labels['type'] == t]
254 | test_labels = test_labels.merge(sub, how='left', on=['session'])
255 | test_labels['hits'] = test_labels.apply(
256 | lambda df: min(20, len(set(df.ground_truth).intersection(set(df.labels)))), axis=1)
257 | # 设定阈值 长度多于20,定为20
258 | test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0, 20)
259 | print(f"开始计算{key}recall!!!")
260 | recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
261 | print(f'{key} {t} recall@20 =', recall)
262 |
263 |
264 | def user_sample(frac):
265 | return valid.drop_duplicates(['session']).sample(frac=frac, random_state=random_state)['session']
266 |
267 |
268 | if __name__ == '__main__':
269 | # 抽取一半session计算recall
270 | random_state = 33
271 | valid = load_data(f'/home/niejianfei/otto/CV/data/test_parquet/*')
272 |
273 | candidate_type = ['clicks', 'carts', 'orders']
274 | describe = 'final'
275 |
276 | train_xgb(candidate_type, user_sample(0.5), describe)
277 | generate_submission('test', 'CV', candidate_type, user_sample(0.5), describe)
278 | get_recall('test', candidate_type)
279 |
280 | # 使用全量数据训练模型做最终预测
281 | train_xgb(candidate_type, user_sample(1), 'final_all_data')
282 |
--------------------------------------------------------------------------------