├── .idea ├── .gitignore ├── Kaggle OTTO – Multi-Objective Recommender System.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml └── modules.xml ├── README.md ├── candidates └── generate_candidates.py ├── features ├── __init__.py ├── co_visitation_features.py ├── item_features.py ├── recall_features.py ├── similarity_features.py ├── user_features.py └── user_item_features.py ├── merge_features.py ├── predict.py ├── preprocess ├── BPRMF_ALSMF_LMF_prepare.py ├── ProNE_prepare.py ├── __init__.py ├── co-visitation_matrix_prepare.py └── deepwalk_prepare.py └── ranker.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/Kaggle OTTO – Multi-Objective Recommender System.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 24 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle_OTTO_Multi-Objective_Recommender_System 2 | kaggle比赛—otto多目标推荐系统源代码,单模型分数0.594,LB排名30左右 3 | 4 | ## 召回阶段 5 | 1.基于历史序列召回 6 | 7 | 2.基于协同过滤co-visitation召回(I2I) 8 | 9 | 3.基于规则召回 10 | 11 | 点击最多/加购最多/购买最多/热门商品,综合指数最高 12 | 13 | 4.基于embedding召回 14 | 15 | deepwalk last week(I2I) 16 | 17 | deepwalk last month(I2I) 18 | 19 | ## 排序阶段 20 | 21 | 构造candidates特征,使用xgboost作为排序模型,做出预测。 22 | 23 | 特征构造具体如下: 24 | 25 | 初次召回特征(多重召回策略所带的特征) 26 | 27 | item特征 28 | 29 | user特征 30 | 31 | user和item的交互特征 32 | 33 | similarity特征(包括deepwalk,ProNE等相似度特征) 34 | 35 | co-visitation特征 36 | 37 | 训练策略具体如下: 38 | 39 | 分别用xgboost训练click/cart/order模型 40 | 41 | 用户click/cart/order的样本为正样本,其余candidates为负样本 42 | 43 | 训练模型时选用所有正样本,负样本选取正样本数量的30倍 44 | 45 | ## 模型提升历程 46 | 47 | 1.利用手工规则recall@20后LB分数为0.577 48 | 49 | 2.采用rank模型,增加召回数量(平均每个user召回170个item),candidates加入相似度特征(mean和max)后,LB提升到0.585 50 | 51 | 3.尝试向量召回,继续增加召回数量(平均每个user召回220个item),并加入co-visitation权重特征(mean和max),LB提升到0.590 52 | 53 | 4.继续尝试增加相似度特征(candidate与user序列最后三个aid分别的相似度特征)和co-visitation权重特征(candidate与user序列最后三个aid分别的权重特征),LB提升到0.594 54 | 55 | ## 尝试但不work的方法 56 | 57 | 1.ProNE基于图的user-item相似度特征 58 | 59 | 2.BPRMF,ALSMF,LMF基于矩阵分解的user-item相似度特征 60 | 61 | 3.许多item特征,例如时间趋势类特征、物品点击购买率类特征等 62 | 63 | 4.许多user特征,例如点击购买时间间隔类特征、用户点击购买率类特征等 64 | 65 | 5.使用网格搜索对xgboost简单进行调参,模型几乎没有提升 66 | 67 | 6.使用简单的特征交叉,模型没有提升 68 | -------------------------------------------------------------------------------- /candidates/generate_candidates.py: -------------------------------------------------------------------------------- 1 | # 召回策略 2 | # 1.基于历史序列召回 全部aids 3 | # 2.基于co—visitation召回(I2I) 100aids 4 | # 3.基于规则召回 5 | # 点击最多 6 | # 加购最多 7 | # 购买最多 8 | # 热门商品 9 | # 4.基于embedding召回 10 | # deepwalk last week(I2I) 80aids 11 | # deepwalk last month(I2I) 80aids 12 | 13 | # 开始计算recall@220!!! 14 | # clicks recall = 0.628 15 | # carts recall = 0.519 16 | # orders recall = 0.716 17 | # ============= 18 | # Overall Recall = 0.6481 19 | # ============= 20 | 21 | import gensim 22 | import pandas as pd, numpy as np 23 | import glob 24 | from collections import Counter 25 | import itertools 26 | 27 | type_labels = {'clicks': 0, 'carts': 1, 'orders': 2} 28 | VER = 6 29 | DISK_PIECES = 4 30 | IS_TRAIN = True 31 | 32 | 33 | def load_data(path): 34 | dfs = [] 35 | for e, chunk_file in enumerate(glob.glob(path)): 36 | chunk = pd.read_parquet(chunk_file) 37 | chunk.ts = (chunk.ts / 1000).astype('int32') 38 | chunk['type'] = chunk['type'].map(type_labels).astype('int8') 39 | dfs.append(chunk) 40 | return pd.concat(dfs).reset_index(drop=True) 41 | 42 | 43 | def pqt_to_dict(df): 44 | df = df.loc[df.n < 20].drop('n', axis=1) 45 | # df['sim_aid_and_score'] = df['aid_y'].astype('str') + '#' + df['wgt'].astype('str') 46 | return df.groupby('aid_x').aid_y.apply(list).to_dict() 47 | 48 | 49 | if IS_TRAIN: 50 | stage = 'CV' 51 | data = '' 52 | else: 53 | stage = 'LB' 54 | data = 'all_data_' 55 | 56 | print('加载原始数据!!') 57 | test_df = load_data(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 58 | train_df = load_data(f'/home/niejianfei/otto/{stage}/data/*_parquet/*') 59 | 60 | print("开始读取co_visitation矩阵数据!!!") 61 | # LOAD THREE CO-VISITATION MATRICES 62 | top_20_clicks = pqt_to_dict( 63 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_20_clicks_v{VER}_0.pqt')) 64 | for k in range(1, DISK_PIECES): 65 | top_20_clicks.update( 66 | pqt_to_dict( 67 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_20_clicks_v{VER}_{k}.pqt'))) 68 | top_20_buys = pqt_to_dict( 69 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_15_carts_orders_v{VER}_0.pqt')) 70 | for k in range(1, DISK_PIECES): 71 | top_20_buys.update( 72 | pqt_to_dict( 73 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_15_carts_orders_v{VER}_{k}.pqt'))) 74 | top_20_buy2buy = pqt_to_dict( 75 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/{data}top_15_buy2buy_v{VER}_0.pqt')) 76 | 77 | print('开始读取deepwalk词向量!!') 78 | word2vec_last_week = gensim.models.KeyedVectors.load_word2vec_format( 79 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_week.w2v', 80 | binary=False) 81 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format( 82 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v', 83 | binary=False) 84 | 85 | # 基于规则,热门商品 86 | print("开始生成test热门商品!!!") 87 | top_clicks = test_df.loc[test_df['type'] == 0, 'aid'].value_counts()[:200].to_dict() 88 | top_carts = test_df.loc[test_df['type'] == 1, 'aid'].value_counts()[:200].to_dict() 89 | top_orders = test_df.loc[test_df['type'] == 2, 'aid'].value_counts()[:200].to_dict() 90 | 91 | # 修改权重 92 | type_weight_multipliers = {0: 1, 1: 5, 2: 4} 93 | print("开始生成test hot商品!!!") 94 | test_df['score'] = test_df['type'].map(type_weight_multipliers) 95 | top_hot_items = test_df.groupby('aid')['score'].apply(lambda x: x.sum()) \ 96 | .sort_values(ascending=False)[:200].to_dict() 97 | print('开始生成train hot商品!!!') 98 | train_df['score'] = train_df['type'].map(type_weight_multipliers) 99 | top_hot_items_last_month = train_df.groupby('aid')['score'].apply(lambda x: x.sum()) \ 100 | .sort_values(ascending=False)[:200].to_dict() 101 | print(top_hot_items_last_month) 102 | print('开始生成train click hot商品!!!') 103 | train_df['score'] = 1 104 | top_clicks_items_last_month = train_df.groupby('aid')['score'].apply(lambda x: x.sum()) \ 105 | .sort_values(ascending=False)[:200].to_dict() 106 | print(top_clicks_items_last_month) 107 | 108 | 109 | def suggest_clicks(df): 110 | # USER HISTORY AIDS AND TYPES 111 | aids = df.aid.tolist() 112 | types = df.type.tolist() 113 | # unique_aids = list(dict.fromkeys(aids[::-1])) 114 | 115 | # RERANK CANDIDATES USING WEIGHTS 116 | # 直接召回历史序列按权重划分的aids 117 | # 等比数列 2**0.1 - 2**1 权重差的过大? 0.07-1?对于大序列,这点不好 118 | weights = np.logspace(0.1, 1, len(aids), base=2, endpoint=True) - 1 119 | aids_temp = Counter() 120 | # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS 121 | # 历史序列召回,考虑时间效应,优先召回时间近的 122 | for aid, w, t in zip(aids, weights, types): 123 | aids_temp[aid] += w * type_weight_multipliers[t] 124 | # session长度40已经可以涵盖90%的数据了 125 | history_aids = [k for k, v in aids_temp.most_common()] 126 | type_1 = [1] * len(history_aids) 127 | scores_1 = [v for k, v in aids_temp.most_common()] 128 | if len(set(scores_1)) == 1: 129 | scores_1 = [1] * len(scores_1) 130 | else: 131 | min_ = min(scores_1) 132 | max_ = max(scores_1) 133 | scores_1 = [(j - min_) / (max_ - min_) for j in scores_1] 134 | 135 | # 相似度矩阵召回 136 | # USE "CLICKS" CO-VISITATION MATRIX 137 | # click矩阵只考虑了时间的因素,cart-orders还考虑了相似商品的类别 138 | # 这里可以修改,通过sorted_aids召回相似物品 ---------------sort <= unique 139 | aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in history_aids if aid in top_20_clicks])) 140 | aids3 = list(itertools.chain(*[top_20_buys[aid] for aid in history_aids if aid in top_20_buys])) 141 | # RERANK CANDIDATES Counter计数筛选,不管得分,历史序列优先 142 | # 融合aids2和aids3的信息,同时考虑了相似item的时间权重和类型权重 143 | sim_aids_100 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(100)] 144 | type_2 = [2] * len(sim_aids_100) 145 | scores_2 = [cnt for aid2, cnt in Counter(aids2 + aids3).most_common(100)] 146 | 147 | # 基于规则召回n个 148 | # 热门商品召回100个,类别加权 149 | top_hot_items_100 = list(top_hot_items.keys())[:100] 150 | type_3 = [3] * (len(top_hot_items_100)) 151 | score_3 = list(top_hot_items.values())[:100] 152 | # 点击最多的商品召回100个 153 | top_clicks_100 = list(top_clicks.keys())[:100] 154 | type_4 = [4] * (len(top_clicks_100)) 155 | score_4 = list(top_clicks.values())[:100] 156 | # 过去一个月点击最多的商品召回100个 157 | top_clicks_last_month_100 = list(top_clicks_items_last_month.keys())[:100] 158 | type_5 = [5] * (len(top_clicks_last_month_100)) 159 | score_5 = list(top_clicks_items_last_month.values())[:100] 160 | # 过去一个月热度最高的100个商品 161 | top_hot_items_one_month_100 = list(top_hot_items_last_month.keys())[:100] 162 | type_6 = [6] * (len(top_hot_items_one_month_100)) 163 | score_6 = list(top_hot_items_last_month.values())[:100] 164 | 165 | # 基于向量embedding召回160个 166 | # 基于最后一周deepwalk召回80个 167 | temp_counter = Counter() 168 | for i in history_aids: 169 | if f'item_{i}' in word2vec_last_week: 170 | for j in word2vec_last_week.similar_by_word(f'item_{i}', topn=20): 171 | temp_counter[j[0]] += j[1] 172 | item_emb_deepwalk_last_week_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter.most_common(80)] 173 | type_7 = [7] * len(item_emb_deepwalk_last_week_80) 174 | score_7 = [cnt for aid2, cnt in temp_counter.most_common(80)] 175 | 176 | # 基于全局deepwalk召回80个 177 | temp_counter1 = Counter() 178 | for i in history_aids: 179 | for j in word2vec_last_month.similar_by_word(f'item_{i}', topn=20): 180 | temp_counter1[j[0]] += j[1] 181 | item_emb_last_month_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter1.most_common(80)] 182 | type_8 = [8] * len(item_emb_last_month_80) 183 | score_8 = [cnt for aid2, cnt in temp_counter1.most_common(80)] 184 | # print(item_emb_deepwalk_last_week_80[0], score_7[0], item_emb_last_month_80[0], score_8[0]) 185 | 186 | result = history_aids + sim_aids_100 + top_hot_items_100 + top_clicks_100 + top_clicks_last_month_100 + \ 187 | top_hot_items_one_month_100 + item_emb_deepwalk_last_week_80 + item_emb_last_month_80 188 | 189 | type = type_1 + type_2 + type_3 + type_4 + type_5 + type_6 + type_7 + type_8 190 | score = scores_1 + scores_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8 191 | 192 | info = [str(result[i]) + "#" + str(type[i]) + "#" + str(score[i]) for i in range(len(result))] 193 | 194 | return info 195 | 196 | 197 | def suggest_carts(df): 198 | # User history aids and types 199 | aids = df.aid.tolist() 200 | types = df.type.tolist() 201 | 202 | # UNIQUE AIDS AND UNIQUE BUYS 203 | unique_aids = list(dict.fromkeys(aids[::-1])) 204 | df = df.loc[(df['type'] == 0) | (df['type'] == 1)] 205 | unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1])) 206 | 207 | # Rerank candidates using weights,时间weight? 208 | # 等比数列 2**0.5[0.414] -- 2**1-1,要突出以往carts和orders的权重,时间权重不能过小 209 | weights = np.logspace(0.5, 1, len(aids), base=2, endpoint=True) - 1 210 | aids_temp = Counter() 211 | 212 | # Rerank based on repeat items and types of items 213 | # 使用aids信息召回 214 | for aid, w, t in zip(aids, weights, types): # w: 0.414-1 types:1,5,4 min 0.414 max 5 215 | aids_temp[aid] += w * type_weight_multipliers[t] 216 | # 不直接召回,下面利用矩阵信息再算一次 217 | # Rerank candidates using"top_20_carts" co-visitation matrix 218 | # 基于buy2buys召回carts 用unique_buys召回carts 219 | # aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_buys if aid in top_20_buys])) 220 | # aids2 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy])) 221 | # 将buy2buy矩阵输出 +0.1, 222 | # 还是以历史序列为主,尽量不要超过历史权重的量级,0.1算是合理 223 | # for aid in aids2: aids_temp[aid] += 0.1 224 | history_aids = [k for k, v in aids_temp.most_common()] 225 | type_1 = [1] * len(history_aids) 226 | scores_1 = [v for k, v in aids_temp.most_common()] 227 | if len(set(scores_1)) == 1: 228 | scores_1 = [1] * len(scores_1) 229 | else: 230 | min_ = min(scores_1) 231 | max_ = max(scores_1) 232 | scores_1 = [(j - min_) / (max_ - min_) for j in scores_1] 233 | # print(scores_1[1]) 234 | 235 | # Use "cart order" and "clicks" co-visitation matrices 236 | # click时间序列召回 基于历史session,要考虑时间,召回最新的 237 | aids1 = list(itertools.chain(*[top_20_clicks[aid] for aid in history_aids if aid in top_20_clicks])) 238 | # carts-orders召回 这里通过aids召回,使用buys也情有可原 239 | # 使用点击召回carts 240 | aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in history_aids if aid in top_20_buys])) 241 | # 修改5:基于unique_buys召回carts,要考虑carts-orders,那么使用buy2buy 242 | aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy])) 243 | 244 | # RERANK CANDIDATES 245 | sim_aids_100 = [aid2 for aid2, cnt in Counter(aids1 + aids2 + aids3).most_common(100) if aid2 not in history_aids] 246 | type_2 = [2] * len(sim_aids_100) 247 | scores_2 = [cnt for aid2, cnt in Counter(aids1 + aids2 + aids3).most_common(100) if aid2 not in history_aids] 248 | 249 | # 基于规则召回200个 250 | # 热门商品召回100个,类别加权 251 | top_hot_items_100 = list(top_hot_items.keys())[:100] 252 | type_3 = [3] * (len(top_hot_items_100)) 253 | score_3 = list(top_hot_items.values())[:100] 254 | # 购买最多的商品召回100个 255 | top_orders_100 = list(top_orders.keys())[:100] 256 | type_4 = [4] * (len(top_orders_100)) 257 | score_4 = list(top_orders.values())[:100] 258 | # 加购最多的商品召回100个 259 | top_carts_100 = list(top_carts.keys())[:100] 260 | type_5 = [5] * (len(top_carts_100)) 261 | score_5 = list(top_carts.values())[:100] 262 | # 过去一个月热度最高的100个商品 263 | top_hot_items_one_month_100 = list(top_hot_items_last_month.keys())[:150] 264 | type_6 = [6] * (len(top_hot_items_one_month_100)) 265 | score_6 = list(top_hot_items_last_month.values())[:150] 266 | 267 | # 基于向量embedding召回160个 268 | # 基于最后一周deepwalk召回80个 269 | temp_counter = Counter() 270 | for i in history_aids: 271 | if f'item_{i}' in word2vec_last_week: 272 | for j in word2vec_last_week.similar_by_word(f'item_{i}', topn=20): 273 | temp_counter[j[0]] += j[1] 274 | item_emb_deepwalk_last_week_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter.most_common(80)] 275 | type_7 = [7] * len(item_emb_deepwalk_last_week_80) 276 | score_7 = [cnt for aid2, cnt in temp_counter.most_common(80)] 277 | 278 | # 基于全局deepwalk召回80个 279 | temp_counter1 = Counter() 280 | for i in history_aids: 281 | for j in word2vec_last_month.similar_by_word(f'item_{i}', topn=20): 282 | temp_counter1[j[0]] += j[1] 283 | item_emb_last_month_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter1.most_common(80)] 284 | type_8 = [8] * len(item_emb_last_month_80) 285 | score_8 = [cnt for aid2, cnt in temp_counter1.most_common(80)] 286 | print(item_emb_deepwalk_last_week_80[0], score_7[0], item_emb_last_month_80[0], score_8[0]) 287 | 288 | result = history_aids + sim_aids_100 + top_hot_items_100 + top_orders_100 + top_carts_100 + \ 289 | top_hot_items_one_month_100 + item_emb_deepwalk_last_week_80 + item_emb_last_month_80 290 | 291 | type = type_1 + type_2 + type_3 + type_4 + type_5 + type_6 + type_7 + type_8 292 | score = scores_1 + scores_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8 293 | 294 | info = [str(result[i]) + "#" + str(type[i]) + "#" + str(score[i]) for i in range(len(result))] 295 | 296 | return info 297 | 298 | 299 | def suggest_buys(df): 300 | # USER HISTORY AIDS AND TYPES 301 | aids = df.aid.tolist() 302 | types = df.type.tolist() 303 | # UNIQUE AIDS AND UNIQUE BUYS 304 | # unique_aids = list(dict.fromkeys(aids[::-1])) 305 | df = df.loc[(df['type'] == 1) | (df['type'] == 2)] 306 | unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1])) 307 | 308 | # 基于历史序列召回40个 309 | # RERANK CANDIDATES USING WEIGHTS 310 | # 等比数列 0.414-1 311 | weights = np.logspace(0.5, 1, len(aids), base=2, endpoint=True) - 1 312 | aids_temp = Counter() 313 | # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS 314 | for aid, w, t in zip(aids, weights, types): 315 | aids_temp[aid] += w * type_weight_multipliers[t] 316 | # 直接取40,不管够不够,不够的话就这样 317 | history_aids = [k for k, v in aids_temp.most_common()] 318 | type_1 = [1] * len(history_aids) 319 | scores_1 = [v for k, v in aids_temp.most_common()] 320 | if len(set(scores_1)) == 1: 321 | scores_1 = [1] * len(scores_1) 322 | else: 323 | min_ = min(scores_1) 324 | max_ = max(scores_1) 325 | scores_1 = [(j - min_) / (max_ - min_) for j in scores_1] 326 | 327 | # 基于co—visitation召回100个 328 | # USE "CART ORDER" CO-VISITATION MATRIX 用aids召回orders,对的! 329 | aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in history_aids if aid in top_20_buys])) 330 | # USE "BUY2BUY" CO-VISITATION MATRIX 用unique_buys召回orders,对的!! 331 | aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy])) 332 | # RERANK CANDIDATES 333 | 334 | sim_aids_100 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(100)] 335 | type_2 = [2] * len(sim_aids_100) 336 | scores_2 = [cnt for aid2, cnt in Counter(aids2 + aids3).most_common(100)] 337 | 338 | # 基于规则召回n个 339 | # 热门商品召回100个,类别加权 340 | top_hot_items_100 = list(top_hot_items.keys())[:100] 341 | type_3 = [3] * (len(top_hot_items_100)) 342 | score_3 = list(top_hot_items.values())[:100] 343 | # 购买最多的商品召回100个 344 | top_orders_100 = list(top_orders.keys())[:100] 345 | type_4 = [4] * (len(top_orders_100)) 346 | score_4 = list(top_orders.values())[:100] 347 | # 加购最多的商品召回100个 348 | top_carts_100 = list(top_carts.keys())[:100] 349 | type_5 = [5] * (len(top_carts_100)) 350 | score_5 = list(top_carts.values())[:100] 351 | # 过去一个月热度最高的100个商品 352 | top_hot_items_one_month_100 = list(top_hot_items_last_month.keys())[:100] 353 | type_6 = [6] * (len(top_hot_items_one_month_100)) 354 | score_6 = list(top_hot_items_last_month.values())[:100] 355 | 356 | # 基于向量embedding召回160个 357 | # 基于最后一周deepwalk召回80个 358 | temp_counter = Counter() 359 | for i in history_aids: 360 | if f'item_{i}' in word2vec_last_week: 361 | for j in word2vec_last_week.similar_by_word(f'item_{i}', topn=20): 362 | temp_counter[j[0]] += j[1] 363 | item_emb_deepwalk_last_week_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter.most_common(80)] 364 | type_7 = [7] * len(item_emb_deepwalk_last_week_80) 365 | score_7 = [cnt for aid2, cnt in temp_counter.most_common(80)] 366 | 367 | # 基于全局deepwalk召回80个 368 | temp_counter1 = Counter() 369 | for i in history_aids: 370 | for j in word2vec_last_month.similar_by_word(f'item_{i}', topn=20): 371 | temp_counter1[j[0]] += j[1] 372 | item_emb_last_month_80 = [int(aid2.split('_')[1]) for aid2, cnt in temp_counter1.most_common(80)] 373 | type_8 = [8] * len(item_emb_last_month_80) 374 | score_8 = [cnt for aid2, cnt in temp_counter1.most_common(80)] 375 | print(item_emb_deepwalk_last_week_80[0], score_7[0], item_emb_last_month_80[0], score_8[0]) 376 | 377 | result = history_aids + sim_aids_100 + top_hot_items_100 + top_orders_100 + top_carts_100 + \ 378 | top_hot_items_one_month_100 + item_emb_deepwalk_last_week_80 + item_emb_last_month_80 379 | 380 | type = type_1 + type_2 + type_3 + type_4 + type_5 + type_6 + type_7 + type_8 381 | score = scores_1 + scores_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8 382 | 383 | info = [str(result[i]) + "#" + str(type[i]) + "#" + str(score[i]) for i in range(len(result))] 384 | 385 | return info 386 | 387 | 388 | print("开始进行clicks推荐!!!") 389 | 390 | pred_df_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply( 391 | lambda x: suggest_clicks(x) 392 | ) 393 | print("开始进行carts推荐!!!") 394 | pred_df_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply( 395 | lambda x: suggest_carts(x) 396 | ) 397 | print("开始进行buys推荐!!!") 398 | pred_df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply( 399 | lambda x: suggest_buys(x) 400 | ) 401 | 402 | print("开始进行推荐!!!") 403 | clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index() 404 | orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index() 405 | carts_pred_df = pd.DataFrame(pred_df_carts.add_suffix("_carts"), columns=["labels"]).reset_index() 406 | 407 | pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df]) 408 | pred_df.columns = ["session_type", "labels"] 409 | pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str, x))) 410 | pred_df.to_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates.pqt') 411 | print(pred_df) 412 | 413 | 414 | print("开始计算recall!!!") 415 | score = 0 416 | recall_score = {} 417 | weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60} 418 | for t in ['clicks', 'carts', 'orders']: 419 | sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy() 420 | sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0])) 421 | # sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')]) 422 | sub['labels'] = sub['labels'].apply(lambda x: [int(i.split('#')[0]) for i in x.split(' ')]) 423 | test_labels = pd.read_parquet(f'/home/niejianfei/otto/CV/preprocess/test_labels.parquet') 424 | test_labels = test_labels.loc[test_labels['type'] == t] 425 | test_labels = test_labels.merge(sub, how='left', on=['session']) 426 | test_labels['hits'] = test_labels.apply( 427 | lambda df: min(20, len(set(df.ground_truth).intersection(set(df.labels)))), axis=1) 428 | # 设定阈值 长度多于20,定为20 429 | test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0, 20) 430 | recall = test_labels['hits'].sum() / test_labels['gt_count'].sum() 431 | recall_score[t] = recall 432 | score += weights[t] * recall 433 | print(f'{t} recall =', recall) 434 | 435 | print('=============') 436 | print('Overall Recall =', score) 437 | print('=============') 438 | 439 | # handcraft recall LB,0.577 440 | # 开始计算recall!!! 441 | # clicks recall = 0.5257653796508641 442 | # carts recall = 0.41246734503419014 443 | # orders recall = 0.6498501450672353 444 | # ============= 445 | # Overall Recall = 0.5662268285156846 446 | # ============= 447 | 448 | # 开始计算recall@170!!! 449 | # clicks recall = 0.6012911171187798 450 | # carts recall = 0.5011587525716328 451 | # orders recall = 0.7053682856531855 452 | # ============= 453 | # Overall Recall = 0.6336977088752791 454 | # ============= 455 | -------------------------------------------------------------------------------- /features/__init__.py: -------------------------------------------------------------------------------- 1 | from .recall_features import recall_features 2 | from .user_item_features import user_item_features 3 | from .similarity_features import similarity_features 4 | from .co_visitation_features import co_visitation_features -------------------------------------------------------------------------------- /features/co_visitation_features.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pandas as pd 3 | 4 | 5 | def load_validate(path): 6 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 7 | dfs = [] 8 | # 只导入训练数据 9 | for e, chunk_file in enumerate(glob.glob(path)): 10 | chunk = pd.read_parquet(chunk_file) 11 | chunk.ts = (chunk.ts / 1000).astype('int32') 12 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 13 | dfs.append(chunk) 14 | return pd.concat(dfs).reset_index(drop=True) 15 | 16 | 17 | def calculate_cf_u2i_similarity(string, dic): 18 | list = string.split(' ') 19 | if int(list[-1]) < 0: 20 | return '-10 -10' 21 | aid = list[0] 22 | score = [] 23 | for i in list[1:]: 24 | if aid + ' ' + i in dic: 25 | temp_score = float(dic[aid + ' ' + i]) 26 | else: 27 | temp_score = 0 28 | score.append(temp_score) 29 | return str(max(score)) + ' ' + str(sum(score)) 30 | 31 | 32 | # 计算候选aid与user序列co_visitation矩阵的权重之和 33 | def cf_u2i_similarity(stage, candidate_type, start, end): 34 | print('开始读取数据!!!') 35 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 36 | print(valid) 37 | print('开始筛选') 38 | 39 | valid1 = valid[valid['type'] != 0] 40 | print(valid1) 41 | print('开始排序') 42 | # 分别对session_id聚合,对时间进行排序 43 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 44 | print('生成list') 45 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 46 | sentences_df.columns = ['carts_and_orders'] 47 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 48 | sentences_df = sentences_df.drop(columns='carts_and_orders') 49 | print(sentences_df) 50 | 51 | valid2 = valid 52 | print(valid2) 53 | print('开始排序') 54 | # 分别对session_id聚合,对时间进行排序 55 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 56 | print('生成list') 57 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 58 | sentences_df1.columns = ['clicks'] 59 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 60 | sentences_df1 = sentences_df1.drop(columns='clicks') 61 | print(sentences_df1) 62 | 63 | print('开始读取字典!!') 64 | print('click') 65 | VER = 6 66 | print(VER) 67 | dic_click = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_0.pqt') 68 | DISK_PIECES = 4 69 | for k in range(1, DISK_PIECES): 70 | dic_click = dic_click.append( 71 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_{k}.pqt')) 72 | 73 | dic_click['aids1'] = dic_click['aid_x'].astype('str') + ' ' + dic_click['aid_y'].astype('str') 74 | dic_click['aids2'] = dic_click['aid_y'].astype('str') + ' ' + dic_click['aid_x'].astype('str') 75 | 76 | dic_click = dic_click.drop(columns=['aid_x', 'aid_y']) 77 | dic_click1 = dic_click[['aids1', 'wgt']] 78 | print(dic_click1) 79 | dic_click2 = dic_click[['aids2', 'wgt']] 80 | dic_click2.columns = ['aids1', 'wgt'] 81 | print(dic_click2) 82 | dic_click = dic_click1.append(dic_click2) 83 | print(dic_click) 84 | dic_click.index = dic_click['aids1'] 85 | print(dic_click) 86 | dic_click = dic_click['wgt'].to_dict() 87 | print('0 532042' in dic_click) 88 | print('532042 0' in dic_click) 89 | print('0 532022242' in dic_click) 90 | 91 | print('hot') 92 | dic_hot = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_0.pqt') 93 | DISK_PIECES = 4 94 | for k in range(1, DISK_PIECES): 95 | dic_hot = dic_hot.append( 96 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_{k}.pqt')) 97 | 98 | dic_hot['aids1'] = dic_hot['aid_x'].astype('str') + ' ' + dic_hot['aid_y'].astype('str') 99 | dic_hot['aids2'] = dic_hot['aid_y'].astype('str') + ' ' + dic_hot['aid_x'].astype('str') 100 | 101 | dic_hot = dic_hot.drop(columns=['aid_x', 'aid_y']) 102 | dic_click1 = dic_hot[['aids1', 'wgt']] 103 | print(dic_click1) 104 | dic_click2 = dic_hot[['aids2', 'wgt']] 105 | dic_click2.columns = ['aids1', 'wgt'] 106 | print(dic_click2) 107 | dic_hot = dic_click1.append(dic_click2) 108 | print(dic_hot) 109 | dic_hot.index = dic_hot['aids1'] 110 | print(dic_hot) 111 | dic_hot = dic_hot['wgt'].to_dict() 112 | print('0 532042' in dic_hot) 113 | print('532042 0' in dic_hot) 114 | print('0 532022242' in dic_hot) 115 | 116 | print('buys') 117 | dic_buys = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_buy2buy_v{VER}_0.pqt') 118 | print(dic_buys) 119 | 120 | dic_buys['aids1'] = dic_buys['aid_x'].astype('str') + ' ' + dic_buys['aid_y'].astype('str') 121 | dic_buys['aids2'] = dic_buys['aid_y'].astype('str') + ' ' + dic_buys['aid_x'].astype('str') 122 | 123 | dic_buys = dic_buys.drop(columns=['aid_x', 'aid_y']) 124 | dic_click1 = dic_buys[['aids1', 'wgt']] 125 | print(dic_click1) 126 | dic_click2 = dic_buys[['aids2', 'wgt']] 127 | dic_click2.columns = ['aids1', 'wgt'] 128 | print(dic_click2) 129 | dic_buys = dic_click1.append(dic_click2) 130 | print(dic_buys) 131 | dic_buys.index = dic_buys['aids1'] 132 | print(dic_buys) 133 | dic_buys = dic_buys['wgt'].to_dict() 134 | print('0 532042' in dic_buys) 135 | print('532042 0' in dic_buys) 136 | print('0 532022242' in dic_buys) 137 | 138 | for t in candidate_type: 139 | # 只导入训练数据 140 | print('开始导入数据') 141 | for i in range(start, end): 142 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 143 | print(f'第{i + 1}块数据') 144 | chunk = pd.read_parquet(path) 145 | print(path) 146 | print(chunk.columns) 147 | chunk = chunk.astype("float32") 148 | chunk['session'] = chunk['session'].astype('int32') 149 | chunk['aid'] = chunk['aid'].astype('int32') 150 | print(chunk) 151 | print(chunk.columns) 152 | 153 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 154 | print(chunk) 155 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 156 | print('开始计算相似度!!!') 157 | chunk['sim_score_str'] = chunk['sim_list'].apply(lambda x: calculate_cf_u2i_similarity(x, dic_buys)) 158 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 159 | chunk['buys_CF_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 160 | chunk['buys_CF_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 161 | print(chunk[(chunk['buys_CF_sim_max'] != -10) & (chunk['buys_CF_sim_max'] != 0)]) 162 | 163 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 164 | print(chunk) 165 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 166 | print('click开始计算相似度!!!') 167 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 168 | lambda x: calculate_cf_u2i_similarity(x, dic_click)) 169 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 170 | chunk['clicks_CF_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 171 | chunk['clicks_CF_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 172 | print(chunk[(chunk['clicks_CF_sim_max'] != -10) & (chunk['clicks_CF_sim_max'] != 0)]) 173 | 174 | print('click开始计算相似度!!!') 175 | chunk['clicks_sim_score_str1'] = chunk['clicks_sim_list'].apply( 176 | lambda x: calculate_cf_u2i_similarity(x, dic_hot)) 177 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str1']]) 178 | chunk['hot_CF_sim_max'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[0])) 179 | chunk['hot_CF_sim_sum'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[1])) 180 | print(chunk[(chunk['hot_CF_sim_max'] != -10) & (chunk['hot_CF_sim_max'] != 0)]) 181 | 182 | chunk = chunk.drop( 183 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 184 | 'clicks_sim_score_str', 'clicks_sim_score_str1']) 185 | print(chunk[['buys_CF_sim_max', 'buys_CF_sim_sum', 'hot_CF_sim_max', 'hot_CF_sim_sum', 'clicks_CF_sim_max', 186 | 'clicks_CF_sim_sum']]) 187 | print(chunk.columns) 188 | print(chunk) 189 | chunk.to_parquet(path) 190 | 191 | 192 | def calculate_cf_u2i_similarity_tail(string, dic): 193 | list = string.split(' ') 194 | if int(list[-1]) < 0: 195 | return '-10 -10' 196 | aid = list[0] 197 | score = [] 198 | for i in list[1:]: 199 | if aid + ' ' + i in dic: 200 | temp_score = float(dic[aid + ' ' + i]) 201 | else: 202 | temp_score = 0 203 | score.append(temp_score) 204 | return str(sum(score) / len(score)) + ' ' + str(score[-1]) 205 | 206 | 207 | # 计算候选aid与user序列co_visitation矩阵的权重之和 208 | def cf_u2i_similarity_tail(stage, candidate_type, start, end): 209 | print('开始读取数据!!!') 210 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 211 | print(valid) 212 | print('开始筛选') 213 | 214 | valid1 = valid[valid['type'] != 0] 215 | print(valid1) 216 | print('开始排序') 217 | # 分别对session_id聚合,对时间进行排序 218 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 219 | print('生成list') 220 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 221 | sentences_df.columns = ['carts_and_orders'] 222 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 223 | sentences_df = sentences_df.drop(columns='carts_and_orders') 224 | print(sentences_df) 225 | 226 | valid2 = valid 227 | print(valid2) 228 | print('开始排序') 229 | # 分别对session_id聚合,对时间进行排序 230 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 231 | print('生成list') 232 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 233 | sentences_df1.columns = ['clicks'] 234 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 235 | sentences_df1 = sentences_df1.drop(columns='clicks') 236 | print(sentences_df1) 237 | 238 | print('开始读取字典!!') 239 | print('click') 240 | VER = 6 241 | print(VER) 242 | dic_click = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_0.pqt') 243 | DISK_PIECES = 4 244 | for k in range(1, DISK_PIECES): 245 | dic_click = dic_click.append( 246 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_20_clicks_v{VER}_{k}.pqt')) 247 | 248 | dic_click['aids1'] = dic_click['aid_x'].astype('str') + ' ' + dic_click['aid_y'].astype('str') 249 | dic_click['aids2'] = dic_click['aid_y'].astype('str') + ' ' + dic_click['aid_x'].astype('str') 250 | 251 | dic_click = dic_click.drop(columns=['aid_x', 'aid_y']) 252 | dic_click1 = dic_click[['aids1', 'wgt']] 253 | print(dic_click1) 254 | dic_click2 = dic_click[['aids2', 'wgt']] 255 | dic_click2.columns = ['aids1', 'wgt'] 256 | print(dic_click2) 257 | dic_click = dic_click1.append(dic_click2) 258 | print(dic_click) 259 | dic_click.index = dic_click['aids1'] 260 | print(dic_click) 261 | dic_click = dic_click['wgt'].to_dict() 262 | print('0 532042' in dic_click) 263 | print('532042 0' in dic_click) 264 | print('0 532022242' in dic_click) 265 | 266 | print('hot') 267 | dic_hot = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_0.pqt') 268 | DISK_PIECES = 4 269 | for k in range(1, DISK_PIECES): 270 | dic_hot = dic_hot.append( 271 | pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_carts_orders_v{VER}_{k}.pqt')) 272 | 273 | dic_hot['aids1'] = dic_hot['aid_x'].astype('str') + ' ' + dic_hot['aid_y'].astype('str') 274 | dic_hot['aids2'] = dic_hot['aid_y'].astype('str') + ' ' + dic_hot['aid_x'].astype('str') 275 | 276 | dic_hot = dic_hot.drop(columns=['aid_x', 'aid_y']) 277 | dic_click1 = dic_hot[['aids1', 'wgt']] 278 | print(dic_click1) 279 | dic_click2 = dic_hot[['aids2', 'wgt']] 280 | dic_click2.columns = ['aids1', 'wgt'] 281 | print(dic_click2) 282 | dic_hot = dic_click1.append(dic_click2) 283 | print(dic_hot) 284 | dic_hot.index = dic_hot['aids1'] 285 | print(dic_hot) 286 | dic_hot = dic_hot['wgt'].to_dict() 287 | print('0 532042' in dic_hot) 288 | print('532042 0' in dic_hot) 289 | print('0 532022242' in dic_hot) 290 | 291 | print('buys') 292 | dic_buys = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/top_15_buy2buy_v{VER}_0.pqt') 293 | print(dic_buys) 294 | 295 | dic_buys['aids1'] = dic_buys['aid_x'].astype('str') + ' ' + dic_buys['aid_y'].astype('str') 296 | dic_buys['aids2'] = dic_buys['aid_y'].astype('str') + ' ' + dic_buys['aid_x'].astype('str') 297 | 298 | dic_buys = dic_buys.drop(columns=['aid_x', 'aid_y']) 299 | dic_click1 = dic_buys[['aids1', 'wgt']] 300 | print(dic_click1) 301 | dic_click2 = dic_buys[['aids2', 'wgt']] 302 | dic_click2.columns = ['aids1', 'wgt'] 303 | print(dic_click2) 304 | dic_buys = dic_click1.append(dic_click2) 305 | print(dic_buys) 306 | dic_buys.index = dic_buys['aids1'] 307 | print(dic_buys) 308 | dic_buys = dic_buys['wgt'].to_dict() 309 | print('0 532042' in dic_buys) 310 | print('532042 0' in dic_buys) 311 | print('0 532022242' in dic_buys) 312 | 313 | for t in candidate_type: 314 | # 只导入训练数据 315 | print('开始导入数据') 316 | for i in range(start, end): 317 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 318 | print(f'第{i + 1}块数据') 319 | chunk = pd.read_parquet(path) 320 | print(path) 321 | print(chunk.columns) 322 | chunk = chunk.astype("float32") 323 | chunk['session'] = chunk['session'].astype('int32') 324 | chunk['aid'] = chunk['aid'].astype('int32') 325 | print(chunk) 326 | print(chunk.columns) 327 | 328 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 329 | print(chunk) 330 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 331 | print('开始计算相似度!!!') 332 | chunk['sim_score_str'] = chunk['sim_list'].apply(lambda x: calculate_cf_u2i_similarity_tail(x, dic_buys)) 333 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 334 | chunk['buys_CF_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 335 | chunk['buys_CF_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 336 | print(chunk[(chunk['buys_CF_sim_mean'] != -10) & (chunk['buys_CF_sim_-1'] != 0)]) 337 | 338 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 339 | print(chunk) 340 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 341 | print('click开始计算相似度!!!') 342 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 343 | lambda x: calculate_cf_u2i_similarity_tail(x, dic_click)) 344 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 345 | chunk['clicks_CF_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 346 | chunk['clicks_CF_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 347 | print(chunk[(chunk['clicks_CF_sim_mean'] != -10) & (chunk['clicks_CF_sim_-1'] != 0)]) 348 | 349 | print('click开始计算相似度!!!') 350 | chunk['clicks_sim_score_str1'] = chunk['clicks_sim_list'].apply( 351 | lambda x: calculate_cf_u2i_similarity_tail(x, dic_hot)) 352 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str1']]) 353 | chunk['hot_CF_sim_mean'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[0])) 354 | chunk['hot_CF_sim_-1'] = chunk['clicks_sim_score_str1'].apply(lambda x: float(x.split(' ')[1])) 355 | print(chunk[(chunk['hot_CF_sim_mean'] != -10) & (chunk['hot_CF_sim_-1'] != 0)]) 356 | 357 | chunk = chunk.drop( 358 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 359 | 'clicks_sim_score_str', 'clicks_sim_score_str1']) 360 | print(chunk[['buys_CF_sim_max', 'buys_CF_sim_sum', 'hot_CF_sim_max', 'hot_CF_sim_sum', 'clicks_CF_sim_max', 361 | 'clicks_CF_sim_sum']]) 362 | print(chunk.columns) 363 | print(chunk) 364 | chunk.to_parquet(path) 365 | 366 | 367 | # 三个矩阵的特征 368 | def co_visitation_features(stage, candidate_type, start, end): 369 | cf_u2i_similarity(stage, candidate_type, start, end) 370 | cf_u2i_similarity_tail(stage, candidate_type, start, end) 371 | 372 | 373 | if __name__ == '__main__': 374 | IS_TRAIN = True 375 | if IS_TRAIN: 376 | stage = 'CV' 377 | else: 378 | stage = 'LB' 379 | candidate_type = ['clicks', 'carts', 'orders'] 380 | co_visitation_features(stage, candidate_type, 0, 8) 381 | -------------------------------------------------------------------------------- /features/item_features.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import math 3 | import pandas as pd 4 | import numpy as np 5 | 6 | 7 | def load_data(path): 8 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 9 | dfs = [] 10 | # 只导入训练数据 11 | for e, chunk_file in enumerate(glob.glob(path)): 12 | chunk = pd.read_parquet(chunk_file) 13 | chunk.ts = (chunk.ts / 1000).astype('int32') 14 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 15 | if not IS_TRAIN: 16 | # 除去第一周的数据 17 | chunk = chunk[chunk['ts'] >= 1659909599] 18 | dfs.append(chunk) 19 | return pd.concat(dfs).reset_index(drop=True) 20 | 21 | 22 | def item_features(input_path, output_path): 23 | print("开始导入数据!!!") 24 | train = load_data(input_path) 25 | 26 | print("开始构造item_feature!!!") 27 | # Step 2:构造item_features 28 | # item_features,使用train data 和valid data 29 | print("开始聚合aid:agg中!!!") 30 | item_features = train.groupby('aid').agg({'aid': 'count', 'session': 'nunique', 'type': ['mean', 'skew'], 31 | 'ts': ['min', 'max', 'skew']}) 32 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 33 | item_features.columns = ['item_item_count', 'item_user_count', 'item_buy_ratio', 'item_buy_skew', 'item_min_ts', 34 | 'item_max_ts', 'item_skew_ts'] 35 | print("开始构造ts偏态峰态中!!!") 36 | # 计算时间偏态系数,计算时间峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0) 37 | item_features['item_skew_ts'] = item_features['item_skew_ts'].fillna(value=0) 38 | item_features['item_kurt_ts'] = train.groupby('aid')['ts'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(value=0) 39 | 40 | print("开始构造type偏态峰态中!!!") 41 | # 计算类型偏态系数,计算类型峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0) 42 | item_features['item_buy_skew'] = item_features['item_buy_skew'].fillna(value=0) 43 | item_features['item_buy_kurt'] = train.groupby('aid')['type'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(value=0) 44 | # aids序列持续的时间(天) 45 | print("开始计算ts时间s!!!") 46 | item_features['item_long_ts'] = item_features['item_max_ts'] - item_features['item_min_ts'] 47 | print(item_features) 48 | item_features = item_features.drop(columns=['item_min_ts', 'item_max_ts']) 49 | 50 | print("开始计算aid三个比例特征!!!") 51 | # aid平均每天被观看几次 52 | item_features["item_avg_visit_per_day"] = item_features['item_item_count'] / (item_features['item_long_ts'] / (60 * 53 | 60 * 24)).clip( 54 | 1, 60).apply(lambda x: math.ceil(x)) 55 | item_features["item_repeat_visit_num"] = item_features['item_item_count'] - item_features['item_user_count'] 56 | # 平均每个商品被每个用户观看的次数 57 | item_features["item_ave_visit_num"] = item_features['item_item_count'] / item_features['item_user_count'] 58 | # aids的re_watch比例 59 | item_features["item_re_visit_rate"] = item_features['item_repeat_visit_num'] / item_features['item_item_count'] 60 | 61 | # train 的ts是毫秒,没有除以1000 62 | print("开始导入数据!!!") 63 | # 前三周的训练数据 64 | 65 | time = (train['ts'].max() - train['ts'].min()) / (60 * 60 * 24) 66 | print('天', time) 67 | # 只要后几周的数据 68 | train['ts_minus'] = (train['ts'] - train['ts'].min()) / (60 * 60 * 24) 69 | # 最后一周 70 | print('最后一周') 71 | train1 = train[train['ts_minus'] >= 21].drop(columns='ts_minus') 72 | print(train1) 73 | item_item_count_last_week = train1.groupby('aid').agg({'aid': 'count', 'type': 'mean'}) 74 | item_item_count_last_week.columns = ['item_item_count_last_week', 'item_buy_ratio_last_week'] 75 | print(item_item_count_last_week) 76 | # 最后两周 77 | print('最后两周') 78 | train2 = train[train['ts_minus'] >= 14].drop(columns='ts_minus') 79 | print(train2) 80 | item_item_count_last_two_week = train2.groupby('aid').agg({'aid': 'count', 'type': 'mean'}) 81 | item_item_count_last_two_week.columns = ['item_item_count_last_two_week', 'item_buy_ratio_last_two_week'] 82 | print(item_item_count_last_two_week) 83 | # 最后三周 84 | print('最后三周') 85 | train3 = train[train['ts_minus'] >= 7].drop(columns='ts_minus') 86 | print(train3) 87 | item_item_count_last_three_week = train3.groupby('aid').agg({'aid': 'count', 'type': 'mean'}) 88 | item_item_count_last_three_week.columns = ['item_item_count_last_three_week', 'item_buy_ratio_last_three_week'] 89 | print(item_item_count_last_three_week) 90 | 91 | item_features = item_features.merge(item_item_count_last_week, left_index=True, right_index=True, 92 | how='left').fillna(value=-1000) 93 | item_features = item_features.merge(item_item_count_last_two_week, left_index=True, right_index=True, 94 | how='left').fillna(value=-1000) 95 | item_features = item_features.merge(item_item_count_last_three_week, left_index=True, right_index=True, 96 | how='left').fillna(value=-1000) 97 | 98 | print(item_features) 99 | print(item_features.columns) 100 | 101 | # 规定保存格式 102 | item_features = item_features.astype('float32') 103 | print("开始保存特征到文件!!!") 104 | item_features.to_parquet(output_path) 105 | 106 | 107 | def add_item_features(input_path1, input_path2, output_path): 108 | # item feature 109 | # item_feature:点击购买率 item_item 总count / cart/order count 110 | # 点击加购率 111 | # 加购购买率 112 | # 点击占比(点击占全部点击之比) 113 | # 加购占比 114 | # 购买占比 115 | # last_week和last_month 趋势 斜率变化 116 | # 复购率 集中度 117 | # 复加购率 118 | # 复点击率 item_item - item_user 119 | print("开始导入数据!!!") 120 | train = load_data(input_path1) 121 | 122 | train_click = train[train['type'] == 0] 123 | train_cart = train[train['type'] == 1] 124 | train_order = train[train['type'] == 2] 125 | 126 | print("开始构造item_feature!!!") 127 | # 最后一个月 128 | print("开始聚合aid:agg中!!!") 129 | click_item_features = train_click.groupby('aid').agg({'aid': 'count', 'session': 'nunique'}) 130 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 131 | click_item_features.columns = ['click_item_item_count', 'click_item_user_count'] 132 | 133 | cart_item_features = train_cart.groupby('aid').agg({'aid': 'count', 'session': 'nunique'}) 134 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 135 | cart_item_features.columns = ['cart_item_item_count', 'cart_item_user_count'] 136 | 137 | order_item_features = train_order.groupby('aid').agg({'aid': 'count', 'session': 'nunique'}) 138 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 139 | order_item_features.columns = ['order_item_item_count', 'order_item_user_count'] 140 | 141 | click_item_features = click_item_features.merge(cart_item_features, left_index=True, right_index=True, 142 | how='left').fillna(value=0) 143 | click_item_features = click_item_features.merge(order_item_features, left_index=True, right_index=True, 144 | how='left').fillna(value=0) 145 | 146 | # click_item_item_count, click_item_user_count 147 | # 点击购买率 * 3 148 | click_item_features['click_cart_rate'] = click_item_features['cart_item_item_count'] / click_item_features[ 149 | 'click_item_item_count'] 150 | click_item_features['click_order_rate'] = click_item_features['order_item_item_count'] / click_item_features[ 151 | 'click_item_item_count'] 152 | click_item_features['cart_order_rate'] = ( 153 | click_item_features['order_item_item_count'] / click_item_features['cart_item_item_count']) 154 | print(click_item_features['cart_order_rate'].max()) 155 | print(click_item_features['cart_order_rate'].min()) 156 | features = click_item_features[ 157 | (click_item_features['order_item_item_count'] == 0) & (click_item_features['cart_item_item_count'] == 0)] 158 | print(features[['cart_item_item_count', 'order_item_item_count', 'cart_order_rate']]) 159 | # 点击占比 160 | click_item_features['click_percentage'] = click_item_features['click_item_item_count'] / click_item_features[ 161 | 'click_item_item_count'].sum() 162 | click_item_features['cart_percentage'] = click_item_features['cart_item_item_count'] / click_item_features[ 163 | 'cart_item_item_count'].sum() 164 | click_item_features['order_percentage'] = click_item_features['order_item_item_count'] / click_item_features[ 165 | 'order_item_item_count'].sum() 166 | # 复购率 167 | click_item_features['re_click_rate'] = (click_item_features['click_item_item_count'] - click_item_features[ 168 | 'click_item_user_count']) / click_item_features['click_item_item_count'] 169 | click_item_features['re_cart_rate'] = (click_item_features['cart_item_item_count'] - click_item_features[ 170 | 'cart_item_user_count']) / click_item_features['cart_item_item_count'] 171 | click_item_features['re_order_rate'] = (click_item_features['order_item_item_count'] - click_item_features[ 172 | 'order_item_user_count']) / click_item_features['order_item_item_count'] 173 | 174 | click_item_features = click_item_features.replace(np.inf, 100) 175 | 176 | print("开始导入valid数据!!!") 177 | valid = load_data(input_path2) 178 | 179 | valid_click = valid[valid['type'] == 0] 180 | valid_cart = valid[valid['type'] == 1] 181 | valid_order = valid[valid['type'] == 2] 182 | 183 | print("开始构造item_feature!!!") 184 | # 最后一个月 185 | print("开始聚合aid:agg中!!!") 186 | valid_click_item_features = valid_click.groupby('aid').agg({'aid': 'count', 'session': 'nunique'}) 187 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 188 | valid_click_item_features.columns = ['click_item_item_count1', 'click_item_user_count1'] 189 | 190 | valid_cart_item_features = valid_cart.groupby('aid').agg({'aid': 'count', 'session': 'nunique'}) 191 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 192 | valid_cart_item_features.columns = ['cart_item_item_count1', 'cart_item_user_count1'] 193 | 194 | valid_order_item_features = valid_order.groupby('aid').agg({'aid': 'count', 'session': 'nunique'}) 195 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 196 | valid_order_item_features.columns = ['order_item_item_count1', 'order_item_user_count1'] 197 | 198 | valid_click_item_features = valid_click_item_features.merge(valid_cart_item_features, left_index=True, 199 | right_index=True, 200 | how='left').fillna(value=0) 201 | valid_click_item_features = valid_click_item_features.merge(valid_order_item_features, left_index=True, 202 | right_index=True, 203 | how='left').fillna(value=0) 204 | # click_item_item_count, click_item_user_count 205 | # 点击购买率 * 3 206 | valid_click_item_features['click_cart_rate1'] = valid_click_item_features['cart_item_item_count1'] / \ 207 | valid_click_item_features[ 208 | 'click_item_item_count1'] 209 | valid_click_item_features['click_order_rate1'] = valid_click_item_features['order_item_item_count1'] / \ 210 | valid_click_item_features[ 211 | 'click_item_item_count1'] 212 | valid_click_item_features['cart_order_rate1'] = valid_click_item_features['order_item_item_count1'] / \ 213 | valid_click_item_features[ 214 | 'cart_item_item_count1'] 215 | # 点击占比 216 | valid_click_item_features['click_percentage1'] = valid_click_item_features['click_item_item_count1'] / \ 217 | valid_click_item_features[ 218 | 'click_item_item_count1'].sum() 219 | valid_click_item_features['cart_percentage1'] = valid_click_item_features['cart_item_item_count1'] / \ 220 | valid_click_item_features[ 221 | 'cart_item_item_count1'].sum() 222 | valid_click_item_features['order_percentage1'] = valid_click_item_features['order_item_item_count1'] / \ 223 | valid_click_item_features[ 224 | 'order_item_item_count1'].sum() 225 | # 复购率 226 | valid_click_item_features['re_click_rate1'] = (valid_click_item_features['click_item_item_count1'] - 227 | valid_click_item_features[ 228 | 'click_item_user_count1']) / valid_click_item_features[ 229 | 'click_item_item_count1'] 230 | valid_click_item_features['re_cart_rate1'] = (valid_click_item_features['cart_item_item_count1'] - 231 | valid_click_item_features[ 232 | 'cart_item_user_count1']) / valid_click_item_features[ 233 | 'cart_item_item_count1'] 234 | valid_click_item_features['re_order_rate1'] = (valid_click_item_features['order_item_item_count1'] - 235 | valid_click_item_features[ 236 | 'order_item_user_count1']) / valid_click_item_features[ 237 | 'order_item_item_count1'] 238 | valid_click_item_features = valid_click_item_features.replace(np.inf, 100) 239 | 240 | # 缺失值用-1填补,相减后也是负数,小于等于-1 241 | click_item_features = click_item_features.merge(valid_click_item_features, left_index=True, right_index=True, 242 | how='left').fillna(value=-10) 243 | # 点击加购率 244 | click_item_features['click_cart_rate_trend'] = ( 245 | click_item_features['click_cart_rate1'] - click_item_features['click_cart_rate']).clip(-10) 246 | click_item_features['click_order_rate_trend'] = ( 247 | click_item_features['click_order_rate1'] - click_item_features['click_order_rate']).clip(-10) 248 | click_item_features['cart_order_rate_trend'] = ( 249 | click_item_features['cart_order_rate1'] - click_item_features['cart_order_rate']).clip(-10) 250 | # 点击占比 251 | click_item_features['click_percentage_trend'] = ( 252 | click_item_features['click_percentage1'] - click_item_features['click_percentage']).clip(-10) 253 | click_item_features['cart_percentage_trend'] = ( 254 | click_item_features['cart_percentage1'] - click_item_features['cart_percentage']).clip(-10) 255 | click_item_features['order_percentage_trend'] = ( 256 | click_item_features['order_percentage1'] - click_item_features['order_percentage']).clip(-10) 257 | # 复购率 258 | click_item_features['re_click_rate_trend'] = ( 259 | click_item_features['re_click_rate1'] - click_item_features['re_click_rate']).clip(-10) 260 | click_item_features['re_cart_rate_trend'] = ( 261 | click_item_features['re_cart_rate1'] - click_item_features['re_cart_rate']).clip(-10) 262 | click_item_features['re_order_rate_trend'] = ( 263 | click_item_features['re_order_rate1'] - click_item_features['re_order_rate']).clip(-10) 264 | 265 | print(click_item_features) 266 | print(click_item_features.describe()) 267 | 268 | print("开始保存特征到文件!!!") 269 | click_item_features.to_parquet(output_path) 270 | 271 | 272 | def trans_time_span_item_features(input_path, output_path1, output_path2, output_path3): 273 | train = load_data(input_path) 274 | 275 | train_clicks = train[train['type'] == 0].drop(columns='type') 276 | train_clicks = train_clicks.rename(columns={'ts': 'ts_click'}) 277 | train_carts = train[train['type'] == 1].drop(columns='type') 278 | train_carts = train_carts.rename(columns={'ts': 'ts_cart'}) 279 | train_orders = train[train['type'] == 2].drop(columns='type') 280 | train_orders = train_orders.rename(columns={'ts': 'ts_order'}) 281 | 282 | print('click_cart_span') 283 | click_cart_span = train_clicks.merge(train_carts, on=['session', 'aid'], how='inner') 284 | print(click_cart_span) 285 | click_cart_span['min'] = click_cart_span['ts_click'] - click_cart_span['ts_cart'] 286 | click_cart_span = click_cart_span[click_cart_span['min'] <= 0].drop(columns='min') 287 | print(click_cart_span) 288 | click_cart_span_feature = click_cart_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_cart': 'min'}) 289 | click_cart_span_feature.columns = ['ts_click_min', 'ts_cart_min'] 290 | print(click_cart_span_feature) 291 | click_cart_span_feature['click_cart_span'] = click_cart_span_feature['ts_cart_min'] - click_cart_span_feature[ 292 | 'ts_click_min'] 293 | print(click_cart_span_feature) 294 | click_cart_span_feature['aids'] = click_cart_span_feature.index.get_level_values('aid') 295 | print(click_cart_span_feature) 296 | print(click_cart_span_feature.index.get_level_values('aid')[:10]) 297 | click_cart_span_feature = click_cart_span_feature.groupby('aids').agg({'aids': 'count', 'click_cart_span': 'mean'}) 298 | click_cart_span_feature.columns = ['trans_click_cart_count', 'trans_click_cart_span_avg'] 299 | print(click_cart_span_feature.describe()) 300 | print(click_cart_span_feature) 301 | click_cart_span_feature.to_parquet(output_path1) 302 | 303 | print('click_order_span') 304 | click_order_span = train_clicks.merge(train_orders, on=['session', 'aid'], how='inner') 305 | print(click_order_span) 306 | click_order_span['min'] = click_order_span['ts_click'] - click_order_span['ts_order'] 307 | click_order_span = click_order_span[click_order_span['min'] <= 0].drop(columns='min') 308 | print(click_order_span) 309 | click_order_span_feature = click_order_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_order': 'min'}) 310 | click_order_span_feature.columns = ['ts_click_min', 'ts_order_min'] 311 | print(click_order_span_feature) 312 | click_order_span_feature['click_order_span'] = click_order_span_feature['ts_order_min'] - click_order_span_feature[ 313 | 'ts_click_min'] 314 | print(click_order_span_feature) 315 | click_order_span_feature['aids'] = click_order_span_feature.index.get_level_values('aid') 316 | print(click_order_span_feature) 317 | print(click_order_span_feature.index.get_level_values('aid')[:10]) 318 | click_order_span_feature = click_order_span_feature.groupby('aids').agg( 319 | {'aids': 'count', 'click_order_span': 'mean'}) 320 | click_order_span_feature.columns = ['trans_click_order_count', 'trans_click_order_span_avg'] 321 | print(click_order_span_feature.describe()) 322 | print(click_order_span_feature) 323 | click_order_span_feature.to_parquet(output_path2) 324 | 325 | print('cart_order_span') 326 | carts_order_span = train_carts.merge(train_orders, on=['session', 'aid'], how='inner') 327 | print(carts_order_span) 328 | carts_order_span['min'] = carts_order_span['ts_cart'] - carts_order_span['ts_order'] 329 | carts_order_span = carts_order_span[carts_order_span['min'] <= 0].drop(columns='min') 330 | print(carts_order_span) 331 | cart_order_span_feature = carts_order_span.groupby(['session', 'aid']).agg({'ts_cart': 'min', 'ts_order': 'min'}) 332 | cart_order_span_feature.columns = ['ts_cart_min', 'ts_order_min'] 333 | print(cart_order_span_feature) 334 | cart_order_span_feature['cart_order_span'] = cart_order_span_feature['ts_order_min'] - cart_order_span_feature[ 335 | 'ts_cart_min'] 336 | print(cart_order_span_feature) 337 | cart_order_span_feature['aids'] = cart_order_span_feature.index.get_level_values('aid') 338 | print(cart_order_span_feature) 339 | print(cart_order_span_feature.index.get_level_values('aid')[:10]) 340 | cart_order_span_feature = cart_order_span_feature.groupby('aids').agg({'aids': 'count', 'cart_order_span': 'mean'}) 341 | cart_order_span_feature.columns = ['trans_cart_order_count', 'trans_cart_order_span_avg'] 342 | print(cart_order_span_feature.describe()) 343 | print(cart_order_span_feature) 344 | cart_order_span_feature.to_parquet(output_path3) 345 | 346 | 347 | if __name__ == '__main__': 348 | IS_TRAIN = True 349 | if IS_TRAIN: 350 | stage = 'CV' 351 | else: 352 | stage = 'LB' 353 | input_path = f'/home/niejianfei/otto/{stage}/data/*_parquet/*' 354 | input_path2 = f'/home/niejianfei/otto/{stage}/data/test_parquet/*' 355 | output_path = f'/home/niejianfei/otto/{stage}/preprocess/item_features.pqt' 356 | output_path1 = f'/home/niejianfei/otto/{stage}/preprocess/add_item_features.pqt' 357 | item_features(input_path, output_path) 358 | add_item_features(input_path, input_path2, output_path1) 359 | 360 | input_path3 = f'/home/niejianfei/otto/{stage}/data/train_parquet/*' 361 | output_path2 = f'/home/niejianfei/otto/{stage}/preprocess/click_cart_item_features.pqt' 362 | output_path3 = f'/home/niejianfei/otto/{stage}/preprocess/click_order_item_features.pqt' 363 | output_path4 = f'/home/niejianfei/otto/{stage}/preprocess/cart_order_item_features.pqt' 364 | trans_time_span_item_features(input_path3, output_path2, output_path3, output_path4) 365 | -------------------------------------------------------------------------------- /features/recall_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | # 导入candidates数据 5 | def recall_features(stage, candidate_type): 6 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 7 | print("开始导入数据!!!") 8 | candidates = pd.read_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates.pqt') 9 | print('candidate的长度为', len(candidates)) 10 | 11 | print("开始处理candidates数据!!!") 12 | # 标记类型 13 | print("转换!!!") 14 | candidates["type"] = candidates.session_type.apply(lambda x: x.split("_")[1]) 15 | candidates["type"] = candidates["type"].map(type_transform).astype('int8') 16 | for t in candidate_type: 17 | print(f"只要{t}!!!") 18 | candidates = candidates[candidates['type'] == type_transform[t]] 19 | print("推荐长度:", len(candidates)) 20 | # 裂开 session_type, labels, type 21 | print("裂开!!!") 22 | candidates["labels"] = candidates["labels"].apply(lambda x: x.split(" ")) 23 | candidates = candidates.explode("labels") 24 | # 开始计算类型 session_type, labels, type, candidate_type 25 | print("candidate_type") 26 | candidates["candidate_type"] = candidates["labels"].apply(lambda x: x.split('#')[1]).astype('float32').astype( 27 | 'int32') 28 | # 开始计算得分 session_type, labels, type, candidate_type, candidate_type_scores 29 | print("candidate_type_scores") 30 | candidates["candidate_type_scores"] = candidates["labels"].apply(lambda x: x.split('#')[2]).astype('float32') 31 | # 开始标签 session_type, labels, type, candidate_type, candidate_type_scores 32 | print("labels") 33 | candidates["labels"] = candidates["labels"].apply(lambda x: x.split('#')[0]).astype('int32') 34 | candidates["session_type"] = candidates.session_type.apply(lambda x: x.split("_")[0]).astype("int32") 35 | candidates.rename(columns={'session_type': 'session', 'labels': 'aid'}, inplace=True) 36 | print(candidates) 37 | 38 | # 'session', 'aid', 'type', 'candidate_type', 'candidate_type_scores' 39 | # history_aid, sim_aid, top_hot_aid, top_orders_aid 40 | candidate_type_dic = {1: 'history_aid', 2: 'sim_aid', 3: 'top_hot_aid', 4: 'top_orders_aid', 41 | 5: 'top_carts_aid', 6: 'top_hot_aid_last_month', 7: 'deepwalk', 8: 'word2vec'} 42 | candidate_type_scores_dic = {1: 'history_aid_score', 2: 'sim_aid_score', 3: 'top_hot_aid_score', 43 | 4: 'top_orders_aid_score', 5: 'top_carts_aid_score', 44 | 6: 'top_hot_aid_last_month_score', 7: 'deepwalk_score', 45 | 8: 'word2vec_score'} 46 | print('开始merge!!!') 47 | candidates1 = candidates[candidates['candidate_type'] == 1] 48 | candidates1.columns = ['session', 'aid', 'type', 'history_aid', 'history_aid_score'] 49 | candidates1 = candidates1.sort_values(['session', 'history_aid_score'], ascending=[True, False]) 50 | candidates1['history_aid_rank'] = candidates1.groupby('session')['aid'].cumcount() 51 | print(candidates1) 52 | for i in range(7): 53 | temp_df = candidates[candidates['candidate_type'] == i + 2] 54 | temp_df['candidate_type'] = 1 55 | temp_df.rename(columns={'candidate_type': f'{candidate_type_dic[i + 2]}', 56 | 'candidate_type_scores': f'{candidate_type_scores_dic[i + 2]}'}, inplace=True) 57 | temp_df = temp_df.sort_values(['session', f'{candidate_type_scores_dic[i + 2]}'], 58 | ascending=[True, False]) 59 | temp_df[f'{candidate_type_dic[i + 2]}_rank'] = temp_df.groupby('session')['aid'].cumcount() 60 | print(temp_df) 61 | candidates1 = candidates1.merge(temp_df, on=['session', 'aid', 'type'], how='outer').fillna(value=-1) 62 | print(candidates1) 63 | candidates1.to_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates_{t}.pqt') 64 | print('保存完毕') 65 | 66 | 67 | if __name__ == '__main__': 68 | IS_TRAIN = True 69 | candidate_type = ['clicks', 'carts', 'orders'] 70 | if IS_TRAIN: 71 | stage = 'CV' 72 | else: 73 | stage = 'LB' 74 | 75 | recall_features(stage, candidate_type) 76 | -------------------------------------------------------------------------------- /features/similarity_features.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pickle 3 | import gensim 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | 9 | def load_validate(path): 10 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 11 | dfs = [] 12 | # 只导入训练数据 13 | for e, chunk_file in enumerate(glob.glob(path)): 14 | chunk = pd.read_parquet(chunk_file) 15 | chunk.ts = (chunk.ts / 1000).astype('int32') 16 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 17 | dfs.append(chunk) 18 | return pd.concat(dfs).reset_index(drop=True) 19 | 20 | 21 | def calculate_deepwalk_similarity(string, model): 22 | list = string.split(' ') 23 | if int(list[-1]) < 0: 24 | return '-10 -10' 25 | sim = [] 26 | aid = 'item_' + list[0] 27 | for i in list[1:]: 28 | simm = model.similarity(f'item_{i}', aid) 29 | sim.append(simm) 30 | sim_mean = sum(sim) / len(sim) 31 | sim_max = max(sim) 32 | return str(sim_mean) + ' ' + str(sim_max) 33 | 34 | 35 | # deepwalk,i2i相似度buys和clicks相似度的mean和max 36 | def deepwalk_i2i_similarity1(stage, candidate_type, start, end): 37 | print('开始读取数据!!!') 38 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 39 | print(valid) 40 | print('开始筛选buys') 41 | valid1 = valid[valid['type'] != 0] 42 | print(valid1) 43 | print('开始排序') 44 | # 分别对session_id聚合,对时间进行排序 45 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 46 | print('生成list') 47 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 48 | sentences_df.columns = ['carts_and_orders'] 49 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 50 | sentences_df = sentences_df.drop(columns='carts_and_orders') 51 | print(sentences_df) 52 | print('开始筛选clicks') 53 | valid2 = valid[valid['type'] == 0] 54 | print(valid2) 55 | print('开始排序') 56 | # 分别对session_id聚合,对时间进行排序 57 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 58 | print('生成list') 59 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 60 | sentences_df1.columns = ['clicks'] 61 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 62 | sentences_df1 = sentences_df1.drop(columns='clicks') 63 | print(sentences_df1) 64 | 65 | print('开始读取词向量!!') 66 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format( 67 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v', 68 | binary=False) 69 | for t in candidate_type: 70 | # 只导入训练数据 71 | print('开始导入数据') 72 | for i in range(start, end): 73 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 74 | print(f'第{i + 1}块数据') 75 | chunk = pd.read_parquet(path) 76 | print(path) 77 | print(chunk.columns) 78 | chunk = chunk.astype("float32") 79 | chunk['session'] = chunk['session'].astype('int32') 80 | chunk['aid'] = chunk['aid'].astype('int32') 81 | print(chunk) 82 | 83 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 84 | print(chunk) 85 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 86 | print('开始计算相似度!!!') 87 | chunk['sim_score_str'] = chunk['sim_list'].apply( 88 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month)) 89 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 90 | chunk['buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 91 | chunk['buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 92 | print(chunk[chunk['buys_sim_mean'] != -10]) 93 | 94 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 95 | print(chunk) 96 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 97 | print('click开始计算相似度!!!') 98 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 99 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month)) 100 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 101 | chunk['clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 102 | chunk['clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 103 | print(chunk[chunk['clicks_sim_mean'] != -10]) 104 | 105 | chunk = chunk.drop( 106 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 107 | 'clicks_sim_score_str']) 108 | print(chunk[['buys_sim_mean', 'buys_sim_max', 'clicks_sim_mean', 'clicks_sim_max']]) 109 | print(chunk) 110 | chunk.to_parquet(path) 111 | 112 | 113 | # deepwalk,i2i相似度orders和carts相似度的mean和max 114 | def deepwalk_i2i_similarity2(stage, candidate_type, start, end): 115 | print('开始读取数据!!!') 116 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 117 | print(valid) 118 | print('开始筛选buys') 119 | valid1 = valid[valid['type'] == 2] 120 | print(valid1) 121 | print('开始排序') 122 | # 分别对session_id聚合,对时间进行排序 123 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 124 | print('生成list') 125 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 126 | sentences_df.columns = ['carts_and_orders'] 127 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 128 | sentences_df = sentences_df.drop(columns='carts_and_orders') 129 | print(sentences_df) 130 | print('开始筛选clicks') 131 | valid2 = valid[valid['type'] == 1] 132 | print(valid2) 133 | print('开始排序') 134 | # 分别对session_id聚合,对时间进行排序 135 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 136 | print('生成list') 137 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 138 | sentences_df1.columns = ['clicks'] 139 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 140 | sentences_df1 = sentences_df1.drop(columns='clicks') 141 | print(sentences_df1) 142 | 143 | print('开始读取词向量!!') 144 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format( 145 | f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v', 146 | binary=False) 147 | for t in candidate_type: 148 | # 只导入训练数据 149 | print('开始导入数据') 150 | for i in range(start, end): 151 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 152 | print(f'第{i + 1}块数据') 153 | chunk = pd.read_parquet(path) 154 | print(path) 155 | print(chunk.columns) 156 | chunk = chunk.astype("float32") 157 | chunk['session'] = chunk['session'].astype('int32') 158 | chunk['aid'] = chunk['aid'].astype('int32') 159 | print(chunk) 160 | 161 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 162 | print(chunk) 163 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 164 | print('开始计算相似度!!!') 165 | chunk['sim_score_str'] = chunk['sim_list'].apply( 166 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month)) 167 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 168 | chunk['orders_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 169 | chunk['orders_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 170 | print(chunk[chunk['orders_sim_mean'] != -10]) 171 | 172 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 173 | print(chunk) 174 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 175 | print('click开始计算相似度!!!') 176 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 177 | lambda x: calculate_deepwalk_similarity(x, word2vec_last_month)) 178 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 179 | chunk['carts_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 180 | chunk['carts_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 181 | print(chunk[chunk['carts_sim_mean'] != -10]) 182 | 183 | chunk = chunk.drop( 184 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 185 | 'clicks_sim_score_str']) 186 | print(chunk[['orders_sim_mean', 'orders_sim_max', 'carts_sim_mean', 'carts_sim_max']]) 187 | print(chunk) 188 | chunk.to_parquet(path) 189 | 190 | 191 | def calculate_deepwalk_similarity_tail(string, model): 192 | list = string.split(' ') 193 | if int(list[-1]) < 0: 194 | return '-10 -10 -10' 195 | sim = [] 196 | aid = 'item_' + list[0] 197 | for i in list[1:]: 198 | simm = model.similarity(f'item_{i}', aid) 199 | sim.append(simm) 200 | if len(sim) >= 3: 201 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' ' + str(sim[-3]) 202 | elif len(sim) == 2: 203 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' -10' 204 | else: 205 | return str(sim[-1]) + ' -10 -10' 206 | 207 | 208 | def deepwalk_i2i_similarity_tail(stage, candidate_type, start, end): 209 | print('开始读取数据!!!') 210 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 211 | print(valid) 212 | print('开始筛选') 213 | 214 | valid1 = valid[valid['type'] != 0] 215 | print(valid1) 216 | print('开始排序') 217 | # 分别对session_id聚合,对时间进行排序 218 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 219 | print('生成list') 220 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 221 | sentences_df.columns = ['carts_and_orders'] 222 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 223 | sentences_df = sentences_df.drop(columns='carts_and_orders') 224 | print(sentences_df) 225 | 226 | valid2 = valid[valid['type'] == 0] 227 | print(valid2) 228 | print('开始排序') 229 | # 分别对session_id聚合,对时间进行排序 230 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 231 | print('生成list') 232 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 233 | sentences_df1.columns = ['clicks'] 234 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 235 | sentences_df1 = sentences_df1.drop(columns='clicks') 236 | print(sentences_df1) 237 | 238 | print('开始读取词向量!!') 239 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format(f'/home/niejianfei/otto/{stage}/preprocess/deepwalk_last_month.w2v', 240 | binary=False) 241 | for t in candidate_type: 242 | # 只导入训练数据 243 | print('开始导入数据') 244 | for i in range(start, end): 245 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 246 | print(f'第{i + 1}块数据') 247 | chunk = pd.read_parquet(path) 248 | print(path) 249 | chunk = chunk.astype("float32") 250 | chunk['session'] = chunk['session'].astype('int32') 251 | chunk['aid'] = chunk['aid'].astype('int32') 252 | print(chunk) 253 | print(chunk.columns) 254 | 255 | print('merge') 256 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 257 | print(chunk) 258 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 259 | print('开始计算相似度!!!') 260 | chunk['sim_score_str'] = chunk['sim_list'].apply(lambda x: calculate_deepwalk_similarity_tail(x, word2vec_last_month)) 261 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 262 | chunk['buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 263 | chunk['buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 264 | chunk['buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])) 265 | print(chunk[chunk['buys_sim_-1'] != -10]) 266 | 267 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 268 | print(chunk) 269 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 270 | print('click开始计算相似度!!!') 271 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 272 | lambda x: calculate_deepwalk_similarity_tail(x, word2vec_last_month)) 273 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 274 | chunk['clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 275 | chunk['clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 276 | chunk['clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])) 277 | print(chunk[chunk['clicks_sim_-1'] != -10]) 278 | 279 | chunk = chunk.drop( 280 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 281 | 'clicks_sim_score_str']) 282 | print(chunk[['buys_sim_-1', 'buys_sim_-2', 'clicks_sim_-1', 'clicks_sim_-2']]) 283 | print(chunk.columns) 284 | print(chunk) 285 | chunk.to_parquet(path) 286 | 287 | 288 | def calculate_deepwalk_u2i_similarity(string, model): 289 | list = string.split(' ') 290 | if int(list[-1]) < 0: 291 | return '-10' 292 | aid_emb = np.array(model[f'item_{list[0]}']) 293 | user_emb = np.zeros(64) 294 | for i in list[1:]: 295 | user_emb += np.array(model[f'item_{i}']) / (len(list) - 1) 296 | 297 | cos_sim = cosine_similarity(aid_emb.reshape(1, -1), user_emb.reshape(1, -1)) 298 | 299 | return str(cos_sim[0][0]) 300 | 301 | 302 | # deepwalk,u2i相似度orders和carts相似度的mean和max 303 | def deepwalk_u2i_similarity(stage, candidate_type, start, end): 304 | print('开始读取数据!!!') 305 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 306 | print(valid) 307 | 308 | print('开始筛选order') 309 | valid1 = valid[valid['type'] == 2] 310 | print(valid1) 311 | print('开始排序') 312 | # 分别对session_id聚合,对时间进行排序 313 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 314 | print(df.head(10)) 315 | print('生成list') 316 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 317 | sentences_df.columns = ['carts_and_orders'] 318 | print(sentences_df) 319 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 320 | sentences_df = sentences_df.drop(columns='carts_and_orders') 321 | print(sentences_df) 322 | 323 | print('开始筛选cart') 324 | valid2 = valid[valid['type'] == 1] 325 | print(valid2) 326 | print('开始排序') 327 | # 分别对session_id聚合,对时间进行排序 328 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 329 | print(df1.head(10)) 330 | print('生成list') 331 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 332 | sentences_df1.columns = ['clicks'] 333 | print(sentences_df1) 334 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 335 | sentences_df1 = sentences_df1.drop(columns='clicks') 336 | print(sentences_df1) 337 | 338 | print('开始筛选click') 339 | valid3 = valid[valid['type'] == 0] 340 | print(valid3) 341 | print('开始排序') 342 | # 分别对session_id聚合,对时间进行排序 343 | df2 = valid3.sort_values(by=["session", "ts"], ascending=True) 344 | print(df2.head(10)) 345 | print('生成list') 346 | sentences_df2 = pd.DataFrame(df2.groupby('session')['aid'].agg(list)) 347 | sentences_df2.columns = ['clicks'] 348 | print(sentences_df2) 349 | sentences_df2["clicks_str1"] = sentences_df2.clicks.apply(lambda x: " ".join(map(str, x))) 350 | sentences_df2 = sentences_df2.drop(columns='clicks') 351 | print(sentences_df2) 352 | 353 | print('开始读取词向量!!') 354 | word2vec_last_month = gensim.models.KeyedVectors.load_word2vec_format('/home/niejianfei/deepwalk_last_month.w2v', 355 | binary=False) 356 | for t in candidate_type: 357 | # 只导入训练数据 358 | print('开始导入数据') 359 | for i in range(start, end): 360 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 361 | print(f'第{i + 1}块数据') 362 | chunk = pd.read_parquet(path) 363 | print(path) 364 | print(chunk.columns) 365 | chunk = chunk.astype("float32") 366 | chunk['session'] = chunk['session'].astype('int32') 367 | chunk['aid'] = chunk['aid'].astype('int32') 368 | print(chunk) 369 | 370 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 371 | print(chunk) 372 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 373 | print('order开始计算相似度!!!') 374 | chunk['sim_score_str'] = chunk['sim_list'].apply( 375 | lambda x: calculate_deepwalk_u2i_similarity(x, word2vec_last_month)) 376 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 377 | chunk['orders_user_item_sim'] = chunk['sim_score_str'].astype('float32') 378 | print(chunk[chunk['orders_user_item_sim'] != -10]) 379 | 380 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 381 | print(chunk) 382 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 383 | print('cart开始计算相似度!!!') 384 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 385 | lambda x: calculate_deepwalk_u2i_similarity(x, word2vec_last_month)) 386 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 387 | chunk['carts_user_item_sim'] = chunk['clicks_sim_score_str'].astype('float32') 388 | print(chunk[chunk['carts_user_item_sim'] != -10]) 389 | 390 | chunk = chunk.merge(sentences_df2, left_on='session', right_index=True, how='left').fillna(value=-1) 391 | print(chunk) 392 | chunk['clicks_sim_list1'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str1'].astype('str') 393 | print('click开始计算相似度!!!') 394 | chunk['clicks_sim_score_str1'] = chunk['clicks_sim_list1'].apply( 395 | lambda x: calculate_deepwalk_u2i_similarity(x, word2vec_last_month)) 396 | print(chunk[['clicks_str1', 'clicks_sim_list1', 'clicks_sim_score_str1']]) 397 | chunk['clicks_user_item_sim'] = chunk['clicks_sim_score_str1'].astype('float32') 398 | print(chunk[chunk['clicks_user_item_sim'] != -10]) 399 | 400 | chunk = chunk.drop( 401 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 402 | 'clicks_sim_score_str', 'clicks_str1', 'clicks_sim_list1', 403 | 'clicks_sim_score_str1']) 404 | print(chunk[['orders_user_item_sim', 'carts_user_item_sim', 'clicks_user_item_sim']]) 405 | print(chunk.columns) 406 | print(chunk) 407 | chunk.to_parquet(path) 408 | 409 | 410 | def calculate_prone_similarity(string, model, aid_num_dict): 411 | list = string.split(' ') 412 | if int(list[-1]) < 0: 413 | return '-10 -10' 414 | sim = [] 415 | aid = list[0] 416 | if int(aid) in aid_num_dict: 417 | for i in list[1:]: 418 | if int(i) in aid_num_dict: 419 | simm = model.similarity(str(aid_num_dict[int(i)]), str(aid_num_dict[int(aid)])) 420 | sim.append(simm) 421 | if len(sim) == 0: 422 | return '-10 -10' 423 | sim_mean = sum(sim) / len(sim) 424 | sim_max = max(sim) 425 | return str(sim_mean) + ' ' + str(sim_max) 426 | 427 | 428 | # prone,i2i相似度buys和clicks相似度的mean和max 429 | def prone_i2i_similarity(stage, candidate_type, start, end): 430 | print('开始读取数据!!!') 431 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 432 | print(valid) 433 | print('开始筛选') 434 | valid1 = valid[valid['type'] != 0] 435 | print(valid1) 436 | print('开始排序') 437 | # 分别对session_id聚合,对时间进行排序 438 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 439 | print('生成list') 440 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 441 | sentences_df.columns = ['carts_and_orders'] 442 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 443 | sentences_df = sentences_df.drop(columns='carts_and_orders') 444 | print(sentences_df) 445 | 446 | valid2 = valid[valid['type'] == 0] 447 | print(valid2) 448 | print('开始排序') 449 | # 分别对session_id聚合,对时间进行排序 450 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 451 | print('生成list') 452 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 453 | sentences_df1.columns = ['clicks'] 454 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 455 | sentences_df1 = sentences_df1.drop(columns='clicks') 456 | print(sentences_df1) 457 | 458 | print('开始读取词向量!!') 459 | proNE_last_month = gensim.models.KeyedVectors.load_word2vec_format( 460 | f"/home/niejianfei/otto/{stage}/preprocess/proNE_last_month_enhanced.emb", 461 | binary=False) 462 | 463 | print("开始读取aim_num映射文件!!!") 464 | f_read = open(f'/home/niejianfei/otto/{stage}/preprocess/aid_num_dict.pkl', 'rb') 465 | aid_num_dict = pickle.load(f_read) 466 | f_read.close() 467 | print('输出', aid_num_dict[0]) 468 | print("aim_num映射文件读取完毕!!!") 469 | 470 | for t in candidate_type: 471 | # 只导入训练数据 472 | print('开始导入数据') 473 | for i in range(start, end): 474 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 475 | print(f'第{i + 1}块数据') 476 | chunk = pd.read_parquet(path) 477 | print(path) 478 | print(chunk.columns) 479 | 480 | chunk = chunk.astype("float32") 481 | chunk['session'] = chunk['session'].astype('int32') 482 | chunk['aid'] = chunk['aid'].astype('int32') 483 | print(chunk) 484 | 485 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 486 | print(chunk) 487 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 488 | print('开始计算相似度!!!') 489 | chunk['sim_score_str'] = chunk['sim_list'].apply( 490 | lambda x: calculate_prone_similarity(x, proNE_last_month, aid_num_dict)) 491 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 492 | chunk['proNE_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 493 | chunk['proNE_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 494 | print(chunk[chunk['proNE_buys_sim_mean'] != -10]) 495 | 496 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 497 | print(chunk) 498 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 499 | print('click开始计算相似度!!!') 500 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 501 | lambda x: calculate_prone_similarity(x, proNE_last_month, aid_num_dict)) 502 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 503 | chunk['proNE_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 504 | chunk['proNE_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 505 | print(chunk[chunk['proNE_clicks_sim_mean'] != -10]) 506 | 507 | chunk = chunk.drop( 508 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 509 | 'clicks_sim_score_str']) 510 | print(chunk[['proNE_buys_sim_mean', 'proNE_buys_sim_max', 'proNE_clicks_sim_mean', 'proNE_clicks_sim_max']]) 511 | print(chunk) 512 | chunk.to_parquet(path) 513 | 514 | 515 | def calculate_prone_similarity_tail(string, model, aid_num_dict): 516 | list = string.split(' ') 517 | if int(list[-1]) < 0: 518 | return '-10 -10 -10' 519 | sim = [] 520 | aid = list[0] 521 | if int(aid) in aid_num_dict: 522 | for i in list[1:]: 523 | if int(i) in aid_num_dict: 524 | simm = model.similarity(str(aid_num_dict[int(i)]), str(aid_num_dict[int(aid)])) 525 | sim.append(simm) 526 | if len(sim) >= 3: 527 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' ' + str(sim[-3]) 528 | elif len(sim) == 2: 529 | return str(sim[-1]) + ' ' + str(sim[-2]) + ' -10' 530 | elif len(sim) == 1: 531 | return str(sim[-1]) + ' -10 -10' 532 | else: 533 | return '-10 -10 -10' 534 | 535 | 536 | def prone_i2i_similarity_tail(stage, candidate_type, start, end): 537 | print('开始读取数据!!!') 538 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 539 | print(valid) 540 | print('开始筛选') 541 | 542 | valid1 = valid[valid['type'] != 0] 543 | print(valid1) 544 | print('开始排序') 545 | # 分别对session_id聚合,对时间进行排序 546 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 547 | print('生成list') 548 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 549 | sentences_df.columns = ['carts_and_orders'] 550 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 551 | sentences_df = sentences_df.drop(columns='carts_and_orders') 552 | print(sentences_df) 553 | 554 | valid2 = valid[valid['type'] == 0] 555 | print(valid2) 556 | print('开始排序') 557 | # 分别对session_id聚合,对时间进行排序 558 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 559 | print('生成list') 560 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 561 | sentences_df1.columns = ['clicks'] 562 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 563 | sentences_df1 = sentences_df1.drop(columns='clicks') 564 | print(sentences_df1) 565 | 566 | print('开始读取词向量!!') 567 | proNE_last_month = gensim.models.KeyedVectors.load_word2vec_format( 568 | f"/home/niejianfei/otto/{stage}/preprocess/proNE_last_month_enhanced.emb", 569 | binary=False) 570 | 571 | print("开始读取aim_num映射文件!!!") 572 | f_read = open(f'/home/niejianfei/otto/{stage}/preprocess/aid_num_dict.pkl', 'rb') 573 | aid_num_dict = pickle.load(f_read) 574 | f_read.close() 575 | print('输出', aid_num_dict[0]) 576 | print("aim_num映射文件读取完毕!!!") 577 | 578 | for t in candidate_type: 579 | # 只导入训练数据 580 | print('开始导入数据') 581 | for i in range(start, end): 582 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 583 | print(f'第{i + 1}块数据') 584 | chunk = pd.read_parquet(path) 585 | print(path) 586 | print(chunk.columns) 587 | chunk = chunk.astype("float32") 588 | chunk['session'] = chunk['session'].astype('int32') 589 | chunk['aid'] = chunk['aid'].astype('int32') 590 | print(chunk) 591 | print(chunk.columns) 592 | 593 | print('merge') 594 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 595 | print(chunk) 596 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 597 | print('开始计算相似度!!!') 598 | chunk['sim_score_str'] = chunk['sim_list'].apply( 599 | lambda x: calculate_prone_similarity_tail(x, proNE_last_month, aid_num_dict)) 600 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 601 | chunk['proNE_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 602 | chunk['proNE_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 603 | chunk['proNE_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])) 604 | print(chunk[chunk['proNE_buys_sim_-1'] != -10]) 605 | 606 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 607 | print(chunk) 608 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 609 | print('click开始计算相似度!!!') 610 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 611 | lambda x: calculate_prone_similarity_tail(x, proNE_last_month, aid_num_dict)) 612 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 613 | chunk['proNE_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])) 614 | chunk['proNE_clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])) 615 | chunk['proNE_clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])) 616 | print(chunk[chunk['proNE_clicks_sim_-1'] != -10]) 617 | 618 | chunk = chunk.drop( 619 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 620 | 'clicks_sim_score_str']) 621 | print(chunk[['proNE_buys_sim_-1', 'proNE_buys_sim_-2', 'proNE_clicks_sim_-1', 'proNE_clicks_sim_-2']]) 622 | print(chunk.columns) 623 | print(chunk) 624 | chunk.to_parquet(path) 625 | 626 | 627 | def calculate_MF_similarity(string, array): 628 | list = string.split(' ') 629 | if int(list[-1]) < 0: 630 | return '-10' + ' -10' * 3 631 | sim = [] 632 | aid = int(list[0]) 633 | for i in list[1:]: 634 | simm = cosine_similarity(array[aid].reshape(1, -1), array[int(i)].reshape(1, -1))[0][0] 635 | sim.append(simm) 636 | sim_sum = sum(sim) 637 | sim_mean = sim_sum / len(sim) 638 | sim_max = max(sim) 639 | 640 | return str(sim_mean) + ' ' + str(sim_max) + ' ' + str(sim_sum) + ' ' + str(sim[-1]) 641 | 642 | 643 | # bpr,als,lmf,u2i相似度 644 | def bpr_als_lmf_u2i_similarity(stage, candidate_type, start, end): 645 | print('bpr') 646 | bpr_user_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/bpr_user_emb.npy') 647 | bpr_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/bpr_item_emb.npy') 648 | print('als') 649 | als_user_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/als_user_emb.npy') 650 | als_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/als_item_emb.npy') 651 | print('lmf') 652 | lmf_user_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/lmf_user_emb.npy') 653 | lmf_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/lmf_item_emb.npy') 654 | 655 | for t in candidate_type: 656 | print('开始导入数据') 657 | for i in range(start, end): 658 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 659 | print(f'第{i + 1}块数据') 660 | chunk = pd.read_parquet(path) 661 | print(path) 662 | print(chunk.columns) 663 | 664 | chunk = chunk.astype("float32") 665 | chunk['session'] = chunk['session'].astype('int32') 666 | chunk['aid'] = chunk['aid'].astype('int32') 667 | print(chunk) 668 | 669 | chunk['list'] = chunk['session'].astype('str') + ' ' + chunk['aid'].astype('str') 670 | print(chunk) 671 | chunk['bpr_user_item_sim'] = chunk['list'].map( 672 | lambda x: np.dot(bpr_user_emb[int(x.split(' ')[0])], bpr_item_emb[int(x.split(' ')[1])])) 673 | print(chunk['bpr_user_item_sim'].describe()) 674 | 675 | chunk['als_user_item_sim'] = chunk['list'].map( 676 | lambda x: np.dot(als_user_emb[int(x.split(' ')[0])], als_item_emb[int(x.split(' ')[1])])) 677 | print(chunk['als_user_item_sim'].describe()) 678 | 679 | chunk['lmf_user_item_sim'] = chunk['list'].map( 680 | lambda x: np.dot(lmf_user_emb[int(x.split(' ')[0])], lmf_item_emb[int(x.split(' ')[1])])) 681 | print(chunk['lmf_user_item_sim'].describe()) 682 | 683 | print(chunk) 684 | chunk.to_parquet(path) 685 | 686 | 687 | def bpr_als_lmf_i2i_similarity(stage, candidate_type, start, end): 688 | print('开始读取数据!!!') 689 | valid = load_validate(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 690 | print(valid) 691 | print('开始筛选') 692 | 693 | valid1 = valid[valid['type'] != 0] 694 | print(valid1) 695 | print('开始排序') 696 | # 分别对session_id聚合,对时间进行排序 697 | df = valid1.sort_values(by=["session", "ts"], ascending=True) 698 | print(df.head(10)) 699 | print('生成list') 700 | sentences_df = pd.DataFrame(df.groupby('session')['aid'].agg(list)) 701 | sentences_df.columns = ['carts_and_orders'] 702 | print(sentences_df) 703 | sentences_df["carts_and_orders_str"] = sentences_df.carts_and_orders.apply(lambda x: " ".join(map(str, x))) 704 | sentences_df = sentences_df.drop(columns='carts_and_orders') 705 | print(sentences_df) 706 | 707 | valid2 = valid[valid['type'] == 0] 708 | print(valid2) 709 | print('开始排序') 710 | # 分别对session_id聚合,对时间进行排序 711 | df1 = valid2.sort_values(by=["session", "ts"], ascending=True) 712 | print(df1.head(10)) 713 | print('生成list') 714 | sentences_df1 = pd.DataFrame(df1.groupby('session')['aid'].agg(list)) 715 | sentences_df1.columns = ['clicks'] 716 | print(sentences_df1) 717 | sentences_df1["clicks_str"] = sentences_df1.clicks.apply(lambda x: " ".join(map(str, x))) 718 | sentences_df1 = sentences_df1.drop(columns='clicks') 719 | print(sentences_df1) 720 | 721 | print('bpr') 722 | bpr_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/bpr_item_emb.npy') 723 | print('als') 724 | als_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/als_item_emb.npy') 725 | print('lmf') 726 | lmf_item_emb = np.load(f'/home/niejianfei/otto/{stage}/preprocess/lmf_item_emb.npy') 727 | 728 | for t in candidate_type: 729 | # 只导入训练数据 730 | print('开始导入数据') 731 | for i in range(start, end): 732 | path = f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 733 | print(f'第{i + 1}块数据') 734 | chunk = pd.read_parquet(path) 735 | print(path) 736 | print(chunk.columns) 737 | 738 | chunk = chunk.astype("float32") 739 | chunk['session'] = chunk['session'].astype('int32') 740 | chunk['aid'] = chunk['aid'].astype('int32') 741 | print(chunk) 742 | 743 | chunk = chunk.merge(sentences_df, left_on='session', right_index=True, how='left').fillna(value=-1) 744 | print(chunk) 745 | chunk['sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['carts_and_orders_str'].astype('str') 746 | print('开始计算相似度!!!') 747 | chunk['sim_score_str'] = chunk['sim_list'].apply( 748 | lambda x: calculate_MF_similarity(x, bpr_item_emb)) 749 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 750 | chunk['bpr_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32') 751 | chunk['bpr_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32') 752 | chunk['bpr_buys_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32') 753 | chunk['bpr_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32') 754 | chunk['bpr_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32') 755 | chunk['bpr_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32') 756 | print(chunk[chunk['bpr_buys_sim_-3'] != -10]) 757 | print(chunk) 758 | 759 | print('开始计算相似度!!!') 760 | chunk['sim_score_str'] = chunk['sim_list'].apply( 761 | lambda x: calculate_MF_similarity(x, als_item_emb)) 762 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 763 | chunk['als_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32') 764 | chunk['als_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32') 765 | chunk['als_buys_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32') 766 | chunk['als_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32') 767 | chunk['als_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32') 768 | chunk['als_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32') 769 | 770 | print(chunk[chunk['als_buys_sim_-3'] != -10]) 771 | print(chunk) 772 | 773 | print('开始计算相似度!!!') 774 | chunk['sim_score_str'] = chunk['sim_list'].apply( 775 | lambda x: calculate_MF_similarity(x, lmf_item_emb)) 776 | print(chunk[['carts_and_orders_str', 'sim_list', 'sim_score_str']]) 777 | chunk['lmf_buys_sim_mean'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32') 778 | chunk['lmf_buys_sim_max'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32') 779 | chunk['lmf_buys_sim_sum'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32') 780 | chunk['lmf_buys_sim_-1'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32') 781 | chunk['lmf_buys_sim_-2'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32') 782 | chunk['lmf_buys_sim_-3'] = chunk['sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32') 783 | 784 | print(chunk[chunk['lmf_buys_sim_-3'] != -10]) 785 | print(chunk) 786 | 787 | chunk = chunk.merge(sentences_df1, left_on='session', right_index=True, how='left').fillna(value=-1) 788 | print(chunk) 789 | chunk['clicks_sim_list'] = chunk['aid'].astype('str') + ' ' + chunk['clicks_str'].astype('str') 790 | print('click开始计算相似度!!!') 791 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 792 | lambda x: calculate_MF_similarity(x, bpr_item_emb)) 793 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 794 | chunk['bpr_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32') 795 | chunk['bpr_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32') 796 | chunk['bpr_clicks_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32') 797 | chunk['bpr_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32') 798 | chunk['bpr_clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32') 799 | chunk['bpr_clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32') 800 | print(chunk[chunk['bpr_clicks_sim_-3'] != -10]) 801 | print(chunk) 802 | 803 | print('click开始计算相似度!!!') 804 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 805 | lambda x: calculate_MF_similarity(x, als_item_emb)) 806 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 807 | chunk['als_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype( 808 | 'float32') 809 | chunk['als_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype( 810 | 'float32') 811 | chunk['als_clicks_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype( 812 | 'float32') 813 | chunk['als_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype( 814 | 'float32') 815 | print(chunk[chunk['als_clicks_sim_-1'] != -10]) 816 | print(chunk) 817 | 818 | print('click开始计算相似度!!!') 819 | chunk['clicks_sim_score_str'] = chunk['clicks_sim_list'].apply( 820 | lambda x: calculate_MF_similarity(x, lmf_item_emb)) 821 | print(chunk[['clicks_str', 'clicks_sim_list', 'clicks_sim_score_str']]) 822 | chunk['lmf_clicks_sim_mean'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[0])).astype('float32') 823 | chunk['lmf_clicks_sim_max'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[1])).astype('float32') 824 | chunk['lmf_clicks_sim_sum'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[2])).astype('float32') 825 | chunk['lmf_clicks_sim_-1'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[3])).astype('float32') 826 | chunk['lmf_clicks_sim_-2'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[4])).astype('float32') 827 | chunk['lmf_clicks_sim_-3'] = chunk['clicks_sim_score_str'].apply(lambda x: float(x.split(' ')[5])).astype('float32') 828 | print(chunk[chunk['lmf_clicks_sim_-3'] != -10]) 829 | print(chunk) 830 | 831 | chunk = chunk.drop( 832 | columns=['carts_and_orders_str', 'sim_list', 'sim_score_str', 'clicks_str', 'clicks_sim_list', 833 | 'clicks_sim_score_str']) 834 | print(chunk['als_clicks_sim_max']) 835 | print(chunk.columns) 836 | print(chunk) 837 | chunk.to_parquet(path) 838 | 839 | 840 | def similarity_features(stage, candidate_type, start, end): 841 | # buys&clicks * 4 842 | deepwalk_i2i_similarity1(stage, candidate_type, start, end) 843 | # orders&carts * 4 844 | deepwalk_i2i_similarity2(stage, candidate_type, start, end) 845 | # buys&clicks * 6 846 | deepwalk_i2i_similarity_tail(stage, candidate_type, start, end) 847 | # buys&clicks * 3 848 | deepwalk_u2i_similarity(stage, candidate_type, start, end) 849 | 850 | # buys&clicks * 4 851 | prone_i2i_similarity(stage, candidate_type, start, end) 852 | # buys&clicks * 6 853 | prone_i2i_similarity_tail(stage, candidate_type, start, end) 854 | 855 | 856 | if __name__ == '__main__': 857 | IS_TRAIN = True 858 | if IS_TRAIN: 859 | stage = 'CV' 860 | else: 861 | stage = 'LB' 862 | candidate_type = ['clicks', 'carts', 'orders'] 863 | similarity_features(stage, candidate_type, 0, 8) 864 | -------------------------------------------------------------------------------- /features/user_features.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import math 3 | import pandas as pd 4 | import numpy as np 5 | 6 | 7 | def load_data(path): 8 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 9 | dfs = [] 10 | # 只导入训练数据 11 | for e, chunk_file in enumerate(glob.glob(path)): 12 | chunk = pd.read_parquet(chunk_file) 13 | chunk.ts = (chunk.ts / 1000).astype('int32') 14 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 15 | if not IS_TRAIN: 16 | # 除去第一周的数据 17 | chunk = chunk[chunk['ts'] >= 1659909599] 18 | dfs.append(chunk) 19 | return pd.concat(dfs).reset_index(drop=True) 20 | 21 | 22 | def user_features(input_path, output_path): 23 | print('开始读取数据!!!') 24 | valid = load_data(input_path) 25 | print("开始构造user_feature!!!") 26 | # 类别型变量分析:计数,分布 27 | # 连续性变量分析:最小值,最大值,离差,平均数,中位数,众数,标准差,变异系数,偏度,峰度 28 | print("开始聚合user:agg中!!!") 29 | user_features = valid.groupby('session').agg({'session': 'count', 'aid': 'nunique', 'type': ['mean', 'skew'], 30 | 'ts': ['min', 'max', 'skew']}) 31 | user_features.columns = ['user_user_count', 'user_item_count', 'user_buy_ratio', 'user_buy_skew', 32 | 'user_min_ts', 'user_max_ts', 'user_skew_ts'] 33 | print("开始计算ts偏态峰态系数!!!") 34 | # 计算时间偏态系数,计算时间峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0) 35 | user_features['user_skew_ts'] = user_features['user_skew_ts'].fillna(value=0) 36 | user_features['user_kurt_ts'] = valid.groupby('session')['ts'].apply(lambda x: pd.DataFrame.kurt(x)).fillna(value=0) 37 | 38 | print("开始计算type偏态峰态系数!!!") 39 | # 计算类型偏态系数,计算类型峰态系数,Pandas Series.kurt()函数使用Fisher对峰度的定义(正常的峰度==0.0) 40 | user_features['user_buy_skew'] = user_features['user_buy_skew'].fillna(value=0) 41 | user_features['user_buy_kurt'] = valid.groupby('session')['type'].apply(lambda x: pd.DataFrame.kurt(x)).fillna( 42 | value=0) 43 | 44 | print("开始计算ts天数!!!") 45 | # 序列持续的时间(天) 46 | user_features['user_long_ts'] = user_features['user_max_ts'] - user_features['user_min_ts'] 47 | 48 | print("开始计算user三个比例特征!!!") 49 | # 平均每天观看几个商品 50 | user_features["user_avg_visit_per_day"] = user_features['user_user_count'] / ( 51 | user_features['user_long_ts'] / (60 * 60 * 24)).clip(1, 60).apply( 52 | lambda x: math.ceil(x)) 53 | # user重复观看的商品次数 54 | user_features["user_repeat_visit_num"] = user_features['user_user_count'] - user_features['user_item_count'] 55 | # 平均每个商品观看的次数 56 | user_features["user_ave_visit_num"] = user_features['user_user_count'] / user_features['user_item_count'] 57 | # session里面aids的re_watch比例 58 | user_features["user_re_visit_rate"] = user_features['user_repeat_visit_num'] / user_features['user_user_count'] 59 | print(user_features.head()) 60 | print(user_features.columns) 61 | print(user_features.shape) 62 | # 规定保存格式 63 | user_features = user_features.astype('float32') 64 | print("开始保存特征到文件!!!") 65 | user_features.to_parquet(output_path) 66 | 67 | 68 | def add_user_features(input_path, output_path): 69 | # user feature 7 70 | # 平均购买/加购/点击间隔 max - min / num 71 | # 点击购买率 72 | # 点击加购率 73 | # 加购购买率 74 | # 点击占比 75 | # 加购占比 76 | # 购买占比 user特征比较稀疏,加上可能效果不好 77 | # 复购率 78 | # 复加购率 79 | # 复点击率 item_item - item_user 80 | print('开始读取数据!!!') 81 | train = load_data(input_path) 82 | print("开始构造user_feature!!!") 83 | # 类别型变量分析:计数,分布 84 | # 连续性变量分析:最小值,最大值,离差,平均数,中位数,众数,标准差,变异系数,偏度,峰度 85 | print("开始聚合user:agg中!!!") 86 | train_click = train[train['type'] == 0] 87 | train_cart = train[train['type'] == 1] 88 | train_order = train[train['type'] == 2] 89 | 90 | print("开始构造item_feature!!!") 91 | click_user_features = train_click.groupby('session').agg({'aid': ['count', 'nunique'], 'ts': ['min', 'max']}) 92 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 93 | click_user_features.columns = ['click_user_user_count', 'click_user_item_count', 'ts_min', 'ts_max'] 94 | click_user_features['click_time'] = click_user_features['ts_max'] - click_user_features['ts_min'] 95 | click_user_features['avg_click_span'] = click_user_features['click_time'] / click_user_features['click_user_user_count'] 96 | click_user_features = click_user_features.drop(columns=['ts_min', 'ts_max', 'click_time']) 97 | print(click_user_features) 98 | 99 | cart_user_features = train_cart.groupby('session').agg({'aid': ['count', 'nunique'], 'ts': ['min', 'max']}) 100 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 101 | cart_user_features.columns = ['cart_user_user_count', 'cart_user_item_count', 'ts_min', 'ts_max'] 102 | cart_user_features['cart_time'] = cart_user_features['ts_max'] - cart_user_features['ts_min'] 103 | cart_user_features['avg_cart_span'] = cart_user_features['cart_time'] / cart_user_features['cart_user_user_count'] 104 | cart_user_features = cart_user_features.drop(columns=['ts_min', 'ts_max', 'cart_time']) 105 | print(cart_user_features) 106 | 107 | order_user_features = train_order.groupby('session').agg({'aid': ['count', 'nunique'], 'ts': ['min', 'max']}) 108 | # aid出现的次数,也就是aid发生的events数量:定义热门商品;操作aid的用户数量:简介定义热门商品;类型均值:这个商品易购程度 109 | order_user_features.columns = ['order_user_user_count', 'order_user_item_count', 'ts_min', 'ts_max'] 110 | order_user_features['order_time'] = order_user_features['ts_max'] - order_user_features['ts_min'] 111 | order_user_features['avg_order_span'] = order_user_features['order_time'] / order_user_features['order_user_user_count'] 112 | order_user_features = order_user_features.drop(columns=['ts_min', 'ts_max', 'order_time']) 113 | print(order_user_features) 114 | 115 | click_user_features = click_user_features.merge(cart_user_features, left_index=True, right_index=True, 116 | how='left').fillna(value=0) 117 | click_user_features = click_user_features.merge(order_user_features, left_index=True, right_index=True, 118 | how='left').fillna(value=0) 119 | 120 | # click_item_item_count, click_item_user_count 121 | # 点击购买率 * 3 122 | click_user_features['user_click_cart_rate'] = click_user_features['cart_user_user_count'] / click_user_features[ 123 | 'click_user_user_count'] 124 | click_user_features['user_click_order_rate'] = click_user_features['order_user_user_count'] / click_user_features[ 125 | 'click_user_user_count'] 126 | click_user_features['user_cart_order_rate'] = click_user_features['order_user_user_count'] / click_user_features['cart_user_user_count'] 127 | 128 | # 点击占比 129 | click_user_features['user_click_percentage'] = click_user_features['click_user_user_count'] / click_user_features[ 130 | 'click_user_user_count'].sum() 131 | click_user_features['user_cart_percentage'] = click_user_features['cart_user_user_count'] / click_user_features[ 132 | 'cart_user_user_count'].sum() 133 | click_user_features['user_order_percentage'] = click_user_features['order_user_user_count'] / click_user_features[ 134 | 'order_user_user_count'].sum() 135 | # 复购率 136 | click_user_features['user_re_click_rate'] = (click_user_features['click_user_user_count'] - click_user_features[ 137 | 'click_user_item_count']) / click_user_features['click_user_user_count'] 138 | click_user_features['user_re_cart_rate'] = (click_user_features['cart_user_user_count'] - click_user_features[ 139 | 'cart_user_item_count']) / click_user_features['cart_user_user_count'] 140 | click_user_features['user_re_order_rate'] = (click_user_features['order_user_user_count'] - click_user_features[ 141 | 'order_user_item_count']) / click_user_features['order_user_user_count'] 142 | 143 | click_user_features = click_user_features.replace(np.inf, 100) 144 | click_user_features = click_user_features.fillna(value=-10) 145 | print(click_user_features) 146 | 147 | print("开始保存特征到文件!!!") 148 | click_user_features.to_parquet(output_path) 149 | 150 | 151 | def trans_time_span_features(input_path, output_path1, output_path2, output_path3): 152 | train = load_data(input_path) 153 | 154 | train_clicks = train[train['type'] == 0].drop(columns='type') 155 | train_clicks = train_clicks.rename(columns={'ts': 'ts_click'}) 156 | train_carts = train[train['type'] == 1].drop(columns='type') 157 | train_carts = train_carts.rename(columns={'ts': 'ts_cart'}) 158 | train_orders = train[train['type'] == 2].drop(columns='type') 159 | train_orders = train_orders.rename(columns={'ts': 'ts_order'}) 160 | 161 | print('click_cart_span') 162 | click_cart_span = train_clicks.merge(train_carts, on=['session', 'aid'], how='inner') 163 | print(click_cart_span) 164 | click_cart_span['min'] = click_cart_span['ts_click'] - click_cart_span['ts_cart'] 165 | click_cart_span = click_cart_span[click_cart_span['min'] <= 0].drop(columns='min') 166 | print(click_cart_span) 167 | click_cart_span_feature = click_cart_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_cart': 'min'}) 168 | click_cart_span_feature.columns = ['ts_click_min', 'ts_cart_min'] 169 | print(click_cart_span_feature) 170 | click_cart_span_feature['click_cart_span'] = click_cart_span_feature['ts_cart_min'] - click_cart_span_feature['ts_click_min'] 171 | print(click_cart_span_feature) 172 | click_cart_span_feature['aids'] = click_cart_span_feature.index.get_level_values('aid') 173 | print(click_cart_span_feature) 174 | print(click_cart_span_feature.index.get_level_values('aid')[:10]) 175 | click_cart_span_feature = click_cart_span_feature.groupby('aids').agg({'aids': 'count', 'click_cart_span': 'mean'}) 176 | click_cart_span_feature.columns = ['trans_click_cart_count', 'trans_click_cart_span_avg'] 177 | print(click_cart_span_feature.describe()) 178 | print(click_cart_span_feature) 179 | click_cart_span_feature.to_parquet(output_path1) 180 | 181 | print('click_order_span') 182 | click_order_span = train_clicks.merge(train_orders, on=['session', 'aid'], how='inner') 183 | print(click_order_span) 184 | click_order_span['min'] = click_order_span['ts_click'] - click_order_span['ts_order'] 185 | click_order_span = click_order_span[click_order_span['min'] <= 0].drop(columns='min') 186 | print(click_order_span) 187 | click_order_span_feature = click_order_span.groupby(['session', 'aid']).agg({'ts_click': 'min', 'ts_order': 'min'}) 188 | click_order_span_feature.columns = ['ts_click_min', 'ts_order_min'] 189 | print(click_order_span_feature) 190 | click_order_span_feature['click_order_span'] = click_order_span_feature['ts_order_min'] - click_order_span_feature['ts_click_min'] 191 | print(click_order_span_feature) 192 | click_order_span_feature['aids'] = click_order_span_feature.index.get_level_values('aid') 193 | print(click_order_span_feature) 194 | print(click_order_span_feature.index.get_level_values('aid')[:10]) 195 | click_order_span_feature = click_order_span_feature.groupby('aids').agg({'aids': 'count', 'click_order_span': 'mean'}) 196 | click_order_span_feature.columns = ['trans_click_order_count', 'trans_click_order_span_avg'] 197 | print(click_order_span_feature.describe()) 198 | print(click_order_span_feature) 199 | click_order_span_feature.to_parquet(output_path2) 200 | 201 | 202 | print('cart_order_span') 203 | carts_order_span = train_carts.merge(train_orders, on=['session', 'aid'], how='inner') 204 | print(carts_order_span) 205 | carts_order_span['min'] = carts_order_span['ts_cart'] - carts_order_span['ts_order'] 206 | carts_order_span = carts_order_span[carts_order_span['min'] <= 0].drop(columns='min') 207 | print(carts_order_span) 208 | cart_order_span_feature = carts_order_span.groupby(['session', 'aid']).agg({'ts_cart': 'min', 'ts_order': 'min'}) 209 | cart_order_span_feature.columns = ['ts_cart_min', 'ts_order_min'] 210 | print(cart_order_span_feature) 211 | cart_order_span_feature['cart_order_span'] = cart_order_span_feature['ts_order_min'] - cart_order_span_feature['ts_cart_min'] 212 | print(cart_order_span_feature) 213 | cart_order_span_feature['aids'] = cart_order_span_feature.index.get_level_values('aid') 214 | print(cart_order_span_feature) 215 | print(cart_order_span_feature.index.get_level_values('aid')[:10]) 216 | cart_order_span_feature = cart_order_span_feature.groupby('aids').agg({'aids': 'count', 'cart_order_span': 'mean'}) 217 | cart_order_span_feature.columns = ['trans_cart_order_count', 'trans_cart_order_span_avg'] 218 | print(cart_order_span_feature.describe()) 219 | print(cart_order_span_feature) 220 | cart_order_span_feature.to_parquet(output_path3) 221 | 222 | 223 | if __name__ == '__main__': 224 | IS_TRAIN = True 225 | if IS_TRAIN: 226 | stage = 'CV' 227 | else: 228 | stage = 'LB' 229 | input_path = f'/home/niejianfei/otto/{stage}/data/test_parquet/*' 230 | output_path = f'/home/niejianfei/otto/{stage}/preprocess/user_features.pqt' 231 | output_path1 = f'/home/niejianfei/otto/{stage}/preprocess/add_user_features.pqt' 232 | user_features(input_path, output_path) 233 | add_user_features(input_path, output_path1) 234 | 235 | input_path1 = f'/home/niejianfei/otto/{stage}/data/train_parquet/*' 236 | output_path2 = f'/home/niejianfei/otto/{stage}/preprocess/click_cart_span_features.pqt' 237 | output_path3 = f'/home/niejianfei/otto/{stage}/preprocess/click_order_span_features.pqt' 238 | output_path4 = f'/home/niejianfei/otto/{stage}/preprocess/cart_order_span_features.pqt' 239 | trans_time_span_features(input_path1, output_path2, output_path3, output_path4) -------------------------------------------------------------------------------- /features/user_item_features.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pandas as pd 3 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 4 | IS_TRAIN = True 5 | 6 | 7 | def load_data(path): 8 | dfs = [] 9 | # 只导入训练数据 10 | for e, chunk_file in enumerate(glob.glob(path)): 11 | chunk = pd.read_parquet(chunk_file) 12 | chunk.ts = (chunk.ts / 1000).astype('int32') 13 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 14 | dfs.append(chunk) 15 | return pd.concat(dfs).reset_index(drop=True) 16 | 17 | 18 | def user_item_features(stage, candidate_type): 19 | valid = load_data(f'/home/niejianfei/otto/{stage}/data/test_parquet/*') 20 | for t in candidate_type: 21 | print('读取candidates!!!') 22 | candidates = pd.read_parquet(f'/home/niejianfei/otto/{stage}/candidates/candidates_{t}.pqt').reset_index( 23 | drop=True) 24 | candidates = candidates.sort_values('session', ascending=True) 25 | print(candidates) 26 | 27 | print("开始构造user_item interaction features!!!") 28 | # 构造user_item interaction features 29 | print("click!!!") 30 | # item是否被点击 31 | item_clicked = valid[valid["type"] == 0].drop(columns="ts").drop_duplicates(["session", "aid"]) 32 | item_clicked["type"] = 1 33 | item_clicked.columns = ["session", "aid", "item_clicked"] 34 | # item_clicked 特征 35 | item_clicked_features = valid[valid["type"] == 0].groupby(['session', 'aid']).agg( 36 | {'aid': 'count'}) 37 | item_clicked_features.columns = ['item_clicked_num'] 38 | item_clicked_features = item_clicked_features.astype('float32') 39 | 40 | print("cart!!!") 41 | # item是否被加购 42 | item_carted = valid[valid["type"] == 1].drop(columns="ts").drop_duplicates(["session", "aid"]) 43 | item_carted["type"] = 1 44 | item_carted.columns = ["session", "aid", "item_carted"] 45 | # item_carted 特征 46 | item_carted_features = valid[valid["type"] == 1].groupby(['session', 'aid']).agg( 47 | {'aid': 'count'}) 48 | item_carted_features.columns = ['item_carted_num'] 49 | item_carted_features = item_carted_features.astype('float32') 50 | print("order!!!") 51 | 52 | # item是否被购买 53 | item_ordered = valid[valid["type"] == 2].drop(columns="ts").drop_duplicates(["session", "aid"]) 54 | item_ordered["type"] = 1 55 | item_ordered.columns = ["session", "aid", "item_ordered"] 56 | # item_ordered 特征 57 | item_ordered_features = valid[valid["type"] == 2].groupby(['session', 'aid']).agg( 58 | {'aid': 'count'}) 59 | item_ordered_features.columns = ['item_ordered_num'] 60 | item_ordered_features = item_ordered_features.astype('float32') 61 | 62 | print("开始聚合数据!!!") 63 | item_features = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/item_features.pqt') 64 | 65 | chunk = 8 66 | size = candidates.shape[0] + 200 67 | print(f"candidates有{candidates.shape[0]}条数据!!!") 68 | # 距离session结束的时间sec, 需要ts merge到candidate上然后减去min_ts 69 | # 去重,保留最后一个ts,merge 相减 加两列特征距离session结束的时间sec,和最后一次和aid交互的类型 70 | valid = valid.drop_duplicates(['session', 'aid'], keep='last').drop(columns='type') 71 | # valid['user_item_within'] = 1 72 | print(valid) 73 | 74 | user_features = pd.read_parquet(f'/home/niejianfei/otto/{stage}/preprocess/user_features.pqt') 75 | valid = valid.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1000) 76 | 77 | valid['sec_to_session_start'] = valid['ts'] - valid['user_min_ts'] 78 | valid['sec_to_session_end'] = valid['user_max_ts'] - valid['ts'] 79 | valid = valid.drop(columns=['user_min_ts', 'user_max_ts', 'ts']) 80 | 81 | val_session = valid[['sec_to_session_start', 'sec_to_session_end', 'user_long_ts']] 82 | print(val_session) 83 | print((val_session['sec_to_session_start'] + val_session['sec_to_session_end'] - val_session['user_long_ts']).max()) 84 | 85 | k = size // chunk 86 | t = 0 87 | for i in range(chunk): 88 | print(f"第{i + 1}块!!!") 89 | print("1!!!") 90 | temp_candidates = candidates.iloc[k * i:k * (i + 1), :] 91 | print(temp_candidates) 92 | # merge user_item interaction features 93 | temp_candidates = temp_candidates.merge(item_clicked, how="left", on=["session", "aid"]).fillna(value=-1) 94 | temp_candidates = temp_candidates.merge(item_clicked_features, how="left", on=["session", "aid"]).fillna( 95 | value=-1) 96 | print(temp_candidates) 97 | print("2!!!") 98 | temp_candidates = temp_candidates.merge(item_carted, how="left", on=["session", "aid"]).fillna(value=-1) 99 | temp_candidates = temp_candidates.merge(item_carted_features, how="left", on=["session", "aid"]).fillna( 100 | value=-1) 101 | print("3!!!") 102 | temp_candidates = temp_candidates.merge(item_ordered, how="left", on=["session", "aid"]).fillna(value=-1) 103 | temp_candidates = temp_candidates.merge(item_ordered_features, how="left", on=["session", "aid"]).fillna( 104 | value=-1) 105 | print(temp_candidates) 106 | print("开始读取聚合item_features!!!") 107 | # Step 5:add features to our candidate dataframe 108 | temp_candidates = temp_candidates.merge(item_features, left_on='aid', right_index=True, how='left').fillna( 109 | -1000) 110 | 111 | # 加入交互特征 112 | temp_candidates = temp_candidates.merge(valid, on=["session", "aid"], how='left').fillna(-1) 113 | print(temp_candidates) 114 | 115 | temp_candidates.to_parquet( 116 | f"/home/niejianfei/otto/{stage}/candidates/candidates_{candidate_type[0:-1]}_features_data/candidate_{candidate_type[0:-1]}_{i}.pqt") 117 | print(temp_candidates) 118 | t += len(temp_candidates) 119 | print(f'第{i+1}块数据量:', len(temp_candidates)) 120 | print('数据总量:', t) 121 | 122 | 123 | if __name__ == '__main__': 124 | IS_TRAIN = True 125 | candidate_type = ['clicks', 'carts', 'orders'] 126 | if IS_TRAIN: 127 | stage = 'CV' 128 | else: 129 | stage = 'LB' 130 | 131 | user_item_features(stage, candidate_type) -------------------------------------------------------------------------------- /merge_features.py: -------------------------------------------------------------------------------- 1 | from features import recall_features 2 | from features import user_item_features 3 | from features import similarity_features 4 | from features import co_visitation_features 5 | import pandas as pd 6 | 7 | 8 | def add_labels(candidate_type): 9 | targets = pd.read_parquet('/home/niejianfei/otto/CV/preprocess/test_labels.parquet') 10 | for t in candidate_type: 11 | print("给data加标签!!!") 12 | # 加标签 13 | temp_target = targets[targets['type'] == t].drop(columns="type") 14 | temp_target = temp_target.explode("ground_truth").astype("int32") 15 | temp_target.columns = ['session', 'aid'] 16 | temp_target[t[0:-1]] = 1 17 | 18 | # 只导入CV数据 19 | print('开始导入数据') 20 | for i in range(0, 8): 21 | path = f"/home/niejianfei/otto/CV/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 22 | print(f'第{i + 1}块数据') 23 | chunk = pd.read_parquet(path) 24 | print(path) 25 | print(chunk.columns) 26 | # 加标签,负类标0 27 | chunk = chunk.merge(temp_target, ['session', 'aid'], how='left').fillna(value=0) 28 | print(chunk) 29 | chunk.to_parquet(path) 30 | 31 | 32 | if __name__ == '__main__': 33 | IS_TRAIN = True 34 | candidate_type = ['clicks', 'carts', 'orders'] 35 | if IS_TRAIN: 36 | stage = 'CV' 37 | else: 38 | stage = 'LB' 39 | 40 | recall_features(stage, candidate_type) 41 | user_item_features(stage, candidate_type) 42 | similarity_features(stage, candidate_type, 0, 8) 43 | co_visitation_features(stage, candidate_type, 0, 8) 44 | # 给CV加标签 45 | if IS_TRAIN: 46 | add_labels(candidate_type) -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ranker import generate_submission 3 | from ranker import user_sample 4 | 5 | 6 | def submission(candidate_type): 7 | sub = pd.DataFrame() 8 | for t in candidate_type: 9 | df = pd.read_parquet(f'/home/niejianfei/otto/LB/submission/sub_{t}.pqt') 10 | df = df.loc[df.session_type.str.contains(t)] 11 | sub = sub.append(df) 12 | return sub 13 | 14 | 15 | if __name__ == '__main__': 16 | candidate_type = ['clicks', 'carts', 'orders'] 17 | generate_submission('test', 'LB', candidate_type, user_sample(0.5), 'final_all_data') 18 | 19 | submission_final = submission(candidate_type) 20 | submission_final.to_csv(f'/home/niejianfei/otto/LB/submission/submission_final.csv', index=False) 21 | -------------------------------------------------------------------------------- /preprocess/BPRMF_ALSMF_LMF_prepare.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import pandas as pd 4 | import scipy.sparse as sparse 5 | import implicit 6 | IS_TRAIN = True 7 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 8 | 9 | 10 | def load_data(path): 11 | dfs = [] 12 | # 只导入训练数据 13 | for e, chunk_file in enumerate(glob.glob(path)): 14 | chunk = pd.read_parquet(chunk_file) 15 | chunk.ts = (chunk.ts / 1000).astype('int32') 16 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 17 | dfs.append(chunk) 18 | return pd.concat(dfs).reset_index(drop=True) 19 | 20 | 21 | if IS_TRAIN: 22 | test_df = load_data('/home/niejianfei/otto/CV/data/*_parquet/*') 23 | else: 24 | test_df = load_data('/home/niejianfei/otto/LB/data/*_parquet/*') 25 | 26 | 27 | dic1 = {0: 1, 1: 5, 2: 4} 28 | test_df['type'] = test_df['type'].map(dic1) 29 | grouped_df = test_df.groupby(['session', 'aid']).sum().reset_index() 30 | 31 | # sparse_content_person = sparse.csr_matrix( 32 | # (grouped_df['type'].astype(float), (grouped_df['aid'], grouped_df['session']))) 33 | sparse_person_content = sparse.csr_matrix( 34 | (grouped_df['type'].astype(float), (grouped_df['session'], grouped_df['aid']))) 35 | 36 | print(sparse_person_content.shape) 37 | # print(sparse_person_content.shape) 38 | 39 | alpha = 15 40 | sparse_person_content = (sparse_person_content * alpha).astype('double') 41 | 42 | # from implicit.nearest_neighbours import bm25_weight 43 | # # weight the matrix, both to reduce impact of users that have played the same artist thousands of times 44 | # # and to reduce the weight given to popular items 45 | # artist_user_plays = bm25_weight(sparse_person_content, K1=100, B=0.8) 46 | 47 | model1 = implicit.bpr.BayesianPersonalizedRanking(factors=64, regularization=0.1) 48 | model2 = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.1) 49 | model3 = implicit.lmf.LogisticMatrixFactorization(factors=64, regularization=0.6) 50 | 51 | models = [model1, model2, model3] 52 | names = ['bpr', 'als', 'lmf'] 53 | 54 | for model, name in zip(models, names): 55 | model.fit(sparse_person_content) 56 | user_emb = model.user_factors.to_numpy() 57 | print("user") 58 | print(user_emb[0], len(user_emb)) 59 | print("item") 60 | item_emb = model.item_factors.to_numpy() 61 | print(item_emb[0], len(item_emb)) 62 | print('save') 63 | if IS_TRAIN: 64 | stage = 'CV' 65 | else: 66 | stage = 'LB' 67 | np.save(f'/home/niejianfei/otto/{stage}/preprocess/{name}_user_emb', user_emb) 68 | np.save(f'/home/niejianfei/otto/{stage}/preprocess/{name}_item_emb', item_emb) 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /preprocess/ProNE_prepare.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | IS_TRAIN = True 6 | 7 | 8 | def load_data(path): 9 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 10 | dfs = [] 11 | # 只导入训练数据 12 | for e, chunk_file in enumerate(glob.glob(path)): 13 | chunk = pd.read_parquet(chunk_file) 14 | chunk.ts = (chunk.ts / 1000).astype('int32') 15 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 16 | dfs.append(chunk) 17 | return pd.concat(dfs).reset_index(drop=True) 18 | 19 | 20 | # 加载数据 21 | print('加载数据') 22 | 23 | if IS_TRAIN: 24 | train_sessions = load_data('/home/niejianfei/otto/CV/data/*_parquet/*') 25 | else: 26 | train_sessions = load_data('/home/niejianfei/otto/LB/data/*_parquet/*') 27 | print(train_sessions) 28 | 29 | dic = pd.DataFrame(train_sessions.drop_duplicates(['aid']).sort_values(by='aid', ascending=True)['aid']) 30 | dic['num'] = range(len(dic)) 31 | dic.index = dic['aid'] 32 | dic = dic.drop(columns='aid').to_dict()['num'] 33 | # print(dic) 34 | 35 | # 保存矩阵到本地 36 | if IS_TRAIN: 37 | f_save = open('/home/niejianfei/otto/CV/preprocess/aid_num_dict.pkl', 'wb') 38 | pickle.dump(dic, f_save) 39 | f_save.close() 40 | else: 41 | f_save = open('/home/niejianfei/otto/LB/preprocess/aid_num_dict.pkl', 'wb') 42 | pickle.dump(dic, f_save) 43 | f_save.close() 44 | print("aid_num映射保存完毕!!!") 45 | 46 | 47 | def generate_pairs(df): 48 | df = df.sort_values(by=['session', 'ts']) 49 | print(df) 50 | df['aid'] = df['aid'].map(dic) 51 | print(df) 52 | 53 | print('count 1') 54 | df['session_count'] = df['session'].map(df['session'].value_counts()) 55 | print(df) 56 | df1 = df[df['session_count'] == 1] 57 | df = df.append(df1) 58 | print('count 2') 59 | df['session_count'] = df['session'].map(df['session'].value_counts()) 60 | print(df['session_count'].min()) 61 | print(df) 62 | 63 | df = df.sort_values(by=['session', 'ts']) 64 | df['ranking'] = df.groupby(['session'])['ts'].rank(method='first', ascending=True) 65 | print(df) 66 | df['aid_next'] = df['aid'].shift(-1) 67 | print(df) 68 | df = df.query('session_count!=ranking').reset_index(drop=True) 69 | 70 | df['aid_next'] = df['aid_next'].astype('int32') 71 | print(df) 72 | df = df[['aid', 'aid_next']] 73 | print(df) 74 | pairs_list = np.array(df) 75 | return pairs_list 76 | 77 | 78 | pairs_list = generate_pairs(train_sessions).tolist() 79 | print(pairs_list[:10]) 80 | 81 | if IS_TRAIN: 82 | f = open('/home/niejianfei/otto/CV/preprocess/session_pairs.ungraph', "w") 83 | for line in pairs_list: 84 | f.write(str(line[0]) + ' ' + str(line[1]) + '\n') 85 | f.close() 86 | else: 87 | f = open('/home/niejianfei/otto/LB/preprocess/session_pairs.ungraph', "w") 88 | for line in pairs_list: 89 | f.write(str(line[0]) + ' ' + str(line[1]) + '\n') 90 | f.close() 91 | -------------------------------------------------------------------------------- /preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niejianfei/Kaggle_OTTO_Multi-Objective_Recommender_System/3d1f7bff40891628f7a2edd2b31d6a40011aa38a/preprocess/__init__.py -------------------------------------------------------------------------------- /preprocess/co-visitation_matrix_prepare.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import glob 3 | import cudf 4 | import numpy as np 5 | import pandas as pd 6 | 7 | print('We will use RAPIDS version', cudf.__version__) 8 | VER = 6 9 | type_weight = {0: 1, 1: 5, 2: 4} 10 | IS_TRAIN = True 11 | use_all_data = True 12 | 13 | 14 | # CACHE FUNCTIONS 15 | # 读取文件路径,将cpu RAM上的对应df读取到GPU上 16 | def read_file(f): 17 | return cudf.DataFrame(data_cache[f]) 18 | 19 | 20 | def read_file_to_cache(f): 21 | df = pd.read_parquet(f) 22 | df.ts = (df.ts / 1000).astype('int32') 23 | if not use_all_data: 24 | # 除去第一周的数据 25 | df = df[df['ts'] >= 1659909599] 26 | df['type'] = df['type'].map(type_labels).astype('int8') 27 | return df 28 | 29 | 30 | # CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU 31 | # 存储在cpu上的字典 32 | data_cache = {} 33 | type_labels = {'clicks': 0, 'carts': 1, 'orders': 2} 34 | if IS_TRAIN: 35 | # glob模块用来查找文件目录和文件,并将搜索的到的结果返回到一个列表中 36 | files = glob.glob('/home/niejianfei/otto/CV/data/*_parquet/*') 37 | # 存到字典里面存到cpu的RAM里面,字典的键是文件路径,值是对应路径文件生成的dataframe 38 | for f in files: data_cache[f] = read_file_to_cache(f) 39 | else: 40 | # glob模块用来查找文件目录和文件,并将搜索的到的结果返回到一个列表中 41 | files = glob.glob('/home/niejianfei/otto/LB/data/*_parquet/*') 42 | # 存到字典里面存到cpu的RAM里面,字典的键是文件路径,值是对应路径文件生成的dataframe 43 | for f in files: data_cache[f] = read_file_to_cache(f) 44 | 45 | # CHUNK PARAMETERS 46 | # 分成5组 47 | READ_CT = 5 48 | # ceil向上取整将文件分成chunk=len/6块,将文件分成6块 49 | CHUNK = int(np.ceil(len(files) / 6)) 50 | print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.') 51 | 52 | # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR 53 | DISK_PIECES = 4 54 | # item的数量/分区数量 55 | SIZE = 1.86e6 / DISK_PIECES 56 | 57 | # "Carts Orders" Co-visitation Matrix - Type Weighted 58 | # COMPUTE IN PARTS FOR MEMORY MANGEMENT 59 | # for循环分块计算 60 | for PART in range(DISK_PIECES): # 一次循环计算150个文件中的1/4 个items(item[0,180w/4]), 61 | print() 62 | print('### DISK PART', PART + 1) 63 | 64 | # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS 65 | # => OUTER CHUNKS 66 | # 150个文件分成6大块,每一块25个小文件 67 | for j in range(6): # 6 * 5 *5 = 30 * 5 = 150 68 | a = j * CHUNK 69 | b = min((j + 1) * CHUNK, len(files)) 70 | print(f'Processing files {a} thru {b - 1} in groups of {READ_CT}...') 71 | 72 | # => INNER CHUNKS 73 | # 25个小文件分成5份,每份5个文件,读取最开始的那1份文件 74 | for k in range(a, b, READ_CT): 75 | # READ FILE 76 | # 读到GPU里面去,df为list 77 | df = [read_file(files[k])] 78 | for i in range(1, READ_CT): # 在上述一份文件的基础上,在添加4份文件到GPU 79 | if k + i < b: df.append(read_file(files[k + i])) 80 | # 融合5个dataframe信息 81 | df = cudf.concat(df, ignore_index=True, axis=0) 82 | # 升序排列session,降序排列ts 83 | df = df.sort_values(['session', 'ts'], ascending=[True, False]) 84 | 85 | # USE TAIL OF SESSION 86 | df = df.reset_index(drop=True) 87 | # session分组排序标序号,[0-count-1],df顺序不变,0-n-1的顺序是降序排列,留下session最近的30个item 88 | df['n'] = df.groupby('session').cumcount() 89 | # 过滤数据,筛选出n小于30的session,类似于baseline中的ranking和session_day count 90 | df = df.loc[df.n < 30].drop('n', axis=1) 91 | 92 | # CREATE PAIRS 93 | df = df.merge(df, on='session') 94 | # 构造item对,这两个item被user查看的时间相差不到一天 95 | df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)] 96 | 97 | # MEMORY MANAGEMENT COMPUTE IN PARTS 98 | # 内存管理,这里对df的计算继续分区(采用过滤的方式),分part计算,一共有1800000个item,size=sum(items)/ DISK_PIECES 99 | df = df.loc[(df.aid_x >= PART * SIZE) & (df.aid_x < (PART + 1) * SIZE)] 100 | 101 | # ASSIGN WEIGHTS 102 | # 只留下 session ,item pair,type信息并去重 103 | df = df[['session', 'aid_x', 'aid_y', 'type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', 'type_y']) 104 | # 根据merge的aid_y的类型赋予权重,,x的类型不知道?? 105 | df['wgt'] = df.type_y.map(type_weight) 106 | # 去掉session和type信息 107 | df = df[['aid_x', 'aid_y', 'wgt']] 108 | df.wgt = df.wgt.astype('float32') 109 | # items pair groupby分组计算权重 click/carts/orders 1/5/4 110 | df = df.groupby(['aid_x', 'aid_y']).wgt.sum() 111 | # print(df) 112 | # COMBINE INNER CHUNKS 113 | if k == a: 114 | tmp2 = df 115 | else: 116 | tmp2 = tmp2.add(df, fill_value=0) 117 | print(k, ', ', end='') 118 | 119 | print() 120 | 121 | # COMBINE OUTER CHUNKS 122 | if a == 0: 123 | tmp = tmp2 124 | else: 125 | tmp = tmp.add(tmp2, fill_value=0) 126 | del tmp2, df 127 | gc.collect() 128 | 129 | # CONVERT MATRIX TO DICTIONARY 130 | tmp = tmp.reset_index() 131 | tmp = tmp.sort_values(['aid_x', 'wgt'], ascending=[True, False]) 132 | print(tmp) 133 | # SAVE TOP 40 15 134 | tmp = tmp.reset_index(drop=True) 135 | tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount() 136 | print(tmp) 137 | tmp = tmp.loc[tmp.n < 50] 138 | print(tmp) 139 | # SAVE PART TO DISK (convert to pandas first uses less memory) 140 | if IS_TRAIN: 141 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/CV/preprocess/top_15_carts_orders_v{VER}_{PART}.pqt') 142 | else: 143 | if use_all_data: 144 | tmp.to_pandas().to_parquet( 145 | f'/home/niejianfei/otto/LB/preprocess/all_data_top_15_carts_orders_v{VER}_{PART}.pqt') 146 | else: 147 | tmp.to_pandas().to_parquet( 148 | f'/home/niejianfei/otto/LB/preprocess/top_15_carts_orders_v{VER}_{PART}.pqt') 149 | 150 | # 2."Buy2Buy" Co-visitation Matrix 151 | # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR 152 | DISK_PIECES = 1 153 | SIZE = 1.86e6 / DISK_PIECES 154 | 155 | # COMPUTE IN PARTS FOR MEMORY MANGEMENT 156 | for PART in range(DISK_PIECES): 157 | print() 158 | print('### DISK PART', PART + 1) 159 | 160 | # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS 161 | # => OUTER CHUNKS 162 | for j in range(6): 163 | a = j * CHUNK 164 | b = min((j + 1) * CHUNK, len(files)) 165 | print(f'Processing files {a} thru {b - 1} in groups of {READ_CT}...') 166 | 167 | # => INNER CHUNKS 168 | for k in range(a, b, READ_CT): 169 | 170 | # READ FILE 171 | df = [read_file(files[k])] 172 | for i in range(1, READ_CT): 173 | if k + i < b: df.append(read_file(files[k + i])) 174 | df = cudf.concat(df, ignore_index=True, axis=0) 175 | df = df.loc[df['type'].isin([1, 2])] # ONLY WANT CARTS AND ORDERS 176 | df = df.sort_values(['session', 'ts'], ascending=[True, False]) 177 | 178 | # USE TAIL OF SESSION 179 | df = df.reset_index(drop=True) 180 | df['n'] = df.groupby('session').cumcount() 181 | df = df.loc[df.n < 30].drop('n', axis=1) 182 | 183 | # CREATE PAIRS 184 | df = df.merge(df, on='session') 185 | df = df.loc[((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y)] # 14 DAYS 186 | 187 | # MEMORY MANAGEMENT COMPUTE IN PARTS 188 | df = df.loc[(df.aid_x >= PART * SIZE) & (df.aid_x < (PART + 1) * SIZE)] 189 | 190 | # ASSIGN WEIGHTS 191 | df = df[['session', 'aid_x', 'aid_y', 'type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', 'type_y']) 192 | df['wgt'] = 1 193 | df = df[['aid_x', 'aid_y', 'wgt']] 194 | df.wgt = df.wgt.astype('float32') 195 | df = df.groupby(['aid_x', 'aid_y']).wgt.sum() 196 | 197 | # COMBINE INNER CHUNKS 198 | if k == a: 199 | tmp2 = df 200 | else: 201 | tmp2 = tmp2.add(df, fill_value=0) 202 | print(k, ', ', end='') 203 | 204 | print() 205 | 206 | # COMBINE OUTER CHUNKS 207 | if a == 0: 208 | tmp = tmp2 209 | else: 210 | tmp = tmp.add(tmp2, fill_value=0) 211 | del tmp2, df 212 | gc.collect() 213 | 214 | # CONVERT MATRIX TO DICTIONARY 215 | tmp = tmp.reset_index() 216 | tmp = tmp.sort_values(['aid_x', 'wgt'], ascending=[True, False]) 217 | 218 | # SAVE TOP 15 219 | tmp = tmp.reset_index(drop=True) 220 | tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount() 221 | tmp = tmp.loc[tmp.n < 50] 222 | # SAVE PART TO DISK (convert to pandas first uses less memory) 223 | if IS_TRAIN: 224 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/CV/preprocess/top_15_buy2buy_v{VER}_{PART}.pqt') 225 | else: 226 | if use_all_data: 227 | tmp.to_pandas().to_parquet( 228 | f'/home/niejianfei/otto/LB/preprocess/all_data_top_15_buy2buy_v{VER}_{PART}.pqt') 229 | else: 230 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/LB/preprocess/top_15_buy2buy_v{VER}_{PART}.pqt') 231 | 232 | # 3."Clicks" Co-visitation Matrix - Time Weighted 233 | # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR 234 | DISK_PIECES = 4 235 | SIZE = 1.86e6 / DISK_PIECES 236 | 237 | # COMPUTE IN PARTS FOR MEMORY MANGEMENT 238 | for PART in range(DISK_PIECES): 239 | print() 240 | print('### DISK PART', PART + 1) 241 | 242 | # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS 243 | # => OUTER CHUNKS 244 | for j in range(6): 245 | a = j * CHUNK 246 | b = min((j + 1) * CHUNK, len(files)) 247 | print(f'Processing files {a} thru {b - 1} in groups of {READ_CT}...') 248 | 249 | # => INNER CHUNKS 250 | for k in range(a, b, READ_CT): 251 | # READ FILE 252 | df = [read_file(files[k])] 253 | for i in range(1, READ_CT): 254 | if k + i < b: df.append(read_file(files[k + i])) 255 | df = cudf.concat(df, ignore_index=True, axis=0) 256 | df = df.sort_values(['session', 'ts'], ascending=[True, False]) 257 | 258 | # USE TAIL OF SESSION 259 | df = df.reset_index(drop=True) 260 | df['n'] = df.groupby('session').cumcount() 261 | df = df.loc[df.n < 30].drop('n', axis=1) 262 | 263 | # CREATE PAIRS 264 | df = df.merge(df, on='session') 265 | df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)] 266 | 267 | # MEMORY MANAGEMENT COMPUTE IN PARTS 268 | df = df.loc[(df.aid_x >= PART * SIZE) & (df.aid_x < (PART + 1) * SIZE)] 269 | 270 | # ASSIGN WEIGHTS 271 | df = df[['session', 'aid_x', 'aid_y', 'ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y']) 272 | df['wgt'] = 1 + 3 * (df.ts_x - 1659304800) / (1662328791 - 1659304800) # 归一化数据,离得时间越近,权重越大 273 | # 1659304800 : minimum timestamp 274 | # 1662328791 : maximum timestamp 275 | df = df[['aid_x', 'aid_y', 'wgt']] 276 | df.wgt = df.wgt.astype('float32') 277 | df = df.groupby(['aid_x', 'aid_y']).wgt.sum() 278 | 279 | # COMBINE INNER CHUNKS 280 | if k == a: 281 | tmp2 = df 282 | else: 283 | tmp2 = tmp2.add(df, fill_value=0) 284 | print(k, ', ', end='') 285 | print() 286 | 287 | # COMBINE OUTER CHUNKS 288 | if a == 0: 289 | tmp = tmp2 290 | else: 291 | tmp = tmp.add(tmp2, fill_value=0) 292 | del tmp2, df 293 | gc.collect() 294 | 295 | # CONVERT MATRIX TO DICTIONARY 296 | tmp = tmp.reset_index() 297 | tmp = tmp.sort_values(['aid_x', 'wgt'], ascending=[True, False]) 298 | 299 | # SAVE TOP 20 300 | tmp = tmp.reset_index(drop=True) 301 | tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount() 302 | tmp = tmp.loc[tmp.n < 50] 303 | 304 | # SAVE PART TO DISK (convert to pandas first uses less memory) 305 | if IS_TRAIN: 306 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/CV/preprocess/top_20_clicks_v{VER}_{PART}.pqt') 307 | else: 308 | if use_all_data: 309 | tmp.to_pandas().to_parquet( 310 | f'/home/niejianfei/otto/LB/preprocess/all_data_top_20_clicks_v{VER}_{PART}.pqt') 311 | else: 312 | tmp.to_pandas().to_parquet(f'/home/niejianfei/otto/LB/preprocess/top_20_clicks_v{VER}_{PART}.pqt') 313 | -------------------------------------------------------------------------------- /preprocess/deepwalk_prepare.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pandas as pd 3 | from tqdm import tqdm 4 | from collections import defaultdict 5 | from gensim.models import Word2Vec 6 | import numpy as np 7 | 8 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 9 | IS_TRAIN = True 10 | IS_Last_Month = True 11 | 12 | 13 | def load_data(path): 14 | dfs = [] 15 | # 只导入训练数据 16 | for e, chunk_file in enumerate(glob.glob(path)): 17 | chunk = pd.read_parquet(chunk_file) 18 | chunk.ts = (chunk.ts / 1000).astype('int32') 19 | # if not IS_TRAIN: 20 | # # 除去第一周的数据 21 | # chunk = chunk[chunk['ts'] >= 1659909599] 22 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 23 | dfs.append(chunk) 24 | return pd.concat(dfs).reset_index(drop=True) 25 | 26 | 27 | # 加载数据 28 | print('加载数据') 29 | if IS_TRAIN: 30 | if IS_Last_Month: 31 | train_sessions = load_data('/home/niejianfei/otto/CV/data/*_parquet/*') 32 | print(train_sessions) 33 | else: 34 | train_sessions = load_data('/home/niejianfei/otto/CV/data/test_parquet/*') 35 | print(train_sessions) 36 | else: 37 | if IS_Last_Month: 38 | train_sessions = load_data('/home/niejianfei/otto/LB/data/*_parquet/*') 39 | print(train_sessions) 40 | else: 41 | train_sessions = load_data('/home/niejianfei/otto/LB/data/test_parquet/*') 42 | print(train_sessions) 43 | 44 | print('开始排序') 45 | # 分别对session_id聚合,对时间进行排序 46 | df = train_sessions.sort_values(by=["session", "ts"], ascending=True) 47 | print(df.head(10)) 48 | 49 | print('开始构图') 50 | # 开始构图 51 | dic = defaultdict(list) # defaultdict为了给key不在字典的情况赋予一个default值 52 | # 加文字是区分item和user 53 | for x in tqdm(df[["session", "aid"]].values): 54 | dic[f"user_{x[0]}"].append(f"item_{x[1]}") # list中元素是有顺序的 55 | dic[f"item_{x[1]}"].append(f"user_{x[0]}") 56 | 57 | # 随机游走 58 | print('开始随机游走') 59 | # 中心点item,先选定一个session,再走到session中item后面的元素中 60 | # 计算user item对应长度 61 | dic_count = {} 62 | for key in dic: 63 | dic_count[key] = len(dic[key]) 64 | 65 | item_list = df["aid"].unique() 66 | user_list = df["session"].unique() 67 | print('item数量', len(item_list)) 68 | print('user数量', len(user_list)) 69 | 70 | path_length = 20 71 | sentences = [] 72 | num_sentences = 20000000 # 实际跑的时候建议50w+ (有2w个item) 73 | ''' 74 | badcase: 75 | item_a : session_1 76 | session_1 : [item_b,item_a] 77 | 需要加一个max_repeat_time 避免死循环 78 | ''' 79 | 80 | max_repeat_nums = path_length * 2 81 | for _ in tqdm(range(num_sentences)): 82 | start_item = "item_{}".format(item_list[np.random.randint(0, len(item_list))]) 83 | sentence = [start_item] 84 | repeat_time = 0 85 | while len(sentence) < path_length: 86 | last_item = sentence[-1] 87 | random_user = dic[last_item][np.random.randint(0, dic_count[last_item])] # 递归,选最后一个得到user列表,再选一个user 88 | # 若两个相同的item紧挨着,则+1后跳到下一个,继续session随机可能跳出来,其实图也有这种情况,闭环的产生 89 | next_item_index = np.where(np.array(dic[random_user]) == last_item)[0][ 90 | 0] + 1 # 在random_user的items里面找到last_item的索引+1 91 | # user内item不是最后一个,把后面这个加过去 92 | # 若是最后一个,不做操作继续循环,可能有bad case 93 | if next_item_index <= dic_count[random_user] - 1: 94 | next_item = dic[random_user][next_item_index] 95 | sentence.append(next_item) 96 | repeat_time += 1 97 | if repeat_time > max_repeat_nums: 98 | break 99 | sentences.append(sentence) 100 | 101 | # embedding_dimensions = number_of_categories**0.25 102 | model = Word2Vec(sentences, vector_size=64, sg=1, window=5, min_count=1, hs=1, negative=5, sample=0.001, workers=4) 103 | # 保存模型 104 | if IS_TRAIN: 105 | if IS_Last_Month: 106 | model.wv.save_word2vec_format('/home/niejianfei/otto/CV/preprocess/deepwalk_last_month.w2v', binary=False) 107 | else: 108 | model.wv.save_word2vec_format('/home/niejianfei/otto/CV/preprocess/deepwalk_last_week.w2v', binary=False) 109 | else: 110 | if IS_Last_Month: 111 | model.wv.save_word2vec_format('/home/niejianfei/otto/LB/preprocess/deepwalk_last_month.w2v', binary=False) 112 | else: 113 | model.wv.save_word2vec_format('/home/niejianfei/otto/LB/preprocess/deepwalk_last_week.w2v', binary=False) 114 | -------------------------------------------------------------------------------- /ranker.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pandas as pd 3 | import xgboost as xgb 4 | import numpy as np 5 | from sklearn.model_selection import GroupKFold 6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score 7 | 8 | 9 | def load_data(path): 10 | type_transform = {"clicks": 0, "carts": 1, "orders": 2} 11 | dfs = [] 12 | # 只导入训练数据 13 | for e, chunk_file in enumerate(glob.glob(path)): 14 | chunk = pd.read_parquet(chunk_file) 15 | chunk.ts = (chunk.ts / 1000).astype('int32') 16 | chunk['type'] = chunk['type'].map(type_transform).astype('int8') 17 | dfs.append(chunk) 18 | return pd.concat(dfs).reset_index(drop=True) 19 | 20 | 21 | def load_train_data(t, semi_sessions): 22 | dfs = [] 23 | # 只导入训练数据 24 | k = 0 25 | print('开始导入数据') 26 | for i in range(0, 8): 27 | path = f"/home/niejianfei/otto/CV/candidates/candidates_{t[0:-1]}_features_data/candidate_{t[0:-1]}_{i}.pqt" 28 | print(f'第{i + 1}块数据') 29 | chunk = pd.read_parquet(path) 30 | print(path) 31 | print(chunk) 32 | 33 | train = chunk[chunk.session.isin(semi_sessions)] 34 | 35 | chunk = train.astype("float32") 36 | chunk['session'] = chunk['session'].astype('int32') 37 | chunk['aid'] = chunk['aid'].astype('int32') 38 | 39 | chunk_pos = chunk[chunk[t[0:-1]] == 1].sort_values(by='session', ascending=True) 40 | print('正类', len(chunk_pos)) 41 | chunk_neg = chunk[chunk[t[0:-1]] == 0].sample(len(chunk_pos) * 30, random_state=random_state) 42 | chunk = chunk_neg.append(chunk_pos).sort_values(by='session', ascending=True) 43 | dfs.append(chunk) 44 | print(len(chunk)) 45 | k += len(chunk_pos) 46 | print(f'正类一共有:', k) 47 | return pd.concat(dfs).reset_index(drop=True) 48 | 49 | 50 | # 训练 51 | def train_xgb(candidate_type, semi_sessions, describe): 52 | for t in candidate_type: 53 | candidates = load_train_data(t, semi_sessions) 54 | print(candidates) 55 | 56 | # 训练 57 | candidates = candidates.sort_values(by='session', ascending=True) 58 | FEATURES = candidates.columns[0:-1] 59 | print(FEATURES) 60 | 61 | skf = GroupKFold(n_splits=5) 62 | for fold, (train_idx, valid_idx) in enumerate( 63 | skf.split(candidates, candidates[t[0:-1]], groups=candidates['session'])): 64 | # loc: 标签索引 65 | X_train_ = candidates.loc[train_idx, FEATURES] 66 | X_train = X_train_.drop(columns=['session', 'aid']) 67 | y_train = candidates.loc[train_idx, t[0:-1]] 68 | 69 | X_valid_ = candidates.loc[valid_idx, FEATURES] 70 | X_valid = X_valid_.drop(columns=['session', 'aid']) 71 | y_valid = candidates.loc[valid_idx, t[0:-1]] 72 | 73 | groups1 = X_train_.groupby('session').aid.agg('count').values 74 | groups2 = X_valid_.groupby('session').aid.agg('count').values 75 | # 读取数据,每个user一起训练 76 | # DMatrix是XGBoost使用的内部数据结构,它针对内存效率和训练速度进行了优化 77 | dtrain = xgb.DMatrix(X_train, y_train, group=groups1) 78 | dvalid = xgb.DMatrix(X_valid, y_valid, group=groups2) 79 | # 就当成是一组 80 | # dtrain = xgb.DMatrix(X_train, y_train) 81 | # dvalid = xgb.DMatrix(X_valid, y_valid) 82 | 83 | xgb_parms = {'booster': 'gbtree', 84 | 'tree_method': 'gpu_hist', 85 | 'objective': 'binary:logistic', 86 | 'eta': 0.01, 87 | 'eval_metric': 'logloss', 88 | 'seed': 0, 89 | # 'early_stopping_rounds': 300, 90 | # 'subsample': 0.5, 91 | # 'colsample_bytree': 0.5, 92 | # 'max_depth': 3, 93 | # 'reg_alpha': 1, 94 | 'reg_lambda': 20, 95 | 'scale_pos_weight': 30} 96 | 97 | model = xgb.train(xgb_parms, 98 | dtrain=dtrain, 99 | evals=[(dtrain, 'train'), (dvalid, 'valid')], 100 | num_boost_round=3000, 101 | verbose_eval=100, 102 | ) 103 | 104 | print(f"第{fold + 1}次开始输出模型指标!!!") 105 | name = 'XGB' 106 | dtrain1 = xgb.DMatrix(X_train) 107 | dtest1 = xgb.DMatrix(X_valid) 108 | 109 | def sigmoid(x): 110 | return 1. / (1 + np.exp(-x)) 111 | 112 | y_train_pred_pre = np.array(model.predict(dtrain1)) 113 | # y_train_pred_pre = sigmoid(y_train_pred_pre) 114 | print(y_train_pred_pre[:10]) 115 | y_train_pred = np.array(y_train_pred_pre) 116 | 117 | y_train_pred[y_train_pred >= 0.5] = int(1) 118 | y_train_pred[y_train_pred < 0.5] = int(0) 119 | print(y_train_pred[:10]) 120 | 121 | y_test_pred_pre = np.array(model.predict(dtest1)) 122 | # y_test_pred_pre = sigmoid(y_test_pred_pre) 123 | y_test_pred = np.array(y_test_pred_pre) 124 | 125 | y_test_pred[y_test_pred >= 0.5] = int(1) 126 | y_test_pred[y_test_pred < 0.5] = int(0) 127 | 128 | # accuracy 129 | train_accuracy = accuracy_score(y_train, y_train_pred) 130 | test_accuracy = accuracy_score(y_valid, y_test_pred) 131 | 132 | # precision 133 | train_precision = precision_score(y_train, y_train_pred) 134 | test_precision = precision_score(y_valid, y_test_pred) 135 | # recall 136 | train_recall = recall_score(y_train, y_train_pred) 137 | test_recall = recall_score(y_valid, y_test_pred) 138 | # f1 139 | train_f1 = f1_score(y_train, y_train_pred) 140 | test_f1 = f1_score(y_valid, y_test_pred) 141 | # auc 计算时,计算的应该是不同的概率画出来的曲线下的面积,而不是预测值对应的曲线下的面积 142 | 143 | train_auc = roc_auc_score(y_train, y_train_pred_pre) 144 | test_auc = roc_auc_score(y_valid, y_test_pred_pre) 145 | 146 | print('{} 训练集: accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name, 147 | train_accuracy, 148 | train_precision, 149 | train_recall, 150 | train_f1, 151 | train_auc)) 152 | print( 153 | '{} 验证集: accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name, test_accuracy, 154 | test_precision, 155 | test_recall, 156 | test_f1, 157 | test_auc)) 158 | importance_weight = model.get_score(fmap='', importance_type='weight') 159 | print('weight', importance_weight) 160 | importance_gain = model.get_score(fmap='', importance_type='gain') 161 | print('gain', importance_gain) 162 | 163 | model.save_model(f'/home/niejianfei/otto/CV/models/xgb_fold{fold}_{t[0:-1]}_{describe}.xgb') 164 | 165 | 166 | # 预测 167 | def xgb_inference(key, stage, t, semi_sessions, describe): 168 | fold_num = 5 169 | dfs = [] 170 | # 只导入训练数据 171 | for e, chunk_file in enumerate( 172 | glob.glob(f"/home/niejianfei/otto/{stage}/candidates/candidates_{t[0:-1]}_features_data/*")): 173 | print(f"第{e + 1}块数据!!!") 174 | 175 | chunk = pd.read_parquet(chunk_file) 176 | print(chunk) 177 | print(chunk.columns) 178 | 179 | if stage == 'CV': 180 | x_train = chunk[chunk.session.isin(semi_sessions)].astype("float32") 181 | x_test = chunk[~chunk.session.isin(semi_sessions)].astype("float32") 182 | 183 | if key == 'train': 184 | chunk = x_train.astype("float32") 185 | if len(chunk) == 0: 186 | continue 187 | else: 188 | chunk = x_test.astype("float32") 189 | if len(chunk) == 0: 190 | continue 191 | print(f'{key}长度为', len(chunk)) 192 | FEATURES = chunk.columns[2:-1] 193 | chunk['session'] = chunk['session'].astype('int32') 194 | chunk['aid'] = chunk['aid'].astype('int32') 195 | 196 | preds = np.zeros(len(chunk)) 197 | for fold in range(fold_num): 198 | print(f"第{fold + 1}次预测!!!") 199 | 200 | model = xgb.Booster() 201 | model.load_model( 202 | f'/home/niejianfei/otto/CV/models/xgb_fold{fold}_{t[0:-1]}_{describe}.xgb') 203 | model.set_param({'predictor': 'gpu_predictor'}) 204 | print("开始构建test数据集!!!") 205 | dtest = xgb.DMatrix(data=chunk[FEATURES]) 206 | print("开始预测!!!") 207 | preds += model.predict(dtest) / fold_num 208 | print(preds.max()) 209 | print(f"第{e + 1}次构建predictions!!!") 210 | predictions = chunk[['session', 'aid']].copy() 211 | predictions['pred'] = preds 212 | print(predictions[:10]) 213 | dfs.append(predictions) 214 | return pd.concat(dfs, axis=0).reset_index(drop=True) 215 | 216 | 217 | def generate_submission(key, stage, candidate_type, semi_sessions, describe): 218 | for t in candidate_type: 219 | predictions = xgb_inference(key, stage, t, semi_sessions, describe) 220 | 221 | print("开始构造submission!!!") 222 | predictions = predictions.sort_values(['session', 'pred'], ascending=[True, False]).reset_index( 223 | drop=True).drop_duplicates(['session', 'aid'], keep='first') 224 | predictions['n'] = predictions.groupby('session').aid.cumcount().astype('int32') 225 | print(predictions[:200]) 226 | print(f"开始筛选<20!!!") 227 | predictions1 = predictions[predictions['n'] < 20] 228 | print(predictions1[:20]) 229 | 230 | sub = predictions1.groupby('session').aid.apply(list) 231 | sub = sub.to_frame().reset_index() 232 | sub.aid = sub.aid.apply(lambda x: " ".join(map(str, x))) 233 | sub.columns = ['session_type', 'labels'] 234 | sub.session_type = sub.session_type.astype('str') + f'_{t}' 235 | print(len(sub)) 236 | print("开始写入本地!!!") 237 | sub.to_parquet(f'/home/niejianfei/otto/{stage}/submission/sub_{t}.pqt') 238 | 239 | 240 | def get_recall(key, candidate_type): 241 | for t in candidate_type: 242 | print("开始读取数据!!!") 243 | pred_df = pd.read_parquet(f'/home/niejianfei/otto/CV/submission/sub_{t}.pqt') 244 | print(len(pred_df)) 245 | 246 | sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy() 247 | sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0])) 248 | sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')]) 249 | print("开始读取labels!!!") 250 | test_labels = pd.read_parquet(f'/home/niejianfei/otto/CV/preprocess/test_labels.parquet') 251 | print(len(test_labels)) 252 | print(len(pred_df) - len(pred_df)) 253 | test_labels = test_labels.loc[test_labels['type'] == t] 254 | test_labels = test_labels.merge(sub, how='left', on=['session']) 255 | test_labels['hits'] = test_labels.apply( 256 | lambda df: min(20, len(set(df.ground_truth).intersection(set(df.labels)))), axis=1) 257 | # 设定阈值 长度多于20,定为20 258 | test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0, 20) 259 | print(f"开始计算{key}recall!!!") 260 | recall = test_labels['hits'].sum() / test_labels['gt_count'].sum() 261 | print(f'{key} {t} recall@20 =', recall) 262 | 263 | 264 | def user_sample(frac): 265 | return valid.drop_duplicates(['session']).sample(frac=frac, random_state=random_state)['session'] 266 | 267 | 268 | if __name__ == '__main__': 269 | # 抽取一半session计算recall 270 | random_state = 33 271 | valid = load_data(f'/home/niejianfei/otto/CV/data/test_parquet/*') 272 | 273 | candidate_type = ['clicks', 'carts', 'orders'] 274 | describe = 'final' 275 | 276 | train_xgb(candidate_type, user_sample(0.5), describe) 277 | generate_submission('test', 'CV', candidate_type, user_sample(0.5), describe) 278 | get_recall('test', candidate_type) 279 | 280 | # 使用全量数据训练模型做最终预测 281 | train_xgb(candidate_type, user_sample(1), 'final_all_data') 282 | --------------------------------------------------------------------------------