├── LICENSE
├── Qualification
    └── Qualification.py
├── README.txt
├── README_CN.md
├── Semi-Finals
    ├── online_recommendation
    │   ├── dockerfile
    │   ├── downward_map.zip
    │   ├── lgb_0924_1652
    │   ├── model0924_base.file
    │   ├── read_me.txt
    │   ├── run.sh
    │   ├── test.py
    │   └── upward_map.zip
    └── underline_trainning
    │   ├── Readme.pdf
    │   ├── Step1 itemCF_based_on_Apriori
    │       ├── 1_generate_user_logs.ipynb
    │       ├── 2_generate_hot_table.ipynb
    │       ├── 3_generate_original_matrix.ipynb
    │       ├── 4_Merge.ipynb
    │       ├── 5_Save_sparse_to_dense.ipynb
    │       ├── 6_Sta_for_SparseMatrix.ipynb
    │       └── 7_generate_recall.ipynb
    │   ├── Step2 Generate_feature_for_Ranking
    │       ├── 1_generate_static_features.ipynb
    │       ├── 2_generate_dynamic_feature.ipynb
    │       └── 3_generate_time_feature.ipynb
    │   └── Step3 Ranking
    │       ├── 1_build_model.ipynb
    │       └── 2_recommendation.ipynb
├── 初赛方案简介.pdf
├── 复赛方案简介.pdf
└── 答辩ppt.pptx


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Chuanyu Xue
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Qualification/Qualification.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import datetime
  4 | from multiprocessing import Pool as ProcessPool
  5 | 
  6 | import warnings 
  7 | warnings.filterwarnings('ignore')
  8 | 
  9 | 
 10 | df_user_testA = pd.read_csv('../data/ECommAI_EUIR_round1_testA_20190701/user.csv',header=None)
 11 | df_user_testA.columns = ['userID','gender','age','purchaseLevel']
 12 | df_item_testA = pd.read_csv('../data/ECommAI_EUIR_round1_testA_20190701/item.csv',header=None)
 13 | df_item_testA.columns = ['itemID','categoryID','shopID','brandID']
 14 | df_log_testA = pd.read_csv('../data/ECommAI_EUIR_round1_testA_20190701/user_behavior.csv',header=None)
 15 | df_log_testA.columns = ['userID','itemID','behavior','timestap']
 16 | 
 17 | df_user_testB = pd.read_csv('../data/ECommAI_EUIR_round1_testB_20190809/user.csv',header=None)
 18 | df_user_testB.columns = ['userID','gender','age','purchaseLevel']
 19 | df_item_testB = pd.read_csv('../data/ECommAI_EUIR_round1_testB_20190809/item.csv',header=None)
 20 | df_item_testB.columns = ['itemID','categoryID','shopID','brandID']
 21 | df_log_testB = pd.read_csv('../data/ECommAI_EUIR_round1_testB_20190809/user_behavior.csv',header=None)
 22 | df_log_testB.columns = ['userID','itemID','behavior','timestap']
 23 | 
 24 | 
 25 | 
 26 | 
 27 | df_log_testA['date'] = df_log_testA['timestap'].apply(lambda x : datetime.datetime(2019,7,5) + datetime.timedelta(seconds=x))
 28 | df_log_testA['day'] = df_log_testA['date'].dt.day
 29 | df_log_testA['weekday'] = df_log_testA['date'].dt.weekday + 1
 30 | 
 31 | df_log_testB['date'] = df_log_testB['timestap'].apply(lambda x : datetime.datetime(2019,7,5) + datetime.timedelta(seconds=x))
 32 | df_log_testB['day'] = df_log_testB['date'].dt.day
 33 | df_log_testB['weekday'] = df_log_testB['date'].dt.weekday + 1
 34 | 
 35 | 
 36 | 
 37 | df_log = pd.concat([df_log_testA,df_log_testB])
 38 | df_log.reset_index(inplace=True,drop=True)
 39 | 
 40 | 
 41 | df_log_selected = df_log.copy()
 42 | 
 43 | 
 44 | df_log_selected.loc[df_log_selected['behavior']=='pv','behavior'] = 1
 45 | df_log_selected.loc[df_log_selected['behavior']=='fav','behavior'] = 2
 46 | df_log_selected.loc[df_log_selected['behavior']=='cart','behavior'] = 3
 47 | df_log_selected.loc[df_log_selected['behavior']=='buy','behavior'] = 4
 48 | 
 49 | df_log_selected['hour'] = df_log_selected['date'].dt.hour
 50 | df_log_selected['day_hour'] = df_log_selected['day'] + df_log_selected['hour']/float(24)
 51 | df_log_selected['behavior'] = (1 - (21-df_log_selected['day_hour']+1)/(21-5+1)) * df_log_selected['behavior']
 52 | 
 53 | 
 54 | 
 55 | 
 56 | df_user = pd.concat([df_user_testA,df_user_testB])
 57 | df_user.reset_index(inplace=True,drop=True)
 58 | 
 59 | df_log_selected = pd.merge(df_log_selected,df_user)
 60 | df_log_selected_male = df_log_selected[ df_log_selected['gender']==0 ]
 61 | df_log_selected_female = df_log_selected[ df_log_selected['gender']==1 ]
 62 | 
 63 | 
 64 | 
 65 | item_statistc_male_w = df_log_selected_male.groupby(['itemID'])[['behavior']].sum()
 66 | item_statistc_male_w.reset_index(inplace=True)
 67 | item_statistc_male_w.columns = ['itemID','itemCount_male_w']
 68 | 
 69 | 
 70 | 
 71 | item_statistc_female_w = df_log_selected_female.groupby(['itemID'])[['behavior']].sum()
 72 | item_statistc_female_w.reset_index(inplace=True)
 73 | item_statistc_female_w.columns = ['itemID','itemCount_female_w']
 74 | 
 75 | 
 76 | 
 77 | df_item = pd.concat([df_item_testA,df_item_testB])
 78 | df_item.drop_duplicates(inplace=True)
 79 | df_item.reset_index(drop=True,inplace=True)
 80 | 
 81 | 
 82 | df_item = pd.merge(df_item,item_statistc_female_w,how='left')
 83 | df_item = pd.merge(df_item,item_statistc_male_w,how='left')
 84 | 
 85 | df_item.loc[np.isnan(df_item['itemCount_male_w']),'itemCount_male_w' ] = 0
 86 | df_item.loc[np.isnan(df_item['itemCount_female_w']),'itemCount_female_w' ] = 0
 87 | 
 88 | 
 89 | 
 90 | df_item_male_sorted = df_item.sort_values(by=['itemCount_male_w'],ascending=False)
 91 | df_item_male_sorted.reset_index(drop=True,inplace=True)
 92 | item_male_w_Top500 = list( df_item_male_sorted.loc[:499,'itemID'] )
 93 | 
 94 | 
 95 | 
 96 | df_item_female_sorted = df_item.sort_values(by=['itemCount_female_w'],ascending=False)
 97 | df_item_female_sorted.reset_index(drop=True,inplace=True)
 98 | item_female_w_Top500 = list( df_item_female_sorted.loc[:499,'itemID'] )
 99 | 
100 | 
101 | 
102 | item_to_category_dict = dict()
103 | for row in df_item.values:
104 |     item_to_category_dict[row[0]] = row[1]
105 | 
106 | 
107 | historicalDict = {}
108 | for each_user in df_user_testB['userID']:
109 |     historicalDict[each_user]= set(df_log_selected.loc[df_log_selected['userID']==each_user,'itemID'])
110 | 
111 | 
112 | his_cat_Dict = {}
113 | 
114 | for each_user in df_user_testB['userID']:
115 |     cat_list = []
116 |     for item_t in historicalDict[each_user]:
117 |         cat_list.append(item_to_category_dict[item_t])
118 | 	
119 |     his_cat_Dict[each_user] = set(cat_list)
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | def process(each_user):
128 | 
129 |     gender_tmp = int(df_user_testB.loc[df_user_testB['userID']==each_user,'gender'])
130 |     
131 |     df_tmp = df_log_selected[df_log_selected['userID']==each_user]
132 |     df_tmp.reset_index(inplace=True,drop=True)
133 |     
134 |     cat_his_tmp = his_cat_Dict[each_user]
135 |     
136 |     itemListTmp = []
137 |     
138 |     if len(df_tmp) > 0:
139 |         
140 |         item_sta = df_tmp.groupby(['itemID'])['behavior'].sum()
141 |         item_sta = item_sta.reset_index()
142 |         item_sta_sorted = item_sta.sort_values(by=['behavior'],ascending=False)
143 |         item_sta_sorted.reset_index(inplace=True,drop=True)
144 |         
145 |         itemListTmp = itemListTmp + list(item_sta_sorted.loc[:27,'itemID'])
146 |         
147 | 
148 |         
149 |         if len(itemListTmp) < 50:
150 |                
151 |             if gender_tmp == 0:
152 | 
153 |                     for item_candidate in item_male_w_Top500:
154 |                         if item_candidate not in itemListTmp and item_to_category_dict[item_candidate] not in cat_his_tmp:
155 |                             itemListTmp.append(item_candidate)
156 |                         
157 |                         if len(itemListTmp) == 50:
158 |                             break
159 |             else:
160 | 
161 |                     for item_candidate in item_female_w_Top500:
162 |                         if item_candidate not in itemListTmp and item_to_category_dict[item_candidate] not in cat_his_tmp:
163 |                             itemListTmp.append(item_candidate)
164 |                         
165 |                         if len(itemListTmp) == 50:
166 |                             break            
167 |         
168 |     else:
169 |         
170 |         if gender_tmp == 0:
171 |             
172 |                 for item_candidate in item_male_w_Top500:
173 |                     if item_candidate not in itemListTmp and item_to_category_dict[item_candidate] not in cat_his_tmp:
174 |                         itemListTmp.append(item_candidate)
175 |                     
176 |                     if len(itemListTmp) == 50:
177 |                         break
178 | 
179 |         else:
180 |                 for item_candidate in item_female_w_Top500:
181 |                     if item_candidate not in itemListTmp and item_to_category_dict[item_candidate] not in cat_his_tmp:
182 |                         itemListTmp.append(item_candidate)
183 |                     
184 |                     if len(itemListTmp) == 50:
185 |                         break
186 | 	
187 |     
188 |     return (each_user,set(itemListTmp))
189 | 
190 | pool = ProcessPool(8)
191 | res = pool.map(process, df_user_testB['userID'])
192 | pool.close()
193 | pool.join()
194 | 
195 | 
196 | file = open('../prediction_result/result.csv','w')
197 | 
198 | 
199 | for element in res:
200 |     strTmp = str(element[0]) + ',' + ','.join(map(lambda x:str(x), list(element[1]) ))
201 |     file.write(strTmp+'\n')
202 | 
203 | file.close()
204 | 
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | This repository contains the 1-st solution for the CIKM 2019 EComm AI
2 | - Efficient and Novel Item Retrieval for Large-scale Online Shopping Recommendation Challenge.
3 | 
4 | Email: skewcy@gmail.com
5 | 


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
 1 | # CIKM-2019-AnalytiCup
 2 | 2019-CIKM挑战赛，超大规模推荐之用户兴趣高效检索赛道 冠军解决方案
 3 | 
 4 | This repository contains the champion solution on CIKM 2019 EComm AI - Efficient and Novel Item Retrieval for Large-scale Online Shopping Recommendation Challenge.
 5 | 
 6 | ## 解决方案blog
 7 | 
 8 | 知乎文章：https://zhuanlan.zhihu.com/p/91506866
 9 | 
10 | ## 文件结构
11 | 
12 |     │  LICENSE
13 |     │  project_structure.txt
14 |     │  README.md
15 |     │  初赛方案简介.pdf
16 |     │  复赛方案简介.pdf
17 |     │  答辩ppt.pptx
18 |     │  
19 |     ├─Qualification                                        # 初赛解决方案
20 |     │      Qualification.py
21 |     │      
22 |     └─Semi-Finals                                          # 复赛解决方案
23 |         ├─online_recommendation                            # 生成线上结果
24 |         │      dockerfile
25 |         │      downward_map.zip
26 |         │      lgb_0924_1652
27 |         │      model0924_base.file
28 |         │      read_me.txt
29 |         │      run.sh
30 |         │      test.py
31 |         │      upward_map.zip
32 |         │      
33 |         └─underline_trainning                              # 生成线下验证结果以及特征
34 |             │  Readme.pdf
35 |             │  
36 |             ├─Step1 itemCF_based_on_Apriori                # 基于Apriori关联规则法生成商品关联矩阵
37 |             │      1_generate_user_logs.ipynb
38 |             │      2_generate_hot_table.ipynb
39 |             │      3_generate_original_matrix.ipynb      # 快速相似度矩阵运算方法
40 |             │      4_Merge.ipynb
41 |             │      5_Save_sparse_to_dense.ipynb
42 |             │      6_Sta_for_SparseMatrix.ipynb            # 将稀疏的关联矩阵转化为Hash结构以加快检索效率
43 |             │      7_generate_recall.ipynb                 # 基于关联矩阵为每个用户生成candidate列表
44 |             │      
45 |             ├─Step2 Generate_feature_for_Ranking           # 为candidate列表生成特征
46 |             │      1_generate_static_features.ipynb
47 |             │      2_generate_dynamic_feature.ipynb
48 |             │      3_generate_time_feature.ipynb
49 |             │      
50 |             └─Step3 Ranking                                # 基于candidate列表与特征做出推荐
51 |                     1_build_model.ipynb
52 |                     2_recommendation.ipynb
53 |                     
54 |     注意！有些文件较大未上传到github，除数据集外，所有缺失文件均可在代码中生成。
55 | 
56 | 
57 | ## Q&A
58 | 
59 | > **Q:** 我是香港中文大学（深圳）数据科学专业的学生，想报名参加这个比赛作为毕业项目的，但是超过了比赛的报名时间，所以想问一下你能不能发我一下比赛的原始数据呢？          
60 | > **A:** 数据集较大无法存储到Github，请在网盘链接: https://pan.baidu.com/s/1Mnp4R27qXt_b367G4EcVaA 提取码: 5ecq下载
61 | 
62 | > **Q:** 看你们的复赛方法介绍，讲到了也试过word2vec学习embedding，然后用faiss来做召回。请问这样的方法的效果和你们最后用的item CF的方法，比较起来如何呢？         
63 | > **A:** 很抱歉，由于时间关系我们没有进行对比，因为itemCF已经取得了比较好的效果。我们尝试过embedding+faiss的方案是可行的。为了后期快速搭建线上pipeline我们选择了更为简单的itemCF。但在2020KDDCUP中，我们分别尝试了ItemCF与word2vec+faiss方案，ItemCF取得了更好的召回效果，但embedding可以作为很好的特征。请查阅我库中2020KDDCUP项目
64 | 
65 | > **Q:** 按照方案中描述：使用了用户活跃度的置信度计算 Item CF，这里的sim(i,j)!=sim(j,i)，但是代码中这样看应该是相等的？另外，改进的相似度方法中，公式和代码对不上。是需要进一步推导嘛？       
66 | > **A:** 谢谢你的邮件！此步类似于统计共现次数，统计的mat\[a, b]并不是最终的相似度，在后续得到a到b的相似度时使用的是mat\[a, b] / f(a)的计算方法。请您再看一下代码和相应的公式。如果还有更多的疑问可以再联系我。
67 | 
68 | > **Q:** 在对行为做临近时间加权的时候，好像这样更好， data['behavior'] = data['behavior'] / (max_day-data['day'])，这个不知道你们有没有调整过。        
69 | > **A:** 我们的时间权重设置是按照11年kdd cup第一名的方式设置的，同时也是SVDFeature中temporal SVD的设置方式。我们没有进行你们的尝试，如果data['behavior'] = data['behavior'] / (max_day-data['day'])效果更好，可能说明在不同数据集上要多尝试几种不同的设置方法，然后选择最好的方法。
70 | 
71 | > **Q:** 最后一个问题是关于线下的novel recall@50的计算，我们用复赛的训练数据除去最后一天也就是第15天的数据做训练，然后用初赛round b的测试数据的第15天做验证，得到的novel recall @50大概是0.039，这个和你们文档中说的只用召回代码就可以得到0.053的结果有点差距。请问你们还又做了些什么处理呢？如果不做排序的话。当然也有可能是我们线下计算的指标和比赛的时候线上算的指标不太一致，不知道你们有没有碰到过类似的问题。      
72 | > **A:** 线上效果0.053对应在testB上的验证效果是0.0385，可能是由于线上线下数据集的大小不同或者分布不同导致，但可以保证的是通过testB验证的结果与线上结果是同步增长。在开源代码中的评价方式与线上的指标是相同的。
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | ## 声明
83 | 本项目库专门存放CIKM2019挑战赛的相关代码文件，所有代码仅供各位同学学习参考使用。如有任何对代码的问题请邮箱联系：cs_xcy@126.com
84 | 
85 | If you have any issue please feel free to contact me at cs_xcy@126.com
86 | 
87 | 天池ID：BruceQD & 人畜无害小白兔 & **小雨姑娘**
88 | 


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM registry.cn-qingdao.aliyuncs.com/cikm_bruce/bruceqd:15.0
 2 | 
 3 | WORKDIR /competition
 4 | 
 5 | COPY test.py .
 6 | COPY run.sh .
 7 | COPY upward_map.txt .
 8 | COPY downward_map.txt .
 9 | COPY item_Apriori.txt .
10 | COPY brand_count.csv .
11 | COPY category_count.csv .
12 | COPY itemID_count.csv .
13 | COPY shop_count.csv .
14 | COPY brand_sum.csv .
15 | COPY category_sum.csv .
16 | COPY itemID_sum.csv .
17 | COPY shop_sum.csv .
18 | COPY itemID_higher.csv .
19 | COPY category_higher.csv .
20 | COPY category_lower.csv .
21 | COPY item_rank.csv .
22 | COPY item_to_age_count_online.csv .
23 | COPY item_to_sex_count_online.csv .
24 | COPY item_to_ability_count_online.csv .
25 | COPY itemID_last_time_online.csv .
26 | COPY shop_last_time_online.csv .
27 | COPY category_last_time_online.csv .
28 | COPY brand_last_time_online.csv .
29 | COPY model0924_base.file .
30 | COPY lgb_0924_1652 .
31 | 
32 | RUN chmod a+x  ./run.sh
33 | 
34 | CMD ["sh","run.sh"]
35 | 


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/downward_map.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/Semi-Finals/online_recommendation/downward_map.zip


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/model0924_base.file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/Semi-Finals/online_recommendation/model0924_base.file


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/read_me.txt:
--------------------------------------------------------------------------------
1 | underline: 0.0764
2 | full_list: 0.0939
3 | online: 0.0567


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/run.sh:
--------------------------------------------------------------------------------
1 | python ./test.py
2 | 


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import time
  4 | from catboost import CatBoostClassifier
  5 | import lightgbm as lgb
  6 | 
  7 | start_time = time.time()
  8 | # Note! 别忘了改线上路径
  9 | path = '/tcdata/'
 10 | #path = './testA/'
 11 | 
 12 | object_path = '/competition/'
 13 | #object_path = './'
 14 | 
 15 | targetday = 16
 16 | 
 17 | static_features_path = ''
 18 | 
 19 | static_features_files = [
 20 | 'brand_count.csv',
 21 | 'brand_sum.csv',
 22 | 'category_count.csv',
 23 | 'category_sum.csv',
 24 | 'itemID_count.csv',
 25 | 'itemID_sum.csv',
 26 | 'shop_count.csv',
 27 | 'shop_sum.csv',
 28 | 'category_lower.csv',
 29 | 'item_rank.csv',
 30 | 'category_higher.csv',
 31 | 'itemID_higher.csv',
 32 | ]
 33 | 
 34 | model_name = 'model0924_base.file'
 35 | 
 36 | time_features_files = [
 37 | 'itemID_last_time_online.csv',
 38 | 'brand_last_time_online.csv',
 39 | 'shop_last_time_online.csv'
 40 | ]
 41 | 
 42 | double_features_files = [
 43 | 'item_to_ability_count_online.csv',
 44 | 'item_to_sex_count_online.csv',
 45 | 'item_to_age_count_online.csv',
 46 | ]
 47 | 
 48 | 
 49 | tempory_flie_path = ''
 50 | 
 51 | 
 52 | def reduce_mem_usage(df):
 53 |     """ iterate through all the columns of a dataframe and modify the data type
 54 |         to reduce memory usage.        
 55 |     """ 
 56 |     
 57 |     for col in df.columns:
 58 |         col_type = df[col].dtype
 59 |         
 60 |         if col_type != object:
 61 |             c_min = df[col].min()
 62 |             c_max = df[col].max()
 63 |             if str(col_type)[:3] == 'int':
 64 |                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
 65 |                     df[col] = df[col].astype(np.int8)
 66 |                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
 67 |                     df[col] = df[col].astype(np.int16)
 68 |                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
 69 |                     df[col] = df[col].astype(np.int32)
 70 |                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
 71 |                     df[col] = df[col].astype(np.int64)  
 72 |             else:
 73 |                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
 74 |                     df[col] = df[col].astype(np.float16)
 75 |                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
 76 |                     df[col] = df[col].astype(np.float32)
 77 |                 else:
 78 |                     df[col] = df[col].astype(np.float64)
 79 |         else:
 80 |             df[col] = df[col].astype('category')
 81 | 
 82 |     
 83 |     return df
 84 | 
 85 | def load_uandi(path):
 86 |     user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None, engine='c'))
 87 |     item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None, engine='c'))
 88 |     item.columns = ['itemID','category','shop','brand']
 89 |     user.columns = ['userID','sex','age','ability']
 90 | 
 91 |     return user, item
 92 | 
 93 | 
 94 | 
 95 | def load_data(path):
 96 |     '''
 97 |     input: the directory of original dataset
 98 |     output: user, item, data(tapped with item&user attributes and behavior features)
 99 |     '''
100 | 
101 |     user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None, engine='c'))
102 |     item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None, engine='c'))
103 |     data = pd.read_csv(path + 'user_behavior.csv',header=None, engine='c')
104 | 
105 |     data.columns = ['userID','itemID','behavior','timestamp']
106 |     data['day'] = data['timestamp'] // 86400
107 |     data['hour'] = data['timestamp'] // 3600 % 24
108 |     
109 |     ## 生成behavior的onehot
110 |     for i in ['pv','fav','cart','buy']:
111 |         data[i] = 0
112 |         data.loc[data['behavior'] == i, i] = 1
113 | 
114 |     ## 生成behavior的加权
115 |     
116 |     data['day_hour'] = data['day'] + data['hour'] / float(24)
117 |     data.loc[data['behavior']=='pv','behavior'] = 1
118 |     data.loc[data['behavior']=='fav','behavior'] = 2
119 |     data.loc[data['behavior']=='cart','behavior'] = 3
120 |     data.loc[data['behavior']=='buy','behavior'] = 1
121 |     max_day = max(data['day'])
122 |     min_day = min(data['day'])
123 |     data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] 
124 | 
125 |     item.columns = ['itemID','category','shop','brand']
126 |     user.columns = ['userID','sex','age','ability']
127 |     
128 |     data = reduce_mem_usage(data)
129 | 
130 |     data = pd.merge(left=data, right=item, on='itemID',how='left', sort=False)
131 |     data = pd.merge(left=data, right=user, on='userID',how='left', sort=False)
132 | 
133 |     return user, item, data
134 | 
135 | 
136 | def get_unique_inorder(x, k=50):
137 |     '''
138 |     input: Iterable x, Int k
139 |     return: Iterable x(keep first 50 unique elements)
140 |     '''
141 | 
142 |     result = []
143 |     flag = set()
144 |     for i in x:
145 |         if i[0] not in flag:
146 |             result.append(i)
147 |             flag.add(i[0])
148 |         if len(flag) > k:
149 |             break
150 |     return result
151 | 
152 | 
153 | def get_recall_list(train, targetDay, tempory_flie_path = './', k=300):
154 |     '''
155 |     input: DataFrame train(data used for extracting recall, )
156 |     '''
157 | 
158 |     train_logs = dict()
159 |     f = open(tempory_flie_path + 'upward_map.txt','r')
160 |     upward_map = f.read()
161 |     upward_map = eval(upward_map)
162 |     f.close()
163 |     
164 |     f = open(tempory_flie_path + 'downward_map.txt','r')
165 |     downward_map = f.read()
166 |     downward_map = eval(downward_map)
167 |     f.close()
168 |     
169 | 
170 |     f = open(tempory_flie_path + 'item_Apriori.txt','r')
171 |     tmp = f.read()
172 |     item_dict = eval(tmp)
173 |     f.close()
174 |     
175 |     if targetDay > max(train['day']):
176 |         for row in train[['userID','itemID','behavior']].values:
177 |             train_logs.setdefault(row[0], dict())
178 |             if row[1] in upward_map:
179 |                 train_logs[row[0]].setdefault(upward_map[row[1]],0)
180 |                 train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])
181 |     else:
182 |         user_List_test = set(train.loc[train['day']==targetDay,'userID'])
183 |         train = train[train['day'] < targetDay]
184 |         
185 |         for row in train[['userID','itemID','behavior']].values:
186 |             if row[0] in user_List_test:
187 |                 train_logs.setdefault(row[0], dict())
188 |                 if row[1] in upward_map:
189 |                     train_logs[row[0]].setdefault(upward_map[row[1]],0)
190 |                     train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])
191 | 
192 |     for each_user in train_logs:
193 |         sum_value = sum(train_logs[each_user].values())
194 |         if sum_value > 0:
195 |             for each_item in train_logs[each_user]:
196 |                 train_logs[each_user][each_item] /= sum_value            
197 | 
198 |     result_logs = dict()    
199 |     for u in train_logs:
200 |         result_logs.setdefault(u, list())
201 |         for i in set(train_logs[u].keys()):
202 |             if i in item_dict:
203 |                 tmp_list = [ (x[0], train_logs[u][i]*x[1]) for x in item_dict[i]]
204 |                 result_logs[u] += tmp_list
205 |             
206 |     for u in result_logs:
207 |         result_logs[u] = get_unique_inorder([(downward_map[x[0]], x[1]) for x in sorted(result_logs[u], key=lambda x:x[1], reverse=True)
208 |                           if x[0] not in train_logs[u]], k=k)  
209 |     
210 |     return result_logs
211 | 
212 | 
213 | def generate_pairs(recall):
214 |     result = []
215 |     for u in recall:
216 |         for i in recall[u]:
217 |             result.append([u,i[0],i[1]])
218 |     return result
219 | 
220 | def reshape_recall_to_dataframe(recall):
221 |     result = generate_pairs(recall)
222 |     result = pd.DataFrame(result)
223 |     result.columns = ['userID','itemID','apriori']
224 |     return result
225 | 
226 | def recall(dict1, dict2, train_dict):
227 |     '''
228 |     dict1 是真值 dict2 是预测值.
229 |     '''
230 |     
231 |     result = 0
232 |     count = 0
233 |     for i in dict1:
234 |         if i in dict2 and i in train_dict:
235 |             new_item = set()
236 |     
237 |             for k in dict1[i]:
238 |                 if k not in train_dict[i]:
239 |                     new_item.add(k)
240 |             if new_item:
241 |                 result += len(new_item & set(dict2[i])) / len(new_item)
242 |                 count += 1
243 |             
244 |     if count == 0:
245 |         return 0
246 |     else:
247 |         return result / count
248 | 
249 | def generate_online_features(data):
250 |     online_features = []
251 |     for count_feature in ['category','shop','brand']:
252 |         online_features.append(data[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
253 |             {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
254 |                                                    + count_feature + '_count'}))
255 |     for count_feature in ['category','shop','brand']:
256 |         online_features.append(data[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
257 |             {'behavior': 'sum'}).rename(columns={'behavior':'user_to_' 
258 |                                                  + count_feature + '_sum'}))
259 |     for count_feature in ['category','shop','brand']:
260 |         for behavior_type in ['pv','buy']:
261 |             online_features.append(data[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
262 |                 {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
263 |                                                        + count_feature + '_count_' + behavior_type}))
264 | 
265 |     return online_features
266 | 
267 | def generate_yestday_features(data, targetday):
268 |     yestday_features = []
269 |     yestday = data[data['day'] == targetday - 1]
270 |     
271 |     for count_feature in ['category','shop','brand']:
272 |         yestday_features.append(yestday[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
273 |             {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
274 |                                                    + count_feature + '_count_yestday'}))
275 | 
276 |     for count_feature in ['category','shop','brand']:
277 |         for behavior_type in ['pv','buy']:
278 |             yestday_features.append(yestday[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
279 |                 {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
280 |                                                        + count_feature + '_count_'+behavior_type+'_yestday'}))
281 |     return yestday_features
282 | 
283 | def generate_5days_features(data, targetday):
284 |     a5days = data[(data['day'] > targetday - 1 - 5) & (data['day'] < targetday - 1)]
285 |     five_days_features = []
286 |     
287 |     for count_feature in ['category','shop','brand']:
288 |         five_days_features.append(a5days[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
289 |             {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
290 |                                                    + count_feature + '_count_5days'}))
291 | 
292 |     for count_feature in ['category','shop','brand']:
293 |         for behavior_type in ['pv','fav','cart','buy']:
294 |             five_days_features.append(a5days[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
295 |                 {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
296 |                                                        + count_feature + '_count_' + behavior_type+'_5days'}))
297 |     return five_days_features
298 |         
299 | def generate_lasttime_features(data, targetday):
300 |     dynamic_time_features = []
301 |     test = data[data['day'] < targetday]
302 |     start_timestamp  = max(test['timestamp'])
303 |     test['lasttime'] = start_timestamp - test['timestamp']
304 |     
305 |     for dynamic_time_feature in ['shop', 'category','brand']:
306 |         dynamic_time_features.append(test[['lasttime','userID',dynamic_time_feature,'day']].groupby(['userID',dynamic_time_feature], as_index=False).agg({'lasttime': 'min', 'day':'max'}).rename(columns={'lasttime': 'user_to_'
307 |                                                        + dynamic_time_feature + '_lasttime', 'day':'user_to_'+ dynamic_time_feature + '_lastday'}))
308 |     return dynamic_time_features
309 | 
310 | 
311 | 
312 | #--------------------------------------------------------------------------
313 | # 流程开始
314 | 
315 | user, item, data = load_data(path)
316 | user['age'] = user['age'] // 10
317 | data['age'] = data['age'] // 10
318 | 
319 | test_recall_logs = get_recall_list(data, targetDay=targetday, tempory_flie_path=tempory_flie_path, k=325)
320 | 
321 | test_recall = reshape_recall_to_dataframe(test_recall_logs)
322 | test_recall = pd.merge(left=test_recall, right=user, on='userID',how='left', sort=False)
323 | test_recall = pd.merge(left=test_recall, right=item, on='itemID',how='left', sort=False)
324 | 
325 | recall_time = time.time()
326 | print(str((recall_time - start_time) // 60) + ' is cost in recall')
327 | 
328 | # Concat time features
329 | time_features = []
330 | for f in time_features_files:
331 |     time_features.append(reduce_mem_usage(pd.read_csv(f, engine='c')))
332 | 
333 | for f in time_features:
334 |     test_recall = pd.merge(left=test_recall, right=f, on=f.columns[0], how='left', sort=False)
335 | time_features = []
336 | 
337 | # Concat static features
338 | static_features = []
339 | for f in static_features_files:
340 |     static_features.append(reduce_mem_usage(pd.read_csv(static_features_path + f, engine='c')))
341 | 
342 | for f in static_features:
343 |     test_recall = pd.merge(left=test_recall, right=f, on=f.columns[0], how='left', sort=False)
344 | static_features = []
345 | 
346 | # Concat double features
347 | 
348 | double_features = []
349 | for f in double_features_files:
350 |     double_features.append(reduce_mem_usage(pd.read_csv(static_features_path + f, engine='c')))
351 | 
352 | for f in double_features:
353 |     test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left', sort=False)
354 | double_features = []
355 | 
356 | 
357 | # Concat dynamic features
358 | dynamic_features = generate_online_features(data)
359 | 
360 | for f in dynamic_features:
361 |     test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left', sort=False)
362 | dynamic_features = []
363 | 
364 | yestday_features = generate_yestday_features(data, targetday = targetday)
365 | for f in yestday_features:
366 |     test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left', sort=False)
367 | yestday_features = []
368 | 
369 | fiveday_features = generate_5days_features(data, targetday = targetday)
370 | for f in fiveday_features:
371 |     test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left', sort=False)
372 | fiveday_features = []
373 | 
374 | last_time_features = generate_lasttime_features(data, targetday = targetday)
375 | for f in last_time_features:
376 |     test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left', sort=False)
377 | last_time_features = []
378 | 
379 | feature_time = time.time()
380 | print(str((feature_time - recall_time) // 60) + ' is cost in feature')
381 | 
382 | model = CatBoostClassifier()
383 | 
384 | model = model.load_model(model_name)
385 | 
386 | features = [x for x in test_recall.columns if x not in ['itemID','userID','category','shop','brand','label']]
387 | 
388 | test_recall['label'] = model.predict_proba(test_recall[features])[:,1]
389 | 
390 | features = ['apriori',
391 |  'user_to_category_count_pv_5days',
392 |  'user_to_category_count_buy',
393 |  'itemID_sum',
394 |  'itemID_count',
395 |  'user_to_category_count_pv',
396 |  'user_to_category_count_5days',
397 |  'itemIDlast_time',
398 |  'user_to_category_count_pv_yestday',
399 |  'shop_count',
400 |  'category_count',
401 |  'user_to_category_count_yestday',
402 |  'user_to_category_sum',
403 |  'user_to_brand_count_pv',
404 |  'user_to_shop_sum',
405 |  'user_to_shop_count_pv',
406 |  'age',
407 |  'shop_sum',
408 |  'category_sum',
409 |  'brand_count',
410 |  'user_to_shop_count',
411 |  'user_to_brand_count_pv_5days',
412 |  'rank',
413 |  'rank_percent',
414 |  'ability',
415 |  'user_to_age_count',
416 |  'user_to_sex_count',
417 |  'user_to_shop_count_buy',
418 |  'user_to_shop_count_pv_5days',
419 |  'user_to_shop_count_pv_yestday',
420 |  'user_to_shop_lasttime',
421 |  'user_to_category_count',
422 |  'user_to_category_lasttime',
423 |  'category_median',
424 |  'category_skew',
425 |  'category_std'
426 | ]
427 | 
428 | model_lgb = lgb.Booster(model_file='lgb_0924_1652')
429 | test_recall['label_lgb'] = model_lgb.predict(test_recall[features])
430 | 
431 | 
432 | #0.045965784783714
433 | test_recall['ensemble'] = 10 / ( 5/test_recall['label_lgb'] + 5/test_recall['label'])
434 | 
435 | #0.045943749548558184
436 | test_recall['ensemble_power'] = np.power( test_recall['label_lgb']**4.8 * test_recall['label']**5.2 , 1/10)
437 | 
438 | #0.045996441844155474
439 | test_recall['ensemble_final'] = test_recall['ensemble']*0.5 + test_recall['ensemble_power'] * 0.5
440 | 
441 | model_time = time.time()
442 | print(str((model_time - feature_time) // 60) + ' is cost in model')
443 | 
444 | train_logs = dict()
445 | train_ = data
446 | for row in train_[['userID','itemID']].values:
447 |     train_logs.setdefault(row[0], [])
448 |     train_logs[row[0]].append(row[1])
449 | 
450 | result_logs = dict()
451 | test_recall = test_recall.sort_values('ensemble_final', ascending=False).reset_index(drop=True)
452 | for row in test_recall[['userID','itemID']].values:
453 |     result_logs.setdefault(row[0], [])
454 |     if len(result_logs[row[0]]) < 50:
455 |         result_logs[row[0]].append(row[1])
456 | 
457 | temp = data.groupby(['itemID'], as_index=False).count()[['itemID','userID']]
458 | hot_items = list(temp.sort_values('userID', ascending=False).reset_index(drop=True)['itemID'][:100])
459 | 
460 | rec_dict = dict()
461 | for u in set(data['userID']):
462 |     if u in result_logs:
463 |         lenth = len(result_logs[u])
464 |         if lenth < 50:
465 |             rec_dict[u] = result_logs[u] + [x for x in hot_items if x not in result_logs[u] and x not in train_logs[u]][:50 - lenth]
466 |         else:
467 |             rec_dict[u] = result_logs[u]
468 |     else:
469 |         rec_dict[u] = [x for x in hot_items][:50]
470 | 
471 | 
472 | # Note! 别忘了改线上路径
473 | file = open(object_path + 'result.csv','w')
474 | for element in rec_dict:
475 |     strTmp = str(int(element)) + ',' + ','.join(map(lambda x:str(int(x)), list(rec_dict[element] ) ))
476 |     file.write(strTmp+'\n')
477 | file.close()
478 | 
479 | final_time = time.time()
480 | print(str((final_time - start_time) // 60) + ' is cost in whole process')
481 | 
482 | #--------------------------------------------------------------
483 | # 下面用于线下测试
484 | 
485 | # train_logs = dict()
486 | # train = data[data['day'] < 15]
487 | # for row in train[['userID','itemID']].values:
488 | #     train_logs.setdefault(row[0], [])
489 | #     train_logs[row[0]].append(row[1])
490 | 
491 | # test_logs = dict()
492 | # test = data[data['day'] == 15]
493 | # for row in test[['userID','itemID']].values:
494 | #     test_logs.setdefault(row[0], [])
495 | #     test_logs[row[0]].append(row[1])
496 | 
497 | # recall(test_logs, rec_dict, train_logs)
498 | # recall(test_logs, {x:[x[0] for x in test_recall_logs[x]] for x in test_recall_logs}, train_logs)


--------------------------------------------------------------------------------
/Semi-Finals/online_recommendation/upward_map.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/Semi-Finals/online_recommendation/upward_map.zip


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Readme.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/Semi-Finals/underline_trainning/Readme.pdf


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/1_generate_user_logs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 生成用户日志\n",
  8 |     "\n",
  9 |     "本代码的目的是把用户分组，并在每个组中统计用户的行为日志，以方便后续的并行化处理。\n",
 10 |     "\n",
 11 |     "This code aims to group users into serveral groups, then statistic the user behaivors into each group. This process can simplify the operation in multi-processing  "
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import multiprocessing as mp\n",
 21 |     "import time\n",
 22 |     "import pandas as pd\n",
 23 |     "import numpy as np\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "def reduce_mem_usage(df):\n",
 28 |     "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
 29 |     "        to reduce memory usage.        \n",
 30 |     "    \"\"\"\n",
 31 |     "    start_mem = df.memory_usage().sum() \n",
 32 |     "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
 33 |     "    \n",
 34 |     "    for col in df.columns:\n",
 35 |     "        col_type = df[col].dtype\n",
 36 |     "        \n",
 37 |     "        if col_type != object:\n",
 38 |     "            c_min = df[col].min()\n",
 39 |     "            c_max = df[col].max()\n",
 40 |     "            if str(col_type)[:3] == 'int':\n",
 41 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
 42 |     "                    df[col] = df[col].astype(np.int8)\n",
 43 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
 44 |     "                    df[col] = df[col].astype(np.int16)\n",
 45 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
 46 |     "                    df[col] = df[col].astype(np.int32)\n",
 47 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
 48 |     "                    df[col] = df[col].astype(np.int64)  \n",
 49 |     "            else:\n",
 50 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
 51 |     "                    df[col] = df[col].astype(np.float16)\n",
 52 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
 53 |     "                    df[col] = df[col].astype(np.float32)\n",
 54 |     "                else:\n",
 55 |     "                    df[col] = df[col].astype(np.float64)\n",
 56 |     "        else:\n",
 57 |     "            df[col] = df[col].astype('category')\n",
 58 |     "\n",
 59 |     "    end_mem = df.memory_usage().sum() \n",
 60 |     "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
 61 |     "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
 62 |     "    \n",
 63 |     "    return df\n",
 64 |     "\n",
 65 |     "## 把[user_id, item_id]矩阵 转换成 {user_id: [item_id, item_id, ...., item_id]}字典\n",
 66 |     "def generate_logs_for_each_group(matrix, q):\n",
 67 |     "    user_log = dict()\n",
 68 |     "    for row in matrix:\n",
 69 |     "        user_log.setdefault(row[0], [])\n",
 70 |     "        user_log[row[0]].append(row[1])\n",
 71 |     "    print('This batc is finished')\n",
 72 |     "    ## 把结果放到消息队列里\n",
 73 |     "    q.put(user_log)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 2,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "## 指定用几个CPU处理\n",
 83 |     "CPU_NUMS = 8"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 4,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# round2 train的路径\n",
 93 |     "path = '../ECommAI_EUIR_round2_train_20190816/'"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 5,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "Memory usage of dataframe is 2575214592.00 MB\n",
106 |       "Memory usage after optimization is: 1046181196.00 MB\n",
107 |       "Decreased by 59.4%\n",
108 |       "This batc is finished\n",
109 |       "This batc is finished\n",
110 |       "This batc is finished\n",
111 |       "This batc is finished\n",
112 |       "This batc is finished\n",
113 |       "This batc is finished\n",
114 |       "This batc is finished\n",
115 |       "This batc is finished\n",
116 |       "Waiting for the son processing\n",
117 |       "This batc is finished\n",
118 |       "Over, the time cost is:6.490148067474365\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "data = reduce_mem_usage(pd.read_csv(path+'user_behavior.csv', header=None))\n",
124 |     "user = pd.read_csv(path+'user.csv', header=None)\n",
125 |     "item = pd.read_csv(path+'item.csv', header=None)\n",
126 |     "\n",
127 |     "data['day'] = data[3] // 86400\n",
128 |     "data['hour'] = data[3] // 3600 % 24\n",
129 |     "\n",
130 |     "data = data.drop(3, axis=1)\n",
131 |     "\n",
132 |     "data.columns = ['userID','itemID','behavoir','day','hour']\n",
133 |     "user.columns = ['userID', 'sex', 'age', 'ability']\n",
134 |     "item.columns = ['itemID', 'category', 'shop', 'band']\n",
135 |     "\n",
136 |     "data = data.drop_duplicates(['userID','itemID'],keep=\"last\")\n",
137 |     "data = data.sort_values(['day','hour'], ascending=True).reset_index(drop=True)\n",
138 |     "\n",
139 |     "users = list(set(user['userID']))\n",
140 |     "\n",
141 |     "user_groups = [users[i: i + len(users) // CPU_NUMS] for i in range(0, len(users), len(users) // CPU_NUMS)]\n",
142 |     "\n",
143 |     "## 进程用消息队列沟通\n",
144 |     "q = mp.Queue()\n",
145 |     "for groupID in range(len(user_groups)):\n",
146 |     "    matrix = data[data['userID'].isin(user_groups[groupID])][['userID','itemID']].values\n",
147 |     "    task = mp.Process(target=generate_logs_for_each_group, args=(matrix, q, ))\n",
148 |     "    task.start()\n",
149 |     "    \n",
150 |     "start_time = time.time()\n",
151 |     "print('Waiting for the son processing')\n",
152 |     "while q.qsize() != len(user_groups):\n",
153 |     "    pass\n",
154 |     "end_time = time.time()\n",
155 |     "print(\"Over, the time cost is:\"  + str(end_time - start_time))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 6,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "for i in range(len(user_groups)):\n",
165 |     "    temp = q.get()\n",
166 |     "    ## 把生成的字典保存在文件中\n",
167 |     "    f = open('full_logs/userlogs_group' + str(i) + '.txt','w')\n",
168 |     "    f.write(str(temp))\n",
169 |     "    f.close()"
170 |    ]
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "kernelspec": {
175 |    "display_name": "Python 3",
176 |    "language": "python",
177 |    "name": "python3"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": {
181 |     "name": "ipython",
182 |     "version": 3
183 |    },
184 |    "file_extension": ".py",
185 |    "mimetype": "text/x-python",
186 |    "name": "python",
187 |    "nbconvert_exporter": "python",
188 |    "pygments_lexer": "ipython3",
189 |    "version": "3.7.2"
190 |   },
191 |   "toc": {
192 |    "base_numbering": 1,
193 |    "nav_menu": {},
194 |    "number_sections": true,
195 |    "sideBar": true,
196 |    "skip_h1_title": false,
197 |    "title_cell": "Table of Contents",
198 |    "title_sidebar": "Contents",
199 |    "toc_cell": false,
200 |    "toc_position": {},
201 |    "toc_section_display": true,
202 |    "toc_window_display": false
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 2
207 | }
208 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/2_generate_hot_table.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 生成中间文件\n",
  8 |     "\n",
  9 |     "本代码的目的是生成三个中间文件，其中hot_map是统计的商品的出现次数，upwardmap与downward_map是将商品id映射到实数集\\[0, m\\]，其中m代表商品总数。\n",
 10 |     "\n",
 11 |     "This code aims to generate three temporary files. Hot_map statistics the number of appearance of each item. Upward_map and Downward_map can map the ItemID into \\[0, m\\], where m indicates the size of item corpus."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "from sklearn import preprocessing\n",
 23 |     "\n",
 24 |     "# round2 train的路径\n",
 25 |     "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
 26 |     "data = pd.read_csv(path + 'user_behavior.csv',header=None)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "data.columns = ['userID','itemID','behavior','timestamp']\n",
 36 |     "data['day'] = data['timestamp'] // 86400\n",
 37 |     "data['hour'] = data['timestamp'] // 3600 % 24"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "## 统计每个用户的行为数量\n",
 47 |     "user_times = data[['itemID','userID']].groupby('userID', as_index=False).count()\n",
 48 |     "user_times.columns = ['userID','itemCount']\n",
 49 |     "user_times_map = dict(zip(user_times['userID'], user_times['itemCount']))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 7,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "## 把每个用户的行为数量的字典保存在本地文件\n",
 59 |     "f = open('usersActivity_map.txt', 'w')\n",
 60 |     "f.write(str(user_times_map))\n",
 61 |     "f.close()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "生成upward_map 与 downward_map:"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "## 建立一个 商品编号(itemID) <----> 商品编码(从0开始顺序编码)的双射\n",
 78 |     "item = pd.read_csv(path + 'item.csv', header=None)\n",
 79 |     "item.columns = ['itemID','sex','age','ability']\n",
 80 |     "le = preprocessing.LabelEncoder()\n",
 81 |     "item['encoding'] = le.fit_transform(item['itemID'])\n",
 82 |     "\n",
 83 |     "upward_map = dict(zip(item['itemID'], item['encoding']))\n",
 84 |     "downward_map = dict(zip(item['encoding'], item['itemID']))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "生成hot table"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "## 统计每个商品的行为数量\n",
101 |     "temp = data[['itemID','behavior']].groupby('itemID',as_index=False).count()\n",
102 |     "hot_map = dict(zip(temp['itemID'], temp['behavior']))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "def save_to_file(trans_map, file_path):\n",
112 |     "    trans_map = str(trans_map)\n",
113 |     "    f = open(file_path, 'w')\n",
114 |     "    f.write(trans_map)\n",
115 |     "    f.close()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "## 把这些文件保存在本地\n",
125 |     "save_to_file(hot_map,'hot_items_map.txt')\n",
126 |     "\n",
127 |     "save_to_file(upward_map,'upward_map.txt')\n",
128 |     "\n",
129 |     "save_to_file(downward_map,'downward_map.txt')"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.7.2"
150 |   },
151 |   "toc": {
152 |    "base_numbering": 1,
153 |    "nav_menu": {},
154 |    "number_sections": true,
155 |    "sideBar": true,
156 |    "skip_h1_title": false,
157 |    "title_cell": "Table of Contents",
158 |    "title_sidebar": "Contents",
159 |    "toc_cell": false,
160 |    "toc_position": {},
161 |    "toc_section_display": true,
162 |    "toc_window_display": false
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 2
167 | }
168 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/3_generate_original_matrix.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 统计相似度矩阵\n",
  8 |     "\n",
  9 |     "此代码目的是统计每个group中的相似度矩阵，由于cython和multiprocessing在notebook环境下同时使用会出现一些问题，所以分了很多个文件。\n",
 10 |     "\n",
 11 |     "This code aims to calculate the similarity matrix in each group. Due to cython and multiprocessing cannot work well together in notebook envirment, so we generated many simliar codes to finish the same work for multiprocessing."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import sys\n",
 23 |     "from scipy.sparse import lil_matrix\n",
 24 |     "import scipy as scp\n",
 25 |     "import time\n",
 26 |     "%load_ext Cython"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "ITEM_NUM = 4318201\n",
 36 |     "\n",
 37 |     "## 用于读取前面生成的用户行为日志\n",
 38 |     "def get_logs_from_hardisk(path):\n",
 39 |     "    f = open(path, 'r')\n",
 40 |     "    a = f.read()\n",
 41 |     "    dict_name = eval(a)\n",
 42 |     "    f.close()\n",
 43 |     "    return dict_name\n",
 44 |     "\n",
 45 |     "## 读取用户的热度\n",
 46 |     "f = open('usersActivity_map.txt', 'r')\n",
 47 |     "m = f.read()\n",
 48 |     "user_times_map = eval(m)\n",
 49 |     "f.close()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "%%cython\n",
 59 |     "\n",
 60 |     "import datetime\n",
 61 |     "import math\n",
 62 |     "\n",
 63 |     "## 创建Cpython函数，使用底层C语言加速\n",
 64 |     "cpdef calculate_matrix(mat, list user_logs, dict user_times_map):\n",
 65 |     "    cdef int index, i1, i2, count\n",
 66 |     "    cdef list item_log\n",
 67 |     "    cdef tuple u\n",
 68 |     "    \n",
 69 |     "    count = 0\n",
 70 |     "    for u in user_logs:\n",
 71 |     "        count += 1\n",
 72 |     "        if count % 1000 == 0:\n",
 73 |     "            print('The %d'%count + ' users are finished.')\n",
 74 |     "            print(datetime.datetime.now().strftime('%H:%M:%S'))\n",
 75 |     "            \n",
 76 |     "        item_log = u[1]   \n",
 77 |     "        \n",
 78 |     "        ## 此处统计商品的相似度\n",
 79 |     "        for index, i1 in enumerate(item_log):\n",
 80 |     "            for i2 in item_log[(index+1): ]:\n",
 81 |     "                ## 每次两个商品被同一个用户选择时，相似度增加 1 / log(1 + 用户购买商品数量)，参考TF-IDF原理\n",
 82 |     "                weight = 1/(math.log(1+user_times_map[u[0]]))\n",
 83 |     "                mat[i1, i2] += weight\n",
 84 |     "                mat[i2, i1] += weight\n",
 85 |     "    return mat"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "The 0  batch is started!\n",
 98 |       "--------------------------------\n",
 99 |       "The 1000 users are finished.\n",
100 |       "17:42:50\n",
101 |       "The 2000 users are finished.\n",
102 |       "17:43:20\n",
103 |       "The 3000 users are finished.\n",
104 |       "17:43:52\n",
105 |       "The 4000 users are finished.\n",
106 |       "17:44:22\n",
107 |       "The 5000 users are finished.\n",
108 |       "17:44:59\n",
109 |       "The 6000 users are finished.\n",
110 |       "17:45:46\n",
111 |       "The 7000 users are finished.\n",
112 |       "17:46:36\n",
113 |       "The 8000 users are finished.\n",
114 |       "17:47:37\n",
115 |       "The 9000 users are finished.\n",
116 |       "17:48:41\n",
117 |       "The 10000 users are finished.\n",
118 |       "17:49:35\n",
119 |       "save successfully\n",
120 |       "--------------------------------\n",
121 |       "The 10000  batch is started!\n",
122 |       "--------------------------------\n",
123 |       "The 1000 users are finished.\n",
124 |       "17:53:02\n",
125 |       "The 2000 users are finished.\n",
126 |       "17:53:50\n",
127 |       "The 3000 users are finished.\n",
128 |       "17:54:37\n",
129 |       "The 4000 users are finished.\n",
130 |       "17:55:29\n",
131 |       "The 5000 users are finished.\n",
132 |       "17:56:14\n",
133 |       "The 6000 users are finished.\n",
134 |       "17:57:02\n",
135 |       "The 7000 users are finished.\n",
136 |       "17:57:52\n",
137 |       "The 8000 users are finished.\n",
138 |       "17:58:46\n",
139 |       "The 9000 users are finished.\n",
140 |       "17:59:42\n",
141 |       "The 10000 users are finished.\n",
142 |       "18:00:31\n",
143 |       "save successfully\n",
144 |       "--------------------------------\n",
145 |       "The 20000  batch is started!\n",
146 |       "--------------------------------\n",
147 |       "The 1000 users are finished.\n",
148 |       "18:04:28\n",
149 |       "The 2000 users are finished.\n",
150 |       "18:05:15\n",
151 |       "The 3000 users are finished.\n",
152 |       "18:05:57\n",
153 |       "The 4000 users are finished.\n",
154 |       "18:06:42\n",
155 |       "The 5000 users are finished.\n",
156 |       "18:07:43\n",
157 |       "The 6000 users are finished.\n",
158 |       "18:08:31\n",
159 |       "The 7000 users are finished.\n",
160 |       "18:09:18\n",
161 |       "The 8000 users are finished.\n",
162 |       "18:10:11\n",
163 |       "The 9000 users are finished.\n",
164 |       "18:11:10\n",
165 |       "The 10000 users are finished.\n",
166 |       "18:12:07\n",
167 |       "save successfully\n",
168 |       "--------------------------------\n",
169 |       "The 30000  batch is started!\n",
170 |       "--------------------------------\n",
171 |       "The 1000 users are finished.\n",
172 |       "18:16:06\n",
173 |       "The 2000 users are finished.\n",
174 |       "18:16:47\n",
175 |       "The 3000 users are finished.\n",
176 |       "18:17:29\n",
177 |       "The 4000 users are finished.\n",
178 |       "18:18:22\n",
179 |       "The 5000 users are finished.\n",
180 |       "18:19:11\n",
181 |       "The 6000 users are finished.\n",
182 |       "18:20:07\n",
183 |       "The 7000 users are finished.\n",
184 |       "18:20:51\n",
185 |       "The 8000 users are finished.\n",
186 |       "18:21:43\n",
187 |       "The 9000 users are finished.\n",
188 |       "18:22:34\n",
189 |       "The 10000 users are finished.\n",
190 |       "18:23:22\n",
191 |       "save successfully\n",
192 |       "--------------------------------\n",
193 |       "The 40000  batch is started!\n",
194 |       "--------------------------------\n",
195 |       "The 1000 users are finished.\n",
196 |       "18:26:30\n",
197 |       "The 2000 users are finished.\n",
198 |       "18:27:04\n",
199 |       "The 3000 users are finished.\n",
200 |       "18:27:45\n",
201 |       "The 4000 users are finished.\n",
202 |       "18:28:23\n",
203 |       "The 5000 users are finished.\n",
204 |       "18:29:07\n",
205 |       "The 6000 users are finished.\n",
206 |       "18:29:53\n",
207 |       "The 7000 users are finished.\n",
208 |       "18:30:35\n",
209 |       "The 8000 users are finished.\n",
210 |       "18:31:27\n",
211 |       "The 9000 users are finished.\n",
212 |       "18:32:06\n",
213 |       "The 10000 users are finished.\n",
214 |       "18:32:55\n",
215 |       "save successfully\n",
216 |       "--------------------------------\n",
217 |       "The 50000  batch is started!\n",
218 |       "--------------------------------\n",
219 |       "The 3000 users are finished.\n",
220 |       "18:37:33\n",
221 |       "The 4000 users are finished.\n",
222 |       "18:38:13\n",
223 |       "The 5000 users are finished.\n",
224 |       "18:38:56\n",
225 |       "The 6000 users are finished.\n",
226 |       "18:39:38\n",
227 |       "The 7000 users are finished.\n",
228 |       "18:40:13\n",
229 |       "The 8000 users are finished.\n",
230 |       "18:40:47\n",
231 |       "The 9000 users are finished.\n",
232 |       "18:41:28\n",
233 |       "The 10000 users are finished.\n",
234 |       "18:42:05\n",
235 |       "save successfully\n",
236 |       "--------------------------------\n",
237 |       "The 60000  batch is started!\n",
238 |       "--------------------------------\n",
239 |       "The 1000 users are finished.\n",
240 |       "18:44:52\n",
241 |       "The 2000 users are finished.\n",
242 |       "18:45:25\n",
243 |       "The 3000 users are finished.\n",
244 |       "18:46:00\n",
245 |       "The 4000 users are finished.\n",
246 |       "18:46:32\n",
247 |       "The 5000 users are finished.\n",
248 |       "18:47:01\n",
249 |       "The 6000 users are finished.\n",
250 |       "18:47:34\n",
251 |       "The 7000 users are finished.\n",
252 |       "18:48:04\n",
253 |       "The 8000 users are finished.\n",
254 |       "18:48:42\n",
255 |       "The 9000 users are finished.\n",
256 |       "18:49:16\n",
257 |       "The 10000 users are finished.\n",
258 |       "18:49:50\n",
259 |       "save successfully\n",
260 |       "--------------------------------\n",
261 |       "The 70000  batch is started!\n",
262 |       "--------------------------------\n",
263 |       "The 1000 users are finished.\n",
264 |       "18:52:13\n",
265 |       "The 2000 users are finished.\n",
266 |       "18:52:43\n",
267 |       "The 3000 users are finished.\n",
268 |       "18:53:12\n",
269 |       "The 4000 users are finished.\n",
270 |       "18:53:43\n",
271 |       "The 5000 users are finished.\n",
272 |       "18:54:11\n",
273 |       "The 6000 users are finished.\n",
274 |       "18:54:42\n",
275 |       "The 7000 users are finished.\n",
276 |       "18:55:12\n",
277 |       "The 8000 users are finished.\n",
278 |       "18:55:39\n",
279 |       "The 9000 users are finished.\n",
280 |       "18:56:12\n",
281 |       "The 10000 users are finished.\n",
282 |       "18:56:50\n",
283 |       "save successfully\n",
284 |       "--------------------------------\n",
285 |       "The 80000  batch is started!\n",
286 |       "--------------------------------\n",
287 |       "The 1000 users are finished.\n",
288 |       "18:59:02\n",
289 |       "The 2000 users are finished.\n",
290 |       "18:59:33\n",
291 |       "The 3000 users are finished.\n",
292 |       "19:00:08\n",
293 |       "The 4000 users are finished.\n",
294 |       "19:00:39\n",
295 |       "The 5000 users are finished.\n",
296 |       "19:01:10\n",
297 |       "The 6000 users are finished.\n",
298 |       "19:01:37\n",
299 |       "The 7000 users are finished.\n",
300 |       "19:02:12\n",
301 |       "The 8000 users are finished.\n",
302 |       "19:02:41\n",
303 |       "The 9000 users are finished.\n",
304 |       "19:03:10\n",
305 |       "The 10000 users are finished.\n",
306 |       "19:03:36\n",
307 |       "save successfully\n",
308 |       "--------------------------------\n",
309 |       "The 90000  batch is started!\n",
310 |       "--------------------------------\n",
311 |       "The 1000 users are finished.\n",
312 |       "19:05:46\n",
313 |       "The 2000 users are finished.\n",
314 |       "19:06:09\n",
315 |       "The 3000 users are finished.\n",
316 |       "19:06:33\n",
317 |       "The 4000 users are finished.\n",
318 |       "19:06:57\n",
319 |       "The 5000 users are finished.\n",
320 |       "19:07:20\n",
321 |       "The 6000 users are finished.\n",
322 |       "19:07:42\n",
323 |       "The 7000 users are finished.\n",
324 |       "19:08:07\n",
325 |       "The 8000 users are finished.\n",
326 |       "19:08:31\n",
327 |       "The 9000 users are finished.\n",
328 |       "19:08:54\n",
329 |       "The 10000 users are finished.\n",
330 |       "19:09:20\n",
331 |       "save successfully\n",
332 |       "--------------------------------\n",
333 |       "The 100000  batch is started!\n",
334 |       "--------------------------------\n",
335 |       "The 1000 users are finished.\n",
336 |       "19:11:13\n",
337 |       "The 2000 users are finished.\n",
338 |       "19:11:36\n",
339 |       "The 3000 users are finished.\n",
340 |       "19:11:57\n",
341 |       "The 4000 users are finished.\n",
342 |       "19:12:16\n",
343 |       "The 5000 users are finished.\n",
344 |       "19:12:35\n",
345 |       "The 6000 users are finished.\n",
346 |       "19:12:57\n",
347 |       "The 7000 users are finished.\n",
348 |       "19:13:17\n",
349 |       "The 8000 users are finished.\n",
350 |       "19:13:39\n",
351 |       "The 9000 users are finished.\n",
352 |       "19:13:57\n",
353 |       "The 10000 users are finished.\n",
354 |       "19:14:18\n",
355 |       "save successfully\n",
356 |       "--------------------------------\n",
357 |       "The 110000  batch is started!\n",
358 |       "--------------------------------\n",
359 |       "The 1000 users are finished.\n",
360 |       "19:15:49\n",
361 |       "The 2000 users are finished.\n",
362 |       "19:16:08\n",
363 |       "The 3000 users are finished.\n",
364 |       "19:16:25\n",
365 |       "The 4000 users are finished.\n",
366 |       "19:16:48\n",
367 |       "The 5000 users are finished.\n",
368 |       "19:17:10\n",
369 |       "The 6000 users are finished.\n",
370 |       "19:17:31\n",
371 |       "The 7000 users are finished.\n",
372 |       "19:17:50\n",
373 |       "The 8000 users are finished.\n",
374 |       "19:18:11\n",
375 |       "The 9000 users are finished.\n",
376 |       "19:18:29\n",
377 |       "The 10000 users are finished.\n",
378 |       "19:18:49\n",
379 |       "save successfully\n",
380 |       "--------------------------------\n",
381 |       "The 120000  batch is started!\n",
382 |       "--------------------------------\n",
383 |       "The 1000 users are finished.\n",
384 |       "19:20:14\n",
385 |       "The 2000 users are finished.\n",
386 |       "19:20:28\n",
387 |       "The 3000 users are finished.\n",
388 |       "19:20:44\n",
389 |       "The 4000 users are finished.\n",
390 |       "19:20:59\n",
391 |       "The 5000 users are finished.\n",
392 |       "19:21:14\n",
393 |       "The 6000 users are finished.\n",
394 |       "19:21:28\n",
395 |       "The 7000 users are finished.\n",
396 |       "19:21:42\n",
397 |       "The 8000 users are finished.\n",
398 |       "19:21:57\n",
399 |       "The 9000 users are finished.\n",
400 |       "19:22:10\n",
401 |       "The 10000 users are finished.\n",
402 |       "19:22:25\n",
403 |       "save successfully\n",
404 |       "--------------------------------\n",
405 |       "The 130000  batch is started!\n",
406 |       "--------------------------------\n",
407 |       "The 1000 users are finished.\n",
408 |       "19:23:36\n",
409 |       "The 2000 users are finished.\n",
410 |       "19:23:50\n",
411 |       "The 3000 users are finished.\n",
412 |       "19:24:01\n",
413 |       "The 4000 users are finished.\n",
414 |       "19:24:15\n",
415 |       "The 5000 users are finished.\n",
416 |       "19:24:31\n",
417 |       "The 6000 users are finished.\n",
418 |       "19:24:44\n",
419 |       "The 7000 users are finished.\n",
420 |       "19:24:56\n",
421 |       "The 8000 users are finished.\n",
422 |       "19:25:08\n",
423 |       "The 9000 users are finished.\n",
424 |       "19:25:20\n",
425 |       "The 10000 users are finished.\n",
426 |       "19:25:31\n",
427 |       "save successfully\n",
428 |       "--------------------------------\n",
429 |       "The 140000  batch is started!\n",
430 |       "--------------------------------\n",
431 |       "The 1000 users are finished.\n",
432 |       "19:26:26\n",
433 |       "The 2000 users are finished.\n",
434 |       "19:26:37\n",
435 |       "The 3000 users are finished.\n",
436 |       "19:26:46\n",
437 |       "The 4000 users are finished.\n",
438 |       "19:26:57\n",
439 |       "The 5000 users are finished.\n",
440 |       "19:27:07\n",
441 |       "The 6000 users are finished.\n",
442 |       "19:27:16\n",
443 |       "The 7000 users are finished.\n",
444 |       "19:27:26\n",
445 |       "The 8000 users are finished.\n",
446 |       "19:27:37\n",
447 |       "The 9000 users are finished.\n",
448 |       "19:27:46\n",
449 |       "The 10000 users are finished.\n",
450 |       "19:27:54\n",
451 |       "save successfully\n",
452 |       "--------------------------------\n",
453 |       "The 150000  batch is started!\n",
454 |       "--------------------------------\n",
455 |       "The 1000 users are finished.\n",
456 |       "19:28:38\n",
457 |       "The 2000 users are finished.\n",
458 |       "19:28:46\n",
459 |       "The 3000 users are finished.\n",
460 |       "19:28:53\n",
461 |       "The 4000 users are finished.\n",
462 |       "19:29:02\n",
463 |       "The 5000 users are finished.\n",
464 |       "19:29:10\n",
465 |       "The 6000 users are finished.\n",
466 |       "19:29:18\n",
467 |       "The 7000 users are finished.\n",
468 |       "19:29:26\n",
469 |       "The 8000 users are finished.\n",
470 |       "19:29:35\n",
471 |       "The 9000 users are finished.\n",
472 |       "19:29:43\n",
473 |       "The 10000 users are finished.\n",
474 |       "19:29:51\n",
475 |       "save successfully\n",
476 |       "--------------------------------\n",
477 |       "The 160000  batch is started!\n",
478 |       "--------------------------------\n",
479 |       "The 1000 users are finished.\n",
480 |       "19:30:32\n",
481 |       "The 2000 users are finished.\n",
482 |       "19:30:38\n",
483 |       "The 3000 users are finished.\n",
484 |       "19:30:44\n",
485 |       "The 4000 users are finished.\n",
486 |       "19:30:50\n",
487 |       "The 5000 users are finished.\n",
488 |       "19:30:54\n",
489 |       "The 6000 users are finished.\n",
490 |       "19:31:00\n",
491 |       "The 7000 users are finished.\n",
492 |       "19:31:05\n",
493 |       "The 8000 users are finished.\n",
494 |       "19:31:09\n"
495 |      ]
496 |     },
497 |     {
498 |      "name": "stdout",
499 |      "output_type": "stream",
500 |      "text": [
501 |       "The 9000 users are finished.\n",
502 |       "19:31:14\n",
503 |       "The 10000 users are finished.\n",
504 |       "19:31:18\n",
505 |       "save successfully\n",
506 |       "--------------------------------\n",
507 |       "The 170000  batch is started!\n",
508 |       "--------------------------------\n",
509 |       "The 1000 users are finished.\n",
510 |       "19:31:46\n",
511 |       "The 2000 users are finished.\n",
512 |       "19:31:50\n",
513 |       "The 3000 users are finished.\n",
514 |       "19:31:53\n",
515 |       "The 4000 users are finished.\n",
516 |       "19:31:56\n",
517 |       "save successfully\n",
518 |       "--------------------------------\n"
519 |      ]
520 |     }
521 |    ],
522 |    "source": [
523 |     "for each_thread in range(9):\n",
524 |     "    ## 读取用户日志\n",
525 |     "    user_logs = get_logs_from_hardisk('full_logs/userlogs_group%d.txt'%each_thread)\n",
526 |     "    f = open('upward_map.txt','r')\n",
527 |     "    ## 读取之前建立的双射函数\n",
528 |     "    upward_map = eval(f.read())\n",
529 |     "    f.close()\n",
530 |     "    for u in user_logs:\n",
531 |     "        user_logs[u] = [int(upward_map[x]) for x in user_logs[u]]\n",
532 |     "    user_logs = list(user_logs.items())\n",
533 |     "    \n",
534 |     "    for i in range(0, len(user_logs), 10000):\n",
535 |     "        print('The %d '%i + ' batch is started!')\n",
536 |     "        print('--------------------------------')\n",
537 |     "        ## 构建一个大小为(商品数量 x 商品数量的)稀疏矩阵\n",
538 |     "        mat = lil_matrix((ITEM_NUM+1, ITEM_NUM+1), dtype=float)\n",
539 |     "        ## 使用这个稀疏矩阵 存储部分商品与商品间相似度\n",
540 |     "        ## 注意，最终的商品与商品相似度，由很多这些小的稀疏矩阵的和够成\n",
541 |     "        ## 这样做有两个好处 1.节省内存空间 2.便于扩展为并行加速处理\n",
542 |     "        mat = calculate_matrix(mat, user_logs[i: i + 10000], user_times_map)\n",
543 |     "        ## 将小的稀疏矩阵保存在本地\n",
544 |     "        scp.sparse.save_npz('tmpData/sparse_matrix_%d_batch_group%d.npz'%(i, each_thread), mat.tocsr())\n",
545 |     "        print('save successfully')\n",
546 |     "        print('--------------------------------')"
547 |    ]
548 |   }
549 |  ],
550 |  "metadata": {
551 |   "kernelspec": {
552 |    "display_name": "Python 3",
553 |    "language": "python",
554 |    "name": "python3"
555 |   },
556 |   "language_info": {
557 |    "codemirror_mode": {
558 |     "name": "ipython",
559 |     "version": 3
560 |    },
561 |    "file_extension": ".py",
562 |    "mimetype": "text/x-python",
563 |    "name": "python",
564 |    "nbconvert_exporter": "python",
565 |    "pygments_lexer": "ipython3",
566 |    "version": "3.7.2"
567 |   },
568 |   "toc": {
569 |    "base_numbering": 1,
570 |    "nav_menu": {},
571 |    "number_sections": true,
572 |    "sideBar": true,
573 |    "skip_h1_title": false,
574 |    "title_cell": "Table of Contents",
575 |    "title_sidebar": "Contents",
576 |    "toc_cell": false,
577 |    "toc_position": {},
578 |    "toc_section_display": true,
579 |    "toc_window_display": false
580 |   }
581 |  },
582 |  "nbformat": 4,
583 |  "nbformat_minor": 2
584 | }
585 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/4_Merge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 合并相似性矩阵\n",
  8 |     "\n",
  9 |     "此代码目的是将多个group中的相似性矩阵合并。\n",
 10 |     "\n",
 11 |     "This code aims to merge the similarity matrix from several groups"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "from scipy.sparse import *\n",
 23 |     "import scipy\n",
 24 |     "import os\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "path = 'tmpData/'"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 6,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "save success for 10 batch\n",
 46 |       "-------------------------\n",
 47 |       "save success for 11 batch\n",
 48 |       "-------------------------\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "lenth = 359850\n",
 54 |     "for i in range(0, 12):\n",
 55 |     "    mat = None\n",
 56 |     "    start = lenth * i\n",
 57 |     "    end = lenth * (i + 1)\n",
 58 |     "    count = 0\n",
 59 |     "    for name in os.listdir('tmpData/'):\n",
 60 |     "        if name[-3:] == 'npz':\n",
 61 |     "            if mat == None:\n",
 62 |     "                mat = load_npz(path + name)[start: end]\n",
 63 |     "            else:\n",
 64 |     "                ## 把分块计算的相似度矩阵合并\n",
 65 |     "                mat += load_npz(path + name)[start: end]\n",
 66 |     "        count += 1\n",
 67 |     "    scipy.sparse.save_npz('commonMatrix/common_matrix_from_%d_to_%d.npz'%(start, end), mat)\n",
 68 |     "    print('save success for %d batch'%i)\n",
 69 |     "    print('-------------------------')"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.7.2"
 97 |   },
 98 |   "toc": {
 99 |    "base_numbering": 1,
100 |    "nav_menu": {},
101 |    "number_sections": true,
102 |    "sideBar": true,
103 |    "skip_h1_title": false,
104 |    "title_cell": "Table of Contents",
105 |    "title_sidebar": "Contents",
106 |    "toc_cell": false,
107 |    "toc_position": {},
108 |    "toc_section_display": true,
109 |    "toc_window_display": false
110 |   }
111 |  },
112 |  "nbformat": 4,
113 |  "nbformat_minor": 2
114 | }
115 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/5_Save_sparse_to_dense.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 构建哈希表\n",
  8 |     "\n",
  9 |     "此代码目的是将相似性矩阵转化为可供快速检索的哈希结构。\n",
 10 |     "\n",
 11 |     "This code aims to tansfer similarity matrix into hash table for efficient retrieval"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import numpy as np\n",
 21 |     "from scipy.sparse import *\n",
 22 |     "import os"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "['common_matrix_from_3238650_to_3598500.npz',\n",
 34 |        " 'common_matrix_from_359850_to_719700.npz',\n",
 35 |        " 'common_matrix_from_3598500_to_3958350.npz',\n",
 36 |        " 'common_matrix_from_3958350_to_4318200.npz',\n",
 37 |        " 'common_matrix_from_2878800_to_3238650.npz',\n",
 38 |        " 'common_matrix_from_1439400_to_1799250.npz',\n",
 39 |        " 'common_matrix_from_2518950_to_2878800.npz',\n",
 40 |        " 'common_matrix_from_0_to_359850.npz',\n",
 41 |        " 'common_matrix_from_719700_to_1079550.npz',\n",
 42 |        " 'common_matrix_from_1079550_to_1439400.npz',\n",
 43 |        " 'common_matrix_from_2159100_to_2518950.npz',\n",
 44 |        " 'common_matrix_from_1799250_to_2159100.npz']"
 45 |       ]
 46 |      },
 47 |      "execution_count": 2,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "os.listdir('commonMatrix/')"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "finished\n",
 66 |       "finished\n",
 67 |       "finished\n",
 68 |       "finished\n",
 69 |       "finished\n",
 70 |       "finished\n",
 71 |       "finished\n",
 72 |       "finished\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "for name in os.listdir('commonMatrix/'):\n",
 78 |     "    mat = load_npz('commonMatrix/' + name).tolil()\n",
 79 |     "    l = []\n",
 80 |     "    for i in range(mat.shape[0]):\n",
 81 |     "        _, a, b = find(mat[i])\n",
 82 |     "        index = np.where(b > 1.5)\n",
 83 |     "        #l.append(sorted(list(zip(a[index], b[index])),key= lambda x:x[1], reverse=True))\n",
 84 |     "        \n",
 85 |     "        c = np.array( [round(x,3) for x in b] )\n",
 86 |     "        l.append(sorted(list(zip(a[index], c[index])),key= lambda x:x[1], reverse=True))\n",
 87 |     "        \n",
 88 |     "    l = str(l)\n",
 89 |     "    f = open('common_dense_valued_small/' + name, 'w')\n",
 90 |     "    f.write(l)\n",
 91 |     "    f.close()\n",
 92 |     "    print('finished')"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 1,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "\n",
102 |     "#f = open('usersActivity_map.txt', 'r')\n",
103 |     "#m = f.read()\n",
104 |     "#user_times_map = eval(m)\n",
105 |     "#f.close()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "705"
117 |       ]
118 |      },
119 |      "execution_count": 4,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "#max( user_times_map.values() )"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 5,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "0.15244796589352247"
137 |       ]
138 |      },
139 |      "execution_count": 5,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "#import math\n",
146 |     "#1/math.log(1+705)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": []
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "Python 3",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.7.1"
174 |   },
175 |   "toc": {
176 |    "base_numbering": 1,
177 |    "nav_menu": {},
178 |    "number_sections": true,
179 |    "sideBar": true,
180 |    "skip_h1_title": false,
181 |    "title_cell": "Table of Contents",
182 |    "title_sidebar": "Contents",
183 |    "toc_cell": false,
184 |    "toc_position": {},
185 |    "toc_section_display": true,
186 |    "toc_window_display": false
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 2
191 | }
192 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/6_Sta_for_SparseMatrix.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 截断 & 考虑用户频率\n",
  8 |     "\n",
  9 |     "此代码目对应解决方案中对于similarity的改进\n",
 10 |     "\n",
 11 |     "This code is related to the improvement for similarity metrics"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import time\n",
 23 |     "from scipy.sparse import *\n",
 24 |     "import os\n",
 25 |     "import re"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": []
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": []
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "## 统计每个商品的打分次数（用train）\n",
 49 |     "f = open('hot_items_map.txt', 'r')\n",
 50 |     "rating_times_map = eval(f.read())\n",
 51 |     "f.close()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": []
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": []
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 3,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "load file: 6 sec\n",
 78 |       "This batch is finished, time cost: 1 sec\n",
 79 |       "load file: 7 sec\n",
 80 |       "This batch is finished, time cost: 1 sec\n",
 81 |       "load file: 7 sec\n",
 82 |       "This batch is finished, time cost: 1 sec\n",
 83 |       "load file: 7 sec\n",
 84 |       "This batch is finished, time cost: 1 sec\n",
 85 |       "load file: 8 sec\n",
 86 |       "This batch is finished, time cost: 1 sec\n",
 87 |       "load file: 8 sec\n",
 88 |       "This batch is finished, time cost: 1 sec\n",
 89 |       "load file: 8 sec\n",
 90 |       "This batch is finished, time cost: 1 sec\n",
 91 |       "load file: 8 sec\n",
 92 |       "This batch is finished, time cost: 1 sec\n",
 93 |       "load file: 8 sec\n",
 94 |       "This batch is finished, time cost: 1 sec\n",
 95 |       "load file: 7 sec\n",
 96 |       "This batch is finished, time cost: 1 sec\n",
 97 |       "load file: 8 sec\n",
 98 |       "This batch is finished, time cost: 1 sec\n",
 99 |       "load file: 8 sec\n",
100 |       "This batch is finished, time cost: 1 sec\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "item_dict = {}\n",
106 |     "\n",
107 |     "for name in os.listdir('common_dense_valued_small/'):\n",
108 |     "    start_time = time.time()\n",
109 |     "    f = open('common_dense_valued_small/' + name, 'r')\n",
110 |     "    l = f.read()\n",
111 |     "    l = eval(l)\n",
112 |     "    f.close()\n",
113 |     "    end_time = time.time()\n",
114 |     "    print('load file: %d sec'%((end_time - start_time)))    \n",
115 |     "    \n",
116 |     "    name = re.findall(r'\\d+', name)\n",
117 |     "    start = int(name[0])\n",
118 |     "    end = int(name[1])\n",
119 |     "\n",
120 |     "    \n",
121 |     "    start_time = time.time()\n",
122 |     "    \n",
123 |     "    \n",
124 |     "    for i in range(start, end):\n",
125 |     "        tmp_list = []\n",
126 |     "        [tmp_list.append( (x[0], round(x[1] / rating_times_map[i], 4) ) ) for x in l[i - start] if x[0] != i]\n",
127 |     "        if len(tmp_list) > 0:\n",
128 |     "            item_dict[i] = sorted(tmp_list,key=lambda x:x[1], reverse=True)[:500]\n",
129 |     "            \n",
130 |     "    end_time = time.time()\n",
131 |     "    print('This batch is finished, time cost: %d sec'%((end_time - start_time)))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 4,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "440137"
143 |       ]
144 |      },
145 |      "execution_count": 4,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "len(item_dict)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": []
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": []
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 5,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "f = open('item_Apriori.txt','w')\n",
175 |     "f.write(str(item_dict))\n",
176 |     "f.close()"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.7.1"
197 |   },
198 |   "toc": {
199 |    "base_numbering": 1,
200 |    "nav_menu": {},
201 |    "number_sections": true,
202 |    "sideBar": true,
203 |    "skip_h1_title": false,
204 |    "title_cell": "Table of Contents",
205 |    "title_sidebar": "Contents",
206 |    "toc_cell": false,
207 |    "toc_position": {},
208 |    "toc_section_display": true,
209 |    "toc_window_display": false
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 2
214 | }
215 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step1 itemCF_based_on_Apriori/7_generate_recall.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "def load_data(path):\n",
 20 |     "    user = pd.read_csv(path + 'user.csv',header=None)\n",
 21 |     "    item = pd.read_csv(path + 'item.csv',header=None)\n",
 22 |     "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
 23 |     "\n",
 24 |     "    data.columns = ['userID','itemID','behavior','timestamp']\n",
 25 |     "    data['day'] = data['timestamp'] // 86400\n",
 26 |     "    data['hour'] = data['timestamp'] // 3600 % 24\n",
 27 |     "\n",
 28 |     "    ## 生成behavior的加权\n",
 29 |     "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
 30 |     "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
 31 |     "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
 32 |     "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
 33 |     "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
 34 |     "    max_day = max(data['day'])\n",
 35 |     "    min_day = min(data['day'])\n",
 36 |     "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
 37 |     "\n",
 38 |     "    item.columns = ['itemID','category','shop','brand']\n",
 39 |     "    user.columns = ['userID','sex','age','ability']\n",
 40 |     "\n",
 41 |     "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
 42 |     "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
 43 |     "\n",
 44 |     "    return user, item, data\n",
 45 |     "    \n",
 46 |     "def get_unique_inorder(x, k=50):\n",
 47 |     "    result = []\n",
 48 |     "    flag = set()\n",
 49 |     "    for i in x:\n",
 50 |     "        if i[0] not in flag:\n",
 51 |     "            result.append(i)\n",
 52 |     "            flag.add(i[0])\n",
 53 |     "        if len(flag) > k:\n",
 54 |     "            break\n",
 55 |     "    return result\n",
 56 |     "\n",
 57 |     "def get_recall_list(train, targetDay, k=300):\n",
 58 |     "    train_logs = dict()\n",
 59 |     "    \n",
 60 |     "    if targetDay > max(train['day']):\n",
 61 |     "        for row in train[['userID','itemID','behavior']].values:\n",
 62 |     "            train_logs.setdefault(row[0], dict())\n",
 63 |     "            if row[1] in upward_map:\n",
 64 |     "                train_logs[row[0]].setdefault(upward_map[row[1]],0)\n",
 65 |     "                train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])\n",
 66 |     "    else:\n",
 67 |     "        user_List_test = set(train.loc[train['day']==targetDay,'userID'])\n",
 68 |     "        train = train[train['day'] < targetDay]\n",
 69 |     "        \n",
 70 |     "        for row in train[['userID','itemID','behavior']].values:\n",
 71 |     "            if row[0] in user_List_test:\n",
 72 |     "                train_logs.setdefault(row[0], dict())\n",
 73 |     "                if row[1] in upward_map:\n",
 74 |     "                    train_logs[row[0]].setdefault(upward_map[row[1]],0)\n",
 75 |     "                    train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])\n",
 76 |     "\n",
 77 |     "    for each_user in train_logs:\n",
 78 |     "        sum_value = sum(train_logs[each_user].values())\n",
 79 |     "        if sum_value > 0:\n",
 80 |     "            for each_item in train_logs[each_user]:\n",
 81 |     "                train_logs[each_user][each_item] /= sum_value            \n",
 82 |     "\n",
 83 |     "    result_logs = dict()    \n",
 84 |     "    for u in train_logs:\n",
 85 |     "        result_logs.setdefault(u, list())\n",
 86 |     "        for i in set(train_logs[u].keys()):\n",
 87 |     "            if i in item_dict:\n",
 88 |     "                tmp_list = [ (x[0], train_logs[u][i]*x[1]) for x in item_dict[i]]\n",
 89 |     "                result_logs[u] += tmp_list\n",
 90 |     "            \n",
 91 |     "    for u in result_logs:\n",
 92 |     "        result_logs[u] = get_unique_inorder([(downward_map[x[0]], x[1]) for x in sorted(result_logs[u], key=lambda x:x[1], reverse=True)\n",
 93 |     "                          if x[0] not in train_logs[u]], k=300)  \n",
 94 |     "    \n",
 95 |     "    return result_logs\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "def generate_pairs(recall):\n",
 99 |     "    result = []\n",
100 |     "    for u in recall:\n",
101 |     "        for i in recall[u]:\n",
102 |     "            result.append([u,i[0],i[1]])\n",
103 |     "    return result\n",
104 |     "\n",
105 |     "def reshape_recall_to_dataframe(recall):\n",
106 |     "    result = generate_pairs(recall)\n",
107 |     "    result = pd.DataFrame(result)\n",
108 |     "    result.columns = ['userID','itemID','apriori']\n",
109 |     "    return result\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "#path = './'\n",
119 |     "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
120 |     "\n",
121 |     "## The target date(16 means online, 15 means underline test, 14 means underline train)\n",
122 |     "targetday = 15\n",
123 |     "\n",
124 |     "## The lenth of recall list, the default is 300\n",
125 |     "lenth = 300\n",
126 |     "\n",
127 |     "## The name of generated recall file\n",
128 |     "\n",
129 |     "name = 'recall_list_round2_%dday_%dlenth.csv'%(targetday, lenth)\n",
130 |     "\n",
131 |     "\n",
132 |     "user, item, data = load_data(path = path)   \n",
133 |     "\n",
134 |     "#tempory_path = './tempory_file/'\n",
135 |     "tempory_path = './'\n",
136 |     "f = open(tempory_path + 'upward_map.txt','r')\n",
137 |     "\n",
138 |     "upward_map = f.read()\n",
139 |     "upward_map = eval(upward_map)\n",
140 |     "f.close()\n",
141 |     "    \n",
142 |     "f = open(tempory_path + 'downward_map.txt','r')\n",
143 |     "downward_map = f.read()\n",
144 |     "downward_map = eval(downward_map)\n",
145 |     "f.close()\n",
146 |     "\n",
147 |     "f = open(tempory_path + 'item_Apriori.txt','r')\n",
148 |     "tmp = f.read()\n",
149 |     "item_dict = eval(tmp)\n",
150 |     "f.close()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "注意，下面这一行代码是在256GB内存的机器上运行的。如果你的机器配置不够256GB，为你提供两种解决方案：\n",
158 |     "1. 使用最近日期的数据运行，依然会占用较大内存，对结果有一定影响：\n",
159 |     "\n",
160 |     "          data = data[data['day'] >= K] ##这里K设置为12/13等靠后的天数\n",
161 |     "          \n",
162 |     "2. 【推荐】在Step 6_Sta_for_SparseMatrix.ipynb 中把下面代码中的参数500调小为300，对结果影响较小，内存需求很小：\n",
163 |     "\n",
164 |     "          item_dict[i] = sorted(tmp_list,key=lambda x:x[1], reverse=True)[:500]\n",
165 |     "          \n",
166 |     "当然这部分是完全可以在不减少精度的情况下批处理优化的，但是由于当时比赛时间不够没有优化，而且有大内存机器。"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 4,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "recall_logs = get_recall_list(data, targetDay=targetday, k=lenth)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 5,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "recall_df = reshape_recall_to_dataframe(recall_logs)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 6,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "temp = pd.merge(left=recall_df, right=data[data['day'] == targetday][['userID','itemID','behavior']], \n",
194 |     "         on=['userID','itemID'], how='left').rename(columns={'behavior':'label'})"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "len(set(recall_df['userID']) & set(data[data['day'] == targetday]['userID']))"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "len(set(recall_df['userID']))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "recall_df.to_csv(name, index=False)"
222 |    ]
223 |   }
224 |  ],
225 |  "metadata": {
226 |   "kernelspec": {
227 |    "display_name": "Python 3",
228 |    "language": "python",
229 |    "name": "python3"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.7.2"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 2
246 | }
247 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step2 Generate_feature_for_Ranking/1_generate_static_features.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "def reduce_mem_usage(df):\n",
 20 |     "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
 21 |     "        to reduce memory usage.        \n",
 22 |     "    \"\"\"\n",
 23 |     "    start_mem = df.memory_usage().sum() \n",
 24 |     "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
 25 |     "    \n",
 26 |     "    for col in df.columns:\n",
 27 |     "        col_type = df[col].dtype\n",
 28 |     "        \n",
 29 |     "        if col_type != object:\n",
 30 |     "            c_min = df[col].min()\n",
 31 |     "            c_max = df[col].max()\n",
 32 |     "            if str(col_type)[:3] == 'int':\n",
 33 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
 34 |     "                    df[col] = df[col].astype(np.int8)\n",
 35 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
 36 |     "                    df[col] = df[col].astype(np.int16)\n",
 37 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
 38 |     "                    df[col] = df[col].astype(np.int32)\n",
 39 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
 40 |     "                    df[col] = df[col].astype(np.int64)  \n",
 41 |     "            else:\n",
 42 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
 43 |     "                    df[col] = df[col].astype(np.float16)\n",
 44 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
 45 |     "                    df[col] = df[col].astype(np.float32)\n",
 46 |     "                else:\n",
 47 |     "                    df[col] = df[col].astype(np.float64)\n",
 48 |     "        else:\n",
 49 |     "            df[col] = df[col].astype('category')\n",
 50 |     "\n",
 51 |     "    end_mem = df.memory_usage().sum() \n",
 52 |     "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
 53 |     "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
 54 |     "    \n",
 55 |     "    return df\n",
 56 |     "\n",
 57 |     "def load_data(path):\n",
 58 |     "    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))\n",
 59 |     "    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))\n",
 60 |     "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
 61 |     "\n",
 62 |     "    data.columns = ['userID','itemID','behavior','timestamp']\n",
 63 |     "    data['day'] = data['timestamp'] // 86400\n",
 64 |     "    data['hour'] = data['timestamp'] // 3600 % 24\n",
 65 |     "    \n",
 66 |     "    ## 生成behavior的onehot\n",
 67 |     "    for i in ['pv','fav','cart','buy']:\n",
 68 |     "        data[i] = 0\n",
 69 |     "        data.loc[data['behavior'] == i, i] = 1\n",
 70 |     "\n",
 71 |     "    ## 生成behavior的加权\n",
 72 |     "    \n",
 73 |     "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
 74 |     "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
 75 |     "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
 76 |     "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
 77 |     "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
 78 |     "    max_day = max(data['day'])\n",
 79 |     "    min_day = min(data['day'])\n",
 80 |     "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
 81 |     "\n",
 82 |     "    item.columns = ['itemID','category','shop','brand']\n",
 83 |     "    user.columns = ['userID','sex','age','ability']\n",
 84 |     "    \n",
 85 |     "    data = reduce_mem_usage(data)\n",
 86 |     "\n",
 87 |     "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
 88 |     "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
 89 |     "\n",
 90 |     "    return user, item, data\n",
 91 |     "    "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 3,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Memory usage of dataframe is 44702560.00 MB\n",
104 |       "Memory usage after optimization is: 9778785.00 MB\n",
105 |       "Decreased by 78.1%\n",
106 |       "Memory usage of dataframe is 138182592.00 MB\n",
107 |       "Memory usage after optimization is: 60454956.00 MB\n",
108 |       "Decreased by 56.2%\n",
109 |       "Memory usage of dataframe is 7081839904.00 MB\n",
110 |       "Memory usage after optimization is: 1770460072.00 MB\n",
111 |       "Decreased by 75.0%\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
117 |     "\n",
118 |     "user, item, data = load_data(path = path)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "scrolled": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "for count_feature in ['itemID', 'shop', 'category','brand']:\n",
130 |     "    data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(\n",
131 |     "        {'behavior':'count'}).rename(columns={'behavior':count_feature + '_count'}).to_csv(str(count_feature)+'_count.csv', index=False)\n",
132 |     "\n",
133 |     "for count_feature in ['itemID', 'shop', 'category','brand']:\n",
134 |     "    data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(\n",
135 |     "        {'behavior':'sum'}).rename(columns={'behavior':count_feature + '_sum'}).to_csv(str(count_feature)+'_sum.csv', index=False)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "temp = data[['behavior','category']].groupby('category', as_index=False).agg({'behavior': ['median','std','skew']})\n",
145 |     "temp.columns = ['category','category_median','category_std','category_skew']\n",
146 |     "\n",
147 |     "temp.to_csv('category_higher.csv',index=False)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "temp = data[['behavior','itemID']].groupby('itemID', as_index=False).agg({'behavior': ['median','std','skew']})\n",
157 |     "temp.columns = ['itemID','itemID_median','itemID_std','itemID_skew']\n",
158 |     "\n",
159 |     "temp.to_csv('itemID_higher.csv',index=False)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 4,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "data['age'] = data['age'] // 10\n",
169 |     "train = data[data['day'] < 15]"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "## 注意 这个要生成一个underline版本和一个online版本\n",
179 |     "for count_feature in ['sex','ability','age']:\n",
180 |     "    data[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(\n",
181 |     "        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
182 |     "                                               + count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count_online.csv', index=False)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "itemcount = pd.read_csv('itemID_count.csv')"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "temp = pd.merge(left=item, right=itemcount, how='left', on='itemID')"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "item_rank = []\n",
210 |     "for eachcat in temp.groupby('category'):\n",
211 |     "    each_df = eachcat[1].sort_values('itemID_count', ascending=False).reset_index(drop=True)\n",
212 |     "    each_df['rank'] = each_df.index + 1\n",
213 |     "    lenth = each_df.shape[0]\n",
214 |     "    each_df['rank_percent'] = (each_df.index + 1) / lenth\n",
215 |     "    item_rank.append(each_df[['itemID','rank','rank_percent']])"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "item_rank = pd.concat(item_rank, sort=False)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "item_rank.to_csv('item_rank.csv',index=False)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "def unique_count(x):\n",
243 |     "    return len(set(x))"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "cat1 = item.groupby('category',as_index=False).agg({'itemID': unique_count}).rename(columns={'itemID':'itemnum_undercat'})"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "cat2 = item.groupby('category',as_index=False).agg({'brand': unique_count}).rename(columns={'brand':'brandnum_undercat'})"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "cat3 = item.groupby('category',as_index=False).agg({'shop': unique_count}).rename(columns={'shop':'shopnum_undercat'})"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "pd.concat([cat1, cat2[['brandnum_undercat']], cat3[['shopnum_undercat']]], axis=1).to_csv('category_lower.csv',index=False)"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.7.3"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 2
304 | }
305 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step2 Generate_feature_for_Ranking/2_generate_dynamic_feature.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "def reduce_mem_usage(df):\n",
 20 |     "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
 21 |     "        to reduce memory usage.        \n",
 22 |     "    \"\"\"\n",
 23 |     "    start_mem = df.memory_usage().sum() \n",
 24 |     "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
 25 |     "    \n",
 26 |     "    for col in df.columns:\n",
 27 |     "        col_type = df[col].dtype\n",
 28 |     "        \n",
 29 |     "        if col_type != object:\n",
 30 |     "            c_min = df[col].min()\n",
 31 |     "            c_max = df[col].max()\n",
 32 |     "            if str(col_type)[:3] == 'int':\n",
 33 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
 34 |     "                    df[col] = df[col].astype(np.int8)\n",
 35 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
 36 |     "                    df[col] = df[col].astype(np.int16)\n",
 37 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
 38 |     "                    df[col] = df[col].astype(np.int32)\n",
 39 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
 40 |     "                    df[col] = df[col].astype(np.int64)  \n",
 41 |     "            else:\n",
 42 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
 43 |     "                    df[col] = df[col].astype(np.float16)\n",
 44 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
 45 |     "                    df[col] = df[col].astype(np.float32)\n",
 46 |     "                else:\n",
 47 |     "                    df[col] = df[col].astype(np.float64)\n",
 48 |     "        else:\n",
 49 |     "            df[col] = df[col].astype('category')\n",
 50 |     "\n",
 51 |     "    end_mem = df.memory_usage().sum() \n",
 52 |     "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
 53 |     "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
 54 |     "    \n",
 55 |     "    return df\n",
 56 |     "\n",
 57 |     "def load_data(path):\n",
 58 |     "    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))\n",
 59 |     "    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))\n",
 60 |     "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
 61 |     "\n",
 62 |     "    data.columns = ['userID','itemID','behavior','timestamp']\n",
 63 |     "    data['day'] = data['timestamp'] // 86400\n",
 64 |     "    data['hour'] = data['timestamp'] // 3600 % 24\n",
 65 |     "    \n",
 66 |     "    ## 生成behavior的onehot\n",
 67 |     "    for i in ['pv','fav','cart','buy']:\n",
 68 |     "        data[i] = 0\n",
 69 |     "        data.loc[data['behavior'] == i, i] = 1\n",
 70 |     "\n",
 71 |     "    ## 生成behavior的加权\n",
 72 |     "    \n",
 73 |     "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
 74 |     "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
 75 |     "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
 76 |     "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
 77 |     "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
 78 |     "    max_day = max(data['day'])\n",
 79 |     "    min_day = min(data['day'])\n",
 80 |     "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
 81 |     "\n",
 82 |     "    item.columns = ['itemID','category','shop','brand']\n",
 83 |     "    user.columns = ['userID','sex','age','ability']\n",
 84 |     "    \n",
 85 |     "    data = reduce_mem_usage(data)\n",
 86 |     "\n",
 87 |     "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
 88 |     "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
 89 |     "\n",
 90 |     "    return user, item, data\n",
 91 |     "    "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Memory usage of dataframe is 44702560.00 MB\n",
104 |       "Memory usage after optimization is: 9778785.00 MB\n",
105 |       "Decreased by 78.1%\n",
106 |       "Memory usage of dataframe is 138182592.00 MB\n",
107 |       "Memory usage after optimization is: 60454956.00 MB\n",
108 |       "Decreased by 56.2%\n",
109 |       "Memory usage of dataframe is 7081839904.00 MB\n",
110 |       "Memory usage after optimization is: 1770460072.00 MB\n",
111 |       "Decreased by 75.0%\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "#path = '..\\\\data\\\\'\n",
117 |     "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
118 |     "user, item, data = load_data(path = path)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "train = data[data['day'] < 15]"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "train = data[data['day'] < 15]\n",
137 |     "\n",
138 |     "online_features = []\n",
139 |     "for count_feature in ['category','shop','brand']:\n",
140 |     "    train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
141 |     "        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
142 |     "                                               + count_feature + '_count'}).to_csv('user_to_' + str(count_feature)+'_count.csv', index=False)\n",
143 |     "for count_feature in ['category','shop','brand']:\n",
144 |     "    train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
145 |     "        {'behavior': 'sum'}).rename(columns={'behavior':'user_to_' \n",
146 |     "                                             + count_feature + '_sum'}).to_csv('user_to_' + str(count_feature)+'_sum.csv', index=False)\n",
147 |     "\n",
148 |     "for count_feature in ['category','shop','brand']:\n",
149 |     "    for behavior_type in ['pv','fav','cart','buy']:\n",
150 |     "        train[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
151 |     "            {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'\n",
152 |     "                                                   + count_feature + '_count_' + behavior_type}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '.csv', index=False)\n",
153 |     "\n"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "yestday = data[data['day'] == 14]\n",
163 |     "\n",
164 |     "for count_feature in ['category','shop','brand']:\n",
165 |     "    yestday[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
166 |     "        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
167 |     "                                               + count_feature + '_count_yestday'}).to_csv('user_to_' + str(count_feature)+'_count_yestday.csv', index=False)\n",
168 |     "\n",
169 |     "for count_feature in ['category','shop','brand']:\n",
170 |     "    for behavior_type in ['pv','fav','cart','buy']:\n",
171 |     "        yestday[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
172 |     "            {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'\n",
173 |     "                                                   + count_feature + '_count_' + behavior_type+'_yestday'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_yestday.csv', index=False)\n"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "a5days = data[(data['day'] > 15 - 5) & (data['day'] < 15)]\n",
183 |     "\n",
184 |     "for count_feature in ['category','shop','brand']:\n",
185 |     "    a5days[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
186 |     "        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
187 |     "                                               + count_feature + '_count_5days'}).to_csv('user_to_' + str(count_feature)+'_count_5days.csv', index=False)\n",
188 |     "\n",
189 |     "for count_feature in ['category','shop','brand']:\n",
190 |     "    for behavior_type in ['pv','fav','cart','buy']:\n",
191 |     "        a5days[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
192 |     "            {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'\n",
193 |     "                                                   + count_feature + '_count_' + behavior_type+'_5days'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_5days.csv', index=False)\n"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "start_timestamp  = max(data[data['day'] < 15]['timestamp'])"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "time_features = []\n",
212 |     "test = data[data['day'] < 15]\n",
213 |     "for time_feature in ['shop', 'category','brand']:\n",
214 |     "    time_features.append(test[['last_time','userID',time_feature,'day']].groupby(['userID',time_feature], as_index=False).agg({'last_time': 'min', 'day':'max'}).rename(columns={'last_time': 'user_to_'\n",
215 |     "                                                   + time_feature + '_lasttime', 'day':'user_to_'+ time_feature + '_lastday'}))"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "for f in time_features:\n",
225 |     "    f.to_csv(str(f.columns[2])+'.csv', index=False)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "for f in time_features:\n",
235 |     "    print(str(f.columns[2])+'.csv')"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "for count_feature in ['sex','ability','age']:\n",
245 |     "    train[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(\n",
246 |     "        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
247 |     "                                               + count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count.csv', index=False)"
248 |    ]
249 |   }
250 |  ],
251 |  "metadata": {
252 |   "kernelspec": {
253 |    "display_name": "Python 3",
254 |    "language": "python",
255 |    "name": "python3"
256 |   },
257 |   "language_info": {
258 |    "codemirror_mode": {
259 |     "name": "ipython",
260 |     "version": 3
261 |    },
262 |    "file_extension": ".py",
263 |    "mimetype": "text/x-python",
264 |    "name": "python",
265 |    "nbconvert_exporter": "python",
266 |    "pygments_lexer": "ipython3",
267 |    "version": "3.7.3"
268 |   }
269 |  },
270 |  "nbformat": 4,
271 |  "nbformat_minor": 2
272 | }
273 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step2 Generate_feature_for_Ranking/3_generate_time_feature.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "def reduce_mem_usage(df):\n",
 20 |     "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
 21 |     "        to reduce memory usage.        \n",
 22 |     "    \"\"\"\n",
 23 |     "    start_mem = df.memory_usage().sum() \n",
 24 |     "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
 25 |     "    \n",
 26 |     "    for col in df.columns:\n",
 27 |     "        col_type = df[col].dtype\n",
 28 |     "        \n",
 29 |     "        if col_type != object:\n",
 30 |     "            c_min = df[col].min()\n",
 31 |     "            c_max = df[col].max()\n",
 32 |     "            if str(col_type)[:3] == 'int':\n",
 33 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
 34 |     "                    df[col] = df[col].astype(np.int8)\n",
 35 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
 36 |     "                    df[col] = df[col].astype(np.int16)\n",
 37 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
 38 |     "                    df[col] = df[col].astype(np.int32)\n",
 39 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
 40 |     "                    df[col] = df[col].astype(np.int64)  \n",
 41 |     "            else:\n",
 42 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
 43 |     "                    df[col] = df[col].astype(np.float16)\n",
 44 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
 45 |     "                    df[col] = df[col].astype(np.float32)\n",
 46 |     "                else:\n",
 47 |     "                    df[col] = df[col].astype(np.float64)\n",
 48 |     "        else:\n",
 49 |     "            df[col] = df[col].astype('category')\n",
 50 |     "\n",
 51 |     "    end_mem = df.memory_usage().sum() \n",
 52 |     "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
 53 |     "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
 54 |     "    \n",
 55 |     "    return df\n",
 56 |     "\n",
 57 |     "def load_data(path):\n",
 58 |     "    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))\n",
 59 |     "    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))\n",
 60 |     "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
 61 |     "\n",
 62 |     "    data.columns = ['userID','itemID','behavior','timestamp']\n",
 63 |     "    data['day'] = data['timestamp'] // 86400\n",
 64 |     "    data['hour'] = data['timestamp'] // 3600 % 24\n",
 65 |     "    \n",
 66 |     "    ## 生成behavior的onehot\n",
 67 |     "    for i in ['pv','fav','cart','buy']:\n",
 68 |     "        data[i] = 0\n",
 69 |     "        data.loc[data['behavior'] == i, i] = 1\n",
 70 |     "\n",
 71 |     "    ## 生成behavior的加权\n",
 72 |     "    \n",
 73 |     "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
 74 |     "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
 75 |     "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
 76 |     "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
 77 |     "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
 78 |     "    max_day = max(data['day'])\n",
 79 |     "    min_day = min(data['day'])\n",
 80 |     "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
 81 |     "\n",
 82 |     "    item.columns = ['itemID','category','shop','brand']\n",
 83 |     "    user.columns = ['userID','sex','age','ability']\n",
 84 |     "    \n",
 85 |     "    data = reduce_mem_usage(data)\n",
 86 |     "\n",
 87 |     "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
 88 |     "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
 89 |     "\n",
 90 |     "    return user, item, data\n",
 91 |     "    "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 3,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Memory usage of dataframe is 44702560.00 MB\n",
104 |       "Memory usage after optimization is: 9778785.00 MB\n",
105 |       "Decreased by 78.1%\n",
106 |       "Memory usage of dataframe is 138182592.00 MB\n",
107 |       "Memory usage after optimization is: 60454956.00 MB\n",
108 |       "Decreased by 56.2%\n",
109 |       "Memory usage of dataframe is 7081839904.00 MB\n",
110 |       "Memory usage after optimization is: 1770460072.00 MB\n",
111 |       "Decreased by 75.0%\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
117 |     "user, item, data = load_data(path = path)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## 2019/09/21 晚 实验\n",
125 |     "\n",
126 |     "提取某个商品/店铺/类别/品牌 距离第15 and 16天的最后一次点击"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 5,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "name": "stderr",
136 |      "output_type": "stream",
137 |      "text": [
138 |       "/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
139 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
140 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
141 |       "\n",
142 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
143 |       "  \"\"\"\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "train = data[data['day'] < 15]\n",
149 |     "\n",
150 |     "start_timestamp  = max(train['timestamp'])\n",
151 |     "\n",
152 |     "train['last_time'] = start_timestamp - train['timestamp']\n",
153 |     "\n",
154 |     "timefeatures = []\n",
155 |     "\n",
156 |     "for time_feature in ['itemID', 'shop', 'category','brand']:\n",
157 |     "    name = time_feature + '_last_time_underline.csv'\n",
158 |     "    tf = train[['last_time', time_feature]].groupby(\n",
159 |     "        time_feature, as_index=False).agg({'last_time':'min'}).rename(columns={'last_time': time_feature + 'last_time'})\n",
160 |     "    tf[time_feature + 'last_time_hour_ed'] = tf[time_feature + 'last_time'] // 3600 % 24\n",
161 |     "    timefeatures.append((name, tf))\n",
162 |     "\n",
163 |     "for f in timefeatures:\n",
164 |     "    f[1].to_csv(f[0], index=False)"
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "Python 3",
171 |    "language": "python",
172 |    "name": "python3"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 3
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython3",
184 |    "version": "3.7.3"
185 |   }
186 |  },
187 |  "nbformat": 4,
188 |  "nbformat_minor": 2
189 | }
190 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step3 Ranking/1_build_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import catboost as cat"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "def reduce_mem_usage(df):\n",
 21 |     "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
 22 |     "        to reduce memory usage.        \n",
 23 |     "    \"\"\"\n",
 24 |     "    start_mem = df.memory_usage().sum() \n",
 25 |     "    \n",
 26 |     "    for col in df.columns:\n",
 27 |     "        col_type = df[col].dtype\n",
 28 |     "        \n",
 29 |     "        if col_type != object:\n",
 30 |     "            c_min = df[col].min()\n",
 31 |     "            c_max = df[col].max()\n",
 32 |     "            if str(col_type)[:3] == 'int':\n",
 33 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
 34 |     "                    df[col] = df[col].astype(np.int8)\n",
 35 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
 36 |     "                    df[col] = df[col].astype(np.int16)\n",
 37 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
 38 |     "                    df[col] = df[col].astype(np.int32)\n",
 39 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
 40 |     "                    df[col] = df[col].astype(np.int64)  \n",
 41 |     "            else:\n",
 42 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
 43 |     "                    df[col] = df[col].astype(np.float16)\n",
 44 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
 45 |     "                    df[col] = df[col].astype(np.float32)\n",
 46 |     "                else:\n",
 47 |     "                    df[col] = df[col].astype(np.float64)\n",
 48 |     "        else:\n",
 49 |     "            df[col] = df[col].astype('category')\n",
 50 |     "\n",
 51 |     "    end_mem = df.memory_usage().sum() \n",
 52 |     "\n",
 53 |     "    return df\n",
 54 |     "\n",
 55 |     "def load_data(path):\n",
 56 |     "    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None, engine='c'))\n",
 57 |     "    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None, engine='c'))\n",
 58 |     "    data = pd.read_csv(path + 'user_behavior.csv',header=None, engine='c')\n",
 59 |     "\n",
 60 |     "    data.columns = ['userID','itemID','behavior','timestamp']\n",
 61 |     "    data['day'] = data['timestamp'] // 86400\n",
 62 |     "    data['hour'] = data['timestamp'] // 3600 % 24\n",
 63 |     "    \n",
 64 |     "    ## 生成behavior的onehot\n",
 65 |     "    for i in ['pv','fav','cart','buy']:\n",
 66 |     "        data[i] = 0\n",
 67 |     "        data.loc[data['behavior'] == i, i] = 1\n",
 68 |     "\n",
 69 |     "    ## 生成behavior的加权\n",
 70 |     "    \n",
 71 |     "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
 72 |     "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
 73 |     "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
 74 |     "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
 75 |     "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
 76 |     "    max_day = max(data['day'])\n",
 77 |     "    min_day = min(data['day'])\n",
 78 |     "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
 79 |     "\n",
 80 |     "    item.columns = ['itemID','category','shop','brand']\n",
 81 |     "    user.columns = ['userID','sex','age','ability']\n",
 82 |     "    \n",
 83 |     "    data = reduce_mem_usage(data)\n",
 84 |     "\n",
 85 |     "    data = pd.merge(left=data, right=item, on='itemID',how='left', sort=False)\n",
 86 |     "    data = pd.merge(left=data, right=user, on='userID',how='left', sort=False)\n",
 87 |     "\n",
 88 |     "    return user, item, data\n",
 89 |     "    "
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {
 96 |     "scrolled": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "user, item, data = load_data(path = '../ECommAI_EUIR_round2_train_20190816/')\n",
101 |     "user['age'] = user['age'] // 10\n",
102 |     "data['age'] = data['age'] // 10"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "#########需要修改！！！！！！！！\n",
112 |     "###路径也要改\n",
113 |     "recall_train_list = []\n",
114 |     "for i in range(7):\n",
115 |     "    recall_train_list.append(\n",
116 |     "        reduce_mem_usage(pd.read_csv(str(i) + 'recall_list_round2_15day_300lenth-Copy1.csv', engine='c')))"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 24,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "recall_train = pd.concat(recall_train_list, sort=False)\n",
126 |     "recall_train = recall_train.fillna(0)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 27,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "def downsample(df, percent=10):\n",
136 |     "    '''\n",
137 |     "    percent:多数类别下采样的数量相对于少数类别样本数量的比例\n",
138 |     "    '''\n",
139 |     "    \n",
140 |     "    data1 = df[df['label'] != 0]\n",
141 |     "    data0 = df[df['label'] == 0]\n",
142 |     "    index = np.random.randint(len(data0), size = percent * len(data1))\n",
143 |     "    lower_data0 = data0.iloc[list(index)]\n",
144 |     "    \n",
145 |     "    return(pd.concat([lower_data0, data1]))"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "recall_train = downsample(recall_train,10 )\n",
155 |     "\n",
156 |     "recall_train = pd.merge(left=recall_train, right=item, on='itemID',how='left', sort=False)\n",
157 |     "recall_train = pd.merge(left=recall_train, right=user, on='userID',how='left', sort=False)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "feature_path = '../Step2 Generate_feature_for_Ranking/'"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 28,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "underline_features_files = [\n",
183 |     "'brand_count.csv',\n",
184 |     "'brand_sum.csv',\n",
185 |     "'category_count.csv',\n",
186 |     "'category_sum.csv',\n",
187 |     "'itemID_count.csv',\n",
188 |     "'itemID_sum.csv',\n",
189 |     "'shop_count.csv',\n",
190 |     "'shop_sum.csv',\n",
191 |     "'category_lower.csv',\n",
192 |     "'item_rank.csv',\n",
193 |     "'category_higher.csv',\n",
194 |     "'itemID_higher.csv',\n",
195 |     "]\n",
196 |     "\n",
197 |     "underline_features = []\n",
198 |     "for f in underline_features_files:\n",
199 |     "    underline_features.append(pd.read_csv(feature_path+f, engine='c'))"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 29,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "for f in underline_features:\n",
209 |     "    recall_train = pd.merge(left=recall_train, right=f, on=f.columns[0], how='left', sort=False)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 31,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "## 注意这个线下训练时 是underline\n",
219 |     "\n",
220 |     "double_underline_features_files = [\n",
221 |     "'item_to_ability_count_underline.csv',\n",
222 |     "'item_to_sex_count_underline.csv',\n",
223 |     "'item_to_age_count_underline.csv',\n",
224 |     "]\n",
225 |     "\n",
226 |     "double_underline_features = []\n",
227 |     "for f in double_underline_features_files:\n",
228 |     "    double_underline_features.append(pd.read_csv(feature_path+f, engine='c'))"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 32,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "for f in double_underline_features:\n",
238 |     "    recall_train = pd.merge(left=recall_train, right=f, on=list(f.columns[0: 2]), how='left', sort=False)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 33,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "## 注意这个线下训练时 是underline\n",
248 |     "\n",
249 |     "time_features_files = [\n",
250 |     "'itemID_last_time_underline.csv',\n",
251 |     "'brand_last_time_underline.csv',\n",
252 |     "'shop_last_time_underline.csv'\n",
253 |     "]\n",
254 |     "\n",
255 |     "time_features = []\n",
256 |     "for f in time_features_files:\n",
257 |     "    time_features.append(pd.read_csv(feature_path+f, engine='c'))"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 34,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "for f in time_features:\n",
267 |     "    recall_train = pd.merge(left=recall_train, right=f, on=f.columns[0], how='left', sort=False)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 35,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "online_features_files =  ['user_to_brand_count.csv',\n",
277 |     "'user_to_brand_sum.csv',\n",
278 |     "'user_to_category_count.csv',\n",
279 |     "'user_to_category_sum.csv',\n",
280 |     "'user_to_shop_count.csv',\n",
281 |     "'user_to_shop_sum.csv',]\n",
282 |     "\n",
283 |     "\n",
284 |     "online2 = ['user_to_category_count_pv.csv',\n",
285 |     " 'user_to_category_count_buy.csv',\n",
286 |     " 'user_to_shop_count_pv.csv',\n",
287 |     " 'user_to_shop_count_buy.csv',\n",
288 |     " 'user_to_brand_count_pv.csv',\n",
289 |     " 'user_to_brand_count_buy.csv']\n",
290 |     "\n",
291 |     "\n",
292 |     "online3 = ['user_to_category_count_yestday.csv',\n",
293 |     "'user_to_category_count_pv_yestday.csv',\n",
294 |     " 'user_to_category_count_buy_yestday.csv',\n",
295 |     " 'user_to_shop_count_pv_yestday.csv',\n",
296 |     " 'user_to_shop_count_buy_yestday.csv',\n",
297 |     " 'user_to_brand_count_pv_yestday.csv',\n",
298 |     " 'user_to_brand_count_buy_yestday.csv']\n",
299 |     "\n",
300 |     "online4 = [\n",
301 |     " 'user_to_category_count_5days.csv',\n",
302 |     " 'user_to_category_count_pv_5days.csv',\n",
303 |     " 'user_to_category_count_buy_5days.csv',\n",
304 |     " 'user_to_shop_count_pv_5days.csv',\n",
305 |     " 'user_to_shop_count_buy_5days.csv',\n",
306 |     " 'user_to_brand_count_pv_5days.csv',\n",
307 |     " 'user_to_brand_count_buy_5days.csv']\n",
308 |     "\n",
309 |     "online5 = [\n",
310 |     "'user_to_shop_lasttime.csv',\n",
311 |     "'user_to_category_lasttime.csv',\n",
312 |     "'user_to_brand_lasttime.csv' ,\n",
313 |     "]\n",
314 |     "\n",
315 |     "online_features_files = online_features_files + online2 + online3 + online4 + online5\n"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 36,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "online_features = []\n",
325 |     "for f in online_features_files:\n",
326 |     "    online_features.append(pd.read_csv(feature_path+f, engine='c'))"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 37,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "for f in online_features:\n",
336 |     "    recall_train = pd.merge(left=recall_train, right=f, on=list(f.columns[0: 2]), how='left', sort=False)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 38,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "def transfer_label(x):\n",
346 |     "    if x == 0:\n",
347 |     "        return 0\n",
348 |     "    else:\n",
349 |     "        return 1\n",
350 |     "\n",
351 |     "recall_train['label'] = recall_train['label'].apply(transfer_label)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 39,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "features = [x for x in recall_train.columns if x not in ['itemID','userID','category','shop','brand','label','apriori_rank','apriori_top']]"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 40,
366 |    "metadata": {
367 |     "scrolled": true
368 |    },
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "0:\tlearn: 0.5969336\ttotal: 617ms\tremaining: 3m 4s\n",
375 |       "1:\tlearn: 0.5237747\ttotal: 1.2s\tremaining: 2m 58s\n",
376 |       "2:\tlearn: 0.4676843\ttotal: 1.74s\tremaining: 2m 51s\n",
377 |       "3:\tlearn: 0.4248270\ttotal: 2.24s\tremaining: 2m 46s\n",
378 |       "4:\tlearn: 0.3923313\ttotal: 2.78s\tremaining: 2m 44s\n",
379 |       "5:\tlearn: 0.3675028\ttotal: 3.31s\tremaining: 2m 41s\n",
380 |       "6:\tlearn: 0.3480424\ttotal: 3.85s\tremaining: 2m 41s\n",
381 |       "7:\tlearn: 0.3332812\ttotal: 4.37s\tremaining: 2m 39s\n",
382 |       "8:\tlearn: 0.3221164\ttotal: 4.84s\tremaining: 2m 36s\n",
383 |       "9:\tlearn: 0.3126932\ttotal: 5.33s\tremaining: 2m 34s\n",
384 |       "10:\tlearn: 0.3053910\ttotal: 5.8s\tremaining: 2m 32s\n",
385 |       "11:\tlearn: 0.2996877\ttotal: 6.35s\tremaining: 2m 32s\n",
386 |       "12:\tlearn: 0.2950420\ttotal: 6.92s\tremaining: 2m 32s\n",
387 |       "13:\tlearn: 0.2915425\ttotal: 7.41s\tremaining: 2m 31s\n",
388 |       "14:\tlearn: 0.2886570\ttotal: 7.98s\tremaining: 2m 31s\n",
389 |       "15:\tlearn: 0.2865094\ttotal: 8.53s\tremaining: 2m 31s\n",
390 |       "16:\tlearn: 0.2846760\ttotal: 8.98s\tremaining: 2m 29s\n",
391 |       "17:\tlearn: 0.2833095\ttotal: 9.43s\tremaining: 2m 27s\n",
392 |       "18:\tlearn: 0.2818918\ttotal: 9.99s\tremaining: 2m 27s\n",
393 |       "19:\tlearn: 0.2807000\ttotal: 10.5s\tremaining: 2m 27s\n",
394 |       "20:\tlearn: 0.2797297\ttotal: 11.1s\tremaining: 2m 26s\n",
395 |       "21:\tlearn: 0.2789706\ttotal: 11.5s\tremaining: 2m 25s\n",
396 |       "22:\tlearn: 0.2782515\ttotal: 12s\tremaining: 2m 24s\n",
397 |       "23:\tlearn: 0.2776294\ttotal: 12.5s\tremaining: 2m 23s\n",
398 |       "24:\tlearn: 0.2772333\ttotal: 13s\tremaining: 2m 23s\n",
399 |       "25:\tlearn: 0.2768003\ttotal: 13.6s\tremaining: 2m 22s\n",
400 |       "26:\tlearn: 0.2764516\ttotal: 14s\tremaining: 2m 21s\n",
401 |       "27:\tlearn: 0.2761037\ttotal: 14.5s\tremaining: 2m 20s\n",
402 |       "28:\tlearn: 0.2757995\ttotal: 15s\tremaining: 2m 20s\n",
403 |       "29:\tlearn: 0.2755404\ttotal: 15.5s\tremaining: 2m 19s\n",
404 |       "30:\tlearn: 0.2753335\ttotal: 15.9s\tremaining: 2m 18s\n",
405 |       "31:\tlearn: 0.2751294\ttotal: 16.5s\tremaining: 2m 17s\n",
406 |       "32:\tlearn: 0.2749600\ttotal: 16.9s\tremaining: 2m 16s\n",
407 |       "33:\tlearn: 0.2748096\ttotal: 17.4s\tremaining: 2m 16s\n",
408 |       "34:\tlearn: 0.2746508\ttotal: 17.9s\tremaining: 2m 15s\n",
409 |       "35:\tlearn: 0.2744759\ttotal: 18.5s\tremaining: 2m 15s\n",
410 |       "36:\tlearn: 0.2743000\ttotal: 18.9s\tremaining: 2m 14s\n",
411 |       "37:\tlearn: 0.2741494\ttotal: 19.4s\tremaining: 2m 13s\n",
412 |       "38:\tlearn: 0.2740247\ttotal: 19.9s\tremaining: 2m 12s\n",
413 |       "39:\tlearn: 0.2739329\ttotal: 20.4s\tremaining: 2m 12s\n",
414 |       "40:\tlearn: 0.2738084\ttotal: 20.9s\tremaining: 2m 12s\n",
415 |       "41:\tlearn: 0.2737103\ttotal: 21.5s\tremaining: 2m 11s\n",
416 |       "42:\tlearn: 0.2735962\ttotal: 22s\tremaining: 2m 11s\n",
417 |       "43:\tlearn: 0.2734955\ttotal: 22.4s\tremaining: 2m 10s\n",
418 |       "44:\tlearn: 0.2734077\ttotal: 22.9s\tremaining: 2m 9s\n",
419 |       "45:\tlearn: 0.2733399\ttotal: 23.4s\tremaining: 2m 9s\n",
420 |       "46:\tlearn: 0.2732147\ttotal: 24s\tremaining: 2m 9s\n",
421 |       "47:\tlearn: 0.2731284\ttotal: 24.6s\tremaining: 2m 9s\n",
422 |       "48:\tlearn: 0.2730598\ttotal: 25.1s\tremaining: 2m 8s\n",
423 |       "49:\tlearn: 0.2729952\ttotal: 25.6s\tremaining: 2m 8s\n",
424 |       "50:\tlearn: 0.2729183\ttotal: 26.2s\tremaining: 2m 8s\n",
425 |       "51:\tlearn: 0.2728487\ttotal: 26.8s\tremaining: 2m 7s\n",
426 |       "52:\tlearn: 0.2727670\ttotal: 27.4s\tremaining: 2m 7s\n",
427 |       "53:\tlearn: 0.2727061\ttotal: 27.9s\tremaining: 2m 6s\n",
428 |       "54:\tlearn: 0.2726305\ttotal: 28.4s\tremaining: 2m 6s\n",
429 |       "55:\tlearn: 0.2725697\ttotal: 29s\tremaining: 2m 6s\n",
430 |       "56:\tlearn: 0.2724978\ttotal: 29.5s\tremaining: 2m 5s\n",
431 |       "57:\tlearn: 0.2724327\ttotal: 29.9s\tremaining: 2m 4s\n",
432 |       "58:\tlearn: 0.2723599\ttotal: 30.4s\tremaining: 2m 3s\n",
433 |       "59:\tlearn: 0.2723106\ttotal: 30.9s\tremaining: 2m 3s\n",
434 |       "60:\tlearn: 0.2722603\ttotal: 31.4s\tremaining: 2m 2s\n",
435 |       "61:\tlearn: 0.2722100\ttotal: 31.8s\tremaining: 2m 2s\n",
436 |       "62:\tlearn: 0.2721508\ttotal: 32.3s\tremaining: 2m 1s\n",
437 |       "63:\tlearn: 0.2721004\ttotal: 32.8s\tremaining: 2m\n",
438 |       "64:\tlearn: 0.2720448\ttotal: 33.3s\tremaining: 2m\n",
439 |       "65:\tlearn: 0.2719740\ttotal: 33.7s\tremaining: 1m 59s\n",
440 |       "66:\tlearn: 0.2719150\ttotal: 34.2s\tremaining: 1m 58s\n",
441 |       "67:\tlearn: 0.2718503\ttotal: 34.6s\tremaining: 1m 58s\n",
442 |       "68:\tlearn: 0.2718010\ttotal: 35.1s\tremaining: 1m 57s\n",
443 |       "69:\tlearn: 0.2717310\ttotal: 35.6s\tremaining: 1m 57s\n",
444 |       "70:\tlearn: 0.2716921\ttotal: 36.1s\tremaining: 1m 56s\n",
445 |       "71:\tlearn: 0.2716118\ttotal: 37.4s\tremaining: 1m 58s\n",
446 |       "72:\tlearn: 0.2715527\ttotal: 37.8s\tremaining: 1m 57s\n",
447 |       "73:\tlearn: 0.2715115\ttotal: 38.3s\tremaining: 1m 56s\n",
448 |       "74:\tlearn: 0.2714582\ttotal: 38.7s\tremaining: 1m 56s\n",
449 |       "75:\tlearn: 0.2714228\ttotal: 39.2s\tremaining: 1m 55s\n",
450 |       "76:\tlearn: 0.2713428\ttotal: 39.7s\tremaining: 1m 55s\n",
451 |       "77:\tlearn: 0.2713086\ttotal: 40.2s\tremaining: 1m 54s\n",
452 |       "78:\tlearn: 0.2712682\ttotal: 40.7s\tremaining: 1m 53s\n",
453 |       "79:\tlearn: 0.2712123\ttotal: 41.1s\tremaining: 1m 53s\n",
454 |       "80:\tlearn: 0.2711785\ttotal: 41.6s\tremaining: 1m 52s\n",
455 |       "81:\tlearn: 0.2711087\ttotal: 42.1s\tremaining: 1m 51s\n",
456 |       "82:\tlearn: 0.2710730\ttotal: 42.5s\tremaining: 1m 51s\n",
457 |       "83:\tlearn: 0.2710132\ttotal: 42.9s\tremaining: 1m 50s\n",
458 |       "84:\tlearn: 0.2709664\ttotal: 43.5s\tremaining: 1m 49s\n",
459 |       "85:\tlearn: 0.2709390\ttotal: 43.9s\tremaining: 1m 49s\n",
460 |       "86:\tlearn: 0.2708896\ttotal: 44.4s\tremaining: 1m 48s\n",
461 |       "87:\tlearn: 0.2708565\ttotal: 44.9s\tremaining: 1m 48s\n",
462 |       "88:\tlearn: 0.2708241\ttotal: 45.4s\tremaining: 1m 47s\n",
463 |       "89:\tlearn: 0.2707728\ttotal: 45.9s\tremaining: 1m 47s\n",
464 |       "90:\tlearn: 0.2707403\ttotal: 46.4s\tremaining: 1m 46s\n",
465 |       "91:\tlearn: 0.2706926\ttotal: 46.8s\tremaining: 1m 45s\n",
466 |       "92:\tlearn: 0.2706669\ttotal: 47.3s\tremaining: 1m 45s\n",
467 |       "93:\tlearn: 0.2706357\ttotal: 47.8s\tremaining: 1m 44s\n",
468 |       "94:\tlearn: 0.2705976\ttotal: 48.3s\tremaining: 1m 44s\n",
469 |       "95:\tlearn: 0.2705609\ttotal: 48.8s\tremaining: 1m 43s\n",
470 |       "96:\tlearn: 0.2705360\ttotal: 49.4s\tremaining: 1m 43s\n",
471 |       "97:\tlearn: 0.2704969\ttotal: 49.9s\tremaining: 1m 42s\n",
472 |       "98:\tlearn: 0.2704558\ttotal: 50.4s\tremaining: 1m 42s\n",
473 |       "99:\tlearn: 0.2704185\ttotal: 50.8s\tremaining: 1m 41s\n",
474 |       "100:\tlearn: 0.2703852\ttotal: 51.3s\tremaining: 1m 41s\n",
475 |       "101:\tlearn: 0.2703458\ttotal: 51.9s\tremaining: 1m 40s\n",
476 |       "102:\tlearn: 0.2703192\ttotal: 52.4s\tremaining: 1m 40s\n",
477 |       "103:\tlearn: 0.2703081\ttotal: 52.8s\tremaining: 1m 39s\n",
478 |       "104:\tlearn: 0.2702842\ttotal: 53.3s\tremaining: 1m 38s\n",
479 |       "105:\tlearn: 0.2702547\ttotal: 53.8s\tremaining: 1m 38s\n",
480 |       "106:\tlearn: 0.2702239\ttotal: 54.3s\tremaining: 1m 37s\n",
481 |       "107:\tlearn: 0.2701957\ttotal: 55s\tremaining: 1m 37s\n",
482 |       "108:\tlearn: 0.2701513\ttotal: 55.5s\tremaining: 1m 37s\n",
483 |       "109:\tlearn: 0.2701191\ttotal: 56s\tremaining: 1m 36s\n",
484 |       "110:\tlearn: 0.2700940\ttotal: 56.5s\tremaining: 1m 36s\n",
485 |       "111:\tlearn: 0.2700623\ttotal: 57.1s\tremaining: 1m 35s\n",
486 |       "112:\tlearn: 0.2700381\ttotal: 57.6s\tremaining: 1m 35s\n",
487 |       "113:\tlearn: 0.2700139\ttotal: 58.2s\tremaining: 1m 34s\n",
488 |       "114:\tlearn: 0.2699855\ttotal: 58.7s\tremaining: 1m 34s\n",
489 |       "115:\tlearn: 0.2699472\ttotal: 59.3s\tremaining: 1m 34s\n",
490 |       "116:\tlearn: 0.2699268\ttotal: 59.8s\tremaining: 1m 33s\n",
491 |       "117:\tlearn: 0.2698983\ttotal: 1m\tremaining: 1m 32s\n",
492 |       "118:\tlearn: 0.2698807\ttotal: 1m\tremaining: 1m 32s\n",
493 |       "119:\tlearn: 0.2698600\ttotal: 1m 1s\tremaining: 1m 31s\n",
494 |       "120:\tlearn: 0.2698235\ttotal: 1m 1s\tremaining: 1m 31s\n",
495 |       "121:\tlearn: 0.2698015\ttotal: 1m 2s\tremaining: 1m 30s\n",
496 |       "122:\tlearn: 0.2697776\ttotal: 1m 2s\tremaining: 1m 30s\n",
497 |       "123:\tlearn: 0.2697520\ttotal: 1m 3s\tremaining: 1m 29s\n",
498 |       "124:\tlearn: 0.2697259\ttotal: 1m 3s\tremaining: 1m 29s\n",
499 |       "125:\tlearn: 0.2697031\ttotal: 1m 4s\tremaining: 1m 28s\n",
500 |       "126:\tlearn: 0.2696761\ttotal: 1m 4s\tremaining: 1m 28s\n",
501 |       "127:\tlearn: 0.2696339\ttotal: 1m 5s\tremaining: 1m 27s\n",
502 |       "128:\tlearn: 0.2696114\ttotal: 1m 5s\tremaining: 1m 27s\n",
503 |       "129:\tlearn: 0.2695903\ttotal: 1m 6s\tremaining: 1m 26s\n",
504 |       "130:\tlearn: 0.2695650\ttotal: 1m 6s\tremaining: 1m 26s\n",
505 |       "131:\tlearn: 0.2695402\ttotal: 1m 7s\tremaining: 1m 25s\n",
506 |       "132:\tlearn: 0.2695161\ttotal: 1m 7s\tremaining: 1m 25s\n",
507 |       "133:\tlearn: 0.2694868\ttotal: 1m 8s\tremaining: 1m 24s\n",
508 |       "134:\tlearn: 0.2694629\ttotal: 1m 8s\tremaining: 1m 23s\n",
509 |       "135:\tlearn: 0.2694440\ttotal: 1m 9s\tremaining: 1m 23s\n",
510 |       "136:\tlearn: 0.2694191\ttotal: 1m 9s\tremaining: 1m 22s\n",
511 |       "137:\tlearn: 0.2693964\ttotal: 1m 10s\tremaining: 1m 22s\n",
512 |       "138:\tlearn: 0.2693789\ttotal: 1m 10s\tremaining: 1m 21s\n",
513 |       "139:\tlearn: 0.2693522\ttotal: 1m 11s\tremaining: 1m 21s\n",
514 |       "140:\tlearn: 0.2693318\ttotal: 1m 11s\tremaining: 1m 20s\n",
515 |       "141:\tlearn: 0.2692985\ttotal: 1m 12s\tremaining: 1m 20s\n",
516 |       "142:\tlearn: 0.2692746\ttotal: 1m 12s\tremaining: 1m 19s\n",
517 |       "143:\tlearn: 0.2692549\ttotal: 1m 13s\tremaining: 1m 19s\n",
518 |       "144:\tlearn: 0.2692345\ttotal: 1m 13s\tremaining: 1m 18s\n",
519 |       "145:\tlearn: 0.2692192\ttotal: 1m 14s\tremaining: 1m 18s\n",
520 |       "146:\tlearn: 0.2692030\ttotal: 1m 14s\tremaining: 1m 17s\n",
521 |       "147:\tlearn: 0.2691846\ttotal: 1m 15s\tremaining: 1m 17s\n",
522 |       "148:\tlearn: 0.2691649\ttotal: 1m 15s\tremaining: 1m 16s\n",
523 |       "149:\tlearn: 0.2691351\ttotal: 1m 16s\tremaining: 1m 16s\n",
524 |       "150:\tlearn: 0.2691172\ttotal: 1m 16s\tremaining: 1m 15s\n",
525 |       "151:\tlearn: 0.2690976\ttotal: 1m 17s\tremaining: 1m 15s\n",
526 |       "152:\tlearn: 0.2690744\ttotal: 1m 17s\tremaining: 1m 14s\n",
527 |       "153:\tlearn: 0.2690489\ttotal: 1m 18s\tremaining: 1m 14s\n",
528 |       "154:\tlearn: 0.2690281\ttotal: 1m 18s\tremaining: 1m 13s\n",
529 |       "155:\tlearn: 0.2690069\ttotal: 1m 19s\tremaining: 1m 13s\n",
530 |       "156:\tlearn: 0.2689886\ttotal: 1m 19s\tremaining: 1m 12s\n",
531 |       "157:\tlearn: 0.2689708\ttotal: 1m 20s\tremaining: 1m 12s\n"
532 |      ]
533 |     },
534 |     {
535 |      "name": "stdout",
536 |      "output_type": "stream",
537 |      "text": [
538 |       "158:\tlearn: 0.2689558\ttotal: 1m 20s\tremaining: 1m 11s\n",
539 |       "159:\tlearn: 0.2689389\ttotal: 1m 21s\tremaining: 1m 11s\n",
540 |       "160:\tlearn: 0.2689273\ttotal: 1m 21s\tremaining: 1m 10s\n",
541 |       "161:\tlearn: 0.2689062\ttotal: 1m 22s\tremaining: 1m 10s\n",
542 |       "162:\tlearn: 0.2688837\ttotal: 1m 22s\tremaining: 1m 9s\n",
543 |       "163:\tlearn: 0.2688722\ttotal: 1m 23s\tremaining: 1m 9s\n",
544 |       "164:\tlearn: 0.2688602\ttotal: 1m 23s\tremaining: 1m 8s\n",
545 |       "165:\tlearn: 0.2688419\ttotal: 1m 24s\tremaining: 1m 7s\n",
546 |       "166:\tlearn: 0.2688311\ttotal: 1m 24s\tremaining: 1m 7s\n",
547 |       "167:\tlearn: 0.2688143\ttotal: 1m 25s\tremaining: 1m 6s\n",
548 |       "168:\tlearn: 0.2687947\ttotal: 1m 25s\tremaining: 1m 6s\n",
549 |       "169:\tlearn: 0.2687703\ttotal: 1m 26s\tremaining: 1m 5s\n",
550 |       "170:\tlearn: 0.2687481\ttotal: 1m 26s\tremaining: 1m 5s\n",
551 |       "171:\tlearn: 0.2687167\ttotal: 1m 27s\tremaining: 1m 4s\n",
552 |       "172:\tlearn: 0.2686880\ttotal: 1m 27s\tremaining: 1m 4s\n",
553 |       "173:\tlearn: 0.2686678\ttotal: 1m 28s\tremaining: 1m 3s\n",
554 |       "174:\tlearn: 0.2686523\ttotal: 1m 28s\tremaining: 1m 3s\n",
555 |       "175:\tlearn: 0.2686358\ttotal: 1m 29s\tremaining: 1m 2s\n",
556 |       "176:\tlearn: 0.2686162\ttotal: 1m 29s\tremaining: 1m 2s\n",
557 |       "177:\tlearn: 0.2685984\ttotal: 1m 30s\tremaining: 1m 1s\n",
558 |       "178:\tlearn: 0.2685749\ttotal: 1m 30s\tremaining: 1m 1s\n",
559 |       "179:\tlearn: 0.2685654\ttotal: 1m 30s\tremaining: 1m\n",
560 |       "180:\tlearn: 0.2685436\ttotal: 1m 31s\tremaining: 1m\n",
561 |       "181:\tlearn: 0.2685312\ttotal: 1m 31s\tremaining: 59.6s\n",
562 |       "182:\tlearn: 0.2685169\ttotal: 1m 32s\tremaining: 59.1s\n",
563 |       "183:\tlearn: 0.2684946\ttotal: 1m 32s\tremaining: 58.5s\n",
564 |       "184:\tlearn: 0.2684647\ttotal: 1m 33s\tremaining: 58s\n",
565 |       "185:\tlearn: 0.2684486\ttotal: 1m 33s\tremaining: 57.5s\n",
566 |       "186:\tlearn: 0.2684308\ttotal: 1m 34s\tremaining: 57s\n",
567 |       "187:\tlearn: 0.2684148\ttotal: 1m 34s\tremaining: 56.5s\n",
568 |       "188:\tlearn: 0.2683936\ttotal: 1m 35s\tremaining: 56s\n",
569 |       "189:\tlearn: 0.2683785\ttotal: 1m 35s\tremaining: 55.5s\n",
570 |       "190:\tlearn: 0.2683670\ttotal: 1m 36s\tremaining: 55s\n",
571 |       "191:\tlearn: 0.2683416\ttotal: 1m 36s\tremaining: 54.5s\n",
572 |       "192:\tlearn: 0.2683321\ttotal: 1m 37s\tremaining: 53.9s\n",
573 |       "193:\tlearn: 0.2683155\ttotal: 1m 37s\tremaining: 53.4s\n",
574 |       "194:\tlearn: 0.2682979\ttotal: 1m 38s\tremaining: 52.9s\n",
575 |       "195:\tlearn: 0.2682730\ttotal: 1m 38s\tremaining: 52.5s\n",
576 |       "196:\tlearn: 0.2682627\ttotal: 1m 39s\tremaining: 51.9s\n",
577 |       "197:\tlearn: 0.2682430\ttotal: 1m 39s\tremaining: 51.4s\n",
578 |       "198:\tlearn: 0.2682279\ttotal: 1m 40s\tremaining: 50.9s\n",
579 |       "199:\tlearn: 0.2682177\ttotal: 1m 40s\tremaining: 50.4s\n",
580 |       "200:\tlearn: 0.2681975\ttotal: 1m 41s\tremaining: 49.9s\n",
581 |       "201:\tlearn: 0.2681854\ttotal: 1m 41s\tremaining: 49.3s\n",
582 |       "202:\tlearn: 0.2681557\ttotal: 1m 42s\tremaining: 48.9s\n",
583 |       "203:\tlearn: 0.2681451\ttotal: 1m 42s\tremaining: 48.3s\n",
584 |       "204:\tlearn: 0.2681250\ttotal: 1m 43s\tremaining: 47.8s\n",
585 |       "205:\tlearn: 0.2681095\ttotal: 1m 43s\tremaining: 47.3s\n",
586 |       "206:\tlearn: 0.2681000\ttotal: 1m 44s\tremaining: 46.8s\n",
587 |       "207:\tlearn: 0.2680911\ttotal: 1m 44s\tremaining: 46.2s\n",
588 |       "208:\tlearn: 0.2680806\ttotal: 1m 45s\tremaining: 45.7s\n",
589 |       "209:\tlearn: 0.2680688\ttotal: 1m 45s\tremaining: 45.2s\n",
590 |       "210:\tlearn: 0.2680423\ttotal: 1m 45s\tremaining: 44.7s\n",
591 |       "211:\tlearn: 0.2680267\ttotal: 1m 46s\tremaining: 44.2s\n",
592 |       "212:\tlearn: 0.2680148\ttotal: 1m 47s\tremaining: 43.7s\n",
593 |       "213:\tlearn: 0.2679986\ttotal: 1m 47s\tremaining: 43.2s\n",
594 |       "214:\tlearn: 0.2679761\ttotal: 1m 48s\tremaining: 42.7s\n",
595 |       "215:\tlearn: 0.2679602\ttotal: 1m 48s\tremaining: 42.2s\n",
596 |       "216:\tlearn: 0.2679475\ttotal: 1m 49s\tremaining: 41.7s\n",
597 |       "217:\tlearn: 0.2679325\ttotal: 1m 49s\tremaining: 41.3s\n",
598 |       "218:\tlearn: 0.2679192\ttotal: 1m 50s\tremaining: 40.8s\n",
599 |       "219:\tlearn: 0.2679044\ttotal: 1m 50s\tremaining: 40.3s\n",
600 |       "220:\tlearn: 0.2678915\ttotal: 1m 51s\tremaining: 39.7s\n",
601 |       "221:\tlearn: 0.2678813\ttotal: 1m 51s\tremaining: 39.2s\n",
602 |       "222:\tlearn: 0.2678601\ttotal: 1m 52s\tremaining: 38.7s\n",
603 |       "223:\tlearn: 0.2678457\ttotal: 1m 52s\tremaining: 38.2s\n",
604 |       "224:\tlearn: 0.2678343\ttotal: 1m 53s\tremaining: 37.7s\n",
605 |       "225:\tlearn: 0.2678263\ttotal: 1m 53s\tremaining: 37.2s\n",
606 |       "226:\tlearn: 0.2678135\ttotal: 1m 53s\tremaining: 36.6s\n",
607 |       "227:\tlearn: 0.2677968\ttotal: 1m 54s\tremaining: 36.1s\n",
608 |       "228:\tlearn: 0.2677817\ttotal: 1m 54s\tremaining: 35.6s\n",
609 |       "229:\tlearn: 0.2677639\ttotal: 1m 55s\tremaining: 35.1s\n",
610 |       "230:\tlearn: 0.2677553\ttotal: 1m 55s\tremaining: 34.6s\n",
611 |       "231:\tlearn: 0.2677438\ttotal: 1m 56s\tremaining: 34.1s\n",
612 |       "232:\tlearn: 0.2677354\ttotal: 1m 56s\tremaining: 33.6s\n",
613 |       "233:\tlearn: 0.2677196\ttotal: 1m 57s\tremaining: 33.1s\n",
614 |       "234:\tlearn: 0.2677084\ttotal: 1m 57s\tremaining: 32.6s\n",
615 |       "235:\tlearn: 0.2676964\ttotal: 1m 58s\tremaining: 32.1s\n",
616 |       "236:\tlearn: 0.2676864\ttotal: 1m 58s\tremaining: 31.6s\n",
617 |       "237:\tlearn: 0.2676708\ttotal: 1m 59s\tremaining: 31.1s\n",
618 |       "238:\tlearn: 0.2676580\ttotal: 1m 59s\tremaining: 30.6s\n",
619 |       "239:\tlearn: 0.2676386\ttotal: 2m\tremaining: 30.1s\n",
620 |       "240:\tlearn: 0.2676212\ttotal: 2m\tremaining: 29.6s\n",
621 |       "241:\tlearn: 0.2676087\ttotal: 2m 1s\tremaining: 29.1s\n",
622 |       "242:\tlearn: 0.2676002\ttotal: 2m 1s\tremaining: 28.6s\n",
623 |       "243:\tlearn: 0.2675860\ttotal: 2m 2s\tremaining: 28.1s\n",
624 |       "244:\tlearn: 0.2675760\ttotal: 2m 2s\tremaining: 27.6s\n",
625 |       "245:\tlearn: 0.2675625\ttotal: 2m 3s\tremaining: 27.1s\n",
626 |       "246:\tlearn: 0.2675538\ttotal: 2m 3s\tremaining: 26.6s\n",
627 |       "247:\tlearn: 0.2675432\ttotal: 2m 4s\tremaining: 26.1s\n",
628 |       "248:\tlearn: 0.2675295\ttotal: 2m 4s\tremaining: 25.6s\n",
629 |       "249:\tlearn: 0.2675231\ttotal: 2m 5s\tremaining: 25.1s\n",
630 |       "250:\tlearn: 0.2675140\ttotal: 2m 5s\tremaining: 24.6s\n",
631 |       "251:\tlearn: 0.2675031\ttotal: 2m 6s\tremaining: 24s\n",
632 |       "252:\tlearn: 0.2674841\ttotal: 2m 6s\tremaining: 23.5s\n",
633 |       "253:\tlearn: 0.2674752\ttotal: 2m 7s\tremaining: 23s\n",
634 |       "254:\tlearn: 0.2674671\ttotal: 2m 7s\tremaining: 22.5s\n",
635 |       "255:\tlearn: 0.2674572\ttotal: 2m 8s\tremaining: 22s\n",
636 |       "256:\tlearn: 0.2674466\ttotal: 2m 8s\tremaining: 21.5s\n",
637 |       "257:\tlearn: 0.2674283\ttotal: 2m 9s\tremaining: 21s\n",
638 |       "258:\tlearn: 0.2674150\ttotal: 2m 9s\tremaining: 20.5s\n",
639 |       "259:\tlearn: 0.2674016\ttotal: 2m 10s\tremaining: 20s\n",
640 |       "260:\tlearn: 0.2673913\ttotal: 2m 10s\tremaining: 19.5s\n",
641 |       "261:\tlearn: 0.2673845\ttotal: 2m 11s\tremaining: 19s\n",
642 |       "262:\tlearn: 0.2673767\ttotal: 2m 11s\tremaining: 18.5s\n",
643 |       "263:\tlearn: 0.2673676\ttotal: 2m 12s\tremaining: 18s\n",
644 |       "264:\tlearn: 0.2673518\ttotal: 2m 12s\tremaining: 17.5s\n",
645 |       "265:\tlearn: 0.2673440\ttotal: 2m 12s\tremaining: 17s\n",
646 |       "266:\tlearn: 0.2673313\ttotal: 2m 13s\tremaining: 16.5s\n",
647 |       "267:\tlearn: 0.2673196\ttotal: 2m 13s\tremaining: 16s\n",
648 |       "268:\tlearn: 0.2673136\ttotal: 2m 14s\tremaining: 15.5s\n",
649 |       "269:\tlearn: 0.2672997\ttotal: 2m 14s\tremaining: 15s\n",
650 |       "270:\tlearn: 0.2672870\ttotal: 2m 15s\tremaining: 14.5s\n",
651 |       "271:\tlearn: 0.2672794\ttotal: 2m 15s\tremaining: 14s\n",
652 |       "272:\tlearn: 0.2672730\ttotal: 2m 16s\tremaining: 13.5s\n",
653 |       "273:\tlearn: 0.2672598\ttotal: 2m 16s\tremaining: 13s\n",
654 |       "274:\tlearn: 0.2672510\ttotal: 2m 17s\tremaining: 12.5s\n",
655 |       "275:\tlearn: 0.2672349\ttotal: 2m 17s\tremaining: 12s\n",
656 |       "276:\tlearn: 0.2672298\ttotal: 2m 18s\tremaining: 11.5s\n",
657 |       "277:\tlearn: 0.2672192\ttotal: 2m 18s\tremaining: 11s\n",
658 |       "278:\tlearn: 0.2672115\ttotal: 2m 19s\tremaining: 10.5s\n",
659 |       "279:\tlearn: 0.2672013\ttotal: 2m 19s\tremaining: 9.98s\n",
660 |       "280:\tlearn: 0.2671907\ttotal: 2m 20s\tremaining: 9.48s\n",
661 |       "281:\tlearn: 0.2671842\ttotal: 2m 20s\tremaining: 8.97s\n",
662 |       "282:\tlearn: 0.2671775\ttotal: 2m 21s\tremaining: 8.47s\n",
663 |       "283:\tlearn: 0.2671718\ttotal: 2m 21s\tremaining: 7.97s\n",
664 |       "284:\tlearn: 0.2671522\ttotal: 2m 22s\tremaining: 7.48s\n",
665 |       "285:\tlearn: 0.2671431\ttotal: 2m 22s\tremaining: 6.99s\n",
666 |       "286:\tlearn: 0.2671274\ttotal: 2m 23s\tremaining: 6.49s\n",
667 |       "287:\tlearn: 0.2671200\ttotal: 2m 23s\tremaining: 5.99s\n",
668 |       "288:\tlearn: 0.2671056\ttotal: 2m 24s\tremaining: 5.49s\n",
669 |       "289:\tlearn: 0.2670975\ttotal: 2m 24s\tremaining: 4.99s\n",
670 |       "290:\tlearn: 0.2670885\ttotal: 2m 25s\tremaining: 4.5s\n",
671 |       "291:\tlearn: 0.2670799\ttotal: 2m 25s\tremaining: 4s\n",
672 |       "292:\tlearn: 0.2670658\ttotal: 2m 26s\tremaining: 3.5s\n",
673 |       "293:\tlearn: 0.2670562\ttotal: 2m 26s\tremaining: 3s\n",
674 |       "294:\tlearn: 0.2670460\ttotal: 2m 27s\tremaining: 2.5s\n",
675 |       "295:\tlearn: 0.2670321\ttotal: 2m 27s\tremaining: 2s\n",
676 |       "296:\tlearn: 0.2670180\ttotal: 2m 28s\tremaining: 1.5s\n",
677 |       "297:\tlearn: 0.2670094\ttotal: 2m 28s\tremaining: 998ms\n",
678 |       "298:\tlearn: 0.2670013\ttotal: 2m 29s\tremaining: 500ms\n",
679 |       "299:\tlearn: 0.2669893\ttotal: 2m 30s\tremaining: 0us\n"
680 |      ]
681 |     },
682 |     {
683 |      "data": {
684 |       "text/plain": [
685 |        "<catboost.core.CatBoostClassifier at 0x7f8f45214630>"
686 |       ]
687 |      },
688 |      "execution_count": 40,
689 |      "metadata": {},
690 |      "output_type": "execute_result"
691 |     }
692 |    ],
693 |    "source": [
694 |     "cbt_model = cat.CatBoostClassifier(iterations=300,learning_rate=0.1,depth=5,verbose=True,thread_count=12\n",
695 |     "                                   ,random_seed=1024)\n",
696 |     "cbt_model.fit(recall_train[features], recall_train['label'])"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": 41,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "cbt_model.save_model('model0924_base.file')"
706 |    ]
707 |   },
708 |   {
709 |    "cell_type": "code",
710 |    "execution_count": 42,
711 |    "metadata": {},
712 |    "outputs": [],
713 |    "source": [
714 |     "importance = dict(zip(features,\n",
715 |     "cbt_model.feature_importances_))"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": 43,
721 |    "metadata": {
722 |     "scrolled": true
723 |    },
724 |    "outputs": [
725 |     {
726 |      "data": {
727 |       "text/plain": [
728 |        "[('apriori', 27.28756987244248),\n",
729 |        " ('itemID_median', 8.290965095145731),\n",
730 |        " ('user_to_category_lastday', 7.07994210784468),\n",
731 |        " ('user_to_category_count_buy', 5.7383039412964445),\n",
732 |        " ('user_to_category_count_pv_5days', 5.523705076092089),\n",
733 |        " ('user_to_category_count_pv', 3.6634786916021396),\n",
734 |        " ('itemID_std', 2.973804811358111),\n",
735 |        " ('user_to_category_count_buy_yestday', 2.9720553018294242),\n",
736 |        " ('user_to_age_count', 2.4646399997374417),\n",
737 |        " ('user_to_sex_count', 2.409206256237985),\n",
738 |        " ('user_to_category_lasttime', 2.319085627011794),\n",
739 |        " ('user_to_category_count_pv_yestday', 1.96984573441514),\n",
740 |        " ('age', 1.757174970832472),\n",
741 |        " ('itemID_skew', 1.5063951577357892),\n",
742 |        " ('user_to_shop_count_pv', 1.4966560077500304),\n",
743 |        " ('user_to_category_count_buy_5days', 1.3092884271919525),\n",
744 |        " ('rank', 1.1516030170793725),\n",
745 |        " ('category_count', 1.147464644646628),\n",
746 |        " ('user_to_category_count_5days', 1.1221689244633937),\n",
747 |        " ('category_std', 1.0918870855320673),\n",
748 |        " ('itemID_count', 1.0201773241728571),\n",
749 |        " ('shop_count', 1.0185020472919601),\n",
750 |        " ('user_to_brand_count_pv', 1.0067128105572412),\n",
751 |        " ('user_to_brand_sum', 0.954042675110285),\n",
752 |        " ('user_to_shop_count', 0.9305873028137963),\n",
753 |        " ('ability', 0.8277866337425583),\n",
754 |        " ('user_to_ability_count', 0.7111652189901285),\n",
755 |        " ('brand_count', 0.7090070158121335),\n",
756 |        " ('category_median', 0.681949062526269),\n",
757 |        " ('user_to_brand_count_buy_5days', 0.6600978426957829),\n",
758 |        " ('user_to_brand_count', 0.6417751744994954),\n",
759 |        " ('user_to_category_count_yestday', 0.6290691411487302),\n",
760 |        " ('itemIDlast_time', 0.622966509690606),\n",
761 |        " ('user_to_shop_lasttime', 0.5539912728824887),\n",
762 |        " ('itemID_sum', 0.54067425249639),\n",
763 |        " ('shop_sum', 0.528248736432499),\n",
764 |        " ('brandnum_undercat', 0.36279276742454236),\n",
765 |        " ('user_to_shop_lastday', 0.345745637579721),\n",
766 |        " ('user_to_brand_lasttime', 0.3392726529975986),\n",
767 |        " ('brand_sum', 0.30243324290767826),\n",
768 |        " ('category_sum', 0.2972347540153292),\n",
769 |        " ('user_to_category_count', 0.28108372105321255),\n",
770 |        " ('user_to_shop_count_buy_yestday', 0.26669841669374117),\n",
771 |        " ('user_to_shop_count_buy', 0.23086734344251975),\n",
772 |        " ('user_to_shop_sum', 0.22227705993173602),\n",
773 |        " ('category_skew', 0.21843835730022557),\n",
774 |        " ('rank_percent', 0.21651442093132778),\n",
775 |        " ('user_to_shop_count_pv_yestday', 0.207861807960456),\n",
776 |        " ('user_to_brand_count_buy_yestday', 0.20075945763028505),\n",
777 |        " ('user_to_brand_lastday', 0.18717325139595967),\n",
778 |        " ('shopnum_undercat', 0.17816407228526177),\n",
779 |        " ('shoplast_time', 0.16195270764915062),\n",
780 |        " ('user_to_brand_count_pv_yestday', 0.15628142224579591),\n",
781 |        " ('itemnum_undercat', 0.14901844877622533),\n",
782 |        " ('user_to_category_sum', 0.11326634266490741),\n",
783 |        " ('user_to_brand_count_buy', 0.0763461944218196),\n",
784 |        " ('shoplast_time_hour_ed', 0.06747842920014958),\n",
785 |        " ('user_to_shop_count_buy_5days', 0.04868805672848317),\n",
786 |        " ('user_to_brand_count_pv_5days', 0.026816613726960085),\n",
787 |        " ('user_to_shop_count_pv_5days', 0.014712047290010712),\n",
788 |        " ('sex', 0.013405393491663675),\n",
789 |        " ('itemIDlast_time_hour_ed', 0.002723609146835795),\n",
790 |        " ('brandlast_time', 0.0),\n",
791 |        " ('brandlast_time_hour_ed', 0.0)]"
792 |       ]
793 |      },
794 |      "execution_count": 43,
795 |      "metadata": {},
796 |      "output_type": "execute_result"
797 |     }
798 |    ],
799 |    "source": [
800 |     "sorted(importance.items(), key=lambda x:x[1], reverse=True)"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": null,
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": []
809 |   },
810 |   {
811 |    "cell_type": "code",
812 |    "execution_count": null,
813 |    "metadata": {},
814 |    "outputs": [],
815 |    "source": [
816 |     "#####要有LGB和融合的代码！！！！！！！！"
817 |    ]
818 |   },
819 |   {
820 |    "cell_type": "code",
821 |    "execution_count": null,
822 |    "metadata": {},
823 |    "outputs": [],
824 |    "source": []
825 |   },
826 |   {
827 |    "cell_type": "code",
828 |    "execution_count": null,
829 |    "metadata": {},
830 |    "outputs": [],
831 |    "source": []
832 |   },
833 |   {
834 |    "cell_type": "code",
835 |    "execution_count": null,
836 |    "metadata": {},
837 |    "outputs": [],
838 |    "source": []
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {},
844 |    "outputs": [],
845 |    "source": []
846 |   }
847 |  ],
848 |  "metadata": {
849 |   "kernelspec": {
850 |    "display_name": "Python 3",
851 |    "language": "python",
852 |    "name": "python3"
853 |   },
854 |   "language_info": {
855 |    "codemirror_mode": {
856 |     "name": "ipython",
857 |     "version": 3
858 |    },
859 |    "file_extension": ".py",
860 |    "mimetype": "text/x-python",
861 |    "name": "python",
862 |    "nbconvert_exporter": "python",
863 |    "pygments_lexer": "ipython3",
864 |    "version": "3.7.3"
865 |   }
866 |  },
867 |  "nbformat": 4,
868 |  "nbformat_minor": 2
869 | }
870 | 


--------------------------------------------------------------------------------
/Semi-Finals/underline_trainning/Step3 Ranking/2_recommendation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 32,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import catboost as cat"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 33,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "def reduce_mem_usage(df):\n",
 21 |     "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
 22 |     "        to reduce memory usage.        \n",
 23 |     "    \"\"\"\n",
 24 |     "    start_mem = df.memory_usage().sum() \n",
 25 |     "    for col in df.columns:\n",
 26 |     "        col_type = df[col].dtype\n",
 27 |     "        \n",
 28 |     "        if col_type != object:\n",
 29 |     "            c_min = df[col].min()\n",
 30 |     "            c_max = df[col].max()\n",
 31 |     "            if str(col_type)[:3] == 'int':\n",
 32 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
 33 |     "                    df[col] = df[col].astype(np.int8)\n",
 34 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
 35 |     "                    df[col] = df[col].astype(np.int16)\n",
 36 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
 37 |     "                    df[col] = df[col].astype(np.int32)\n",
 38 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
 39 |     "                    df[col] = df[col].astype(np.int64)  \n",
 40 |     "            else:\n",
 41 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
 42 |     "                    df[col] = df[col].astype(np.float16)\n",
 43 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
 44 |     "                    df[col] = df[col].astype(np.float32)\n",
 45 |     "                else:\n",
 46 |     "                    df[col] = df[col].astype(np.float64)\n",
 47 |     "        else:\n",
 48 |     "            df[col] = df[col].astype('category')\n",
 49 |     "\n",
 50 |     "    end_mem = df.memory_usage().sum() \n",
 51 |     "    return df\n",
 52 |     "\n",
 53 |     "def load_data(path):\n",
 54 |     "    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))\n",
 55 |     "    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))\n",
 56 |     "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
 57 |     "\n",
 58 |     "    data.columns = ['userID','itemID','behavior','timestamp']\n",
 59 |     "    data['day'] = data['timestamp'] // 86400\n",
 60 |     "    data['hour'] = data['timestamp'] // 3600 % 24\n",
 61 |     "    \n",
 62 |     "    ## 生成behavior的onehot\n",
 63 |     "    for i in ['pv','fav','cart','buy']:\n",
 64 |     "        data[i] = 0\n",
 65 |     "        data.loc[data['behavior'] == i, i] = 1\n",
 66 |     "\n",
 67 |     "    ## 生成behavior的加权\n",
 68 |     "    \n",
 69 |     "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
 70 |     "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
 71 |     "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
 72 |     "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
 73 |     "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
 74 |     "    max_day = max(data['day'])\n",
 75 |     "    min_day = min(data['day'])\n",
 76 |     "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
 77 |     "\n",
 78 |     "    item.columns = ['itemID','category','shop','brand']\n",
 79 |     "    user.columns = ['userID','sex','age','ability']\n",
 80 |     "    \n",
 81 |     "    data = reduce_mem_usage(data)\n",
 82 |     "\n",
 83 |     "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
 84 |     "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
 85 |     "\n",
 86 |     "    return user, item, data\n",
 87 |     "    "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 34,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "user, item, data = load_data(path = '../ECommAI_EUIR_round1_testA_20190701/')\n",
 97 |     "user['age'] = user['age'] // 10\n",
 98 |     "data['age'] = data['age'] // 10"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 35,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "def get_unique_inorder(x, k=50):\n",
108 |     "    result = []\n",
109 |     "    flag = set()\n",
110 |     "    for i in x:\n",
111 |     "        if i[0] not in flag:\n",
112 |     "            result.append(i)\n",
113 |     "            flag.add(i[0])\n",
114 |     "        if len(flag) > k:\n",
115 |     "            break\n",
116 |     "    return result\n",
117 |     "\n",
118 |     "def get_recall_list(train, targetDay, k=300):\n",
119 |     "    train_logs = dict()\n",
120 |     "    path = './'\n",
121 |     "    f = open(path + 'upward_map.txt','r')\n",
122 |     "    upward_map = f.read()\n",
123 |     "    upward_map = eval(upward_map)\n",
124 |     "    f.close()\n",
125 |     "    \n",
126 |     "    f = open(path + 'downward_map.txt','r')\n",
127 |     "    downward_map = f.read()\n",
128 |     "    downward_map = eval(downward_map)\n",
129 |     "    f.close()\n",
130 |     "    \n",
131 |     "\n",
132 |     "    f = open(path + 'item_Apriori.txt','r')\n",
133 |     "    tmp = f.read()\n",
134 |     "    item_dict = eval(tmp)\n",
135 |     "    f.close()\n",
136 |     "    \n",
137 |     "    if targetDay > max(train['day']):\n",
138 |     "        for row in train[['userID','itemID','behavior']].values:\n",
139 |     "            train_logs.setdefault(row[0], dict())\n",
140 |     "            if row[1] in upward_map:\n",
141 |     "                train_logs[row[0]].setdefault(upward_map[row[1]],0)\n",
142 |     "                train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])\n",
143 |     "    else:\n",
144 |     "        user_List_test = set(train.loc[train['day']==targetDay,'userID'])\n",
145 |     "        train = train[train['day'] < targetDay]\n",
146 |     "        \n",
147 |     "        for row in train[['userID','itemID','behavior']].values:\n",
148 |     "            if row[0] in user_List_test:\n",
149 |     "                train_logs.setdefault(row[0], dict())\n",
150 |     "                if row[1] in upward_map:\n",
151 |     "                    train_logs[row[0]].setdefault(upward_map[row[1]],0)\n",
152 |     "                    train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])\n",
153 |     "\n",
154 |     "    for each_user in train_logs:\n",
155 |     "        sum_value = sum(train_logs[each_user].values())\n",
156 |     "        if sum_value > 0:\n",
157 |     "            for each_item in train_logs[each_user]:\n",
158 |     "                train_logs[each_user][each_item] /= sum_value            \n",
159 |     "\n",
160 |     "    result_logs = dict()    \n",
161 |     "    for u in train_logs:\n",
162 |     "        result_logs.setdefault(u, list())\n",
163 |     "        for i in set(train_logs[u].keys()):\n",
164 |     "            if i in item_dict:\n",
165 |     "                tmp_list = [ (x[0], train_logs[u][i]*x[1]) for x in item_dict[i]]\n",
166 |     "                result_logs[u] += tmp_list\n",
167 |     "            \n",
168 |     "    for u in result_logs:\n",
169 |     "        result_logs[u] = get_unique_inorder([(downward_map[x[0]], x[1]) for x in sorted(result_logs[u], key=lambda x:x[1], reverse=True)\n",
170 |     "                          if x[0] not in train_logs[u]], k=k)  \n",
171 |     "    \n",
172 |     "    return result_logs"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 36,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "test = data[data['day'] < 15]"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 37,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "def generate_pairs(recall):\n",
191 |     "    result = []\n",
192 |     "    for u in recall:\n",
193 |     "        rank = 0\n",
194 |     "        lenth = len(recall[u])\n",
195 |     "        for i in recall[u]:\n",
196 |     "            result.append([u,i[0],i[1], rank, rank/lenth])\n",
197 |     "            rank += 1\n",
198 |     "    return result\n",
199 |     "\n",
200 |     "def reshape_recall_to_dataframe(recall):\n",
201 |     "    result = generate_pairs(recall)\n",
202 |     "    result = pd.DataFrame(result)\n",
203 |     "    result.columns = ['userID','itemID','apriori', 'apriori_rank', 'apriori_top']\n",
204 |     "    return result\n",
205 |     "\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 38,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "#test_recall_logs = get_recall_list(data, targetDay = 15, k=500)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 39,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "#test_recall = reshape_recall_to_dataframe(test_recall_logs)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 40,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "test_recall = pd.read_csv('recall_list_testA_15day_300lenth.csv')\n",
233 |     "test_recall = pd.merge(left=test_recall, right=user, on='userID',how='left')\n",
234 |     "test_recall = pd.merge(left=test_recall, right=item, on='itemID',how='left')"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 41,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "underline_features_files = [\n",
244 |     "'brand_count.csv',\n",
245 |     "'brand_sum.csv',\n",
246 |     "'category_count.csv',\n",
247 |     "'category_sum.csv',\n",
248 |     "'itemID_count.csv',\n",
249 |     "'itemID_sum.csv',\n",
250 |     "'shop_count.csv',\n",
251 |     "'shop_sum.csv',\n",
252 |     "'category_lower.csv',\n",
253 |     "'item_rank.csv',\n",
254 |     "'category_higher.csv',\n",
255 |     "'itemID_higher.csv',\n",
256 |     "]\n",
257 |     "\n",
258 |     "underline_features = []\n",
259 |     "for f in underline_features_files:\n",
260 |     "    underline_features.append(reduce_mem_usage(pd.read_csv(f)))"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 42,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "for f in underline_features:\n",
270 |     "    test_recall = pd.merge(left=test_recall, right=f, on=f.columns[0], how='left')"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 43,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "double_underline_features_files = [\n",
280 |     "'item_to_ability_count_underline.csv',\n",
281 |     "'item_to_sex_count_underline.csv',\n",
282 |     "'item_to_age_count_underline.csv',\n",
283 |     "]\n",
284 |     "\n",
285 |     "double_underline_features = []\n",
286 |     "for f in double_underline_features_files:\n",
287 |     "    double_underline_features.append(pd.read_csv(f, engine='c'))"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 44,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "for f in double_underline_features:\n",
297 |     "    test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left', sort=False)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 45,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "def generate_online_features(data):\n",
307 |     "    online_features = []\n",
308 |     "    for count_feature in ['category','shop','brand']:\n",
309 |     "        online_features.append(data[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
310 |     "            {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
311 |     "                                                   + count_feature + '_count'}))\n",
312 |     "    for count_feature in ['category','shop','brand']:\n",
313 |     "        online_features.append(data[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
314 |     "            {'behavior': 'sum'}).rename(columns={'behavior':'user_to_' \n",
315 |     "                                                 + count_feature + '_sum'}))\n",
316 |     "    for count_feature in ['category','shop','brand']:\n",
317 |     "        for behavior_type in ['pv','buy']:\n",
318 |     "            online_features.append(data[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
319 |     "                {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'\n",
320 |     "                                                       + count_feature + '_count_' + behavior_type}))\n",
321 |     "\n",
322 |     "    return online_features\n",
323 |     "\n"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 46,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "online_features = generate_online_features(test)\n",
333 |     "\n",
334 |     "for f in online_features:\n",
335 |     "    test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left')"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 47,
341 |    "metadata": {},
342 |    "outputs": [],
343 |    "source": [
344 |     "def generate_yestday_features(data):\n",
345 |     "    yestday_features = []\n",
346 |     "    yestday = data[data['day'] == 14]\n",
347 |     "    \n",
348 |     "    for count_feature in ['category','shop','brand']:\n",
349 |     "        yestday_features.append(yestday[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
350 |     "            {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
351 |     "                                                   + count_feature + '_count_yestday'}))\n",
352 |     "\n",
353 |     "    for count_feature in ['category','shop','brand']:\n",
354 |     "        for behavior_type in ['pv','buy']:\n",
355 |     "            yestday_features.append(yestday[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
356 |     "                {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'\n",
357 |     "                                                       + count_feature + '_count_'+behavior_type+'_yestday'}))\n",
358 |     "    return yestday_features"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 48,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "def generate_5days_features(data):\n",
368 |     "    a5days = data[(data['day'] > 15 - 5) & (data['day'] < 15)]\n",
369 |     "    five_days_features = []\n",
370 |     "    \n",
371 |     "    for count_feature in ['category','shop','brand']:\n",
372 |     "        five_days_features.append(a5days[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
373 |     "            {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
374 |     "                                                   + count_feature + '_count_5days'}))\n",
375 |     "\n",
376 |     "    for count_feature in ['category','shop','brand']:\n",
377 |     "        for behavior_type in ['pv','fav','cart','buy']:\n",
378 |     "            five_days_features.append(a5days[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(\n",
379 |     "                {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'\n",
380 |     "                                                       + count_feature + '_count_' + behavior_type+'_5days'}))\n",
381 |     "    return five_days_features\n",
382 |     "        \n"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 49,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "yestday_features = generate_yestday_features(test)\n",
392 |     "\n",
393 |     "for f in yestday_features:\n",
394 |     "    test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left')"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 50,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "five_days_features = generate_5days_features(test)\n",
404 |     "\n",
405 |     "for f in five_days_features:\n",
406 |     "    test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left')"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 51,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "time_features_files = [\n",
416 |     "'itemID_last_time_underline.csv',\n",
417 |     "'brand_last_time_underline.csv',\n",
418 |     "'shop_last_time_underline.csv'\n",
419 |     "]\n",
420 |     "\n",
421 |     "\n",
422 |     "time_features = []\n",
423 |     "for f in time_features_files:\n",
424 |     "    time_features.append(reduce_mem_usage(pd.read_csv(f)))"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 52,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "for f in time_features:\n",
434 |     "    test_recall = pd.merge(left=test_recall, right=f, on=f.columns[0], how='left')"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 53,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "def generate_dynamic_features(data):\n",
444 |     "    dynamic_time_features = []\n",
445 |     "    test = data[data['day'] < 15]\n",
446 |     "    start_timestamp  = max(test['timestamp'])\n",
447 |     "    test['lasttime'] = start_timestamp - test['timestamp']\n",
448 |     "    \n",
449 |     "    for dynamic_time_feature in ['shop', 'category','brand']:\n",
450 |     "        dynamic_time_features.append(test[['lasttime','userID',dynamic_time_feature,'day']].groupby(['userID',dynamic_time_feature], as_index=False).agg({'lasttime': 'min', 'day':'max'}).rename(columns={'lasttime': 'user_to_'\n",
451 |     "                                                       + dynamic_time_feature + '_lasttime', 'day':'user_to_'+ dynamic_time_feature + '_lastday'}))\n",
452 |     "    return dynamic_time_features"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 54,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "dynamic_time_features = generate_dynamic_features(test)\n",
462 |     "for f in dynamic_time_features:\n",
463 |     "    test_recall = pd.merge(left=test_recall, right=f, on=list(f.columns[0: 2]), how='left')"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 55,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "cbt_model = cat.CatBoostClassifier()"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 56,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "cbt_model = cbt_model.load_model('model0924_base.file')"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": 1,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "## About Model Averaging\n",
491 |     "\n",
492 |     "## WE DIDNT TEST MODEL AVERAGING OFFLINE, BUT WE APPLIED IT ONLINE\n",
493 |     "## PLEASE SEE FILE IN /CIKM-2019-AnalytiCup/Semi-Finals/online_recommendation/\n",
494 |     "\n",
495 |     "#0.045965784783714\n",
496 |     "# test_recall['ensemble'] = 10 / ( 5/test_recall['label_lgb'] + 5/test_recall['label'])\n",
497 |     "\n",
498 |     "# #0.045943749548558184\n",
499 |     "# test_recall['ensemble_power'] = np.power( test_recall['label_lgb']**4.8 * test_recall['label']**5.2 , 1/10)\n",
500 |     "\n",
501 |     "# #0.045996441844155474\n",
502 |     "# test_recall['ensemble_final'] = test_recall['ensemble']*0.5 + test_recall['ensemble_power'] * 0.5"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 57,
508 |    "metadata": {},
509 |    "outputs": [],
510 |    "source": [
511 |     "features = [x for x in test_recall.columns if x not in ['itemID','userID','category','shop','brand','label']]"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 58,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "test_recall['label'] = cbt_model.predict_proba(test_recall[features])[:,1]\n",
521 |     "\n",
522 |     "train_logs = dict()\n",
523 |     "train_ = data[data['day'] < 15]\n",
524 |     "for row in train_[['userID','itemID']].values:\n",
525 |     "    train_logs.setdefault(row[0], [])\n",
526 |     "    train_logs[row[0]].append(row[1])\n",
527 |     "\n",
528 |     "test_logs = dict()\n",
529 |     "test_ = data[data['day'] == 15]\n",
530 |     "for row in test_[['userID','itemID']].values:\n",
531 |     "    test_logs.setdefault(row[0], [])\n",
532 |     "    test_logs[row[0]].append(row[1])\n",
533 |     "    \n",
534 |     "\n",
535 |     "result_logs = dict()\n",
536 |     "test_recall = test_recall.sort_values('label', ascending=False).reset_index(drop=True)\n",
537 |     "for row in test_recall[['userID','itemID']].values:\n",
538 |     "    result_logs.setdefault(row[0], [])\n",
539 |     "    if len(result_logs[row[0]]) < 50:\n",
540 |     "        result_logs[row[0]].append(row[1])\n",
541 |     "\n",
542 |     "temp = data.groupby(['itemID'], as_index=False).count()[['itemID','userID']]\n",
543 |     "hot_items = list(temp.sort_values('userID', ascending=False).reset_index(drop=True)['itemID'][:100])"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 62,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "import time"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 69,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "a = time.time()\n",
562 |     "rec_dict = dict()\n",
563 |     "for u in set(data['userID']):\n",
564 |     "    if u in result_logs:\n",
565 |     "        lenth = len(result_logs[u])\n",
566 |     "        if lenth < 50:\n",
567 |     "            rec_dict[u] = result_logs[u] + [x for x in hot_items if x not in result_logs[u] and x not in train_logs[u]][:50 - lenth]\n",
568 |     "        else:\n",
569 |     "            rec_dict[u] = result_logs[u]\n",
570 |     "    else:\n",
571 |     "        rec_dict[u] = [x for x in hot_items][:50]\n",
572 |     "b=time.time()"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 70,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "data": {
582 |       "text/plain": [
583 |        "2.221940040588379"
584 |       ]
585 |      },
586 |      "execution_count": 70,
587 |      "metadata": {},
588 |      "output_type": "execute_result"
589 |     }
590 |    ],
591 |    "source": [
592 |     "b - a"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": 71,
598 |    "metadata": {},
599 |    "outputs": [],
600 |    "source": [
601 |     "def recall(dict1, dict2, train_dict):\n",
602 |     "    '''\n",
603 |     "    dict1 是真值 dict2 是预测值.\n",
604 |     "    '''\n",
605 |     "    \n",
606 |     "    result = 0\n",
607 |     "    count = 0\n",
608 |     "    for i in dict1:\n",
609 |     "        if i in dict2 and i in train_dict:\n",
610 |     "            new_item = set()\n",
611 |     "    \n",
612 |     "            for k in dict1[i]:\n",
613 |     "                if k not in train_dict[i]:\n",
614 |     "                    new_item.add(k)\n",
615 |     "            if new_item:\n",
616 |     "                result += len(new_item & set(dict2[i])) / len(new_item)\n",
617 |     "                count += 1\n",
618 |     "            \n",
619 |     "    if count == 0:\n",
620 |     "        return 0\n",
621 |     "    else:\n",
622 |     "        return result / count\n",
623 |     "\n"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 72,
629 |    "metadata": {},
630 |    "outputs": [
631 |     {
632 |      "data": {
633 |       "text/plain": [
634 |        "0.04542858771389073"
635 |       ]
636 |      },
637 |      "execution_count": 72,
638 |      "metadata": {},
639 |      "output_type": "execute_result"
640 |     }
641 |    ],
642 |    "source": [
643 |     "recall(test_logs, rec_dict, train_logs)"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 31,
649 |    "metadata": {},
650 |    "outputs": [
651 |     {
652 |      "ename": "NameError",
653 |      "evalue": "name 'test_recall_logs' is not defined",
654 |      "output_type": "error",
655 |      "traceback": [
656 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
657 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
658 |       "\u001b[0;32m<ipython-input-31-a84d35211ea1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrecall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_logs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtest_recall_logs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtest_recall_logs\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_logs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
659 |       "\u001b[0;31mNameError\u001b[0m: name 'test_recall_logs' is not defined"
660 |      ]
661 |     }
662 |    ],
663 |    "source": [
664 |     "recall(test_logs, {x:[x[0] for x in test_recall_logs[x]] for x in test_recall_logs}, train_logs)"
665 |    ]
666 |   }
667 |  ],
668 |  "metadata": {
669 |   "kernelspec": {
670 |    "display_name": "Python 3",
671 |    "language": "python",
672 |    "name": "python3"
673 |   },
674 |   "language_info": {
675 |    "codemirror_mode": {
676 |     "name": "ipython",
677 |     "version": 3
678 |    },
679 |    "file_extension": ".py",
680 |    "mimetype": "text/x-python",
681 |    "name": "python",
682 |    "nbconvert_exporter": "python",
683 |    "pygments_lexer": "ipython3",
684 |    "version": "3.7.1"
685 |   }
686 |  },
687 |  "nbformat": 4,
688 |  "nbformat_minor": 2
689 | }
690 | 


--------------------------------------------------------------------------------
/初赛方案简介.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/初赛方案简介.pdf


--------------------------------------------------------------------------------
/复赛方案简介.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/复赛方案简介.pdf


--------------------------------------------------------------------------------
/答辩ppt.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChuanyuXue/CIKM-2019-AnalytiCup/921f88589fac643c700635e8bb32542b240ceac1/答辩ppt.pptx


--------------------------------------------------------------------------------