├── 1 ├── tool.py ├── rf.py ├── xgb.py └── tianchi.py ├── 2 ├── tool.py ├── rf.py └── tianchi.py └── README.md /2/tool.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | import os 5 | 6 | 7 | def get_result(dir_name, file_name): 8 | """ 9 | dir_name: 文件夹名称 10 | file_name: 结果文件名 11 | """ 12 | file_list = os.listdir(dir_name) 13 | fw = open(file_name, 'w') 14 | fw.write('row_id,shop_id\n') 15 | for file_name in file_list: 16 | if 'm_' in file_name: 17 | with open(dir_name + file_name, 'r') as f: 18 | for line in f.readlines()[1:]: 19 | if line.strip() != '': 20 | fw.write(line) 21 | fw.close() 22 | 23 | 24 | if __name__ == '__main__': 25 | # 获取总的结果 26 | get_result('./mall_results/', 'result.csv') 27 | -------------------------------------------------------------------------------- /1/tool.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | 4 | def get_result(dir_name, file_name): 5 | """ 6 | dir_name: 文件夹名称 7 | file_name: 结果文件名 8 | """ 9 | file_list = os.listdir(dir_name) 10 | fw = open(file_name, 'w') 11 | fw.write('row_id,shop_id\n') 12 | for file_name in file_list: 13 | if 'm_' in file_name: 14 | with open(dir_name + file_name, 'r') as f: 15 | for line in f.readlines()[1:]: 16 | if line.strip() != '': 17 | fw.write(line) 18 | fw.close() 19 | 20 | if __name__ == '__main__': 21 | # 获得随机森林的预测结果 22 | get_result('./rf_result/', 'rf_result.csv') 23 | # 获得XGBoost的预测结果 24 | get_result('./xgb_result/', 'xgb_result.csv') -------------------------------------------------------------------------------- /2/rf.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.model_selection import train_test_split 5 | from sklearn import preprocessing 6 | 7 | class RF: 8 | def __init__(self): 9 | # 采用默认参数200,不寻参 10 | self.n_estimators = 200 11 | 12 | def train_prob(self, X, shop_ids, TEST): 13 | """ 14 | 返回预测概率 15 | X: 训练集 vector 16 | shop_ids: 训练集标签 17 | TEST: 测试集 vector 18 | """ 19 | lbl = preprocessing.LabelEncoder() 20 | lbl.fit(shop_ids) 21 | y = lbl.transform(shop_ids) 22 | rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.n_estimators) 23 | rf.fit(X, y) 24 | predict_prob = rf.predict_proba(TEST) 25 | return predict_prob, lbl 26 | 27 | def train(self, X, shop_ids, TEST): 28 | """ 29 | 预测标签 30 | """ 31 | lbl = preprocessing.LabelEncoder() 32 | lbl.fit(shop_ids) 33 | y = lbl.transform(shop_ids) 34 | rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.n_estimators) 35 | rf.fit(X, y) 36 | predict = rf.predict(TEST) 37 | predict_ids = [lbl.inverse_transform(int(x)) for x in predict] 38 | return predict_ids 39 | -------------------------------------------------------------------------------- /1/rf.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | from sklearn import preprocessing 3 | from sklearn.model_selection import train_test_split 4 | 5 | 6 | class RF: 7 | """ 8 | Random Forest 9 | """ 10 | def __init__(self): 11 | self.n_estimators_options = [100, 120, 140, 160, 180, 200] 12 | self.best_n_estimators = 0 13 | self.best_acc = 0 14 | 15 | def train(self, mall_id, X, shop_ids, TEST, row_ids): 16 | """ 17 | mall_id: 商场 ID 18 | X: 训练集 vector 19 | shop_ids: 训练集标签 20 | TEST: 测试集 vector 21 | row_ids: 测试集行号 22 | """ 23 | lbl = preprocessing.LabelEncoder() 24 | lbl.fit(shop_ids) 25 | y = lbl.transform(shop_ids) 26 | # 划分训练集和验证集 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 28 | # 简单寻参 29 | for n_estimators_size in self.n_estimators_options: 30 | alg = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators_size) 31 | alg.fit(X_train, y_train) 32 | predict = alg.predict(X_test) 33 | acc = (y_test == predict).mean() 34 | print(n_estimators_size, acc) 35 | if acc >= self.best_acc: 36 | self.best_acc = acc 37 | self.best_n_estimators = n_estimators_size 38 | # 定义模型,训练 39 | rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.best_n_estimators) 40 | rf.fit(X, y) 41 | predict = rf.predict(TEST) 42 | predict_result = [lbl.inverse_transform(int(x)) for x in predict] 43 | with open('./rf_result/' + str(mall_id) + '_result.csv', 'w') as f: 44 | f.write('row_id,shop_id\n') 45 | for i, row_id in enumerate(row_ids): 46 | f.write('%s,%s\n' %(row_id, predict_result[i])) 47 | -------------------------------------------------------------------------------- /1/xgb.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | from sklearn.model_selection import train_test_split 3 | from sklearn import preprocessing 4 | 5 | 6 | def analyse(mall_id, X, shop_ids): 7 | """ 8 | 划分训练集和验证集,计算 ACC 9 | """ 10 | lbl = preprocessing.LabelEncoder() 11 | lbl.fit(shop_ids) 12 | y = lbl.transform(shop_ids) 13 | # 计算类别数 14 | num_class = y.max() + 1 15 | # 划分训练集和验证集 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 17 | xg_train = xgb.DMatrix(X_train, label=y_train) 18 | xg_test = xgb.DMatrix(X_test, label=y_test) 19 | watchlist = [(xg_train, 'train'), (xg_test, 'test')] 20 | # 定义参数 21 | params = { 22 | 'objective': 'multi:softmax', 23 | 'eta': 0.1, 24 | 'max_depth': 9, 25 | 'eval_metric': 'merror', 26 | 'seed': 0, 27 | 'missing': -999, 28 | 'num_class': num_class, 29 | 'silent': 1, 30 | } 31 | bst = xgb.train(params, xg_train, 60, watchlist, early_stopping_rounds=15) 32 | pred = bst.predict(xg_test) 33 | acc = (y_test == pred).mean() 34 | print('accuracy: %s' %acc) 35 | 36 | 37 | def train(mall_id, X, shop_ids, TEST, row_ids): 38 | """ 39 | 训练预测 40 | """ 41 | lbl = preprocessing.LabelEncoder() 42 | lbl.fit(shop_ids) 43 | y = lbl.transform(shop_ids) 44 | num_class = y.max() + 1 45 | xg_train = xgb.DMatrix(X, label=y) 46 | xg_test = xgb.DMatrix(TEST) 47 | watchlist = [(xg_train, 'train')] 48 | params = { 49 | 'objective': 'multi:softmax', 50 | 'eta': 0.1, 51 | 'max_depth': 9, 52 | 'eval_metric': 'merror', 53 | 'seed': 0, 54 | 'missing': -999, 55 | 'num_class': num_class, 56 | 'silent': 1, 57 | } 58 | bst = xgb.train(params, xg_train, 60, watchlist, early_stopping_rounds=15) 59 | pred = bst.predict(xg_test) 60 | pred = [lbl.inverse_transform(int(x)) for x in pred] 61 | # 写出结果到文件 62 | with open('./xgb_result/' + str(mall_id) + '_result.csv', 'w') as f: 63 | f.write('row_id,shop_id\n') 64 | for i, row_id in enumerate(row_ids): 65 | f.write('%s,%s\n' %(row_id, pred[i])) 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 天池大赛——商场中精确定位用户所在店铺 2 | 3 | 前些日子,受同学“鼓动”,到天池官网注册了一个帐号,“顺理成章”的参加了这个“演技满满”的比赛。“影帝”们的表演还在继续,我等“龙套演员”已经领盒饭走人了。最终B榜成绩:119 / 2845,accuracy为0.9083。特此分享一波自己的解题过程。抛砖(确定是砖)引玉,希望“影帝们”赛后可以稍加点拨。让我们的“演(技)技(术)”也得到一丁点儿的提升,就很感激了。 4 | 5 | 6 | 在这里特别感谢技术圈各位大神之前的开源,让我学到很多。谢谢! 7 | 8 | ## 解题思路 9 | 10 | 这种比赛每个人写代码可能都有自己的风格,所以这里特别说明一下自己的解题过程。 11 | 12 | ### 时间戳 13 | 首先自己对时间戳做过一定的处理。比如分时间段、分周末和工作日,可能是姿势不太正确。并没有起到好的效果,反而“逆上分”。这里仅仅截取时间戳的预处理函数,如果想尝试的,可以自行尝试一下。 14 | 15 | ``` python 16 | @staticmethod 17 | def timestamp_handle(df): 18 | mdays = [] 19 | wdays = [] 20 | hours = [] 21 | mins = [] 22 | for timestamp in df['time_stamp']: 23 | timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M") 24 | mdays.append(timeArray.tm_mday) 25 | wdays.append(timeArray.tm_wday) 26 | hours.append(timeArray.tm_hour) 27 | mins.append(timeArray.tm_min) 28 | df['time_mday'] = mdays 29 | df['time_wday'] = wdays 30 | df['time_hour'] = hours 31 | df['time_min'] = mins 32 | df = df.drop(['time_stamp'], axis=1) 33 | return df 34 | ``` 35 | 36 | ### 经纬度信息 37 | 首先用户的经纬度存在不准确的情况。这部分用户可以通过一个偏差上限来甄别,只要用户的经纬度偏差大于这个上限,说明这个用户的经纬度是不准确的,在接下来的计算过程中,不考虑这个用户的经纬度。对于经纬度比较准确的用户,自己通过计算欧式距离或者经纬度距离基本都可以排除20%左右(或者更少)的商铺。这里仅仅提到有这个可以缩小范围的方式,但自己在代码中并没有采用。对于经纬度,自己在代码中只使用了原始数据,并没有做任何处理。 38 | 39 | 40 | ### WIFI信息 41 | 整个题目,自己基本都在围绕WIFI信息做文章。之前有大神开源过一份XGBoost的代码(非常感谢)。自己在代码中有尝试融合,自己当时选用的是Random Forest算法。也能取得基本一致的结果(按自己的处理方式,Random Forest的表现可能要好一点儿),很遗憾,这个题自己没有找到XGBoost的调参姿势(时间开销),所以在结果基本一致的情况下,自己选择了Random Forest算法。其中这里开源了两份代码,分别位于**文件夹1**和**文件夹2**,其中**文件夹1**的代码跟大神的思路是一样的,筛选出WIFI作为特征,并保留经纬度,整个的特征向量的结构为[longitude, latitude, wifi1, wifi2, wifi3 ...],可能差别就插在挑选WIFI的方式上,自己是按照TF值来挑选了一部分,然后按照整个商场中WIFI的出现次数补充了一部分。这种方式A榜成绩为0.9075,B榜成绩为0.9054。**文件夹2**的代码是在文件夹1的代码的基础之上做了部分改动。首先利用随机森林预测得到一部分测试集的结果(对某个商铺的预测概率大于等于0.8,则认为随机森林预测结果可靠),之后对于剩下的测试集,计算每个用户的WIFI和每个商铺的WIFI(挑选出来的,假设这个WIFI属于这个商铺)的余弦相似度,然后排序,挑选出**最多**5个(之所以说最多是因为用这种方式,有的用户的候选集里的商铺数不足5个)商铺作为候选集。然后将候选集相同的测试集样例进行合并到一起,然后依照候选集划分出一个个小的测试集。然后提取每个候选集里的商铺对应的训练集数据,构建分类器,然后进行预测(这样就减少了分类数,但本质上还是多分类)。最终B榜成绩0.9083。 42 | 43 | 44 | ### 其他 45 | - 关于WIFI连接,自己统计过一个数据,当一个WIFI被连接的时候,用户在哪个商铺前?发现有部分WIFI,当WIFI被连接次数大于等于10次,用户在某个商铺前的概率为1。但融合到最终结果。结果有了小小的下降。自己不太明白为什么,要么是自己计算错误,要么是前后两个月的WIFI有比较大的差别?(生活所迫,并没有细究下去,哈哈)。 46 | 47 | - TextCNN,这部分只是自己觉得好玩儿,做了个实验。先确定出每个SHOP的WIFI列表,然后给定一个用户的WIFI信息,与每个SHOP的WIFI比对,该SHOP是否有该WIFI,有的话,在对应的位置上填上WIFI强度。这样最后的特征向量的为“**店铺数 × 每个店铺的WIFI数**”样式的矩阵结构,如此构造特征向量是可以收敛的,最终的成绩能在0.9左右。 48 | 49 | - 时间有限(好朋友的项目都给耽误了,sorry)(老师的项目不敢怠慢~),其他信息这里没有再考虑了~,比赛也到此为止了。 50 | - 再说一下收获吧:技(演)术(技)真的非常重要~,自己有很多不足,多读 paper,多多实践吧。 51 | 52 | 53 | 54 | 大致如上,水文一篇,如果大家有什么好的思路,希望多多指导~,谢谢!QQ: 765422195 55 | 56 | 多多加星哦~ 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /1/tianchi.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from multiprocessing import Pool 6 | from sklearn.preprocessing import MinMaxScaler 7 | import xgb 8 | from rf import RF 9 | 10 | 11 | # 12个进程 12 | SPLITS = 12 13 | 14 | class TianChi: 15 | def __init__(self): 16 | """ 17 | 初始化函数,加载数据,连接数据 18 | """ 19 | self.shop_info = pd.read_csv('./data/shop_info.csv', sep=',', encoding='utf8', engine='c') 20 | self.user_info = pd.read_csv('./data/user_shop_behavior.csv', sep=',', encoding='utf8', engine='c') 21 | self.evl_data = pd.read_csv('./data/evaluation.csv', sep=',', encoding='utf8', engine='c') 22 | self.train_data = pd.merge(self.user_info, self.shop_info, on=['shop_id']) 23 | 24 | def wifi_info_process(self, wifi_info): 25 | """ 26 | 预处理 wifi_infos 字段 27 | """ 28 | wifi_name = wifi_info[0] 29 | wifi_isty = int(wifi_info[1]) 30 | wifi_conn = wifi_info[2] 31 | if wifi_isty <= -100: 32 | wifi_isty = 0 33 | else: 34 | wifi_isty = (wifi_isty + 100) / 100.0 35 | if wifi_conn == 'true': 36 | wifi_conn = True 37 | else: 38 | wifi_conn = False 39 | return [wifi_name, wifi_isty, wifi_conn] 40 | 41 | def mall_init(self, mall_id, train_mall_df, evl_mall_df): 42 | """ 43 | 商场的数据结构初始化 44 | mall_id: 商场ID 45 | train_mall_df: 训练集 DataFrame 46 | evl_mall_df: 测试集 DataFrame 47 | """ 48 | self.shops = {} 49 | shop_list = self.shop_info[self.shop_info.mall_id == mall_id].shop_id.unique() 50 | for shop_id in shop_list: 51 | self.shops[shop_id] = train_mall_df[train_mall_df.shop_id == shop_id] 52 | print('MALL ID: %s\nTRAIN NUM: %s\nEVL_NUM: %s\nSHOP_NUM: %s' %(mall_id, train_mall_df.shape[0], evl_mall_df.shape[0], len(shop_list))) 53 | # 统计每个SHOP的WIFI数和每个SHOP的每个WIFI的强度和 54 | self.shops_wifi_count = {} 55 | self.shops_wifi_isty = {} 56 | for shop_id in self.shops: 57 | shop = self.shops[shop_id] 58 | shop_wifi_count = {} 59 | shop_wifi_isty = {} 60 | for index, row in shop.iterrows(): 61 | for wifi_info in row['wifi_infos']: 62 | wifi_name = wifi_info[0] 63 | wifi_isty = wifi_info[1] 64 | if wifi_name not in shop_wifi_count: 65 | shop_wifi_count[wifi_name] = 1 66 | shop_wifi_isty[wifi_name] = wifi_isty 67 | else: 68 | shop_wifi_count[wifi_name] += 1 69 | shop_wifi_isty[wifi_name] += wifi_isty 70 | self.shops_wifi_count[shop_id] = shop_wifi_count 71 | self.shops_wifi_isty[shop_id] = shop_wifi_isty 72 | 73 | # 求每家商铺的 WIFI 的平均强度 74 | for shop_id in self.shops_wifi_isty: 75 | shop_wifi_isty = self.shops_wifi_isty[shop_id] 76 | shop_wifi_count = self.shops_wifi_count[shop_id] 77 | for wifi_name in shop_wifi_isty: 78 | shop_wifi_isty[wifi_name] = float(shop_wifi_isty[wifi_name]) / (shop_wifi_count[wifi_name]) 79 | self.shops_wifi_isty[shop_id] = shop_wifi_isty 80 | 81 | # 商场中每家商铺的每个WIFI的TF值 82 | self.shops_wifi_tf = {} 83 | for shop_id in self.shops_wifi_count: 84 | shop_wifi_count = self.shops_wifi_count[shop_id] 85 | shop_wifi_tf = {} 86 | total_num = float(sum(shop_wifi_count.values())) 87 | for wifi_name in shop_wifi_count: 88 | shop_wifi_tf[wifi_name] = shop_wifi_count[wifi_name] / total_num 89 | self.shops_wifi_tf[shop_id] = shop_wifi_tf 90 | 91 | # 统计整个商场中,每个WIFI的出现次数 92 | train_wifi_count = {} 93 | for wifi_infos in train_mall_df['wifi_infos']: 94 | for wifi_info in wifi_infos: 95 | wifi_name = wifi_info[0] 96 | if wifi_name not in train_wifi_count: 97 | train_wifi_count[wifi_name] = 1 98 | else: 99 | train_wifi_count[wifi_name] += 1 100 | 101 | # 统计商场中WIFI出现次数大于10的WIFI 102 | train_wifi_gt10 = set() 103 | for wifi_name in train_wifi_count: 104 | if train_wifi_count[wifi_name] >= 10: 105 | train_wifi_gt10.add(wifi_name) 106 | 107 | # 筛选出做特征的WIFI 108 | self.wifi = {} 109 | wifi_num = 0 110 | # 1. 筛选出每个SHOP的WIFI的TF值大于0.02的WIFI 111 | for shop_id in self.shops_wifi_tf: 112 | shop_wifi_tf = self.shops_wifi_tf[shop_id] 113 | for wifi_name in shop_wifi_tf: 114 | if shop_wifi_tf[wifi_name] >= 0.02: 115 | if wifi_name not in self.wifi: 116 | self.wifi[wifi_name] = wifi_num 117 | wifi_num += 1 118 | # 2. 筛选出整个商场中WIFI出现次数大于10的WIFI 119 | for wifi_name in train_wifi_gt10: 120 | if wifi_name not in self.wifi: 121 | self.wifi[wifi_name] = wifi_num 122 | wifi_num += 1 123 | # 3. 筛选出整个商场中WIFI出现次数的TOP10% 124 | wifi_perc10 = sorted(train_wifi_count.items(), key=lambda d: d[1], reverse=True) 125 | wifi_perc10 = [x[0] for x in wifi_perc10[:int(len(train_wifi_count)*0.10)]] 126 | for wifi_name in wifi_perc10: 127 | if wifi_name not in self.wifi: 128 | self.wifi[wifi_name] = wifi_num 129 | wifi_num += 1 130 | self.wifi_num = wifi_num 131 | print('WIFI NUM:', self.wifi_num) 132 | 133 | def f_wifi(self, row): 134 | for wifi_info in row['wifi_infos']: 135 | wifi_name = wifi_info[0] 136 | wifi_intensity = wifi_info[1] 137 | if wifi_name in self.wifi: 138 | row[-1 - self.wifi[wifi_name]] = wifi_intensity 139 | return row 140 | 141 | def apply_f_wifi(self, df): 142 | return df.apply(self.f_wifi, axis=1, raw=True) 143 | 144 | def get_wifi_vector(self, df): 145 | df_temp = pd.DataFrame(columns=['wifi_' + str(i) for i in range(self.wifi_num)]) 146 | df = pd.concat([df, df_temp], axis=1) 147 | df_parts_temp = np.array_split(df, SPLITS) 148 | with Pool(processes=SPLITS) as pool: 149 | df_parts = pool.map(self.apply_f_wifi, df_parts_temp) 150 | df = pd.concat(df_parts) 151 | return df 152 | 153 | def run(self): 154 | 155 | mall_list = self.shop_info.mall_id.unique() 156 | for mall_id in mall_list: 157 | if mall_id != 'm_6803': 158 | continue 159 | # 提取训练集数据和验证集数据 160 | train_mall_df = self.train_data[self.train_data.mall_id == mall_id] 161 | evl_mall_df = self.evl_data[self.evl_data.mall_id == mall_id] 162 | train_mall_df.rename(columns={'longitude_x': 'longitude', 'latitude_x': 'latitude'}, inplace=True) 163 | # wif_infos 字段的预处理 164 | train_mall_df['wifi_infos'] = train_mall_df['wifi_infos'].apply(lambda x: [self.wifi_info_process(wifi.split('|')) for wifi in x.split(';')]) 165 | evl_mall_df['wifi_infos'] = evl_mall_df['wifi_infos'].apply(lambda x: [self.wifi_info_process(wifi.split('|')) for wifi in x.split(';')]) 166 | # 提取训练集标签和测试集行号 167 | row_ids = list(evl_mall_df['row_id']) 168 | shop_ids = list(train_mall_df['shop_id']) 169 | # 提取需要的列 170 | train_columns = ['longitude', 'latitude', 'wifi_infos', 'shop_id'] 171 | evl_columns = ['longitude', 'latitude', 'wifi_infos', 'row_id'] 172 | train_mall_df = train_mall_df[train_columns] 173 | evl_mall_df = evl_mall_df[evl_columns] 174 | # mall 数据结构初始化 175 | self.mall_init(mall_id, train_mall_df,evl_mall_df) 176 | # 连接train_mall_df和evl_mall_df进行预处理 177 | df = pd.concat([train_mall_df, evl_mall_df]) 178 | df = self.get_wifi_vector(df) 179 | columns = ['longitude', 'latitude'] + ['wifi_' + str(i) for i in range(len(self.wifi))] 180 | df = df[columns] 181 | df = df.fillna(0) 182 | X = np.asarray(df, dtype=np.float64) 183 | min_max_scaler = MinMaxScaler() 184 | min_max_scaler.fit(X) 185 | X = min_max_scaler.transform(X) 186 | # 分离出训练集和测试集 187 | X_train = X[:len(shop_ids)] 188 | X_test = X[len(shop_ids):] 189 | rf = RF() 190 | rf.train(mall_id, X_train, shop_ids, X_test, row_ids) 191 | xgb.analyse(mall_id, X_train, shop_ids) 192 | xgb.train(mall_id, X_train, shop_ids, X_test, row_ids) 193 | print('='*120) 194 | 195 | 196 | if __name__ == '__main__': 197 | data = TianChi() 198 | data.run() 199 | -------------------------------------------------------------------------------- /2/tianchi.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from multiprocessing import Pool 6 | from sklearn.preprocessing import MinMaxScaler 7 | import time 8 | from tool import rf 9 | import re 10 | import os 11 | import math 12 | 13 | 14 | # 进程数(将DataFrame划分成 SPLITS 块,每块交给一个进程处理) 15 | SPLITS = 12 16 | # 候选集样例数(用余弦相似度选出候选集) 17 | CANDIDATE_NUM = 5 18 | 19 | 20 | def cos_sim(vector_a, vector_b): 21 | """ 22 | 计算余弦相似度 23 | vector_a: 向量 a 24 | vector_b: 向量 b 25 | """ 26 | vector_a = np.mat(vector_a) 27 | vector_b = np.mat(vector_b) 28 | num = float(vector_a * vector_b.T) 29 | denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b) 30 | cos = num / denom 31 | sim = 0.5 + 0.5 * cos 32 | return sim 33 | 34 | 35 | class TianChi: 36 | 37 | def __init__(self): 38 | """ 39 | 初始化函数,加载数据,连接数据 40 | """ 41 | self.shop_info = pd.read_csv('./data/shop_info.csv', sep=',', encoding='utf8', engine='c') 42 | self.user_info = pd.read_csv('./data/user_shop_behavior.csv', sep=',', encoding='utf8', engine='c') 43 | self.evl_data = pd.read_csv('./data/evaluation.csv', sep=',', encoding='utf8', engine='c') 44 | self.train_data = pd.merge(self.user_info, self.shop_info, on=['shop_id']) 45 | 46 | def f_cossim_candidate(self, row): 47 | cossim_candidate = {} 48 | user_wifi_name = [x[0] for x in row['wifi_infos']] 49 | user_wifi_vector = [x[1] for x in row['wifi_infos']] 50 | for shop_id in self.shops_wifi: 51 | shop_wifi_vector = [] 52 | for wifi_name in user_wifi_name: 53 | if wifi_name in self.shops_wifi[shop_id]: 54 | shop_wifi_vector.append( self.shops_wifi[shop_id][wifi_name]) 55 | else: 56 | shop_wifi_vector.append(0) 57 | shop_wifi_vector = np.array(shop_wifi_vector) 58 | cossim = cos_sim(user_wifi_vector, shop_wifi_vector) 59 | if np.isnan(cossim): 60 | continue 61 | else: 62 | cossim_candidate[shop_id] = cossim 63 | cossim_candidate = [x[0] for x in sorted(cossim_candidate.items(), key=lambda x: x[1], reverse=True)[0:CANDIDATE_NUM]] 64 | row['cossim_candidate'] = set(cossim_candidate) 65 | return row 66 | 67 | def apply_cossim_candidate(self, df_part): 68 | return df_part.apply(self.f_cossim_candidate, axis=1) 69 | 70 | def cossim_candidate_process(self, df): 71 | df = df.reindex(columns=df.columns.tolist() + ['cossim_candidate']) 72 | df_parts = np.array_split(df, SPLITS) 73 | with Pool(processes=SPLITS) as pool: 74 | df_parts_r = pool.map(self.apply_cossim_candidate, df_parts) 75 | df = pd.concat(df_parts_r) 76 | return df 77 | 78 | def f_wifi(self, row): 79 | for wifi_info in row['wifi_infos']: 80 | wifi_name = wifi_info[0] 81 | wifi_intensity = wifi_info[1] 82 | if wifi_name in self.wifi: 83 | row[-1 - self.wifi[wifi_name]] = wifi_intensity 84 | return row 85 | 86 | def apply_f_wifi(self, df): 87 | return df.apply(self.f_wifi, axis=1, raw=True) 88 | 89 | def get_wifi_vector(self, df): 90 | df_temp = pd.DataFrame(columns=['wifi_' + str(i) for i in range(self.wifi_num)]) 91 | df = pd.concat([df, df_temp], axis=1) 92 | df_parts_temp = np.array_split(df, SPLITS) 93 | with Pool(processes=SPLITS) as pool: 94 | df_parts = pool.map(self.apply_f_wifi, df_parts_temp) 95 | df = pd.concat(df_parts) 96 | return df 97 | 98 | @staticmethod 99 | def wifi_process(df, wifi_dict): 100 | df = df.reindex(columns=df.columns.tolist() + ['wifi_' + str(i) for i in range(len(wifi_dict))]) 101 | for index, row in df.iterrows(): 102 | for wifi_info in row['wifi_infos']: 103 | wifi_name = wifi_info[0] 104 | wifi_isty = wifi_info[1] 105 | if wifi_name in wifi_dict: 106 | df.loc[index, 'wifi_'+str(wifi_dict[wifi_name])] = wifi_isty 107 | return df 108 | 109 | @staticmethod 110 | def wifi_info_process(wifi_info): 111 | """ 112 | 预处理 wifi_infos 字段 113 | """ 114 | wifi_name = wifi_info[0] 115 | wifi_isty = int(wifi_info[1]) 116 | wifi_conn = wifi_info[2] 117 | if wifi_isty <= -100: 118 | wifi_isty = 0 119 | else: 120 | wifi_isty = (wifi_isty + 100) / 100.0 121 | if wifi_conn == 'true': 122 | wifi_conn = True 123 | else: 124 | wifi_conn = False 125 | return [wifi_name, wifi_isty, wifi_conn] 126 | 127 | def mall_init(self, mall_id, train_mall_df, evl_mall_df): 128 | """ 129 | 商场的数据结构初始化 130 | mall_id: 商场ID 131 | train_mall_df: 训练集 DataFrame 132 | evl_mall_df: 测试集 DataFrame 133 | """ 134 | self.shops = {} 135 | shop_list = self.shop_info[self.shop_info.mall_id == mall_id].shop_id.unique() 136 | for shop_id in shop_list: 137 | self.shops[shop_id] = train_mall_df[train_mall_df.shop_id == shop_id] 138 | print('MALL ID: %s\nTRAIN NUM: %s\nEVL_NUM: %s\nSHOP_NUM: %s' %(mall_id, train_mall_df.shape[0], evl_mall_df.shape[0], len(shop_list))) 139 | # 统计每个SHOP的WIFI数和每个SHOP的每个WIFI的强度和 140 | self.shops_wifi_count = {} 141 | self.shops_wifi_isty = {} 142 | for shop_id in self.shops: 143 | shop = self.shops[shop_id] 144 | shop_wifi_count = {} 145 | shop_wifi_isty = {} 146 | for index, row in shop.iterrows(): 147 | for wifi_info in row['wifi_infos']: 148 | wifi_name = wifi_info[0] 149 | wifi_isty = wifi_info[1] 150 | if wifi_name not in shop_wifi_count: 151 | shop_wifi_count[wifi_name] = 1 152 | shop_wifi_isty[wifi_name] = wifi_isty 153 | else: 154 | shop_wifi_count[wifi_name] += 1 155 | shop_wifi_isty[wifi_name] += wifi_isty 156 | self.shops_wifi_count[shop_id] = shop_wifi_count 157 | self.shops_wifi_isty[shop_id] = shop_wifi_isty 158 | 159 | # 求每家商铺的 WIFI 的平均强度 160 | for shop_id in self.shops_wifi_isty: 161 | shop_wifi_isty = self.shops_wifi_isty[shop_id] 162 | shop_wifi_count = self.shops_wifi_count[shop_id] 163 | for wifi_name in shop_wifi_isty: 164 | shop_wifi_isty[wifi_name] = float(shop_wifi_isty[wifi_name]) / (shop_wifi_count[wifi_name]) 165 | self.shops_wifi_isty[shop_id] = shop_wifi_isty 166 | 167 | # 商场中每家商铺的每个WIFI的TF值 168 | self.shops_wifi_tf = {} 169 | for shop_id in self.shops_wifi_count: 170 | shop_wifi_count = self.shops_wifi_count[shop_id] 171 | shop_wifi_tf = {} 172 | total_num = float(sum(shop_wifi_count.values())) 173 | for wifi_name in shop_wifi_count: 174 | shop_wifi_tf[wifi_name] = shop_wifi_count[wifi_name] / total_num 175 | self.shops_wifi_tf[shop_id] = shop_wifi_tf 176 | 177 | # 统计整个商场中,每个WIFI的出现次数 178 | train_wifi_count = {} 179 | for wifi_infos in train_mall_df['wifi_infos']: 180 | for wifi_info in wifi_infos: 181 | wifi_name = wifi_info[0] 182 | if wifi_name not in train_wifi_count: 183 | train_wifi_count[wifi_name] = 1 184 | else: 185 | train_wifi_count[wifi_name] += 1 186 | 187 | # 统计商场中WIFI出现次数大于10的WIFI 188 | train_wifi_gt10 = set() 189 | for wifi_name in train_wifi_count: 190 | if train_wifi_count[wifi_name] >= 10: 191 | train_wifi_gt10.add(wifi_name) 192 | 193 | # 筛选出做特征的WIFI 194 | self.wifi = {} 195 | wifi_num = 0 196 | # 1. 筛选出每个SHOP的WIFI的TF值大于0.02的WIFI 197 | for shop_id in self.shops_wifi_tf: 198 | shop_wifi_tf = self.shops_wifi_tf[shop_id] 199 | for wifi_name in shop_wifi_tf: 200 | if shop_wifi_tf[wifi_name] >= 0.02: 201 | if wifi_name not in self.wifi: 202 | self.wifi[wifi_name] = wifi_num 203 | wifi_num += 1 204 | # 2. 筛选出整个商场中WIFI出现次数大于10的WIFI 205 | for wifi_name in train_wifi_gt10: 206 | if wifi_name not in self.wifi: 207 | self.wifi[wifi_name] = wifi_num 208 | wifi_num += 1 209 | # 3. 筛选出整个商场中WIFI出现次数的TOP10% 210 | wifi_perc10 = sorted(train_wifi_count.items(), key=lambda d: d[1], reverse=True) 211 | wifi_perc10 = [x[0] for x in wifi_perc10[:int(len(train_wifi_count)*0.10)]] 212 | for wifi_name in wifi_perc10: 213 | if wifi_name not in self.wifi: 214 | self.wifi[wifi_name] = wifi_num 215 | wifi_num += 1 216 | self.wifi_num = wifi_num 217 | print('WIFI NUM:', self.wifi_num) 218 | 219 | # 挑选每个商铺的 WIFI(用于余弦相似度的计算) 220 | self.shops_wifi = {} 221 | for shop_id in self.shops_wifi_isty: 222 | shop_wifi_isty = self.shops_wifi_isty[shop_id] 223 | shop_wifi_tf = self.shops_wifi_tf[shop_id] 224 | shop_wifi = {} 225 | for wifi_name in shop_wifi_isty: 226 | if shop_wifi_tf[wifi_name] >= 0.02: 227 | shop_wifi[wifi_name] = shop_wifi_isty[wifi_name] 228 | self.shops_wifi[shop_id] = shop_wifi_isty 229 | 230 | def chunks(self, arr, m): 231 | """ 232 | 将一个列表等分成 m 份 233 | """ 234 | n = int(math.ceil(len(arr) / float(m))) 235 | return [arr[i:i + n] for i in range(0, len(arr), n)] 236 | 237 | def run_proc(self, candidate_chunks, evl_mall_df, file_name): 238 | """ 239 | 对每个样例进行预测调用的函数 240 | """ 241 | result = {} 242 | step = 1 243 | for candidates in candidate_chunks: 244 | # 并没有输出太多信息,如果想看具体的信息,可以自行输出一下,这里删减了 245 | print('step:', step) 246 | step += 1 247 | test_df = evl_mall_df[evl_mall_df['cossim_candidate'] == candidates] 248 | columns = ['longitude', 'latitude', 'wifi_infos'] 249 | row_ids = list(test_df['row_id']) 250 | test_df = test_df[columns] 251 | shop_ids = list(pd.concat([self.shops[shop_id]['shop_id'] for shop_id in candidates])) 252 | shop_df = pd.concat([self.shops[shop_id][columns] for shop_id in candidates]) 253 | shop_df = pd.concat([shop_df, test_df]) 254 | wifi_dict = {} 255 | wifi_num = 0 256 | for shop_id in candidates: 257 | ''' 258 | 构造候选集SHOP的WIFI特征 259 | 这个部分有很多种方式(不同的方式,可以按照TF来构造,也可以只按照出现次数来构造等), 260 | 时间所限,这里并没有全部尝试完成 261 | ''' 262 | shop_wifi_tf = self.shops_wifi_tf[shop_id] 263 | # shop_wifi_count = self.shops_wifi_count[shop_id] 264 | # shop_wifi_count = sorted(shop_wifi_count.items(), key=lambda x: x[1], reverse=True) 265 | # shop_wifi_count = [x[0] for x in shop_wifi_count[:int(len(shop_wifi_count)*0.2)]] 266 | for wifi_name in shop_wifi_tf: 267 | if wifi_name not in wifi_dict and shop_wifi_tf[wifi_name]>=0.02: 268 | wifi_dict[wifi_name] = wifi_num 269 | wifi_num += 1 270 | # for wifi_name in shop_wifi_count: 271 | # if wifi_name not in wifi_dict: 272 | # wifi_dict[wifi_name] = wifi_num 273 | # wifi_num += 1 274 | shop_df = TianChi.wifi_process(shop_df, wifi_dict) 275 | columns = ['longitude', 'latitude'] + ['wifi_' + str(i) for i in range(len(wifi_dict))] 276 | shop_df = shop_df[columns] 277 | shop_df = shop_df.fillna(0) 278 | X = np.asarray(shop_df, dtype=np.float64) 279 | min_max_scaler = MinMaxScaler() 280 | min_max_scaler.fit(X) 281 | X = min_max_scaler.transform(X) 282 | X_train = X[:len(shop_ids)] 283 | X_test = X[len(shop_ids):] 284 | # 预测 shop_ids 285 | shop_ids_predict = rf.train(X_train, shop_ids, X_test) 286 | for i, row_id in enumerate(row_ids): 287 | result[row_id] = shop_ids_predict[i] 288 | 289 | with open('./runs/' + file_name, 'w') as f: 290 | for row_id in result: 291 | f.write('%s,%s\n' %(row_id, result[row_id])) 292 | 293 | 294 | def run(self): 295 | 296 | mall_list = self.shop_info.mall_id.unique() 297 | for mall_id in mall_list: 298 | if mall_id != 'm_6803': 299 | continue 300 | # 提取训练集数据和验证集数据 301 | train_mall_df = self.train_data[self.train_data.mall_id == mall_id] 302 | evl_mall_df = self.evl_data[self.evl_data.mall_id == mall_id] 303 | train_mall_df.rename(columns={'longitude_x': 'longitude', 'latitude_x': 'latitude'}, inplace=True) 304 | # wif_infos 字段的预处理 305 | train_mall_df['wifi_infos'] = train_mall_df['wifi_infos'].apply(lambda x: [TianChi.wifi_info_process(wifi.split('|')) for wifi in x.split(';')]) 306 | evl_mall_df['wifi_infos'] = evl_mall_df['wifi_infos'].apply(lambda x: [TianChi.wifi_info_process(wifi.split('|')) for wifi in x.split(';')]) 307 | # 提取训练集标签和测试集行号 308 | row_ids = list(evl_mall_df['row_id']) 309 | shop_ids = list(train_mall_df['shop_id']) 310 | # 提取需要的列 311 | train_columns = ['longitude', 'latitude', 'wifi_infos', 'shop_id'] 312 | evl_columns = ['longitude', 'latitude', 'wifi_infos', 'row_id'] 313 | train_mall_df = train_mall_df[train_columns] 314 | evl_mall_df = evl_mall_df[evl_columns] 315 | # mall 数据结构初始化 316 | self.mall_init(mall_id, train_mall_df, evl_mall_df) 317 | # 连接train_mall_df和evl_mall_df进行预处理 318 | df = pd.concat([train_mall_df, evl_mall_df]) 319 | df_temp = self.get_wifi_vector(df) 320 | columns = ['longitude', 'latitude'] + ['wifi_' + str(i) for i in range(len(self.wifi))] 321 | df_temp = df_temp[columns] 322 | df_temp = df_temp.fillna(0) 323 | X = np.asarray(df_temp, dtype=np.float64) 324 | min_max_scaler = MinMaxScaler() 325 | min_max_scaler.fit(X) 326 | X = min_max_scaler.transform(X) 327 | # 分离出训练集和测试集 328 | X_train = X[:len(shop_ids)] 329 | X_test = X[len(shop_ids):] 330 | # 第一步预测:用随机森林进行预测,筛选出单个SHOP预测概率大于0.8的样本,并确定预测结果为最终结果 331 | results = {} 332 | results_all = {} 333 | shop_ids_prob, lbl =rf.train_prob(X_train, shop_ids, X_test) 334 | for i, row in enumerate(shop_ids_prob): 335 | # 统计单个店铺概率概率大于0.8结果 336 | for k, prob in enumerate(row): 337 | if prob >= 0.8: 338 | shop_id = lbl.inverse_transform(int(k)) 339 | row_id = row_ids[i] 340 | results[row_id] = shop_id 341 | break 342 | # 统计随机森林预测的所有结果 343 | k_temp = 0 344 | prob_temp = 0 345 | for k, prob in enumerate(row): 346 | if prob > prob_temp: 347 | k_temp = k 348 | prob_temp = prob 349 | shop_id = lbl.inverse_transform(int(k_temp)) 350 | row_id = row_ids[i] 351 | results_all[row_id] = shop_id 352 | 353 | # 筛选出第一步没有确定商铺的测试集样例 354 | for row_id in results: 355 | df.loc[df.row_id == row_id, 'shop_id'] = results[row_id] 356 | evl_mall_df = df[df.shop_id.isnull()] 357 | row_ids = evl_mall_df['row_id'] 358 | 359 | # 计算测试集每个样例的候选集 360 | # 候选集的挑选仅使用了用户的WIFI列表和每个店铺的WIFI列表的余弦相似度 361 | evl_mall_df = self.cossim_candidate_process(evl_mall_df) 362 | 363 | # 统计所有的候选集集合 364 | candidates_list = [] 365 | for candidates in evl_mall_df['cossim_candidate']: 366 | # 其中 candidates 的类型为 set 367 | if candidates not in candidates_list: 368 | candidates_list.append(candidates) 369 | 370 | # 过滤长度为0和长度为1的候选集集合 371 | for candidates in candidates_list: 372 | test_df = evl_mall_df[evl_mall_df['cossim_candidate'] == candidates] 373 | # 长度为0,说明用户的WIFI列表里的WIFI在之前的训练集中没有出现过 374 | if len(candidates) == 0: 375 | for index, row in test_df.iterrows(): 376 | row_id = row['row_id'] 377 | results[row_id] = results_all[row_id] 378 | candidates_list.remove(candidates) 379 | # 长度为1,说明用户的WIFI列表仅与一家商铺的WIFI匹配 380 | elif len(candidates) == 1: 381 | for index, row in test_df.iterrows(): 382 | row_id = row['row_id'] 383 | results[row_id] = list(candidates)[0] 384 | candidates_list.remove(candidates) 385 | print('\n', 'TOTAL STEPS:', len(candidates_list), '\n') 386 | 387 | # 划分候选集集合 388 | candidate_chunks = self.chunks(candidates_list, SPLITS) 389 | file_list = [str(i)+'.csv' for i in range(SPLITS)] 390 | 391 | # 多进程处理各个候选集情况 392 | p = Pool(SPLITS) 393 | for i in range(SPLITS): 394 | p.apply_async(self.run_proc, args=(candidate_chunks[i], evl_mall_df, file_list[i])) 395 | p.close() 396 | # 全部进程结束才执行下面的汇总结果的代码 397 | p.join() 398 | 399 | # 汇总结果 400 | fw = open('./mall_results/' + str(mall_id) + 'result.csv', 'w') 401 | fw.write('row_id,shop_id\n') 402 | for row_id in results: 403 | fw.write('%s,%s\n' %(row_id, results[row_id])) 404 | for filename in file_list: 405 | with open('./runs/' + filename, 'r') as f: 406 | lines = f.readlines() 407 | for line in lines: 408 | if line.strip() != '': 409 | fw.write(line) 410 | fw.close() 411 | 412 | 413 | if __name__ == '__main__': 414 | data = TianChi() 415 | data.run() 416 | --------------------------------------------------------------------------------