├── 1
    ├── tool.py
    ├── rf.py
    ├── xgb.py
    └── tianchi.py
├── 2
    ├── tool.py
    ├── rf.py
    └── tianchi.py
└── README.md


/2/tool.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | import os
 5 | 
 6 | 
 7 | def get_result(dir_name, file_name):
 8 |     """
 9 |     dir_name: 文件夹名称
10 |     file_name: 结果文件名
11 |     """
12 |     file_list = os.listdir(dir_name)
13 |     fw = open(file_name, 'w')
14 |     fw.write('row_id,shop_id\n')
15 |     for file_name in file_list:
16 |         if 'm_' in file_name:
17 |             with open(dir_name + file_name, 'r') as f:
18 |                 for line in f.readlines()[1:]:
19 |                     if line.strip() != '':
20 |                         fw.write(line)
21 |     fw.close()
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     # 获取总的结果
26 |     get_result('./mall_results/', 'result.csv')
27 | 


--------------------------------------------------------------------------------
/1/tool.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | 
 4 | def get_result(dir_name, file_name):
 5 |     """
 6 |     dir_name: 文件夹名称
 7 |     file_name: 结果文件名
 8 |     """
 9 |     file_list = os.listdir(dir_name)
10 |     fw = open(file_name, 'w')
11 |     fw.write('row_id,shop_id\n')
12 |     for file_name in file_list:
13 |         if 'm_' in file_name:
14 |             with open(dir_name + file_name, 'r') as f:
15 |                 for line in f.readlines()[1:]:
16 |                     if line.strip() != '':
17 |                         fw.write(line)
18 |     fw.close()
19 | 
20 | if __name__ == '__main__':
21 |     # 获得随机森林的预测结果
22 |     get_result('./rf_result/', 'rf_result.csv')
23 |     # 获得XGBoost的预测结果
24 |     get_result('./xgb_result/', 'xgb_result.csv')


--------------------------------------------------------------------------------
/2/rf.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn import preprocessing
 6 | 
 7 | class RF:
 8 |     def __init__(self):
 9 |         # 采用默认参数200，不寻参
10 |         self.n_estimators = 200
11 | 
12 |     def train_prob(self, X, shop_ids, TEST):
13 |         """
14 |         返回预测概率
15 |         X: 训练集 vector
16 |         shop_ids: 训练集标签
17 |         TEST: 测试集 vector
18 |         """
19 |         lbl = preprocessing.LabelEncoder()
20 |         lbl.fit(shop_ids)
21 |         y = lbl.transform(shop_ids)
22 |         rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.n_estimators)
23 |         rf.fit(X, y)
24 |         predict_prob = rf.predict_proba(TEST)
25 |         return predict_prob, lbl
26 | 
27 |     def train(self, X, shop_ids, TEST):
28 |         """
29 |         预测标签
30 |         """
31 |         lbl = preprocessing.LabelEncoder()
32 |         lbl.fit(shop_ids)
33 |         y = lbl.transform(shop_ids)
34 |         rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.n_estimators)
35 |         rf.fit(X, y)
36 |         predict = rf.predict(TEST)
37 |         predict_ids = [lbl.inverse_transform(int(x)) for x in predict]
38 |         return predict_ids
39 | 


--------------------------------------------------------------------------------
/1/rf.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | from sklearn import preprocessing
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | 
 6 | class RF:
 7 |     """
 8 |     Random Forest
 9 |     """
10 |     def __init__(self):
11 |         self.n_estimators_options = [100, 120, 140, 160, 180, 200]
12 |         self.best_n_estimators = 0
13 |         self.best_acc = 0
14 | 
15 |     def train(self, mall_id, X, shop_ids, TEST, row_ids):
16 |         """
17 |         mall_id: 商场 ID
18 |         X: 训练集 vector
19 |         shop_ids: 训练集标签
20 |         TEST： 测试集 vector
21 |         row_ids: 测试集行号
22 |         """
23 |         lbl = preprocessing.LabelEncoder()
24 |         lbl.fit(shop_ids)
25 |         y = lbl.transform(shop_ids)
26 |         # 划分训练集和验证集
27 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
28 |         # 简单寻参
29 |         for n_estimators_size in self.n_estimators_options:
30 |             alg = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators_size)
31 |             alg.fit(X_train, y_train)
32 |             predict = alg.predict(X_test)
33 |             acc = (y_test == predict).mean()
34 |             print(n_estimators_size, acc)
35 |             if acc >= self.best_acc:
36 |                 self.best_acc = acc
37 |                 self.best_n_estimators = n_estimators_size
38 |         # 定义模型，训练
39 |         rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.best_n_estimators)
40 |         rf.fit(X, y)
41 |         predict = rf.predict(TEST)
42 |         predict_result = [lbl.inverse_transform(int(x)) for x in predict]
43 |         with open('./rf_result/' + str(mall_id) + '_result.csv', 'w') as f:
44 |             f.write('row_id,shop_id\n')
45 |             for i, row_id in enumerate(row_ids):
46 |                 f.write('%s,%s\n' %(row_id, predict_result[i]))
47 | 


--------------------------------------------------------------------------------
/1/xgb.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn import preprocessing
 4 | 
 5 | 
 6 | def analyse(mall_id, X, shop_ids):
 7 |     """
 8 |     划分训练集和验证集，计算 ACC
 9 |     """
10 |     lbl = preprocessing.LabelEncoder()
11 |     lbl.fit(shop_ids)
12 |     y = lbl.transform(shop_ids)
13 |     # 计算类别数
14 |     num_class = y.max() + 1
15 |     # 划分训练集和验证集
16 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
17 |     xg_train = xgb.DMatrix(X_train, label=y_train)
18 |     xg_test = xgb.DMatrix(X_test, label=y_test)
19 |     watchlist = [(xg_train, 'train'), (xg_test, 'test')]
20 |     # 定义参数
21 |     params = {
22 |         'objective': 'multi:softmax',
23 |         'eta': 0.1,
24 |         'max_depth': 9,
25 |         'eval_metric': 'merror',
26 |         'seed': 0,
27 |         'missing': -999,
28 |         'num_class': num_class,
29 |         'silent': 1,
30 |     }
31 |     bst = xgb.train(params, xg_train, 60, watchlist, early_stopping_rounds=15)
32 |     pred = bst.predict(xg_test)
33 |     acc = (y_test == pred).mean()
34 |     print('accuracy: %s' %acc)
35 | 
36 | 
37 | def train(mall_id, X, shop_ids, TEST, row_ids):
38 |     """
39 |     训练预测
40 |     """
41 |     lbl = preprocessing.LabelEncoder()
42 |     lbl.fit(shop_ids)
43 |     y = lbl.transform(shop_ids)
44 |     num_class = y.max() + 1
45 |     xg_train = xgb.DMatrix(X, label=y)
46 |     xg_test = xgb.DMatrix(TEST)
47 |     watchlist = [(xg_train, 'train')]
48 |     params = {
49 |         'objective': 'multi:softmax',
50 |         'eta': 0.1,
51 |         'max_depth': 9,
52 |         'eval_metric': 'merror',
53 |         'seed': 0,
54 |         'missing': -999,
55 |         'num_class': num_class,
56 |         'silent': 1,
57 |     }
58 |     bst = xgb.train(params, xg_train, 60, watchlist, early_stopping_rounds=15)
59 |     pred = bst.predict(xg_test)
60 |     pred = [lbl.inverse_transform(int(x)) for x in pred]
61 |     # 写出结果到文件
62 |     with open('./xgb_result/' + str(mall_id) + '_result.csv', 'w') as f:
63 |         f.write('row_id,shop_id\n')
64 |         for i, row_id in enumerate(row_ids):
65 |             f.write('%s,%s\n' %(row_id, pred[i]))
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 天池大赛——商场中精确定位用户所在店铺
 2 | 
 3 | 前些日子，受同学“鼓动”，到天池官网注册了一个帐号，“顺理成章”的参加了这个“演技满满”的比赛。“影帝”们的表演还在继续，我等“龙套演员”已经领盒饭走人了。最终B榜成绩：119 / 2845，accuracy为0.9083。特此分享一波自己的解题过程。抛砖（确定是砖）引玉，希望“影帝们”赛后可以稍加点拨。让我们的“演（技）技（术）”也得到一丁点儿的提升，就很感激了。
 4 | 
 5 | 
 6 | 在这里特别感谢技术圈各位大神之前的开源，让我学到很多。谢谢！
 7 | 
 8 | ## 解题思路
 9 | 
10 | 这种比赛每个人写代码可能都有自己的风格，所以这里特别说明一下自己的解题过程。
11 | 
12 | ### 时间戳
13 | 首先自己对时间戳做过一定的处理。比如分时间段、分周末和工作日，可能是姿势不太正确。并没有起到好的效果，反而“逆上分”。这里仅仅截取时间戳的预处理函数，如果想尝试的，可以自行尝试一下。
14 | 
15 | ``` python
16 | @staticmethod
17 | def timestamp_handle(df):
18 | mdays = []
19 | wdays = []
20 | hours = []
21 | mins = []
22 | for timestamp in df['time_stamp']:
23 |     timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M")
24 |     mdays.append(timeArray.tm_mday)
25 |     wdays.append(timeArray.tm_wday)
26 |     hours.append(timeArray.tm_hour)
27 |     mins.append(timeArray.tm_min)
28 | df['time_mday'] = mdays
29 | df['time_wday'] = wdays
30 | df['time_hour'] = hours
31 | df['time_min'] = mins
32 | df = df.drop(['time_stamp'], axis=1)
33 | return df
34 | ```
35 | 
36 | ### 经纬度信息
37 | 首先用户的经纬度存在不准确的情况。这部分用户可以通过一个偏差上限来甄别，只要用户的经纬度偏差大于这个上限，说明这个用户的经纬度是不准确的，在接下来的计算过程中，不考虑这个用户的经纬度。对于经纬度比较准确的用户，自己通过计算欧式距离或者经纬度距离基本都可以排除20%左右（或者更少）的商铺。这里仅仅提到有这个可以缩小范围的方式，但自己在代码中并没有采用。对于经纬度，自己在代码中只使用了原始数据，并没有做任何处理。
38 | 
39 | 
40 | ### WIFI信息
41 | 整个题目，自己基本都在围绕WIFI信息做文章。之前有大神开源过一份XGBoost的代码（非常感谢）。自己在代码中有尝试融合，自己当时选用的是Random Forest算法。也能取得基本一致的结果（按自己的处理方式，Random Forest的表现可能要好一点儿），很遗憾，这个题自己没有找到XGBoost的调参姿势（时间开销），所以在结果基本一致的情况下，自己选择了Random Forest算法。其中这里开源了两份代码，分别位于**文件夹1**和**文件夹2**，其中**文件夹1**的代码跟大神的思路是一样的，筛选出WIFI作为特征，并保留经纬度，整个的特征向量的结构为[longitude, latitude, wifi1, wifi2, wifi3 ...]，可能差别就插在挑选WIFI的方式上，自己是按照TF值来挑选了一部分，然后按照整个商场中WIFI的出现次数补充了一部分。这种方式A榜成绩为0.9075,B榜成绩为0.9054。**文件夹2**的代码是在文件夹1的代码的基础之上做了部分改动。首先利用随机森林预测得到一部分测试集的结果（对某个商铺的预测概率大于等于0.8，则认为随机森林预测结果可靠），之后对于剩下的测试集，计算每个用户的WIFI和每个商铺的WIFI（挑选出来的，假设这个WIFI属于这个商铺）的余弦相似度，然后排序，挑选出**最多**5个（之所以说最多是因为用这种方式，有的用户的候选集里的商铺数不足5个）商铺作为候选集。然后将候选集相同的测试集样例进行合并到一起，然后依照候选集划分出一个个小的测试集。然后提取每个候选集里的商铺对应的训练集数据，构建分类器，然后进行预测（这样就减少了分类数，但本质上还是多分类）。最终B榜成绩0.9083。
42 | 
43 | 
44 | ### 其他
45 | - 关于WIFI连接，自己统计过一个数据，当一个WIFI被连接的时候，用户在哪个商铺前？发现有部分WIFI，当WIFI被连接次数大于等于10次，用户在某个商铺前的概率为1。但融合到最终结果。结果有了小小的下降。自己不太明白为什么，要么是自己计算错误，要么是前后两个月的WIFI有比较大的差别？（生活所迫，并没有细究下去，哈哈）。
46 | 
47 | - TextCNN，这部分只是自己觉得好玩儿，做了个实验。先确定出每个SHOP的WIFI列表，然后给定一个用户的WIFI信息，与每个SHOP的WIFI比对，该SHOP是否有该WIFI，有的话，在对应的位置上填上WIFI强度。这样最后的特征向量的为“**店铺数 × 每个店铺的WIFI数**”样式的矩阵结构，如此构造特征向量是可以收敛的，最终的成绩能在0.9左右。
48 | 
49 | - 时间有限（好朋友的项目都给耽误了，sorry）（老师的项目不敢怠慢～），其他信息这里没有再考虑了～，比赛也到此为止了。
50 | - 再说一下收获吧：技（演）术（技）真的非常重要～，自己有很多不足，多读 paper，多多实践吧。
51 | 
52 | 
53 | 
54 | 大致如上，水文一篇，如果大家有什么好的思路，希望多多指导～，谢谢！QQ: 765422195
55 | 
56 | 多多加星哦～
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/1/tianchi.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from multiprocessing import Pool
  6 | from sklearn.preprocessing import MinMaxScaler
  7 | import xgb
  8 | from rf import RF
  9 | 
 10 | 
 11 | # 12个进程
 12 | SPLITS = 12
 13 | 
 14 | class TianChi:
 15 |     def __init__(self):
 16 |         """
 17 |         初始化函数，加载数据，连接数据
 18 |         """
 19 |         self.shop_info = pd.read_csv('./data/shop_info.csv', sep=',', encoding='utf8', engine='c')
 20 |         self.user_info = pd.read_csv('./data/user_shop_behavior.csv', sep=',', encoding='utf8', engine='c')
 21 |         self.evl_data = pd.read_csv('./data/evaluation.csv', sep=',', encoding='utf8', engine='c')
 22 |         self.train_data = pd.merge(self.user_info, self.shop_info, on=['shop_id'])
 23 | 
 24 |     def wifi_info_process(self, wifi_info):
 25 |         """
 26 |         预处理 wifi_infos 字段
 27 |         """
 28 |         wifi_name = wifi_info[0]
 29 |         wifi_isty = int(wifi_info[1])
 30 |         wifi_conn = wifi_info[2]
 31 |         if wifi_isty <= -100:
 32 |             wifi_isty = 0
 33 |         else:
 34 |             wifi_isty = (wifi_isty + 100) / 100.0
 35 |         if wifi_conn == 'true':
 36 |             wifi_conn = True
 37 |         else:
 38 |             wifi_conn = False
 39 |         return [wifi_name, wifi_isty, wifi_conn]
 40 |     
 41 |     def mall_init(self, mall_id, train_mall_df, evl_mall_df):
 42 |         """
 43 |         商场的数据结构初始化
 44 |         mall_id: 商场ID
 45 |         train_mall_df: 训练集 DataFrame
 46 |         evl_mall_df: 测试集 DataFrame
 47 |         """
 48 |         self.shops = {}
 49 |         shop_list = self.shop_info[self.shop_info.mall_id == mall_id].shop_id.unique()
 50 |         for shop_id in shop_list:
 51 |             self.shops[shop_id] = train_mall_df[train_mall_df.shop_id == shop_id]
 52 |         print('MALL ID: %s\nTRAIN NUM: %s\nEVL_NUM: %s\nSHOP_NUM: %s' %(mall_id, train_mall_df.shape[0], evl_mall_df.shape[0], len(shop_list)))
 53 |         # 统计每个SHOP的WIFI数和每个SHOP的每个WIFI的强度和
 54 |         self.shops_wifi_count = {}
 55 |         self.shops_wifi_isty = {}
 56 |         for shop_id in self.shops:
 57 |             shop = self.shops[shop_id]
 58 |             shop_wifi_count = {}
 59 |             shop_wifi_isty = {}
 60 |             for index, row in shop.iterrows():
 61 |                 for wifi_info in row['wifi_infos']:
 62 |                     wifi_name = wifi_info[0]
 63 |                     wifi_isty = wifi_info[1]
 64 |                     if wifi_name not in shop_wifi_count:
 65 |                         shop_wifi_count[wifi_name] = 1
 66 |                         shop_wifi_isty[wifi_name] = wifi_isty
 67 |                     else:
 68 |                         shop_wifi_count[wifi_name] += 1
 69 |                         shop_wifi_isty[wifi_name] += wifi_isty
 70 |             self.shops_wifi_count[shop_id] = shop_wifi_count
 71 |             self.shops_wifi_isty[shop_id] = shop_wifi_isty
 72 | 
 73 |         # 求每家商铺的 WIFI 的平均强度
 74 |         for shop_id in self.shops_wifi_isty:
 75 |             shop_wifi_isty = self.shops_wifi_isty[shop_id]
 76 |             shop_wifi_count = self.shops_wifi_count[shop_id]
 77 |             for wifi_name in shop_wifi_isty:
 78 |                 shop_wifi_isty[wifi_name] = float(shop_wifi_isty[wifi_name]) / (shop_wifi_count[wifi_name])
 79 |             self.shops_wifi_isty[shop_id] = shop_wifi_isty
 80 | 
 81 |         # 商场中每家商铺的每个WIFI的TF值
 82 |         self.shops_wifi_tf = {}
 83 |         for shop_id in self.shops_wifi_count:
 84 |             shop_wifi_count = self.shops_wifi_count[shop_id]
 85 |             shop_wifi_tf = {}
 86 |             total_num = float(sum(shop_wifi_count.values()))
 87 |             for wifi_name in shop_wifi_count:
 88 |                 shop_wifi_tf[wifi_name] = shop_wifi_count[wifi_name] / total_num
 89 |             self.shops_wifi_tf[shop_id] = shop_wifi_tf
 90 |         
 91 |         # 统计整个商场中，每个WIFI的出现次数
 92 |         train_wifi_count = {}
 93 |         for wifi_infos in train_mall_df['wifi_infos']:
 94 |             for wifi_info in wifi_infos:
 95 |                 wifi_name = wifi_info[0]
 96 |                 if wifi_name not in train_wifi_count:
 97 |                     train_wifi_count[wifi_name] = 1
 98 |                 else:
 99 |                     train_wifi_count[wifi_name] += 1
100 | 
101 |         # 统计商场中WIFI出现次数大于10的WIFI
102 |         train_wifi_gt10 = set()
103 |         for wifi_name in train_wifi_count:
104 |             if train_wifi_count[wifi_name] >= 10:
105 |                 train_wifi_gt10.add(wifi_name)
106 | 
107 |         # 筛选出做特征的WIFI
108 |         self.wifi = {}
109 |         wifi_num = 0
110 |         # 1. 筛选出每个SHOP的WIFI的TF值大于0.02的WIFI
111 |         for shop_id in self.shops_wifi_tf:
112 |             shop_wifi_tf = self.shops_wifi_tf[shop_id]
113 |             for wifi_name in shop_wifi_tf:
114 |                 if shop_wifi_tf[wifi_name] >= 0.02:
115 |                     if wifi_name not in self.wifi:
116 |                         self.wifi[wifi_name] = wifi_num
117 |                         wifi_num += 1
118 |         # 2. 筛选出整个商场中WIFI出现次数大于10的WIFI
119 |         for wifi_name in train_wifi_gt10:
120 |             if wifi_name not in self.wifi:
121 |                 self.wifi[wifi_name] = wifi_num
122 |                 wifi_num += 1
123 |         # 3. 筛选出整个商场中WIFI出现次数的TOP10%
124 |         wifi_perc10 = sorted(train_wifi_count.items(), key=lambda d: d[1], reverse=True)
125 |         wifi_perc10 = [x[0] for x in wifi_perc10[:int(len(train_wifi_count)*0.10)]]
126 |         for wifi_name in wifi_perc10:
127 |             if wifi_name not in self.wifi:
128 |                 self.wifi[wifi_name] = wifi_num
129 |                 wifi_num += 1
130 |         self.wifi_num = wifi_num
131 |         print('WIFI NUM:', self.wifi_num) 
132 |     
133 |     def f_wifi(self, row):
134 |         for wifi_info in row['wifi_infos']:
135 |             wifi_name = wifi_info[0]
136 |             wifi_intensity = wifi_info[1]
137 |             if wifi_name in self.wifi:
138 |                 row[-1 - self.wifi[wifi_name]] = wifi_intensity
139 |         return row
140 | 
141 |     def apply_f_wifi(self, df):
142 |         return df.apply(self.f_wifi, axis=1, raw=True)
143 | 
144 |     def get_wifi_vector(self, df):
145 |         df_temp = pd.DataFrame(columns=['wifi_' + str(i) for i in range(self.wifi_num)])
146 |         df = pd.concat([df, df_temp], axis=1)
147 |         df_parts_temp = np.array_split(df, SPLITS)
148 |         with Pool(processes=SPLITS) as pool:
149 |             df_parts = pool.map(self.apply_f_wifi, df_parts_temp)
150 |         df = pd.concat(df_parts)
151 |         return df
152 |     
153 |     def run(self):
154 |         
155 |         mall_list = self.shop_info.mall_id.unique()
156 |         for mall_id in mall_list:
157 |             if mall_id != 'm_6803':
158 |                 continue
159 |             # 提取训练集数据和验证集数据
160 |             train_mall_df = self.train_data[self.train_data.mall_id == mall_id]
161 |             evl_mall_df = self.evl_data[self.evl_data.mall_id == mall_id]
162 |             train_mall_df.rename(columns={'longitude_x': 'longitude', 'latitude_x': 'latitude'}, inplace=True)
163 |             # wif_infos 字段的预处理
164 |             train_mall_df['wifi_infos'] = train_mall_df['wifi_infos'].apply(lambda x: [self.wifi_info_process(wifi.split('|')) for wifi in x.split(';')])
165 |             evl_mall_df['wifi_infos'] = evl_mall_df['wifi_infos'].apply(lambda x: [self.wifi_info_process(wifi.split('|')) for wifi in x.split(';')])
166 |             # 提取训练集标签和测试集行号
167 |             row_ids = list(evl_mall_df['row_id'])
168 |             shop_ids = list(train_mall_df['shop_id'])
169 |             # 提取需要的列
170 |             train_columns = ['longitude', 'latitude', 'wifi_infos', 'shop_id']
171 |             evl_columns = ['longitude', 'latitude', 'wifi_infos', 'row_id']
172 |             train_mall_df = train_mall_df[train_columns]
173 |             evl_mall_df = evl_mall_df[evl_columns]
174 |             # mall 数据结构初始化
175 |             self.mall_init(mall_id, train_mall_df,evl_mall_df)
176 |             # 连接train_mall_df和evl_mall_df进行预处理
177 |             df = pd.concat([train_mall_df, evl_mall_df])
178 |             df = self.get_wifi_vector(df)
179 |             columns = ['longitude', 'latitude'] + ['wifi_' + str(i) for i in range(len(self.wifi))]  
180 |             df = df[columns]
181 |             df = df.fillna(0)
182 |             X = np.asarray(df, dtype=np.float64)
183 |             min_max_scaler = MinMaxScaler()
184 |             min_max_scaler.fit(X)
185 |             X = min_max_scaler.transform(X)
186 |             # 分离出训练集和测试集
187 |             X_train = X[:len(shop_ids)]
188 |             X_test = X[len(shop_ids):]
189 |             rf = RF()
190 |             rf.train(mall_id, X_train, shop_ids, X_test, row_ids)
191 |             xgb.analyse(mall_id, X_train, shop_ids)
192 |             xgb.train(mall_id, X_train, shop_ids, X_test, row_ids)
193 |             print('='*120)
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     data = TianChi()
198 |     data.run()
199 | 


--------------------------------------------------------------------------------
/2/tianchi.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from multiprocessing import Pool
  6 | from sklearn.preprocessing import MinMaxScaler
  7 | import time
  8 | from tool import rf
  9 | import re
 10 | import os
 11 | import math
 12 | 
 13 | 
 14 | # 进程数(将DataFrame划分成 SPLITS 块，每块交给一个进程处理)
 15 | SPLITS = 12
 16 | # 候选集样例数(用余弦相似度选出候选集)
 17 | CANDIDATE_NUM = 5
 18 | 
 19 | 
 20 | def cos_sim(vector_a, vector_b):
 21 |     """
 22 |     计算余弦相似度
 23 |     vector_a: 向量 a
 24 |     vector_b: 向量 b
 25 |     """
 26 |     vector_a = np.mat(vector_a)
 27 |     vector_b = np.mat(vector_b)
 28 |     num = float(vector_a * vector_b.T)
 29 |     denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
 30 |     cos = num / denom
 31 |     sim = 0.5 + 0.5 * cos
 32 |     return sim
 33 | 
 34 | 
 35 | class TianChi:
 36 | 
 37 |     def __init__(self):
 38 |         """
 39 |         初始化函数，加载数据，连接数据
 40 |         """
 41 |         self.shop_info = pd.read_csv('./data/shop_info.csv', sep=',', encoding='utf8', engine='c')
 42 |         self.user_info = pd.read_csv('./data/user_shop_behavior.csv', sep=',', encoding='utf8', engine='c')
 43 |         self.evl_data = pd.read_csv('./data/evaluation.csv', sep=',', encoding='utf8', engine='c')
 44 |         self.train_data = pd.merge(self.user_info, self.shop_info, on=['shop_id'])
 45 |     
 46 |     def f_cossim_candidate(self, row):
 47 |         cossim_candidate = {}
 48 |         user_wifi_name = [x[0] for x in row['wifi_infos']]
 49 |         user_wifi_vector = [x[1] for x in row['wifi_infos']]
 50 |         for shop_id in  self.shops_wifi:
 51 |             shop_wifi_vector = []
 52 |             for wifi_name in user_wifi_name:
 53 |                 if wifi_name in  self.shops_wifi[shop_id]:
 54 |                     shop_wifi_vector.append( self.shops_wifi[shop_id][wifi_name])
 55 |                 else:
 56 |                     shop_wifi_vector.append(0)
 57 |             shop_wifi_vector = np.array(shop_wifi_vector)
 58 |             cossim = cos_sim(user_wifi_vector, shop_wifi_vector)
 59 |             if np.isnan(cossim):
 60 |                 continue
 61 |             else:
 62 |             	cossim_candidate[shop_id] = cossim
 63 |         cossim_candidate = [x[0] for x in sorted(cossim_candidate.items(), key=lambda x: x[1], reverse=True)[0:CANDIDATE_NUM]]   
 64 |         row['cossim_candidate'] = set(cossim_candidate)
 65 |         return row
 66 | 
 67 |     def apply_cossim_candidate(self, df_part):
 68 |         return df_part.apply(self.f_cossim_candidate, axis=1)
 69 | 
 70 |     def cossim_candidate_process(self, df):
 71 |         df = df.reindex(columns=df.columns.tolist() + ['cossim_candidate'])
 72 |         df_parts = np.array_split(df, SPLITS)
 73 |         with Pool(processes=SPLITS) as pool:
 74 |             df_parts_r = pool.map(self.apply_cossim_candidate, df_parts)
 75 |         df = pd.concat(df_parts_r)
 76 |         return df
 77 |     
 78 |     def f_wifi(self, row):
 79 |         for wifi_info in row['wifi_infos']:
 80 |             wifi_name = wifi_info[0]
 81 |             wifi_intensity = wifi_info[1]
 82 |             if wifi_name in self.wifi:
 83 |                 row[-1 - self.wifi[wifi_name]] = wifi_intensity
 84 |         return row
 85 | 
 86 |     def apply_f_wifi(self, df):
 87 |         return df.apply(self.f_wifi, axis=1, raw=True)
 88 | 
 89 |     def get_wifi_vector(self, df):
 90 |         df_temp = pd.DataFrame(columns=['wifi_' + str(i) for i in range(self.wifi_num)])
 91 |         df = pd.concat([df, df_temp], axis=1)
 92 |         df_parts_temp = np.array_split(df, SPLITS)
 93 |         with Pool(processes=SPLITS) as pool:
 94 |             df_parts = pool.map(self.apply_f_wifi, df_parts_temp)
 95 |         df = pd.concat(df_parts)
 96 |         return df
 97 |     
 98 |     @staticmethod
 99 |     def wifi_process(df, wifi_dict):
100 |         df = df.reindex(columns=df.columns.tolist() + ['wifi_' + str(i) for i in range(len(wifi_dict))])
101 |         for index, row in df.iterrows():
102 |             for wifi_info in row['wifi_infos']:
103 |                 wifi_name = wifi_info[0]
104 |                 wifi_isty = wifi_info[1]
105 |                 if wifi_name in wifi_dict:
106 |                     df.loc[index, 'wifi_'+str(wifi_dict[wifi_name])] = wifi_isty
107 |         return df
108 |     
109 |     @staticmethod
110 |     def wifi_info_process(wifi_info):
111 |         """
112 |         预处理 wifi_infos 字段
113 |         """
114 |         wifi_name = wifi_info[0]
115 |         wifi_isty = int(wifi_info[1])
116 |         wifi_conn = wifi_info[2]
117 |         if wifi_isty <= -100:
118 |             wifi_isty = 0
119 |         else:
120 |             wifi_isty = (wifi_isty + 100) / 100.0
121 |         if wifi_conn == 'true':
122 |             wifi_conn = True
123 |         else:
124 |             wifi_conn = False
125 |         return [wifi_name, wifi_isty, wifi_conn]
126 |     
127 |     def mall_init(self, mall_id, train_mall_df, evl_mall_df):
128 |         """
129 |         商场的数据结构初始化
130 |         mall_id: 商场ID
131 |         train_mall_df: 训练集 DataFrame
132 |         evl_mall_df: 测试集 DataFrame
133 |         """
134 |         self.shops = {}
135 |         shop_list = self.shop_info[self.shop_info.mall_id == mall_id].shop_id.unique()
136 |         for shop_id in shop_list:
137 |             self.shops[shop_id] = train_mall_df[train_mall_df.shop_id == shop_id]
138 |         print('MALL ID: %s\nTRAIN NUM: %s\nEVL_NUM: %s\nSHOP_NUM: %s' %(mall_id, train_mall_df.shape[0], evl_mall_df.shape[0], len(shop_list)))
139 |         # 统计每个SHOP的WIFI数和每个SHOP的每个WIFI的强度和
140 |         self.shops_wifi_count = {}
141 |         self.shops_wifi_isty = {}
142 |         for shop_id in self.shops:
143 |             shop = self.shops[shop_id]
144 |             shop_wifi_count = {}
145 |             shop_wifi_isty = {}
146 |             for index, row in shop.iterrows():
147 |                 for wifi_info in row['wifi_infos']:
148 |                     wifi_name = wifi_info[0]
149 |                     wifi_isty = wifi_info[1]
150 |                     if wifi_name not in shop_wifi_count:
151 |                         shop_wifi_count[wifi_name] = 1
152 |                         shop_wifi_isty[wifi_name] = wifi_isty
153 |                     else:
154 |                         shop_wifi_count[wifi_name] += 1
155 |                         shop_wifi_isty[wifi_name] += wifi_isty
156 |             self.shops_wifi_count[shop_id] = shop_wifi_count
157 |             self.shops_wifi_isty[shop_id] = shop_wifi_isty
158 | 
159 |         # 求每家商铺的 WIFI 的平均强度
160 |         for shop_id in self.shops_wifi_isty:
161 |             shop_wifi_isty = self.shops_wifi_isty[shop_id]
162 |             shop_wifi_count = self.shops_wifi_count[shop_id]
163 |             for wifi_name in shop_wifi_isty:
164 |                 shop_wifi_isty[wifi_name] = float(shop_wifi_isty[wifi_name]) / (shop_wifi_count[wifi_name])
165 |             self.shops_wifi_isty[shop_id] = shop_wifi_isty
166 | 
167 |         # 商场中每家商铺的每个WIFI的TF值
168 |         self.shops_wifi_tf = {}
169 |         for shop_id in self.shops_wifi_count:
170 |             shop_wifi_count = self.shops_wifi_count[shop_id]
171 |             shop_wifi_tf = {}
172 |             total_num = float(sum(shop_wifi_count.values()))
173 |             for wifi_name in shop_wifi_count:
174 |                 shop_wifi_tf[wifi_name] = shop_wifi_count[wifi_name] / total_num
175 |             self.shops_wifi_tf[shop_id] = shop_wifi_tf
176 |         
177 |         # 统计整个商场中，每个WIFI的出现次数
178 |         train_wifi_count = {}
179 |         for wifi_infos in train_mall_df['wifi_infos']:
180 |             for wifi_info in wifi_infos:
181 |                 wifi_name = wifi_info[0]
182 |                 if wifi_name not in train_wifi_count:
183 |                     train_wifi_count[wifi_name] = 1
184 |                 else:
185 |                     train_wifi_count[wifi_name] += 1
186 | 
187 |         # 统计商场中WIFI出现次数大于10的WIFI
188 |         train_wifi_gt10 = set()
189 |         for wifi_name in train_wifi_count:
190 |             if train_wifi_count[wifi_name] >= 10:
191 |                 train_wifi_gt10.add(wifi_name)
192 | 
193 |         # 筛选出做特征的WIFI
194 |         self.wifi = {}
195 |         wifi_num = 0
196 |         # 1. 筛选出每个SHOP的WIFI的TF值大于0.02的WIFI
197 |         for shop_id in self.shops_wifi_tf:
198 |             shop_wifi_tf = self.shops_wifi_tf[shop_id]
199 |             for wifi_name in shop_wifi_tf:
200 |                 if shop_wifi_tf[wifi_name] >= 0.02:
201 |                     if wifi_name not in self.wifi:
202 |                         self.wifi[wifi_name] = wifi_num
203 |                         wifi_num += 1
204 |         # 2. 筛选出整个商场中WIFI出现次数大于10的WIFI
205 |         for wifi_name in train_wifi_gt10:
206 |             if wifi_name not in self.wifi:
207 |                 self.wifi[wifi_name] = wifi_num
208 |                 wifi_num += 1
209 |         # 3. 筛选出整个商场中WIFI出现次数的TOP10%
210 |         wifi_perc10 = sorted(train_wifi_count.items(), key=lambda d: d[1], reverse=True)
211 |         wifi_perc10 = [x[0] for x in wifi_perc10[:int(len(train_wifi_count)*0.10)]]
212 |         for wifi_name in wifi_perc10:
213 |             if wifi_name not in self.wifi:
214 |                 self.wifi[wifi_name] = wifi_num
215 |                 wifi_num += 1
216 |         self.wifi_num = wifi_num
217 |         print('WIFI NUM:', self.wifi_num)
218 | 
219 |         # 挑选每个商铺的 WIFI(用于余弦相似度的计算)
220 |         self.shops_wifi = {}
221 |         for shop_id in self.shops_wifi_isty:
222 |             shop_wifi_isty = self.shops_wifi_isty[shop_id]
223 |             shop_wifi_tf = self.shops_wifi_tf[shop_id]
224 |             shop_wifi = {}
225 |             for wifi_name in shop_wifi_isty:
226 |                 if shop_wifi_tf[wifi_name] >= 0.02:
227 |                     shop_wifi[wifi_name] = shop_wifi_isty[wifi_name]
228 |             self.shops_wifi[shop_id] = shop_wifi_isty
229 |         
230 |     def chunks(self, arr, m):
231 |         """
232 |         将一个列表等分成 m 份
233 |         """
234 |         n = int(math.ceil(len(arr) / float(m)))
235 |         return [arr[i:i + n] for i in range(0, len(arr), n)]
236 |     
237 |     def run_proc(self, candidate_chunks, evl_mall_df, file_name):
238 |         """
239 |         对每个样例进行预测调用的函数
240 |         """
241 |         result = {}
242 |         step = 1
243 |         for candidates in candidate_chunks:
244 |             # 并没有输出太多信息，如果想看具体的信息，可以自行输出一下，这里删减了
245 |             print('step:', step)
246 |             step += 1
247 |             test_df = evl_mall_df[evl_mall_df['cossim_candidate'] == candidates]
248 |             columns = ['longitude', 'latitude', 'wifi_infos']
249 |             row_ids = list(test_df['row_id'])
250 |             test_df = test_df[columns]
251 |             shop_ids = list(pd.concat([self.shops[shop_id]['shop_id'] for shop_id in candidates]))
252 |             shop_df = pd.concat([self.shops[shop_id][columns] for shop_id in candidates])
253 |             shop_df = pd.concat([shop_df, test_df])
254 |             wifi_dict = {}
255 |             wifi_num = 0
256 |             for shop_id in candidates:
257 |                 '''
258 |                 构造候选集SHOP的WIFI特征
259 |                 这个部分有很多种方式（不同的方式，可以按照TF来构造，也可以只按照出现次数来构造等），
260 |                 时间所限，这里并没有全部尝试完成
261 |                 '''
262 |                 shop_wifi_tf = self.shops_wifi_tf[shop_id]
263 |                 # shop_wifi_count = self.shops_wifi_count[shop_id]
264 |                 # shop_wifi_count = sorted(shop_wifi_count.items(), key=lambda x: x[1], reverse=True)
265 |                 # shop_wifi_count = [x[0] for x in shop_wifi_count[:int(len(shop_wifi_count)*0.2)]]
266 |                 for wifi_name in shop_wifi_tf:
267 |                     if wifi_name not in wifi_dict and shop_wifi_tf[wifi_name]>=0.02:
268 |                         wifi_dict[wifi_name] = wifi_num
269 |                         wifi_num += 1
270 |                 # for wifi_name in shop_wifi_count:
271 |                     # if wifi_name not in wifi_dict:
272 |                         # wifi_dict[wifi_name] = wifi_num
273 |                         # wifi_num += 1
274 |             shop_df = TianChi.wifi_process(shop_df, wifi_dict)
275 |             columns = ['longitude', 'latitude'] + ['wifi_' + str(i) for i in range(len(wifi_dict))]  
276 |             shop_df = shop_df[columns]
277 |             shop_df = shop_df.fillna(0)
278 |             X = np.asarray(shop_df, dtype=np.float64)
279 |             min_max_scaler = MinMaxScaler()
280 |             min_max_scaler.fit(X)
281 |             X = min_max_scaler.transform(X)
282 |             X_train = X[:len(shop_ids)]
283 |             X_test = X[len(shop_ids):]
284 |             # 预测 shop_ids
285 |             shop_ids_predict = rf.train(X_train, shop_ids, X_test)
286 |             for i, row_id in enumerate(row_ids):
287 |                 result[row_id] = shop_ids_predict[i]
288 |         
289 |         with open('./runs/' + file_name, 'w') as f:
290 |             for row_id in result:
291 |                 f.write('%s,%s\n' %(row_id, result[row_id]))
292 | 
293 |     
294 |     def run(self):
295 |         
296 |         mall_list = self.shop_info.mall_id.unique()
297 |         for mall_id in mall_list:
298 |             if mall_id != 'm_6803':
299 |                 continue
300 |             # 提取训练集数据和验证集数据
301 |             train_mall_df = self.train_data[self.train_data.mall_id == mall_id]
302 |             evl_mall_df = self.evl_data[self.evl_data.mall_id == mall_id]
303 |             train_mall_df.rename(columns={'longitude_x': 'longitude', 'latitude_x': 'latitude'}, inplace=True)
304 |             # wif_infos 字段的预处理
305 |             train_mall_df['wifi_infos'] = train_mall_df['wifi_infos'].apply(lambda x: [TianChi.wifi_info_process(wifi.split('|')) for wifi in x.split(';')])
306 |             evl_mall_df['wifi_infos'] = evl_mall_df['wifi_infos'].apply(lambda x: [TianChi.wifi_info_process(wifi.split('|')) for wifi in x.split(';')])
307 |              # 提取训练集标签和测试集行号
308 |             row_ids = list(evl_mall_df['row_id'])
309 |             shop_ids = list(train_mall_df['shop_id'])
310 |             # 提取需要的列
311 |             train_columns = ['longitude', 'latitude', 'wifi_infos', 'shop_id']
312 |             evl_columns = ['longitude', 'latitude', 'wifi_infos', 'row_id']
313 |             train_mall_df = train_mall_df[train_columns]
314 |             evl_mall_df = evl_mall_df[evl_columns]
315 |             # mall 数据结构初始化
316 |             self.mall_init(mall_id, train_mall_df, evl_mall_df)
317 |             # 连接train_mall_df和evl_mall_df进行预处理
318 |             df = pd.concat([train_mall_df, evl_mall_df])
319 |             df_temp = self.get_wifi_vector(df)
320 |             columns = ['longitude', 'latitude'] + ['wifi_' + str(i) for i in range(len(self.wifi))]  
321 |             df_temp = df_temp[columns]
322 |             df_temp = df_temp.fillna(0)
323 |             X = np.asarray(df_temp, dtype=np.float64)
324 |             min_max_scaler = MinMaxScaler()
325 |             min_max_scaler.fit(X)
326 |             X = min_max_scaler.transform(X)
327 |             # 分离出训练集和测试集
328 |             X_train = X[:len(shop_ids)]
329 |             X_test = X[len(shop_ids):]
330 |             # 第一步预测：用随机森林进行预测，筛选出单个SHOP预测概率大于0.8的样本，并确定预测结果为最终结果
331 |             results = {}
332 |             results_all = {}
333 |             shop_ids_prob, lbl =rf.train_prob(X_train, shop_ids, X_test)
334 |             for i, row in enumerate(shop_ids_prob):
335 |                 # 统计单个店铺概率概率大于0.8结果
336 |                 for k, prob in enumerate(row):
337 |                     if prob >= 0.8:
338 |                         shop_id = lbl.inverse_transform(int(k))
339 |                         row_id = row_ids[i]
340 |                         results[row_id] = shop_id
341 |                         break
342 |                 # 统计随机森林预测的所有结果
343 |                 k_temp = 0
344 |                 prob_temp = 0
345 |                 for k, prob in enumerate(row):
346 |                     if prob > prob_temp:
347 |                         k_temp = k
348 |                         prob_temp = prob
349 |                 shop_id = lbl.inverse_transform(int(k_temp))
350 |                 row_id = row_ids[i]
351 |                 results_all[row_id] = shop_id
352 | 
353 |             # 筛选出第一步没有确定商铺的测试集样例
354 |             for row_id in results:
355 |                 df.loc[df.row_id == row_id, 'shop_id'] = results[row_id]
356 |             evl_mall_df = df[df.shop_id.isnull()]
357 |             row_ids = evl_mall_df['row_id']
358 |             
359 |             # 计算测试集每个样例的候选集
360 |             # 候选集的挑选仅使用了用户的WIFI列表和每个店铺的WIFI列表的余弦相似度
361 |             evl_mall_df = self.cossim_candidate_process(evl_mall_df)
362 |             
363 |             # 统计所有的候选集集合
364 |             candidates_list = []
365 |             for candidates in evl_mall_df['cossim_candidate']:
366 |                 # 其中 candidates 的类型为 set
367 |                 if candidates not in candidates_list:
368 |                     candidates_list.append(candidates)
369 |             
370 |             # 过滤长度为0和长度为1的候选集集合
371 |             for candidates in candidates_list:
372 |                 test_df = evl_mall_df[evl_mall_df['cossim_candidate'] == candidates]
373 |                 # 长度为0，说明用户的WIFI列表里的WIFI在之前的训练集中没有出现过
374 |                 if len(candidates) == 0:
375 |                     for index, row in test_df.iterrows():
376 |                         row_id = row['row_id']
377 |                         results[row_id] = results_all[row_id]
378 |                     candidates_list.remove(candidates)
379 |                 # 长度为1，说明用户的WIFI列表仅与一家商铺的WIFI匹配
380 |                 elif len(candidates) == 1:
381 |                     for index, row in test_df.iterrows():
382 |                         row_id = row['row_id']
383 |                         results[row_id] = list(candidates)[0]
384 |                     candidates_list.remove(candidates)
385 |             print('\n', 'TOTAL STEPS:', len(candidates_list), '\n')
386 |             
387 |             # 划分候选集集合
388 |             candidate_chunks = self.chunks(candidates_list, SPLITS)
389 |             file_list = [str(i)+'.csv' for i in range(SPLITS)]
390 | 
391 |             # 多进程处理各个候选集情况
392 |             p = Pool(SPLITS)
393 |             for i in range(SPLITS):
394 |                 p.apply_async(self.run_proc, args=(candidate_chunks[i], evl_mall_df, file_list[i]))
395 |             p.close()
396 |             # 全部进程结束才执行下面的汇总结果的代码
397 |             p.join()
398 | 
399 |             # 汇总结果
400 |             fw = open('./mall_results/' + str(mall_id) + 'result.csv', 'w')
401 |             fw.write('row_id,shop_id\n')
402 |             for row_id in results:
403 |                 fw.write('%s,%s\n' %(row_id, results[row_id]))
404 |             for filename in file_list:
405 |                 with open('./runs/' + filename, 'r') as f:
406 |                     lines = f.readlines()
407 |                     for line in lines:
408 |                         if line.strip() != '':
409 |                             fw.write(line)
410 |             fw.close()
411 |    
412 |     
413 | if __name__ == '__main__':
414 |     data = TianChi()
415 |     data.run()
416 | 


--------------------------------------------------------------------------------