├── .gitignore ├── .idea └── vcs.xml ├── LICENSE ├── README.md ├── __init.py └── hrwhisper ├── __init__.py ├── analysis_mall_location_data.py ├── analysis_user_data.py ├── analysis_wifi_data.py ├── common_helper.py ├── grid_search.py ├── model_stacking.py ├── model_test.py ├── model_voting.py ├── parse_data.py ├── predict_category_pro.py ├── predict_price.py ├── season2 ├── README.md ├── __init__.py ├── create_table │ ├── 1 create_train_table.sql │ ├── 2 create_test_table.sql │ ├── 3 create_test_wifi_top.sql │ ├── 3 create_train_wifi_top.sql │ ├── 4 test_fix_mssing.sql │ ├── 4 train_fix_missing.sql │ ├── 5 final_test_table.sql │ └── 5 final_train_table.sql ├── generate_result.py ├── generate_train_sql.py └── mall_id ├── use_category2.py ├── use_location.py ├── use_price.py ├── use_strong_wifi.py ├── use_time.py ├── use_user.py ├── use_wifi.py ├── use_wifi_kstrong.py ├── visualization_mall_data.py └── visulization_wifi_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 HuangRong Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 天池 商场中精确定位用户所在店铺 2 | - B榜 0.92115 3 | - Rank 13 4 | 5 | 6 | 7 | - run `analysis_mall_location_data.py` for `./feature_save/mall_center_and_area.csv` 8 | - run `analysis_wifi_data.py` for some wifi table. 9 | 10 | 11 | 12 | | 文件名(.py) | 说明 | 13 | | ------------------------------ | ----------------------------------- | 14 | | analysis_mall_location_data.py | 分析mall的信息 | 15 | | analysis_user_data | 分析用户信息 | 16 | | analysis_wifi_data | 分析wifi信息 | 17 | | common_helper | 一些通用的函数 | 18 | | grid_search | 参数搜索,调参用的 | 19 | | model_stacking | 集成学习-stacking方法 | 20 | | **model_test** | 单模型入口,定义使用的特征等 | 21 | | **model_voting** | 模型融合-投票方法,最后的时候上分用 | 22 | | parse_data | 读取数据 | 23 | | predict_category_pro | 预测类别的概率 | 24 | | predict_price | 预测价格 | 25 | | **use_xxx.py** | xxx特征的文件 | 26 | | visualization_mall_data | 可视化mall数据 | 27 | | visulization_wifi_data | 可视化wifi信息 | 28 | 29 | -------------------------------------------------------------------------------- /__init.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/21 3 | # @Author : hrwhisper -------------------------------------------------------------------------------- /hrwhisper/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/21 3 | # @Author : hrwhisper 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /hrwhisper/analysis_mall_location_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/24 3 | # @Author : hrwhisper 4 | 5 | import os 6 | from math import pi, cos, sin, atan2, sqrt 7 | 8 | import gpxpy.geo 9 | 10 | from parse_data import read_mall_data 11 | 12 | 13 | def get_distance_by_latitude_and_longitude(lat1, lon1, lat2, lon2): 14 | return gpxpy.geo.haversine_distance(lat1, lon1, lat2, lon2) 15 | 16 | 17 | def center_latitudes_and_longitudes(geo_coordinates): 18 | """ 19 | 20 | :param geo_coordinates: [[latitude,longtitude],...] 21 | :return: [latitude,longitudes] 22 | """ 23 | x = y = z = 0 24 | for (lat, lng) in geo_coordinates: 25 | lat, lng = lat * pi / 180, lng * pi / 180 26 | x += cos(lat) * cos(lng) 27 | y += cos(lat) * sin(lng) 28 | z += sin(lat) 29 | 30 | x = x / len(geo_coordinates) 31 | y = y / len(geo_coordinates) 32 | z = z / len(geo_coordinates) 33 | lng = atan2(y, x) 34 | hyp = sqrt(x ** 2 + y ** 2) 35 | lat = atan2(z, hyp) 36 | return lat * 180 / pi, lng * 180 / pi 37 | 38 | 39 | def mall_area(): 40 | """ 41 | 计算中心点、以及mall大小 42 | """ 43 | 44 | def cal_mall_area(data, mall_id): 45 | data = data[data['mall_id'] == mall_id] 46 | x = list(data['latitude']) 47 | y = list(data['longitude']) 48 | center = center_latitudes_and_longitudes(list(zip(x, y))) 49 | max_area = 0 50 | for i in range(len(x)): 51 | for j in range(i + 1, len(x)): 52 | max_area = max(max_area, get_distance_by_latitude_and_longitude(x[i], y[i], x[j], y[j])) 53 | return max_area, center[0], center[1] 54 | 55 | train_data = read_mall_data() # read_train_join_mall() 56 | os.makedirs('./feature_save', exist_ok=True) 57 | with open('./feature_save/mall_center_and_area.csv', 'w') as f: 58 | f.write('mall_id,max_area,center_latitude,center_longitude\n') 59 | for mall_id in train_data['mall_id'].unique(): 60 | max_area, lan, long = cal_mall_area(train_data, mall_id) 61 | print(mall_id, max_area, lan, long) 62 | f.write('{},{},{},{}\n'.format(mall_id, max_area, lan, long)) 63 | 64 | 65 | if __name__ == '__main__': 66 | mall_area() 67 | -------------------------------------------------------------------------------- /hrwhisper/analysis_user_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/26 3 | # @Author : hrwhisper 4 | from datetime import datetime 5 | import collections 6 | 7 | from parse_data import read_mall_data, read_train_join_mall 8 | 9 | 10 | def mall_category_time(mall_id='m_7168', _date='2017-08-04'): 11 | """ 12 | 计算某商场某天类别随时间变化 13 | """ 14 | data = read_train_join_mall() 15 | data = data[data['mall_id'] == mall_id] 16 | data = data.sort_values(by='time_stamp') 17 | first_date = datetime.strptime(_date, "%Y-%m-%d").date() 18 | 19 | counter = collections.defaultdict(lambda: [0] * 24) 20 | for _datetime, category_id in zip(data['time_stamp'], data['category_id']): 21 | _datetime = datetime.strptime(_datetime, "%Y-%m-%d %H:%M") 22 | if _datetime.date() != first_date: continue 23 | counter[category_id][_datetime.hour] += 1 24 | 25 | with open('./analysis_data/mall_counter_{}.csv'.format(_date), 'w') as f: 26 | f.write(',{}\n'.format(','.join([str(i) for i in range(24)]))) 27 | for category_id, cnt in sorted(counter.items()): 28 | f.write('{},{}\n'.format(category_id, ','.join([str(c) for c in cnt]))) 29 | 30 | 31 | if __name__ == '__main__': 32 | mall_category_time() 33 | -------------------------------------------------------------------------------- /hrwhisper/analysis_wifi_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/21 3 | # @Author : hrwhisper 4 | """ 5 | wifi feature analysis 6 | """ 7 | import collections 8 | import time 9 | from datetime import datetime 10 | 11 | from sklearn.externals import joblib 12 | 13 | from parse_data import read_train_join_mall, read_test_data 14 | from visulization_wifi_data import draw_wifi 15 | 16 | 17 | def many_mall_has_same_bssid(): 18 | """ 19 | many mall has same bssid, it may be mobile hotspot 20 | :return: 21 | """ 22 | train_data = read_train_join_mall() 23 | counter = collections.defaultdict(set) 24 | start = time.time() 25 | for mall_id, wifi_infos in zip(train_data['mall_id'], train_data['wifi_infos']): 26 | for wifi in wifi_infos.split(';'): 27 | _id, _strong, _connect = wifi.split('|') 28 | counter[_id].add(mall_id) 29 | print(time.time() - start) 30 | many_uid = {key for key, l in counter.items() if len(l) > 1} 31 | joblib.dump(many_uid, './feature_save/many_mall_wifi_bssid.pkl') 32 | print('total: {} repeat in other mall: {}'.format(len(counter), len(many_uid))) 33 | 34 | 35 | def check_mall(train_data, mall_id='m_6803'): 36 | """ 37 | 交易的条数貌似不是很影响结果 38 | 能收到的wifi条数也不是很影响结果的样子 39 | """ 40 | train_data = train_data[train_data['mall_id'] == mall_id] 41 | 42 | print('---------{}-------'.format(mall_id)) 43 | print('shape: {}'.format(train_data.shape)) 44 | wifi_bssid = set() 45 | _id_cnt = 0 46 | strong_cnt = collections.Counter() 47 | receive_cnt = [] 48 | for wifi_infos in train_data['wifi_infos']: 49 | receive_cnt.append(len(wifi_infos.split(';'))) 50 | for wifi in wifi_infos.split(';'): 51 | _id, _strong, _connect = wifi.split('|') 52 | wifi_bssid.add(_id) 53 | strong_cnt[int(_strong)] += 1 54 | _id_cnt += 1 55 | 56 | with open('./analysis_data/mall_wifi_{}.csv'.format(mall_id), 'w') as f: 57 | f.write('strong,cnt\n') 58 | f.writelines( 59 | '\n'.join('{},{}'.format(strong, cnt) for strong, cnt in sorted(strong_cnt.items(), key=lambda x: x[0]))) 60 | 61 | print('number of bssid: {}, cnt: {}, mean receive:{}'.format(len(wifi_bssid), _id_cnt, 62 | sum(receive_cnt) / len(receive_cnt))) 63 | print() 64 | 65 | 66 | def check_low(): 67 | """ 68 | mall_id wifi_loc wifi 69 | m_7168 0.708214760008 0.686966420034 70 | m_7800 0.721053965037 0.690904484419 71 | m_1920 0.764782750735 0.7520418164 72 | m_4422 0.767413834659 0.730537478911 73 | m_2224 0.790900290416 0.773797999355 74 | m_4079 0.793646944714 0.777400581959 75 | m_6803 0.825242718447 0.79854368932 76 | 'm_1950': 0.924817798236, 0.909474491753 77 | m_5076 0.948070175439 0.938713450292 78 | m_4495 0.972508591065 0.968499427262 79 | """ 80 | train_data = read_train_join_mall() 81 | low_list = { 82 | 'm_7168': 0.708214760008, 83 | 'm_7800': 0.721053965037, 84 | 'm_1920': 0.764782750735, 85 | 'm_4422': 0.767413834659, 86 | 'm_2224': 0.790900290416, 87 | 'm_4079': 0.793646944714, 88 | 'm_6803': 0.825242718447, 89 | 'm_1950': 0.924817798236, 90 | 'm_5076': 0.948070175439, 91 | 'm_4495': 0.972508591065 92 | } 93 | for mall_id, score in sorted(low_list.items(), key=lambda x: x[1]): 94 | check_mall(train_data, mall_id) 95 | 96 | 97 | def _wifi_co_occurrence(train_data, mall_id='m_7168'): 98 | train_data = train_data.loc[train_data['mall_id'] == mall_id] 99 | wifi_and_date = collections.defaultdict(set) 100 | for wifi_infos, _time in zip(train_data['wifi_infos'], train_data['time_stamp']): 101 | _time = datetime.strptime(_time, "%Y-%m-%d %H:%M") 102 | for wifi in wifi_infos.split(';'): 103 | _id, _strong, _connect = wifi.split('|') 104 | wifi_and_date[_id].add(str(_time.date())) 105 | 106 | wifi_association = collections.defaultdict(set) 107 | for wifi_infos in train_data['wifi_infos'].values: 108 | wifi_ids = set() 109 | for wifi in wifi_infos.split(';'): 110 | _id, _strong, _connect = wifi.split('|') 111 | if len(wifi_and_date[_id]) < 5: 112 | continue 113 | wifi_ids.add(_id) 114 | 115 | wifi_ids = list(wifi_ids) 116 | 117 | for i in range(len(wifi_ids)): 118 | for j in range(i + 1, len(wifi_ids)): 119 | wifi_association[wifi_ids[i]].add(wifi_ids[j]) 120 | wifi_association[wifi_ids[j]].add(wifi_ids[i]) 121 | 122 | draw_wifi(wifi_association, mall_id) 123 | res = [] 124 | total = len(wifi_association) 125 | print(total) 126 | for _id, l in wifi_association.items(): 127 | # print(_id, len(l)) 128 | if len(l) > total // 4: 129 | res.append([mall_id, _id]) 130 | return res 131 | 132 | 133 | def wifi_co_occurrence_analysis(): 134 | train_data = read_train_join_mall() 135 | res = [] 136 | for mall_id in train_data['mall_id'].unique(): 137 | res.extend(_wifi_co_occurrence(train_data, mall_id)) 138 | with open('./feature_save/wifi_co_occurrence.csv', 'w') as f: 139 | f.write('mall_id,bssid\n') 140 | for mall_id, bssid in res: 141 | f.write('{},{}\n'.format(mall_id, bssid)) 142 | 143 | 144 | def wifi_empty_statics(): 145 | """ 146 | Wifi那一栏没有为空的 147 | """ 148 | train_data = read_test_data() # read_train_join_mall() 149 | counter = collections.Counter() 150 | for mall_id in train_data['mall_id'].unique(): 151 | data = train_data[train_data['mall_id'] == mall_id] 152 | for wifi_infos in data['wifi_infos']: 153 | cur_wifi_len = len(wifi_infos.split(';')) 154 | if cur_wifi_len == 0: 155 | counter[mall_id] += 1 156 | 157 | for mall_id, cnt in counter.items(): 158 | print(mall_id, cnt) 159 | 160 | 161 | def wifi_apperance_days(mall_id='m_1621'): 162 | import pandas as pd 163 | import numpy as np 164 | _train_data = read_train_join_mall() 165 | train_data = _train_data.loc[_train_data['mall_id'] == mall_id] 166 | train_data = train_data.assign(time_stamp=pd.to_datetime(train_data['time_stamp'])) 167 | train_data['time_stamp'] = train_data['time_stamp'].dt.day 168 | total_count = [collections.defaultdict(set) for _ in range(31)] 169 | bssids = set() 170 | for shop_id, day, wifi_infos in zip(train_data['shop_id'], train_data['time_stamp'], train_data['wifi_infos']): 171 | for wifi_info in wifi_infos.split(';'): 172 | bssid, _, _ = wifi_info.split('|') 173 | bssids.add(bssid) 174 | total_count[day - 1][bssid].add(shop_id) 175 | 176 | cnt = 0 177 | for bssid in sorted(bssids): 178 | t = np.array([len(total_count[i][bssid]) for i in range(31)]) 179 | if np.count_nonzero(t) > 7: 180 | print(t) 181 | cnt += 1 182 | print(cnt, len(bssids)) 183 | 184 | 185 | if __name__ == '__main__': 186 | many_mall_has_same_bssid() 187 | check_low() 188 | wifi_co_occurrence_analysis() 189 | _wifi_co_occurrence(read_train_join_mall()) 190 | wifi_empty_statics() 191 | wifi_apperance_days() 192 | -------------------------------------------------------------------------------- /hrwhisper/common_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/21 3 | # @Author : hrwhisper 4 | import abc 5 | import os 6 | import time 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from scipy import sparse 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.externals import joblib 13 | from sklearn.metrics import accuracy_score 14 | 15 | from parse_data import read_test_data, read_train_join_mall 16 | 17 | 18 | def train_test_split(X, y, test_size=0.2): 19 | train_size = int((1 - test_size) * X.shape[0]) 20 | if isinstance(X, np.ndarray): 21 | return X[:train_size], X[train_size:], y[:train_size], y[train_size:] 22 | else: 23 | return X.iloc[:train_size], X.iloc[train_size:], y.iloc[:train_size], y.iloc[train_size:] 24 | 25 | 26 | def train_test_split_by_date(X, y): 27 | mask = X['time_stamp'] >= '2017-08-18' 28 | return X.loc[mask], X.loc[~mask], y.loc[mask], y.loc[~mask] 29 | 30 | 31 | def safe_dump_model(model, save_path, compress=3): 32 | print('save model......') 33 | dir_name = os.path.dirname(save_path) 34 | if not os.path.exists(dir_name): 35 | os.makedirs(dir_name) 36 | joblib.dump(model, save_path, compress=compress) 37 | print('save model done.') 38 | 39 | 40 | def safe_save_csv_result(res, save_path): 41 | dir_name = os.path.dirname(save_path) 42 | if not os.path.exists(dir_name): 43 | os.makedirs(dir_name) 44 | res.to_csv(save_path) 45 | 46 | 47 | def get_recommend_cpu_count(): 48 | """ 49 | windows: 只跑一半 50 | linux: 为我的测试机或者服务器 51 | """ 52 | t = os.cpu_count() 53 | if os.name == 'nt': return t // 2 54 | if t >= 32: 55 | return t // 8 * 5 - 1 56 | else: 57 | return t - 1 58 | 59 | 60 | class XXToVec(abc.ABC): 61 | def __init__(self, feature_save_path): 62 | self.FEATURE_SAVE_PATH = feature_save_path 63 | 64 | @abc.abstractclassmethod 65 | def _fit_transform(self, train_data, mall_id): 66 | pass 67 | 68 | @abc.abstractclassmethod 69 | def _transform(self, test_data, mall_id): 70 | pass 71 | 72 | def fit_transform(self, train_data, mall_id, renew=True, should_save=False): 73 | """ 74 | 75 | :param train_data: 76 | :param mall_id: 77 | :param renew 78 | :param should_save: 79 | :return: 80 | """ 81 | if renew: 82 | # train_data = train_data.loc[train_data['mall_id'] == mall_id] 83 | features = self._fit_transform(train_data, mall_id) 84 | if should_save: 85 | safe_dump_model(features, self.FEATURE_SAVE_PATH.format('train', mall_id)) 86 | else: 87 | features = joblib.load(self.FEATURE_SAVE_PATH.format('train', mall_id)) 88 | return features 89 | 90 | def transform(self, test_data, mall_id, renew=True, should_save=False): 91 | """ 92 | 93 | :param test_data: 94 | :param mall_id: 95 | :param renew 96 | :param should_save: 97 | :return: 98 | """ 99 | if renew: 100 | # test_data = test_data.loc[test_data['mall_id'] == mall_id] 101 | features = self._transform(test_data, mall_id) 102 | if should_save: 103 | safe_dump_model(features, self.FEATURE_SAVE_PATH.format('test', mall_id)) 104 | else: 105 | features = joblib.load(self.FEATURE_SAVE_PATH.format('test', mall_id)) 106 | return features 107 | 108 | 109 | class DataVector(object): 110 | @staticmethod 111 | def data_to_vec(mall_id, vec_func, data, label=None, is_train=True): 112 | cur_index = data['mall_id'] == mall_id 113 | y = label[cur_index] if label is not None else None 114 | data = data.loc[cur_index] 115 | funcs = [func.fit_transform if is_train else func.transform for func in vec_func] 116 | vectors = [func(data, mall_id) for func in funcs] 117 | X = sparse.hstack(vectors) 118 | return X, y 119 | 120 | @staticmethod 121 | def train_and_test_to_vec(mall_id, vec_func, train_data, train_label, test_data, test_label=None): 122 | X_train, y_train = DataVector.data_to_vec(mall_id, vec_func, train_data, train_label) 123 | X_test, y_test = DataVector.data_to_vec(mall_id, vec_func, test_data, test_label, is_train=False) 124 | return X_train, y_train, X_test, y_test 125 | 126 | 127 | class ModelBase(object): 128 | """ 129 | 多分类 130 | 划分训练集依据: 总体按时间排序后20% 131 | """ 132 | 133 | def __init__(self, test_ratio=0.2, random_state=42, n_jobs=None, use_multiprocess=False, save_model=False, 134 | save_result_proba=False, save_model_base_path='./model_save/', result_save_base_path='./result_save/'): 135 | self._test_ratio = test_ratio 136 | self._random_state = random_state 137 | self.n_jobs = get_recommend_cpu_count() if n_jobs is None else n_jobs 138 | self.use_multiprocess = use_multiprocess 139 | self.SAVE_MODEL = save_model 140 | self.SAVE_RESULT_PROBA = save_result_proba 141 | self.SAVE_MODEL_BASE_PATH = save_model_base_path 142 | self.RESULT_SAVE_BASE_PATH = result_save_base_path 143 | 144 | def get_name(self): 145 | return self.__class__.__name__ 146 | 147 | def _get_classifiers(self): 148 | """ 149 | :return: dict. {name:classifier} 150 | """ 151 | return { 152 | 'random forest': RandomForestClassifier(n_jobs=self.n_jobs, n_estimators=400, bootstrap=False, 153 | random_state=self._random_state, class_weight='balanced'), 154 | } 155 | 156 | @staticmethod 157 | def trained_and_predict_location(clf, X_train, y_train, X_test, y_test=None, predicted_proba=False): 158 | print('fitting....') 159 | clf = clf.fit(X_train, y_train) 160 | print('predict....') 161 | return clf.predict(X_test) if not predicted_proba else clf.predict_proba(X_test) 162 | 163 | 164 | def _single_trained_by_mall_and_predict_location(self, vec_func, train_data, train_label, test_data, 165 | test_label=None): 166 | ans = {} 167 | clf_report = {} 168 | is_train = test_label is not None 169 | for ri, mall_id in enumerate(train_data['mall_id'].unique()): 170 | X_train, y_train, X_test, y_test = DataVector.train_and_test_to_vec(mall_id, vec_func, train_data, 171 | train_label, test_data, 172 | test_label) 173 | classifiers = self._get_classifiers() 174 | for name, clf in classifiers.items(): 175 | predicted = self.trained_and_predict_location(clf, X_train, y_train, X_test, y_test) 176 | if self.SAVE_MODEL: 177 | safe_dump_model(clf, '{}/{}/{}_{}.pkl'.format(self.SAVE_MODEL_BASE_PATH, name, 178 | 'train' if is_train else 'test', mall_id)) 179 | 180 | index = test_data['mall_id'] == mall_id 181 | for row_id, label in zip(test_data[index]['row_id'], predicted): 182 | ans[row_id] = label 183 | 184 | if self.SAVE_RESULT_PROBA: 185 | predicted_pro = clf.predict_proba(X_test) 186 | row_ids = pd.DataFrame(test_data[index]['row_id'].values, columns=['row_id']) 187 | predicted_pro = pd.DataFrame(predicted_pro, columns=clf.classes_) 188 | 189 | safe_save_csv_result(pd.concat([row_ids, predicted_pro], axis=1).set_index('row_id'), 190 | '{}/{}/{}_{}.csv'.format(self.RESULT_SAVE_BASE_PATH, name, 191 | 'train' if is_train else 'test', mall_id)) 192 | 193 | if is_train: 194 | score = accuracy_score(y_test, predicted) 195 | clf_report[name] = clf_report.get(name, 0) + score 196 | print(ri, mall_id, name, score) 197 | else: 198 | print(ri, mall_id) 199 | 200 | if is_train: 201 | cnt = train_data['mall_id'].unique().shape[0] 202 | classifiers = self._get_classifiers() 203 | for name, score in clf_report.items(): 204 | print("{} Mean: {}".format(classifiers[name], score / cnt)) 205 | return ans 206 | 207 | def _trained_by_mall_and_predict_location(self, vec_func, train_data, train_label, test_data, test_label=None): 208 | """ 209 | 210 | :param vec_func: 211 | :param train_data: 212 | :param train_label 213 | :param test_data: 214 | :param test_label: 215 | :return: 216 | """ 217 | return self._single_trained_by_mall_and_predict_location(vec_func, train_data, train_label, test_data, 218 | test_label) 219 | 220 | def train_test(self, vec_func, target_column='shop_id'): 221 | """ 222 | 223 | :param vec_func: list of vector function 224 | :param target_column: the target column you want to predict. 225 | :return: 226 | """ 227 | # ------input data ----------- 228 | train_data = read_train_join_mall() 229 | train_data = train_data.sort_values(by='time_stamp') 230 | train_label = train_data[target_column] 231 | train_data, test_data, train_label, test_label = train_test_split(train_data, train_label, self._test_ratio) 232 | 233 | ans = self._trained_by_mall_and_predict_location(vec_func, train_data, train_label, test_data, test_label) 234 | 235 | def train_and_on_test_data(self, vec_func, target_column='shop_id'): 236 | train_data = read_train_join_mall() 237 | train_label = train_data[target_column] 238 | test_data = read_test_data() 239 | 240 | ans = self._trained_by_mall_and_predict_location(vec_func, train_data, train_label, test_data) 241 | self.result_to_csv(ans, test_data) 242 | 243 | @staticmethod 244 | def result_to_csv(ans, test_data=None): 245 | _save_path = './result' 246 | if not os.path.exists(_save_path): 247 | os.mkdir(_save_path) 248 | with open(_save_path + '/hrwhisper_res_{}.csv'.format(time.strftime("%Y-%m-%d-%H-%M-%S")), 'w') as f: 249 | f.write('row_id,shop_id\n') 250 | if test_data is not None: 251 | for row_id in test_data['row_id']: 252 | f.write('{},{}\n'.format(row_id, ans[row_id])) 253 | else: 254 | for row_id, shop_id in ans.items(): 255 | f.write('{},{}\n'.format(row_id, shop_id)) 256 | print('done') 257 | -------------------------------------------------------------------------------- /hrwhisper/grid_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/11/3 3 | # @Author : hrwhisper 4 | from lightgbm import LGBMClassifier 5 | from sklearn import preprocessing 6 | from sklearn.ensemble import RandomForestClassifier 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.model_selection import cross_validate 9 | from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold 10 | from sklearn.multiclass import OneVsRestClassifier 11 | from xgboost import XGBClassifier 12 | 13 | from common_helper import get_recommend_cpu_count, DataVector 14 | from parse_data import read_train_join_mall 15 | from use_location import LocationToVec2 16 | from use_price import PriceToVec 17 | from use_strong_wifi import WifiStrongToVec 18 | from use_wifi import WifiToVec 19 | from use_wifi_kstrong import WifiKStrongToVec 20 | 21 | 22 | def multiclass_xgboost(): 23 | parameters = {'estimator__n_jobs': [1], 24 | 'objective': ['multi:softmax'], 25 | 'learning_rate': [0.025, 0.05, 0.1], 26 | 'max_depth': [6, 7, 8, 10], 27 | 'min_child_weight': [1, 5, 10], 28 | 'silent': [1], 29 | 'subsample': [0.6, 0.8, 0.9, 1], 30 | 'colsample_bytree': [0.7, 0.8, ], 31 | 'n_estimators': [100, 200, 500], 32 | 'missing': [-999], 33 | 'random_state': [1337, 1080, 1024, 1226]} 34 | clf = GridSearchCV(XGBClassifier(), parameters, n_jobs=get_recommend_cpu_count() // 2, 35 | cv=KFold(n_splits=5, random_state=42), 36 | verbose=1, refit=True) 37 | return clf 38 | 39 | 40 | def binary_xgboost(): 41 | parameters = {'estimator__n_jobs': [1], 42 | 'estimator__objective': ['binary:logistic'], 43 | 'estimator__learning_rate': [0.01, 0.025, 0.05, 0.1], # 0.015, 44 | 'estimator__max_depth': [5, 7, 9, 12, 15], # 3, 17 45 | 'estimator__gamma': [0, 0.05, 0.1], # 0.3 0.75 46 | 'estimator__min_child_weight': [1, 3, 5], # 7 47 | 'estimator__subsample': [0.6, 0.7, 0.8, 1.0], # 0.9, 48 | 'estimator__colsample_bytree': [0.6, 0.7, 0.8, 1.0], # 0.9, 49 | 'estimator__reg_alpha': [0, 0.1, ], # 0.5 50 | 'estimator__reg_lambda': [0.01, 0.05, 1], # 0.01, 51 | 'estimator__n_estimators': [600], 52 | 'estimator__silent': [1], 53 | 'estimator__missing': [-999], 54 | 'estimator__random_state': [1080], 55 | } 56 | 57 | clf = GridSearchCV(XGBClassifier(), parameters, n_jobs=get_recommend_cpu_count(), 58 | cv=KFold(n_splits=5, random_state=42), 59 | verbose=1, refit=True) 60 | return clf 61 | 62 | # 'light gbm': LGBMClassifier(n_jobs=self.n_jobs, 63 | # n_estimators=500, 64 | # learning_rate=0.05, 65 | # num_leaves=127, 66 | # max_depth=8, 67 | # ), 68 | 69 | 70 | def lightgbm(): 71 | parameters = {'n_jobs': [1], 72 | 'n_estimators': [500, 1000], 73 | 'num_leaves': [60, 70, 80, 90, 110, 120, 140], 74 | 'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 75 | 'max_depth': [5, 7, 9, 12, 15, 17], # 3 76 | 'min_child_weight': [1, 3, 5], # 7 77 | # 'subsample': [0.6, 0.7, 0.8, 1.0], # 0.9, 78 | # 'colsample_bytree': [0.6, 0.7, 0.8, 1.0], # 0.9, 79 | # 'reg_alpha': [0, 0.1, ], # 0.5 80 | # 'reg_lambda': [0.01, 0.05, 1], # 0.01, 81 | 'silent': [1], 82 | 'random_state': [1080], 83 | } 84 | 85 | clf = GridSearchCV(LGBMClassifier(), parameters, n_jobs=get_recommend_cpu_count(), 86 | cv=KFold(n_splits=3, random_state=42), 87 | verbose=1, refit=True) 88 | return clf 89 | 90 | 91 | def grid_search(clf): 92 | train_data = read_train_join_mall() 93 | train_data = train_data.sort_values(by='time_stamp') 94 | train_label = preprocessing.LabelEncoder().fit_transform(train_data['shop_id']) 95 | 96 | for mall_id in ['m_7374']: # train_data['mall_id'].unique(): 97 | X_train, y_train = DataVector.data_to_vec(mall_id, 98 | [LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), 99 | PriceToVec()], 100 | train_data, train_label) 101 | # print('fit.....') 102 | clf.fit(X_train, y_train) 103 | # print('fit done') 104 | 105 | print('{} score: {}'.format(mall_id, clf.best_score_)) 106 | for name, val in clf.best_params_.items(): 107 | print("{} {}".format(name, val)) 108 | print('----------') 109 | with open('./console_output/grid_search_res.txt', 'a') as f: 110 | f.write('{} score: {}\n'.format(mall_id, clf.best_score_)) 111 | for name, val in clf.best_params_.items(): 112 | f.write("{} {}\n".format(name, val)) 113 | f.write('------\n\n\n') 114 | f.flush() 115 | 116 | 117 | if __name__ == '__main__': 118 | grid_search(lightgbm()) 119 | 120 | # train_data = read_train_join_mall() 121 | # train_data = train_data.loc[train_data['mall_id'] == 'm_6803'] 122 | # train_data = train_data.sort_values(by='time_stamp') 123 | # train_label = train_data['shop_id'] 124 | # train_data, test_data, train_label, test_label = train_test_split(train_data, train_label, 0.2) 125 | # 126 | # b = ModelBase() 127 | # X_train, y_train, X_test, y_test = b._train_and_test_to_vec('m_6803', 128 | # [LocationToVec2(), WifiToVec(), WifiStrongToVec(), 129 | # WifiKStrongToVec(), 130 | # PriceToVec()], 131 | # train_data, train_label, test_data, test_label) 132 | # clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=4, n_estimators=400, 133 | # random_state=42, class_weight='balanced')) 134 | # 135 | # print('fit.....') 136 | # clf.fit(X_train, y_train) 137 | # 138 | # print('fit done') 139 | # print(accuracy_score(y_test, clf.predict(X_test))) 140 | -------------------------------------------------------------------------------- /hrwhisper/model_stacking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/11/6 3 | # @Author : hrwhisper 4 | 5 | import os 6 | 7 | import numpy as np 8 | from sklearn import preprocessing 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.externals import joblib 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.model_selection import KFold 13 | from sklearn.multiclass import OneVsRestClassifier 14 | from xgboost import XGBClassifier 15 | 16 | from common_helper import ModelBase, DataVector, safe_dump_model, train_test_split 17 | from parse_data import read_train_join_mall, read_test_data 18 | from use_location import LocationToVec2 19 | from use_price import PriceToVec 20 | from use_strong_wifi import WifiStrongToVec 21 | from use_wifi import WifiToVec 22 | from use_wifi_kstrong import WifiKStrongToVec 23 | 24 | 25 | class ModelStacking(ModelBase): 26 | def __init__(self, renew=False, save_model=True, save_model_base_path='./model_save/stacking'): 27 | super().__init__(save_model=save_model, save_model_base_path=save_model_base_path) 28 | self.renew = renew 29 | self.new_feature_save_base_path = './feature_save/stacking/' 30 | 31 | def _get_classifiers(self): 32 | """ 33 | :return: dict. {name:classifier} 34 | """ 35 | return { 36 | 'random forest': RandomForestClassifier(n_jobs=self.n_jobs, 37 | n_estimators=400, 38 | bootstrap=False, 39 | min_samples_split=4, 40 | min_samples_leaf=1, 41 | random_state=self._random_state, 42 | class_weight='balanced'), 43 | 'binary random forest': OneVsRestClassifier(RandomForestClassifier(n_estimators=400, 44 | bootstrap=False, 45 | random_state=self._random_state, 46 | class_weight='balanced'), 47 | n_jobs=self.n_jobs), 48 | # 'xgb': XGBClassifier(colsample_bytree=0.7, 49 | # learning_rate=0.025, 50 | # max_depth=6, 51 | # min_child_weight=1, 52 | # missing=-999, 53 | # n_jobs=os.cpu_count() // 3 * 2, 54 | # n_estimators=500, 55 | # objective='binary:logistic', 56 | # random_state=1024, 57 | # _silent=1, 58 | # subsample=0.6), 59 | 'binary xgb': OneVsRestClassifier(XGBClassifier(colsample_bytree=0.7, 60 | learning_rate=0.025, 61 | max_depth=6, 62 | min_child_weight=1, 63 | missing=-999, 64 | # n_jobs=os.cpu_count() // 3 * 2, 65 | n_estimators=500, 66 | objective='binary:logistic', 67 | random_state=1024, 68 | _silent=1, 69 | subsample=0.6 70 | ) 71 | , n_jobs=self.n_jobs) 72 | } 73 | 74 | def train_test(self, vec_func, target_column='shop_id', fold=5): 75 | """ 76 | 77 | :param vec_func: list of vector function 78 | :param target_column: the target column you want to predict. 79 | :param fold: the fold of cross-validation. 80 | :return: None 81 | """ 82 | # ------input data ----------- 83 | _train_data = read_train_join_mall() 84 | _train_data = _train_data.sort_values(by='time_stamp') 85 | _test_data = read_test_data() 86 | 87 | ans = {} 88 | total_score = 0 89 | for mall_id in _train_data['mall_id'].unique(): 90 | train_data = _train_data.loc[_train_data['mall_id'] == mall_id] 91 | train_label = train_data[target_column].values 92 | test_data = _test_data.loc[_test_data['mall_id'] == mall_id] 93 | 94 | label_encoder = preprocessing.LabelEncoder() 95 | train_label = label_encoder.fit_transform(train_label) 96 | 97 | new_train_feature, new_test_feature = [], [] 98 | kf = KFold(n_splits=fold, random_state=self._random_state) 99 | 100 | for clf_name, clf in self._get_classifiers().items(): 101 | oof_train = np.zeros((train_data.shape[0], len(label_encoder.classes_))) 102 | oof_test = np.zeros((test_data.shape[0], len(label_encoder.classes_))) 103 | 104 | for i, (train_index, test_index) in enumerate(kf.split(train_data)): 105 | self._trained_and_predict(vec_func, train_data, train_label, test_data, train_index, test_index, 106 | oof_train, oof_test, i, clf_name, mall_id) 107 | 108 | new_train_feature.append(oof_train) 109 | new_test_feature.append(oof_test / fold) 110 | 111 | new_train_feature = np.hstack(new_train_feature) 112 | new_test_feature = np.hstack(new_test_feature) 113 | # print(new_train_feature.shape) 114 | 115 | # ------- second layer. 116 | clf = self._get_classifiers()['random forest'] 117 | 118 | # -------- on test data 119 | clf.fit(new_train_feature, train_label) 120 | predicted = clf.predict(new_test_feature) 121 | 122 | for row_id, label in zip(test_data['row_id'], predicted): 123 | ans[row_id] = label 124 | 125 | # ---------to report accuracy score 126 | train_data, test_data, train_label, test_label = train_test_split(new_train_feature, train_label, 127 | self._test_ratio) 128 | predicted = self.trained_and_predict_location(clf, train_data, train_label, test_data, test_label) 129 | score = accuracy_score(test_label, predicted) 130 | total_score += score 131 | print('---second layer: {} {}'.format(mall_id, score)) 132 | 133 | print('mean score{}'.format(total_score / _train_data['mall_id'].unique())) 134 | self.result_to_csv(ans) 135 | 136 | def _trained_and_predict(self, vec_func, _train_data, _train_label, R_X_test, 137 | train_index, test_index, oof_train, oof_test, cur_fold, clf_name, mall_id): 138 | 139 | fold_X_train, fold_y_train = _train_data.iloc[train_index], _train_label[train_index] 140 | fold_X_test, fold_y_test = _train_data.iloc[test_index], _train_label[test_index] 141 | 142 | assert len(fold_X_train['mall_id'].unique()) == 1 143 | 144 | X_train, y_train, X_test, y_test = DataVector.train_and_test_to_vec(mall_id, vec_func, fold_X_train, 145 | fold_y_train, fold_X_test, 146 | fold_y_test) 147 | 148 | cur_save_path = '{}/{}/{}_{}'.format(self.SAVE_MODEL_BASE_PATH, clf_name, mall_id, cur_fold) 149 | if self.renew: 150 | clf = self._get_classifiers()[clf_name] 151 | clf.fit(X_train, y_train) 152 | if self.SAVE_MODEL: 153 | safe_dump_model(clf, cur_save_path) 154 | else: 155 | clf = joblib.load(cur_save_path) 156 | 157 | res = clf.predict_proba(X_test) 158 | res[np.isnan(res)] = 0 159 | oof_train[np.ix_(test_index, clf.classes_)] = res 160 | 161 | predicted = clf.predict(X_test) 162 | score = accuracy_score(y_test, predicted) 163 | 164 | X_test, _ = DataVector.data_to_vec(mall_id, vec_func, R_X_test, None, is_train=False) 165 | 166 | res = clf.predict_proba(X_test) 167 | # set the inf to zero. OneVsRestClassifier has done normalized, it cause some value to inf. 168 | res[np.isnan(res)] = 0 169 | oof_test[:, clf.classes_] += res 170 | 171 | print('mall_id: {} cur_fold: {} classifier name: {} score: {}'.format(mall_id, cur_fold, clf_name, score)) 172 | 173 | 174 | def train_test(): 175 | task = ModelStacking(renew=True) 176 | task.train_test([LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), PriceToVec()]) 177 | task.train_and_on_test_data([LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), PriceToVec()]) 178 | 179 | 180 | if __name__ == '__main__': 181 | train_test() 182 | -------------------------------------------------------------------------------- /hrwhisper/model_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/29 3 | # @Author : hrwhisper 4 | import os 5 | 6 | from lightgbm import LGBMClassifier 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.multiclass import OneVsRestClassifier 9 | from xgboost import XGBClassifier 10 | 11 | from common_helper import ModelBase 12 | from use_category2 import CategoryToVec2 13 | from use_location import LocationToVec2 14 | from use_price import PriceToVec 15 | from use_strong_wifi import WifiStrongToVec 16 | from use_wifi import WifiToVec 17 | from use_wifi_kstrong import WifiKStrongToVec 18 | 19 | 20 | class ModelTest(ModelBase): 21 | def __init__(self, save_model, use_multiprocess, save_result_proba): 22 | super().__init__(save_model=save_model, use_multiprocess=use_multiprocess, save_result_proba=save_result_proba) 23 | 24 | def _get_classifiers(self): 25 | """ 26 | :return: dict. {name:classifier} 27 | """ 28 | return { 29 | # 'light gbm': LGBMClassifier(n_jobs=self.n_jobs, 30 | # n_estimators=500, 31 | # learning_rate=0.05, 32 | # num_leaves=127, 33 | # max_depth=8, 34 | # ), 35 | 'random forest': RandomForestClassifier(n_jobs=self.n_jobs, 36 | n_estimators=400, 37 | bootstrap=False, 38 | min_samples_split=4, 39 | min_samples_leaf=1, 40 | random_state=self._random_state, 41 | class_weight='balanced'), 42 | # 'binary random forest': OneVsRestClassifier(RandomForestClassifier(n_estimators=400, 43 | # bootstrap=False, 44 | # random_state=self._random_state, 45 | # class_weight='balanced'), 46 | # n_jobs=self.n_jobs), 47 | # 'xgb': XGBClassifier(colsample_bytree=0.7, 48 | # learning_rate=0.025, 49 | # max_depth=6, 50 | # min_child_weight=1, 51 | # missing=-999, 52 | # n_jobs=self.n_jobs, 53 | # n_estimators=500, 54 | # objective='binary:logistic', 55 | # random_state=1024, 56 | # _silent=1, 57 | # subsample=0.6), 58 | # 59 | # 'binary xgb': OneVsRestClassifier(XGBClassifier(colsample_bytree=0.7, 60 | # learning_rate=0.025, 61 | # max_depth=6, 62 | # min_child_weight=1, 63 | # missing=-999, 64 | # # n_jobs=os.cpu_count() // 3 * 2, 65 | # n_estimators=500, 66 | # objective='binary:logistic', 67 | # random_state=1024, 68 | # _silent=1, 69 | # subsample=0.6 70 | # ) 71 | # , n_jobs=self.n_jobs) 72 | } 73 | 74 | 75 | def train_test(): 76 | task = ModelTest(save_model=False, use_multiprocess=False, save_result_proba=True) 77 | vecs = [LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), PriceToVec(), 78 | CategoryToVec2(), 79 | # UserToVec() 80 | ] 81 | print(vecs) 82 | task.train_test(vecs) 83 | task.train_and_on_test_data(vecs) 84 | 85 | 86 | if __name__ == '__main__': 87 | train_test() 88 | -------------------------------------------------------------------------------- /hrwhisper/model_voting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/11/10 3 | # @Author : hrwhisper 4 | """ 5 | classifier voting. the idea is the same as sklearn's VotingClassifier. 6 | But it load the probability result file. 7 | 8 | ['random forest 0.9169', 'binary random forest 0.9174'] [1, 1] 9 | Mean: 0.9192280256032719 10 | online:0.9214 11 | 12 | ['random forest 0.9169', 'binary random forest 0.9174','binary xgb 0.9142'] [10, 10, 5] 13 | Mean: 0.920154410602307 14 | online:0.9234 15 | 16 | ['random forest 0.91828', 'binary random forest 0.91813', 'binary xgb 0.9147'] weights = [10, 9, 6] 17 | Mean: 0.920985617387946 18 | online:0.9234 比上面的略高 19 | 20 | ['random forest 0.9185', 'binary random forest 0.91813', 'binary xgb 0.9147'] weights = [10, 9, 6] 21 | Mean: 0.9210399096493549 22 | online:0.9237 23 | 24 | ['random forest 0.9198', 'binary random forest 0.91813', 'binary xgb 0.9147'] [1, 0.8, 0.4] 25 | Mean: 0.9221356685641695 26 | online: 0.9243 27 | 28 | ['random forest 0.9198', 'binary random forest 0.9200', 'binary xgb 0.9147'] [0.98, 1, 0.47] 29 | Mean: 0.9232737732023647 30 | online: 0.9264 31 | online B:0.9207 32 | 33 | ['random forest 0.919969', 'binary random forest 0.9200', 'binary xgb 0.9147'] [80, 78, 34] 34 | Mean: 0.923497179961667 35 | online B:0.9207 略高于上面 36 | 37 | 38 | ['random forest 0.919969', 'binary random forest not price 0.91998', 'binary xgb 0.9149'] [78, 80, 34] 39 | Mean: 0.923978501467714 40 | online B:0.9213 41 | 42 | 43 | ['random forest not price 0.9199', 'binary random forest not price 0.91998', 'binary xgb 0.9149', 'xgb 0.9123'] [78, 80, 34, 5] 44 | Mean: 0.9240684683620611 45 | online B 0.9115 46 | """ 47 | 48 | import pandas as pd 49 | import numpy as np 50 | 51 | from sklearn.metrics import accuracy_score 52 | 53 | from common_helper import ModelBase 54 | 55 | 56 | class ModelVoting(ModelBase): 57 | def __init__(self, estimators, weights=None): 58 | super().__init__(save_model=False, use_multiprocess=False) 59 | self.estimators = estimators 60 | self.weights = weights 61 | 62 | def _single_trained_by_mall_and_predict_location(self, vec_func, train_data, train_label, test_data, 63 | test_label=None): 64 | ans = {} 65 | is_train = test_label is not None 66 | total_score = 0 67 | for ri, mall_id in enumerate(train_data['mall_id'].unique()): 68 | y_test = test_label[test_data['mall_id'] == mall_id] if test_label is not None else None 69 | probas = [ 70 | pd.read_csv('{}/{}/{}_{}.csv'.format(self.RESULT_SAVE_BASE_PATH, name, 71 | 'train' if is_train else 'test', mall_id)).set_index('row_id') 72 | for name in self.estimators 73 | ] 74 | 75 | class_ = probas[0].columns.values 76 | res = np.average([p.values for p in probas], weights=self.weights, axis=0) 77 | predicted = class_[np.argmax(res, axis=1)] 78 | # print(predicted) 79 | 80 | for row_id, label in zip(test_data[test_data['mall_id'] == mall_id]['row_id'], predicted): 81 | ans[row_id] = label 82 | 83 | if is_train: 84 | score = accuracy_score(y_test, predicted) 85 | total_score += score 86 | print(ri, mall_id, score) 87 | else: 88 | print(ri, mall_id) 89 | 90 | if is_train: 91 | cnt = train_data['mall_id'].unique().shape[0] 92 | print("Mean: {}".format(total_score / cnt)) 93 | return ans 94 | 95 | 96 | def train_test(): 97 | models = ['random forest not price 0.9199', 'binary random forest not price 0.91998', 'binary xgb 0.9149', 98 | 'xgb 0.9123'] 99 | weights = [78, 80, 34, 5] # Mean: 0.9240684683620611 100 | task = ModelVoting(models, weights=weights) 101 | task.train_test(vec_func=None) 102 | print(models, weights) 103 | task.train_and_on_test_data(vec_func=None) 104 | 105 | 106 | if __name__ == '__main__': 107 | train_test() 108 | -------------------------------------------------------------------------------- /hrwhisper/parse_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/20 3 | # @Author : hrwhisper 4 | import pandas as pd 5 | 6 | 7 | def read_mall_data(): 8 | mall_data_path = '../data/mall.csv' 9 | return pd.read_csv(mall_data_path) 10 | 11 | 12 | def read_train_data(): 13 | _path = '../data/train_row_id.csv' 14 | return pd.read_csv(_path, dtype={'row_id': str}) 15 | 16 | 17 | def read_test_data(): 18 | test_data_path = '../data/test.csv' 19 | return pd.read_csv(test_data_path, dtype={'row_id': str}) 20 | 21 | 22 | def read_train_join_mall(): 23 | mall_data = read_mall_data() 24 | train_data = read_train_data() # 1138015 25 | return train_data.join(mall_data.set_index('shop_id'), on='shop_id', rsuffix='_mall') 26 | 27 | 28 | def add_row_id_for_train_data(): 29 | train_data = pd.read_csv('../data/train.csv') 30 | df1 = train_data.assign(row_id=pd.Series(['_{}'.format(i) for i in range(train_data.shape[0])])).set_index('row_id') 31 | df1.to_csv('../data/train_row_id.csv') 32 | 33 | 34 | if __name__ == '__main__': 35 | read_test_data() 36 | # mall_data = read_mall_data() 37 | # train_data = read_train_data() 38 | # # print(train_data.head()) 39 | # res = train_data.join(mall_data.set_index('shop_id'), on='shop_id', rsuffix='_mall') 40 | # # print(res.head()) 41 | # print(res.loc[res['mall_id'] == 'm_1409'].head()) 42 | # # print(res.info()) 43 | add_row_id_for_train_data() 44 | # print('add') 45 | # t = read_train_data() 46 | # print(t[[True, True] + [False] * (t.shape[0] - 2)]) 47 | # print(t.head()) 48 | -------------------------------------------------------------------------------- /hrwhisper/predict_category_pro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/29 3 | # @Author : hrwhisper 4 | """ 5 | Given a user feature, predict the category_id. the predicted category_id will be use as a feature for predicting 6 | shop_id. 7 | """ 8 | import os 9 | 10 | import numpy as np 11 | import pandas as pd 12 | from sklearn import preprocessing 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.externals import joblib 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.model_selection import KFold 17 | 18 | from common_helper import ModelBase, DataVector, safe_dump_model, safe_save_csv_result 19 | from parse_data import read_train_join_mall, read_test_data 20 | from use_location import LocationToVec2 21 | from use_price import PriceToVec 22 | from use_strong_wifi import WifiStrongToVec 23 | from use_wifi import WifiToVec 24 | from use_wifi_kstrong import WifiKStrongToVec 25 | 26 | 27 | class CategoryPredicted(ModelBase): 28 | def __init__(self): 29 | super().__init__(save_model_base_path='./feature_save/category/') 30 | 31 | def _get_classifiers(self): 32 | """ 33 | :return: dict. {name:classifier} 34 | """ 35 | return RandomForestClassifier(n_jobs=self.n_jobs, 36 | n_estimators=400, 37 | bootstrap=False, 38 | min_samples_split=4, 39 | min_samples_leaf=1, 40 | random_state=self._random_state, 41 | class_weight='balanced') 42 | 43 | def train_test(self, vec_func, target_column='category_id', fold=5): 44 | """ 45 | 46 | :param vec_func: list of vector function 47 | :param target_column: the target column you want to predict. 48 | :param fold: the fold of cross-validation. 49 | :return: None 50 | """ 51 | # ------input data ----------- 52 | _train_data = read_train_join_mall() 53 | _train_data = _train_data.sort_values(by='time_stamp') 54 | _test_data = read_test_data() 55 | 56 | for mall_id in _train_data['mall_id'].unique(): 57 | train_data = _train_data.loc[_train_data['mall_id'] == mall_id] 58 | train_label = train_data[target_column].values 59 | test_data = _test_data.loc[_test_data['mall_id'] == mall_id] 60 | 61 | label_encoder = preprocessing.LabelEncoder() 62 | train_label = label_encoder.fit_transform(train_label) 63 | 64 | kf = KFold(n_splits=fold, random_state=self._random_state) 65 | 66 | oof_train = np.zeros((train_data.shape[0], len(label_encoder.classes_))) 67 | oof_test = np.zeros((test_data.shape[0], len(label_encoder.classes_))) 68 | 69 | for i, (train_index, test_index) in enumerate(kf.split(train_data)): 70 | self._trained_and_predict(vec_func, train_data, train_label, test_data, train_index, test_index, 71 | oof_train, oof_test, i, mall_id) 72 | oof_test /= fold 73 | 74 | cur_save_path = '{}/{}'.format(self.SAVE_MODEL_BASE_PATH, mall_id) 75 | 76 | safe_dump_model(oof_train, cur_save_path + '_train.pkl') 77 | safe_dump_model(oof_test, cur_save_path + '_test.pkl') 78 | 79 | row_ids = pd.DataFrame(train_data['row_id'].values, columns=['row_id']) 80 | oof_train = pd.DataFrame(oof_train, columns=label_encoder.classes_) 81 | safe_save_csv_result(pd.concat([row_ids, oof_train], axis=1).set_index('row_id'), 82 | cur_save_path + '_train.csv') 83 | 84 | row_ids = pd.DataFrame(test_data['row_id'].values, columns=['row_id']) 85 | oof_test = pd.DataFrame(oof_test, columns=label_encoder.classes_) 86 | safe_save_csv_result(pd.concat([row_ids, oof_test], axis=1).set_index('row_id'), 87 | cur_save_path + '_test.csv') 88 | 89 | def _trained_and_predict(self, vec_func, _train_data, _train_label, R_X_test, 90 | train_index, test_index, oof_train, oof_test, cur_fold, mall_id): 91 | 92 | fold_X_train, fold_y_train = _train_data.iloc[train_index], _train_label[train_index] 93 | fold_X_test, fold_y_test = _train_data.iloc[test_index], _train_label[test_index] 94 | 95 | assert len(fold_X_train['mall_id'].unique()) == 1 96 | 97 | X_train, y_train, X_test, y_test = DataVector.train_and_test_to_vec(mall_id, vec_func, fold_X_train, 98 | fold_y_train, fold_X_test, 99 | fold_y_test) 100 | 101 | clf = self._get_classifiers() 102 | clf.fit(X_train, y_train) 103 | 104 | res = clf.predict_proba(X_test) 105 | res[np.isnan(res)] = 0 106 | oof_train[np.ix_(test_index, clf.classes_)] = res 107 | 108 | predicted = clf.predict(X_test) 109 | score = accuracy_score(y_test, predicted) 110 | 111 | X_test, _ = DataVector.data_to_vec(mall_id, vec_func, R_X_test, None, is_train=False) 112 | 113 | res = clf.predict_proba(X_test) 114 | # set the inf to zero. OneVsRestClassifier has done normalized, it cause some value to inf. 115 | res[np.isnan(res)] = 0 116 | oof_test[:, clf.classes_] += res 117 | 118 | print('mall_id: {} cur_fold: {} score: {}'.format(mall_id, cur_fold, score)) 119 | 120 | 121 | def recovery_probability_from_pkl(): 122 | _train_data = read_train_join_mall() 123 | _train_data = _train_data.sort_values(by='time_stamp') 124 | _train_label = _train_data['category_id'].values 125 | _test_data = read_test_data() 126 | 127 | le = preprocessing.LabelEncoder().fit(_train_label) 128 | # print(le.classes_) 129 | _train_label = le.transform(_train_label) 130 | 131 | m, n = _train_data.shape[0], len(le.classes_) 132 | print(m, n) 133 | 134 | oof_train = joblib.load('./feature_save/predicted_category_pro.csv_oof_train2.pkl') 135 | oof_test = joblib.load('./feature_save/predicted_category_pro.csv_oof_test2.pkl') 136 | with open('./feature_save/predicted_category_pro.csv', 'w') as f: 137 | f.write('row_id,{}\n'.format(','.join(str(i) for i in range(n)))) 138 | for i, row_id in enumerate(_train_data['row_id']): 139 | f.write('{},{}\n'.format(row_id, ','.join(list(str(x) for x in oof_train[i])))) 140 | for i, row_id in enumerate(_test_data['row_id']): 141 | f.write('{},{}\n'.format(row_id, ','.join(list(str(x) for x in oof_test[i])))) 142 | 143 | 144 | def train_test(): 145 | task = CategoryPredicted() 146 | func = [LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), PriceToVec()] 147 | task.train_test(func, 'category_id', fold=10) 148 | 149 | 150 | if __name__ == '__main__': 151 | train_test() 152 | -------------------------------------------------------------------------------- /hrwhisper/predict_price.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/31 3 | # @Author : hrwhisper 4 | """ 5 | Given a user feature, predict the price. the predicted price will be use as a feature for predicting shop_id. 6 | """ 7 | import os 8 | 9 | import numpy as np 10 | from sklearn.ensemble import RandomForestRegressor 11 | from sklearn.externals import joblib 12 | from sklearn.model_selection import KFold 13 | 14 | from common_helper import ModelBase, DataVector 15 | from parse_data import read_train_join_mall, read_test_data 16 | from use_category2 import CategoryToVec2 17 | from use_location import LocationToVec2 18 | from use_price import PriceToVec 19 | from use_strong_wifi import WifiStrongToVec 20 | from use_time import TimeToVec 21 | from use_wifi import WifiToVec 22 | from use_wifi_kstrong import WifiKStrongToVec 23 | 24 | 25 | class CategoryPredicted(ModelBase): 26 | def __init__(self): 27 | super().__init__() 28 | self.feature_save_path = './feature_save/predicted_price3.csv' 29 | 30 | def _get_classifiers(self): 31 | """ 32 | :return: dict. {name:classifier} 33 | """ 34 | return { 35 | 'RandomForestRegressor ': RandomForestRegressor(n_estimators=100, n_jobs=self.n_jobs, random_state=42) 36 | } 37 | 38 | def train_test(self, vec_func, target_column='price', fold=10): 39 | """ 40 | 41 | :param vec_func: list of vector function 42 | :param target_column: the target column you want to predict. 43 | :param fold: the fold of cross-validation. 44 | :return: None 45 | """ 46 | # ------input data ----------- 47 | _train_data = read_train_join_mall() 48 | _train_data = _train_data.sort_values(by='time_stamp') 49 | _train_label = _train_data[target_column].values 50 | _test_data = read_test_data() 51 | 52 | kf = KFold(n_splits=fold, random_state=self._random_state) 53 | oof_train = np.zeros((_train_data.shape[0],)) 54 | oof_test = np.zeros((_test_data.shape[0],)) 55 | oof_test_skf = np.zeros((fold, _test_data.shape[0])) 56 | 57 | fold_error = 0 58 | for i, (train_index, test_index) in enumerate(kf.split(_train_data)): 59 | print(i) 60 | fold_error += self._trained_and_predict(vec_func, _train_data, _train_label, _test_data, oof_train, 61 | oof_test_skf, 62 | train_index, test_index, i) 63 | 64 | oof_test[:] = oof_test_skf.mean(axis=0) 65 | 66 | joblib.dump(oof_train, self.feature_save_path + '_oof_train.pkl', compress=3) 67 | joblib.dump(oof_test, self.feature_save_path + '_oof_test.pkl', compress=3) 68 | 69 | print(fold_error / fold) 70 | 71 | with open(self.feature_save_path, 'w') as f: 72 | f.write('row_id,p_price\n') 73 | for i, row_id in enumerate(_train_data['row_id']): 74 | f.write('{},{}\n'.format(row_id, oof_train[i])) 75 | for i, row_id in enumerate(_test_data['row_id']): 76 | f.write('{},{}\n'.format(row_id, oof_test[i])) 77 | print('done') 78 | 79 | def _trained_and_predict(self, vec_func, _train_data, _train_label, R_X_test, 80 | oof_train, oof_test, _train_index, _test_index, cur_fold): 81 | mall_id_list = sorted(list(_train_data.iloc[_train_index]['mall_id'].unique())) 82 | _train_index = set(_train_index) 83 | _test_index = set(_test_index) 84 | clf = list(self._get_classifiers().values())[0] 85 | total_error = 0 86 | for ri, mall_id in enumerate(mall_id_list): 87 | # 先得到当前商场的所有下标,然后和训练的下标做交集才对。 88 | index = set(np.where(_train_data['mall_id'] == mall_id)[0]) 89 | 90 | train_index = np.array(list(_train_index & index)) 91 | test_index = np.array(list(_test_index & index)) 92 | 93 | data = _train_data 94 | label = _train_label 95 | 96 | fold_X_train, fold_y_train = data.iloc[train_index], label[train_index] 97 | fold_X_test, fold_y_test = data.iloc[test_index], label[test_index] 98 | 99 | assert len(fold_X_train['mall_id'].unique()) == 1 100 | 101 | X_train, y_train, X_test, y_test = DataVector.train_and_test_to_vec(mall_id, vec_func, fold_X_train, 102 | fold_y_train, fold_X_test, fold_y_test) 103 | print('fit...') 104 | clf.fit(X_train, y_train) 105 | # print(X_train.shape, y_train.shape, X_test.shape) 106 | 107 | print('predict.....') 108 | predicted = clf.predict(X_test) 109 | 110 | # predicted = np.array([round(i) for i in predicted]) 111 | # print(predicted.shape) 112 | 113 | oof_train[test_index] = predicted 114 | 115 | # print(y_test.shape) 116 | # print(y_test[:100]) 117 | # print(predicted[:100]) 118 | error = np.average(np.abs(predicted - y_test)) 119 | total_error += error 120 | # mean_absolute_error(y_test, predicted, multioutput='raw_values') 121 | print(ri, mall_id, error) 122 | 123 | X_test, _ = DataVector.data_to_vec(mall_id, vec_func, R_X_test, None, is_train=False) 124 | oof_test[cur_fold, np.where(R_X_test['mall_id'] == mall_id)[0]] += clf.predict(X_test) 125 | 126 | return total_error / len(mall_id_list) 127 | 128 | 129 | def train_test(): 130 | task = CategoryPredicted() 131 | func = [LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), CategoryToVec2()] 132 | task.train_test(func, 'price', fold=20) 133 | # task.train_and_on_test_data(func, 'price') 134 | 135 | 136 | def recovery_price_from_pkl(): 137 | _train_data = read_train_join_mall() 138 | _train_data = _train_data.sort_values(by='time_stamp') 139 | _test_data = read_test_data() 140 | 141 | oof_train = joblib.load('./feature_save/predicted_price.csv_oof_train.pkl') 142 | oof_test = joblib.load('./feature_save/predicted_price.csv_oof_test.pkl') 143 | print(oof_train.shape, _train_data.shape) 144 | print(oof_test.shape, _test_data.shape) 145 | 146 | with open('./feature_save/predicted_price.csv', 'w') as f: 147 | f.write('row_id,p_price\n') 148 | for row_id, p in zip(_train_data['row_id'], oof_train): 149 | f.write('{},{}\n'.format(row_id, p)) 150 | for row_id, p in zip(_test_data['row_id'], oof_test): 151 | f.write('{},{}\n'.format(row_id, p)) 152 | 153 | 154 | if __name__ == '__main__': 155 | train_test() 156 | # recovery_price_from_pkl() 157 | -------------------------------------------------------------------------------- /hrwhisper/season2/README.md: -------------------------------------------------------------------------------- 1 | # 天池 商场中精确定位用户所在店铺 -第二赛季 2 | - 第二赛季没怎么打,第一次用数加,感觉平台体验太差,经常排队都没啥心思打。 3 | - 就各个mall取了wifi top 1000,加上经纬度,直接RF多分类。 4 | - B榜 0.7863 5 | 6 | 7 | -------------------------------------------------------------------------------- /hrwhisper/season2/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/12/1 3 | # @Author : hrwhisper -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/1 create_train_table.sql: -------------------------------------------------------------------------------- 1 | -- DROP TABLE hr_train; 2 | CREATE TABLE hr_train 3 | AS 4 | SELECT ROW_NUMBER() OVER (ORDER BY time_stamp) AS row_id, user_id, User.longitude, User.latitude, User.time_stamp 5 | , Mall.longitude AS longitude_mall, Mall.latitude AS latitude_mall, Mall.shop_id, Mall.mall_id 6 | , split_part(split_part(wifi_infos, ';', 1), '|', 1) AS wifi1 7 | , split_part(split_part(wifi_infos, ';', 1), '|', 2) AS wifi1_strong 8 | , split_part(split_part(wifi_infos, ';', 2), '|', 1) AS wifi2 9 | , split_part(split_part(wifi_infos, ';', 2), '|', 2) AS wifi2_strong 10 | , split_part(split_part(wifi_infos, ';', 3), '|', 1) AS wifi3 11 | , split_part(split_part(wifi_infos, ';', 3), '|', 2) AS wifi3_strong 12 | , split_part(split_part(wifi_infos, ';', 4), '|', 1) AS wifi4 13 | , split_part(split_part(wifi_infos, ';', 4), '|', 2) AS wifi4_strong 14 | , split_part(split_part(wifi_infos, ';', 5), '|', 1) AS wifi5 15 | , split_part(split_part(wifi_infos, ';', 5), '|', 2) AS wifi5_strong 16 | , split_part(split_part(wifi_infos, ';', 6), '|', 1) AS wifi6 17 | , split_part(split_part(wifi_infos, ';', 6), '|', 2) AS wifi6_strong 18 | , split_part(split_part(wifi_infos, ';', 7), '|', 1) AS wifi7 19 | , split_part(split_part(wifi_infos, ';', 7), '|', 2) AS wifi7_strong 20 | , split_part(split_part(wifi_infos, ';', 8), '|', 1) AS wifi8 21 | , split_part(split_part(wifi_infos, ';', 8), '|', 2) AS wifi8_strong 22 | , split_part(split_part(wifi_infos, ';', 9), '|', 1) AS wifi9 23 | , split_part(split_part(wifi_infos, ';', 9), '|', 2) AS wifi9_strong 24 | , split_part(split_part(wifi_infos, ';', 10), '|', 1) AS wifi10 25 | , split_part(split_part(wifi_infos, ';', 10), '|', 2) AS wifi10_strong 26 | , split_part(split_part(wifi_infos, ';', 11), '|', 1) AS wifi11 27 | , split_part(split_part(wifi_infos, ';', 11), '|', 2) AS wifi11_strong 28 | , split_part(split_part(wifi_infos, ';', 12), '|', 1) AS wifi12 29 | , split_part(split_part(wifi_infos, ';', 12), '|', 2) AS wifi12_strong 30 | , split_part(split_part(wifi_infos, ';', 13), '|', 1) AS wifi13 31 | , split_part(split_part(wifi_infos, ';', 13), '|', 2) AS wifi13_strong 32 | , split_part(split_part(wifi_infos, ';', 14), '|', 1) AS wifi14 33 | , split_part(split_part(wifi_infos, ';', 14), '|', 2) AS wifi14_strong 34 | , split_part(split_part(wifi_infos, ';', 15), '|', 1) AS wifi15 35 | , split_part(split_part(wifi_infos, ';', 15), '|', 2) AS wifi15_strong 36 | , split_part(split_part(wifi_infos, ';', 16), '|', 1) AS wifi16 37 | , split_part(split_part(wifi_infos, ';', 16), '|', 2) AS wifi16_strong 38 | , split_part(split_part(wifi_infos, ';', 17), '|', 1) AS wifi17 39 | , split_part(split_part(wifi_infos, ';', 17), '|', 2) AS wifi17_strong 40 | , split_part(split_part(wifi_infos, ';', 18), '|', 1) AS wifi18 41 | , split_part(split_part(wifi_infos, ';', 18), '|', 2) AS wifi18_strong 42 | , split_part(split_part(wifi_infos, ';', 19), '|', 1) AS wifi19 43 | , split_part(split_part(wifi_infos, ';', 19), '|', 2) AS wifi19_strong 44 | , split_part(split_part(wifi_infos, ';', 20), '|', 1) AS wifi20 45 | , split_part(split_part(wifi_infos, ';', 20), '|', 2) AS wifi20_strong 46 | FROM odps_tc_257100_f673506e024.ant_tianchi_ccf_sl_user_shop_behavior User 47 | INNER JOIN odps_tc_257100_f673506e024.ant_tianchi_ccf_sl_shop_info Mall 48 | ON user.shop_id = mall.shop_id; 49 | 50 | 51 | drop table if exists hr_train_wifi; 52 | 53 | CREATE TABLE hr_train_wifi ( 54 | row_id STRING, 55 | mall_id STRING, 56 | wifi_bssid STRING, 57 | strong STRING 58 | ); 59 | 60 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 61 | select row_id, mall_id, wifi1, wifi1_strong 62 | FROM hr_train WHERE wifi1 != ''; 63 | 64 | 65 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 66 | select row_id, mall_id, wifi2, wifi2_strong 67 | FROM hr_train WHERE wifi2 != ''; 68 | 69 | 70 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 71 | select row_id, mall_id, wifi3, wifi3_strong 72 | FROM hr_train WHERE wifi3 != ''; 73 | 74 | 75 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 76 | select row_id, mall_id, wifi4, wifi4_strong 77 | FROM hr_train WHERE wifi4 != ''; 78 | 79 | 80 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 81 | select row_id, mall_id, wifi5, wifi5_strong 82 | FROM hr_train WHERE wifi5 != ''; 83 | 84 | 85 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 86 | select row_id, mall_id, wifi6, wifi6_strong 87 | FROM hr_train WHERE wifi6 != ''; 88 | 89 | 90 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 91 | select row_id, mall_id, wifi7, wifi7_strong 92 | FROM hr_train WHERE wifi7 != ''; 93 | 94 | 95 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 96 | select row_id, mall_id, wifi8, wifi8_strong 97 | FROM hr_train WHERE wifi8 != ''; 98 | 99 | 100 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 101 | select row_id, mall_id, wifi9, wifi9_strong 102 | FROM hr_train WHERE wifi9 != ''; 103 | 104 | 105 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 106 | select row_id, mall_id, wifi10, wifi10_strong 107 | FROM hr_train WHERE wifi10 != ''; 108 | 109 | 110 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 111 | select row_id, mall_id, wifi11, wifi11_strong 112 | FROM hr_train WHERE wifi11 != ''; 113 | 114 | 115 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 116 | select row_id, mall_id, wifi12, wifi12_strong 117 | FROM hr_train WHERE wifi12 != ''; 118 | 119 | 120 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 121 | select row_id, mall_id, wifi13, wifi13_strong 122 | FROM hr_train WHERE wifi13 != ''; 123 | 124 | 125 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 126 | select row_id, mall_id, wifi14, wifi14_strong 127 | FROM hr_train WHERE wifi14 != ''; 128 | 129 | 130 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 131 | select row_id, mall_id, wifi15, wifi15_strong 132 | FROM hr_train WHERE wifi15 != ''; 133 | 134 | 135 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 136 | select row_id, mall_id, wifi16, wifi16_strong 137 | FROM hr_train WHERE wifi16 != ''; 138 | 139 | 140 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 141 | select row_id, mall_id, wifi17, wifi17_strong 142 | FROM hr_train WHERE wifi17 != ''; 143 | 144 | 145 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 146 | select row_id, mall_id, wifi18, wifi18_strong 147 | FROM hr_train WHERE wifi18 != ''; 148 | 149 | 150 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 151 | select row_id, mall_id, wifi19, wifi19_strong 152 | FROM hr_train WHERE wifi19 != ''; 153 | 154 | 155 | insert into hr_train_wifi(row_id,mall_id,wifi_bssid,strong) 156 | select row_id, mall_id, wifi20, wifi20_strong 157 | FROM hr_train WHERE wifi20 != ''; 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/2 create_test_table.sql: -------------------------------------------------------------------------------- 1 | drop table if exists hr_test_wifi; 2 | 3 | CREATE TABLE hr_test_wifi ( 4 | row_id STRING, 5 | mall_id STRING, 6 | wifi_bssid STRING, 7 | strong STRING 8 | ); 9 | 10 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 11 | select row_id, mall_id, wifi1, wifi1_strong 12 | FROM hr_test WHERE wifi1 != ''; 13 | 14 | 15 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 16 | select row_id, mall_id, wifi2, wifi2_strong 17 | FROM hr_test WHERE wifi2 != ''; 18 | 19 | 20 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 21 | select row_id, mall_id, wifi3, wifi3_strong 22 | FROM hr_test WHERE wifi3 != ''; 23 | 24 | 25 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 26 | select row_id, mall_id, wifi4, wifi4_strong 27 | FROM hr_test WHERE wifi4 != ''; 28 | 29 | 30 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 31 | select row_id, mall_id, wifi5, wifi5_strong 32 | FROM hr_test WHERE wifi5 != ''; 33 | 34 | 35 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 36 | select row_id, mall_id, wifi6, wifi6_strong 37 | FROM hr_test WHERE wifi6 != ''; 38 | 39 | 40 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 41 | select row_id, mall_id, wifi7, wifi7_strong 42 | FROM hr_test WHERE wifi7 != ''; 43 | 44 | 45 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 46 | select row_id, mall_id, wifi8, wifi8_strong 47 | FROM hr_test WHERE wifi8 != ''; 48 | 49 | 50 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 51 | select row_id, mall_id, wifi9, wifi9_strong 52 | FROM hr_test WHERE wifi9 != ''; 53 | 54 | 55 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 56 | select row_id, mall_id, wifi10, wifi10_strong 57 | FROM hr_test WHERE wifi10 != ''; 58 | 59 | 60 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 61 | select row_id, mall_id, wifi11, wifi11_strong 62 | FROM hr_test WHERE wifi11 != ''; 63 | 64 | 65 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 66 | select row_id, mall_id, wifi12, wifi12_strong 67 | FROM hr_test WHERE wifi12 != ''; 68 | 69 | 70 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 71 | select row_id, mall_id, wifi13, wifi13_strong 72 | FROM hr_test WHERE wifi13 != ''; 73 | 74 | 75 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 76 | select row_id, mall_id, wifi14, wifi14_strong 77 | FROM hr_test WHERE wifi14 != ''; 78 | 79 | 80 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 81 | select row_id, mall_id, wifi15, wifi15_strong 82 | FROM hr_test WHERE wifi15 != ''; 83 | 84 | 85 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 86 | select row_id, mall_id, wifi16, wifi16_strong 87 | FROM hr_test WHERE wifi16 != ''; 88 | 89 | 90 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 91 | select row_id, mall_id, wifi17, wifi17_strong 92 | FROM hr_test WHERE wifi17 != ''; 93 | 94 | 95 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 96 | select row_id, mall_id, wifi18, wifi18_strong 97 | FROM hr_test WHERE wifi18 != ''; 98 | 99 | 100 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 101 | select row_id, mall_id, wifi19, wifi19_strong 102 | FROM hr_test WHERE wifi19 != ''; 103 | 104 | 105 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 106 | select row_id, mall_id, wifi20, wifi20_strong 107 | FROM hr_test WHERE wifi20 != ''; 108 | 109 | 110 | 111 | drop table if exists hr_test_wifi; 112 | 113 | CREATE TABLE hr_test_wifi ( 114 | row_id STRING, 115 | mall_id STRING, 116 | wifi_bssid STRING, 117 | strong STRING 118 | ); 119 | 120 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 121 | select row_id, mall_id, wifi1, wifi1_strong 122 | FROM hr_test WHERE wifi1 != ''; 123 | 124 | 125 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 126 | select row_id, mall_id, wifi2, wifi2_strong 127 | FROM hr_test WHERE wifi2 != ''; 128 | 129 | 130 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 131 | select row_id, mall_id, wifi3, wifi3_strong 132 | FROM hr_test WHERE wifi3 != ''; 133 | 134 | 135 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 136 | select row_id, mall_id, wifi4, wifi4_strong 137 | FROM hr_test WHERE wifi4 != ''; 138 | 139 | 140 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 141 | select row_id, mall_id, wifi5, wifi5_strong 142 | FROM hr_test WHERE wifi5 != ''; 143 | 144 | 145 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 146 | select row_id, mall_id, wifi6, wifi6_strong 147 | FROM hr_test WHERE wifi6 != ''; 148 | 149 | 150 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 151 | select row_id, mall_id, wifi7, wifi7_strong 152 | FROM hr_test WHERE wifi7 != ''; 153 | 154 | 155 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 156 | select row_id, mall_id, wifi8, wifi8_strong 157 | FROM hr_test WHERE wifi8 != ''; 158 | 159 | 160 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 161 | select row_id, mall_id, wifi9, wifi9_strong 162 | FROM hr_test WHERE wifi9 != ''; 163 | 164 | 165 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 166 | select row_id, mall_id, wifi10, wifi10_strong 167 | FROM hr_test WHERE wifi10 != ''; 168 | 169 | 170 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 171 | select row_id, mall_id, wifi11, wifi11_strong 172 | FROM hr_test WHERE wifi11 != ''; 173 | 174 | 175 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 176 | select row_id, mall_id, wifi12, wifi12_strong 177 | FROM hr_test WHERE wifi12 != ''; 178 | 179 | 180 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 181 | select row_id, mall_id, wifi13, wifi13_strong 182 | FROM hr_test WHERE wifi13 != ''; 183 | 184 | 185 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 186 | select row_id, mall_id, wifi14, wifi14_strong 187 | FROM hr_test WHERE wifi14 != ''; 188 | 189 | 190 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 191 | select row_id, mall_id, wifi15, wifi15_strong 192 | FROM hr_test WHERE wifi15 != ''; 193 | 194 | 195 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 196 | select row_id, mall_id, wifi16, wifi16_strong 197 | FROM hr_test WHERE wifi16 != ''; 198 | 199 | 200 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 201 | select row_id, mall_id, wifi17, wifi17_strong 202 | FROM hr_test WHERE wifi17 != ''; 203 | 204 | 205 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 206 | select row_id, mall_id, wifi18, wifi18_strong 207 | FROM hr_test WHERE wifi18 != ''; 208 | 209 | 210 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 211 | select row_id, mall_id, wifi19, wifi19_strong 212 | FROM hr_test WHERE wifi19 != ''; 213 | 214 | 215 | insert into hr_test_wifi(row_id,mall_id,wifi_bssid,strong) 216 | select row_id, mall_id, wifi20, wifi20_strong 217 | FROM hr_test WHERE wifi20 != ''; 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/3 create_test_wifi_top.sql: -------------------------------------------------------------------------------- 1 | --创建临时表,替换null,负值变正 2 | DROP TABLE IF EXISTS hr_test_wifi_temp; 3 | CREATE TABLE hr_test_wifi_temp 4 | AS 5 | SELECT row_id, mall_id, wifi_bssid 6 | , CASE 7 | WHEN strong = 'null' THEN 20 8 | ELSE strong + 120 9 | END as strong 10 | FROM hr_test_wifi; -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/4 test_fix_mssing.sql: -------------------------------------------------------------------------------- 1 | --- 和hr_train_wifi_count_top join就是取wifi的top 1000,hr_test_wifi3_temp 2 | DROP TABLE if exists hr_test_wifi3_temp; 3 | CREATE TABLE hr_test_wifi3_temp 4 | AS 5 | SELECT a.row_id, a.wifi_bssid, a.strong 6 | FROM hr_test_wifi_temp a 7 | JOIN hr_train_wifi_count_top b 8 | ON a.mall_id = b.mall_id 9 | AND a.wifi_bssid = b.wifi_bssid; 10 | 11 | -- 创建所有的每个row_id对应的wifi 12 | DROP TABLE IF EXISTS hr_test_wifi_row_join; 13 | CREATE TABLE hr_test_wifi_row_join 14 | AS 15 | SELECT a.row_id, b.wifi_bssid, -999 AS strong 16 | FROM hr_test a 17 | JOIN hr_train_wifi_count_top b 18 | ON a.mall_id = b.mall_id; 19 | 20 | --- 加入row_id 方便等下判断哪一行不存在 21 | DROP TABLE IF EXISTS hr_test_wifi_row_join_with_append_id; 22 | PAI -name AppendId 23 | -project algo_public 24 | -DIDColName="append_id" 25 | -DoutputTableName="hr_test_wifi_row_join_with_append_id" 26 | -DinputTableName="hr_test_wifi_row_join" 27 | -DselectedColNames="row_id,wifi_bssid,strong"; 28 | 29 | ---筛选出存在的append_id 30 | DROP TABLE IF EXISTS hr_test_missing_wifi_temp; 31 | CREATE table hr_test_missing_wifi_temp 32 | AS 33 | SELECT a.append_id 34 | FROM hr_test_wifi_row_join_with_append_id a 35 | JOIN hr_test_wifi3_temp b 36 | ON a.row_id = b.row_id 37 | AND a.wifi_bssid = b.wifi_bssid; 38 | 39 | -- join 原来的列表 append_id2为空说明不存在 40 | DROP TABLE IF EXISTS hr_test_missing_wifi_temp2; 41 | CREATE TABLE hr_test_missing_wifi_temp2 42 | AS 43 | SELECT a.row_id, a.wifi_bssid, a.strong,a.append_id,b.append_id as append_id2 44 | FROM hr_test_wifi_row_join_with_append_id a 45 | LEFT JOIN hr_test_missing_wifi_temp b 46 | ON a.append_id = b.append_id; 47 | 48 | ---- 选择append_id2为空的,说明为缺失值 49 | DROP TABLE IF EXISTS hr_test_missing_wifi; 50 | CREATE table hr_test_missing_wifi 51 | AS 52 | SELECT row_id,wifi_bssid,strong 53 | FROM hr_test_missing_wifi_temp2 54 | where append_id2 is null; 55 | 56 | ---- 合并缺失值和存在的值 57 | DROP TABLE IF EXISTS hr_test_wifi_feature; 58 | CREATE TABLE hr_test_wifi_feature 59 | AS 60 | SELECT CAST(row_id AS STRING) as row_id, wifi_bssid, CAST(strong AS STRING) as strong 61 | FROM hr_test_missing_wifi 62 | UNION 63 | SELECT row_id, wifi_bssid, CAST(strong AS STRING) as strong 64 | FROM hr_test_wifi3_temp; 65 | 66 | 67 | --连接wifi bssid和strong,方便KV2table 68 | DROP TABLE IF EXISTS hr_test_wifi2; 69 | CREATE TABLE hr_test_wifi2 70 | AS 71 | SELECT row_id 72 | , concat_ws(';', collect_set(concat(wifi_bssid, ':', strong))) AS wifi 73 | FROM hr_test_wifi_feature 74 | GROUP BY row_id; 75 | 76 | -- 创建测试表 77 | DROP TABLE IF EXISTS hr_test3; 78 | CREATE TABLE hr_test3 79 | AS 80 | SELECT a.row_id, a.user_id, a.mall_id, a.time_stamp, a.longitude 81 | , a.latitude, b.wifi 82 | FROM hr_test a 83 | JOIN hr_test_wifi2 b 84 | ON a.row_id = b.row_id; 85 | -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/4 train_fix_missing.sql: -------------------------------------------------------------------------------- 1 | --- 和hr_train_wifi_count_top join就是取wifi的top 1000,生成hr_train_wifi3_temp表 2 | DROP table if exists hr_train_wifi3_temp; 3 | CREATE TABLE hr_train_wifi3_temp 4 | AS 5 | SELECT a.row_id, a.wifi_bssid, a.strong 6 | FROM hr_train_wifi_temp a 7 | JOIN hr_train_wifi_count_top b 8 | ON a.mall_id = b.mall_id 9 | AND a.wifi_bssid = b.wifi_bssid; 10 | 11 | -- 创建所有的每个row_id对应的wifi 12 | DROP TABLE IF EXISTS hr_train_wifi_row_join; 13 | CREATE TABLE hr_train_wifi_row_join 14 | AS 15 | SELECT a.row_id, b.wifi_bssid, -999 AS strong 16 | FROM hr_train a 17 | JOIN hr_train_wifi_count_top b 18 | ON a.mall_id = b.mall_id; 19 | 20 | --- 加入row_id 方便等下判断哪一行不存在 21 | PAI -name AppendId 22 | -project algo_public 23 | -DIDColName="append_id" 24 | -DoutputTableName="hr_train_wifi_row_join_with_append_id" 25 | -DinputTableName="hr_train_wifi_row_join" 26 | -DselectedColNames="row_id,wifi_bssid,strong"; 27 | 28 | 29 | ---筛选出存在的append_id 30 | DROP TABLE IF EXISTS hr_train_missing_wifi_temp; 31 | CREATE table hr_train_missing_wifi_temp 32 | AS 33 | SELECT a.append_id 34 | FROM hr_train_wifi_row_join_with_append_id a 35 | JOIN hr_train_wifi3_temp b 36 | ON a.row_id = b.row_id 37 | AND a.wifi_bssid = b.wifi_bssid; 38 | 39 | -- join 原来的列表 append_id2为空说明不存在 40 | DROP TABLE IF EXISTS hr_train_missing_wifi_temp2; 41 | CREATE TABLE hr_train_missing_wifi_temp2 42 | AS 43 | SELECT a.row_id, a.wifi_bssid, a.strong,a.append_id,b.append_id as append_id2 44 | FROM hr_train_wifi_row_join_with_append_id a 45 | LEFT JOIN hr_train_missing_wifi_temp b 46 | ON a.append_id = b.append_id; 47 | 48 | ---- 选择append_id2为空的,说明为缺失值 49 | DROP TABLE IF EXISTS hr_train_missing_wifi; 50 | CREATE table hr_train_missing_wifi 51 | AS 52 | SELECT row_id,wifi_bssid,strong 53 | FROM hr_train_missing_wifi_temp2 54 | where append_id2 is null; 55 | 56 | 57 | ---- 合并缺失值和存在的值 58 | DROP TABLE IF EXISTS hr_train_wifi_feature; 59 | CREATE TABLE hr_train_wifi_feature 60 | AS 61 | SELECT CAST(row_id AS STRING) as row_id, wifi_bssid, CAST(strong AS STRING) as strong 62 | FROM hr_train_missing_wifi 63 | UNION 64 | SELECT row_id, wifi_bssid, CAST(strong AS STRING) as strong 65 | FROM hr_train_wifi3_temp; 66 | 67 | 68 | --连接wifi bssid和strong,方便KV2table 69 | DROP TABLE IF EXISTS hr_train_wifi2; 70 | CREATE TABLE hr_train_wifi2 71 | AS 72 | SELECT row_id 73 | , concat_ws(';', collect_set(concat(wifi_bssid, ':', strong))) AS wifi 74 | FROM hr_train_wifi_feature 75 | GROUP BY row_id; 76 | 77 | 78 | -- 创建训练表 79 | DROP TABLE IF EXISTS hr_train3; 80 | CREATE TABLE hr_train3 81 | AS 82 | SELECT a.row_id, a.user_id, a.mall_id, a.shop_id, a.time_stamp 83 | , a.longitude, a.latitude, b.wifi 84 | FROM hr_train a 85 | JOIN hr_train_wifi2 b 86 | ON a.row_id = b.row_id; 87 | -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/5 final_test_table.sql: -------------------------------------------------------------------------------- 1 | --连接wifi bssid和strong,方便KV2table 2 | DROP TABLE IF EXISTS hr_test_wifi2; 3 | CREATE TABLE hr_test_wifi2 4 | AS 5 | SELECT row_id 6 | , concat_ws(';', collect_set(concat(wifi_bssid, ':', strong))) AS wifi 7 | FROM hr_test_wifi_temp3 8 | GROUP BY row_id; 9 | 10 | -- 创建测试表 11 | DROP TABLE IF EXISTS hr_test3; 12 | CREATE TABLE hr_test3 13 | AS 14 | SELECT a.row_id, a.user_id, a.mall_id, a.time_stamp, a.longitude 15 | , a.latitude, b.wifi 16 | FROM hr_test a 17 | JOIN hr_test_wifi2 b 18 | ON a.row_id = b.row_id; -------------------------------------------------------------------------------- /hrwhisper/season2/create_table/5 final_train_table.sql: -------------------------------------------------------------------------------- 1 | --连接wifi bssid和strong,方便KV2table 2 | DROP TABLE IF EXISTS hr_train_wifi2; 3 | CREATE TABLE hr_train_wifi2 4 | AS 5 | SELECT a.row_id 6 | , concat_ws(';', collect_set(concat(a.wifi_bssid, ':', a.strong))) AS wifi 7 | FROM hr_train_wifi_temp a 8 | JOIN hr_train_wifi_count_top b 9 | ON a.mall_id = b.mall_id 10 | AND a.wifi_bssid = b.wifi_bssid 11 | GROUP BY a.row_id; 12 | 13 | -- 创建训练表 14 | DROP TABLE IF EXISTS hr_train3; 15 | CREATE TABLE hr_train3 16 | AS 17 | SELECT a.row_id, a.user_id, a.mall_id, a.shop_id, a.time_stamp 18 | , a.longitude, a.latitude, b.wifi 19 | FROM hr_train a 20 | JOIN hr_train_wifi2 b 21 | ON a.row_id = b.row_id; -------------------------------------------------------------------------------- /hrwhisper/season2/generate_result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/12/4 3 | # @Author : hrwhisper 4 | from math import ceil 5 | 6 | 7 | def main(): 8 | with open('./mall_id') as f: 9 | mall_ids = f.read().splitlines() 10 | 11 | split_num = 5 12 | each_num = ceil(len(mall_ids) / split_num) 13 | for i in range(split_num): 14 | print(i * each_num, (i + 1) * each_num) 15 | # print(mall_ids[i * each_num:(i + 1) * each_num]) 16 | if i == 0: 17 | res = ["DROP TABLE IF EXISTS hr_result_rf400_sqrt;", 18 | "create table hr_result_rf400_sqrt(row_id string, shop_id string);"] 19 | else: 20 | res = [] 21 | for mall_id in mall_ids[i * each_num:(i + 1) * each_num]: 22 | res.append("INSERT INTO hr_result_rf400_sqrt select row_id, shop_id from {}_result_400_sqrt;".format(mall_id)) 23 | 24 | with open('./sql/generate_result_{}.sql'.format(i), 'w') as f: 25 | f.writelines('\n'.join(res)) 26 | 27 | # res.append(""" 28 | # SELECT count(*) FROM hr_result_rf400_sqrt; 29 | # DROP TABLE IF EXISTS ant_tianchi_ccf_sl_predict; 30 | # CREATE TABLE ant_tianchi_ccf_sl_predict AS SELECT * FROM hr_result_rf400;""") 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /hrwhisper/season2/generate_train_sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/12/4 3 | # @Author : hrwhisper 4 | """ 5 | 生成各个mall的训练代码, 调用PAI的一些模型。 6 | """ 7 | from math import ceil 8 | 9 | 10 | class SqlGenerator(object): 11 | def __init__(self, mall_id): 12 | self.mall_id = mall_id 13 | self.train_bool_to_str = { 14 | True: 'train', 15 | False: 'test' 16 | } 17 | 18 | def _create_mall_table(self, is_train): 19 | new_table_name = 'temp_{}_hr_{}'.format(self.mall_id, self.train_bool_to_str[is_train]) 20 | from_table_name = 'hr_{}3'.format(self.train_bool_to_str[is_train]) 21 | mall_id = self.mall_id 22 | return """ 23 | DROP TABLE IF EXISTS {new_table_name}; 24 | CREATE TABLE {new_table_name} AS SELECT * FROM {from_table_name} WHERE mall_id = '{mall_id}'; \n""".format(**locals()) 25 | 26 | def _wifi_table_to_vec(self, is_train): 27 | input_table = 'temp_{}_hr_{}'.format(self.mall_id, self.train_bool_to_str[is_train]) 28 | mall_id = self.mall_id 29 | out_table = 'temp_{}_wifi_table_{}_vec'.format(mall_id, self.train_bool_to_str[is_train]) 30 | out_map = 'temp_{}_wifi_table_{}_map_vec'.format(mall_id, self.train_bool_to_str[is_train]) 31 | append_name = 'row_id,longitude,latitude' + (',shop_id' if is_train else '') 32 | 33 | res = """ 34 | DROP TABLE IF EXISTS {out_table}; 35 | DROP TABLE IF EXISTS {out_map}; 36 | PAI -name KVToTable 37 | -project algo_public 38 | -Dlifecycle="28" 39 | -DappendColName="{append_name}" 40 | -DoutputTableName="{out_table}" 41 | -DkvColName="wifi" 42 | -DoutputKeyMapTableName="{out_map}" 43 | -DkvDelimiter=":" 44 | -Dtop1200="true" 45 | -DitemDelimiter=";" 46 | -DinputTableName="{input_table}" """.format(**locals()) 47 | if is_train: 48 | res += ";" 49 | else: 50 | res += '\n -DinputKeyMapTableName="temp_{}_wifi_table_train_map_vec";'.format(mall_id) 51 | return res + '\n' 52 | 53 | def train(self): 54 | mall_id = self.mall_id 55 | table_name = 'temp_{}_wifi_table_train_vec'.format(mall_id) 56 | return """ 57 | -- DROP OFFLINEMODEL IF EXISTS hr_random_forests_{mall_id}_400_sqrt; 58 | PAI -name randomforests 59 | -project algo_public 60 | -DlabelColName="shop_id" 61 | -DminNumPer="0" 62 | -DtreeNum="400" 63 | -DrandomColNum="30" 64 | -DmodelName="hr_random_forests_{mall_id}_400_sqrt" 65 | -DexcludedColNames="row_id" 66 | -DminNumObj="2" 67 | -DmaxRecordSize="100000" 68 | -DinputTableName="{table_name}"; 69 | """.format(**locals()) 70 | 71 | def predict(self): 72 | mall_id = self.mall_id 73 | table_name = 'temp_{}_wifi_table_test_vec'.format(mall_id) 74 | return """ 75 | -- DROP TABLE IF EXISTS {mall_id}_result_400_sqrt; 76 | PAI -name prediction 77 | -project algo_public 78 | -Dlifecycle="28" 79 | -DmodelName="hr_random_forests_{mall_id}_400_sqrt" 80 | -DscoreColName="prediction_score" 81 | -DenableSparse="false" 82 | -DoutputTableName="{mall_id}_result_400_sqrt" 83 | -DdetailColName="prediction_detail" 84 | -DkvDelimiter=":" 85 | -DresultColName="shop_id" 86 | -DitemDelimiter="," 87 | -DinputTableName="{table_name}" 88 | -DappendColNames="row_id";""".format(**locals()) 89 | 90 | def run(self): 91 | res = [ 92 | '-------------------------- {} begin --------------------------'.format(self.mall_id), 93 | self._create_mall_table(is_train=True), 94 | self._create_mall_table(is_train=False), 95 | 96 | self._wifi_table_to_vec(is_train=True), 97 | self._wifi_table_to_vec(is_train=False), 98 | 99 | self.train(), 100 | self.predict(), 101 | '\n-------------------------- {} end--------------------------\n\n\n\n'.format(self.mall_id), 102 | ] 103 | return res 104 | 105 | 106 | def main(): 107 | with open('./mall_id') as f: 108 | mall_ids = f.read().splitlines() 109 | 110 | split_num = 5 111 | each_num = ceil(len(mall_ids) / split_num) 112 | for i in range(split_num): 113 | print(i * each_num, (i + 1) * each_num) 114 | print(mall_ids[i * each_num:(i + 1) * each_num]) 115 | res = [] 116 | for mall_id in mall_ids[i * each_num:(i + 1) * each_num]: 117 | res.extend(SqlGenerator(mall_id).run()) 118 | with open('./sql/train_and_predict_{}.sql'.format(i), 'w') as f: 119 | f.writelines(res) 120 | 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /hrwhisper/season2/mall_id: -------------------------------------------------------------------------------- 1 | m_1010 2 | m_1043 3 | m_1071 4 | m_1080 5 | m_1081 6 | m_1082 7 | m_1111 8 | m_1115 9 | m_1164 10 | m_1176 11 | m_1263 12 | m_1291 13 | m_1293 14 | m_1309 15 | m_1366 16 | m_1402 17 | m_1413 18 | m_1442 19 | m_1451 20 | m_1621 21 | m_1657 22 | m_1755 23 | m_1789 24 | m_1807 25 | m_1893 26 | m_1910 27 | m_1913 28 | m_1919 29 | m_1920 30 | m_1928 31 | m_1936 32 | m_1960 33 | m_1962 34 | m_1990 35 | m_2011 36 | m_2030 37 | m_2092 38 | m_2093 39 | m_2097 40 | m_2108 41 | m_2267 42 | m_2270 43 | m_2324 44 | m_2333 45 | m_2334 46 | m_2419 47 | m_2571 48 | m_2715 49 | m_3034 50 | m_3054 51 | m_3092 52 | m_3112 53 | m_3120 54 | m_3197 55 | m_3219 56 | m_3232 57 | m_3268 58 | m_3319 59 | m_3414 60 | m_3434 61 | m_3520 62 | m_3596 63 | m_3601 64 | m_3610 65 | m_3620 66 | m_3695 67 | m_3702 68 | m_3709 69 | m_3732 70 | m_3795 71 | m_3869 72 | m_3916 73 | m_3936 74 | m_4005 75 | m_4064 76 | m_4066 77 | m_4068 78 | m_4098 79 | m_4173 80 | m_4178 81 | m_4181 82 | m_4199 83 | m_4211 84 | m_4216 85 | m_4227 86 | m_4244 87 | m_4312 88 | m_4357 89 | m_4372 90 | m_4384 91 | m_4406 92 | m_4434 93 | m_4548 94 | m_4634 95 | m_4664 96 | m_4680 97 | m_4711 98 | m_4889 99 | m_4983 100 | m_4998 101 | m_5024 102 | m_5076 103 | m_5192 104 | m_5291 105 | m_5337 106 | m_5343 107 | m_5364 108 | m_5369 109 | m_5452 110 | m_5609 111 | m_5641 112 | m_5654 113 | m_5661 114 | m_5772 115 | m_5783 116 | m_5833 117 | m_5845 118 | m_5847 119 | m_6065 120 | m_613 121 | m_614 122 | m_6141 123 | m_623 124 | m_627 125 | m_628 126 | m_629 127 | m_640 128 | m_6538 129 | m_6596 130 | m_6638 131 | m_6703 132 | m_6714 133 | m_6803 134 | m_699 135 | m_7225 136 | m_7255 137 | m_7256 138 | m_7283 139 | m_7323 140 | m_7329 141 | m_7346 142 | m_7374 143 | m_7375 144 | m_7501 145 | m_7601 146 | m_7671 147 | m_768 148 | m_7697 149 | m_7724 150 | m_7778 151 | m_7781 152 | m_7791 153 | m_7792 154 | m_7800 155 | m_7832 156 | m_7833 157 | m_7870 158 | m_7942 159 | m_7953 160 | m_7973 161 | m_7976 162 | m_7994 163 | m_8015 164 | m_822 165 | m_8222 166 | m_826 167 | m_828 168 | m_8327 169 | m_8344 170 | m_8494 171 | m_8550 172 | m_8563 173 | m_8671 174 | m_8960 175 | m_8991 176 | m_9047 177 | m_9054 178 | m_919 179 | m_1006 180 | m_1052 181 | m_1057 182 | m_1089 183 | m_1106 184 | m_1175 185 | m_1375 186 | m_1377 187 | m_1381 188 | m_1485 189 | m_1790 190 | m_1791 191 | m_1905 192 | m_1906 193 | m_1968 194 | m_1993 195 | m_1997 196 | m_2005 197 | m_2021 198 | m_2087 199 | m_2123 200 | m_2156 201 | m_2177 202 | m_2182 203 | m_2218 204 | m_2224 205 | m_2257 206 | m_2299 207 | m_2361 208 | m_2369 209 | m_2395 210 | m_2404 211 | m_2413 212 | m_2414 213 | m_2415 214 | m_2431 215 | m_2467 216 | m_2476 217 | m_2578 218 | m_2864 219 | m_2892 220 | m_2907 221 | m_3001 222 | m_3117 223 | m_3143 224 | m_3231 225 | m_3281 226 | m_3425 227 | m_3442 228 | m_3445 229 | m_3467 230 | m_3501 231 | m_3511 232 | m_3591 233 | m_3605 234 | m_3627 235 | m_3730 236 | m_3753 237 | m_3822 238 | m_3832 239 | m_3847 240 | m_3871 241 | m_3882 242 | m_3938 243 | m_4011 244 | m_4033 245 | m_4036 246 | m_4058 247 | m_4079 248 | m_4094 249 | m_4099 250 | m_4121 251 | m_4132 252 | m_4139 253 | m_4157 254 | m_4162 255 | m_4187 256 | m_4224 257 | m_4253 258 | m_4341 259 | m_4358 260 | m_4380 261 | m_4505 262 | m_4515 263 | m_4524 264 | m_4525 265 | m_4585 266 | m_4695 267 | m_4818 268 | m_5014 269 | m_5182 270 | m_5200 271 | m_5214 272 | m_5296 273 | m_5311 274 | m_5319 275 | m_5323 276 | m_5325 277 | m_5326 278 | m_5352 279 | m_5363 280 | m_5382 281 | m_5413 282 | m_5443 283 | m_5446 284 | m_5471 285 | m_5487 286 | m_5516 287 | m_5527 288 | m_5570 289 | m_5600 290 | m_5677 291 | m_5810 292 | m_5825 293 | m_5892 294 | m_5958 295 | m_615 296 | m_616 297 | m_6167 298 | m_617 299 | m_618 300 | m_619 301 | m_625 302 | m_626 303 | m_6337 304 | m_648 305 | m_6480 306 | m_6511 307 | m_6516 308 | m_6580 309 | m_6587 310 | m_6590 311 | m_6630 312 | m_6720 313 | m_689 314 | m_6923 315 | m_7039 316 | m_7168 317 | m_7304 318 | m_7383 319 | m_7410 320 | m_7523 321 | m_755 322 | m_760 323 | m_7811 324 | m_7821 325 | m_7867 326 | m_7868 327 | m_7899 328 | m_7997 329 | m_7998 330 | m_800 331 | m_802 332 | m_8041 333 | m_8188 334 | m_8200 335 | m_8215 336 | m_8251 337 | m_8282 338 | m_8285 339 | m_8452 340 | m_8974 341 | m_8980 342 | m_9051 343 | m_9068 344 | m_911 345 | m_912 346 | m_957 347 | m_976 348 | m_979 349 | m_988 350 | m_1021 351 | m_1085 352 | m_1128 353 | m_1129 354 | m_1161 355 | m_1320 356 | m_1389 357 | m_1409 358 | m_1435 359 | m_1553 360 | m_1585 361 | m_1701 362 | m_1943 363 | m_1950 364 | m_2009 365 | m_2058 366 | m_2060 367 | m_2230 368 | m_2307 369 | m_2450 370 | m_2514 371 | m_2539 372 | m_2878 373 | m_3005 374 | m_3010 375 | m_3019 376 | m_3027 377 | m_3031 378 | m_3313 379 | m_3329 380 | m_3449 381 | m_3517 382 | m_3528 383 | m_3532 384 | m_3534 385 | m_3540 386 | m_3679 387 | m_3690 388 | m_3739 389 | m_3804 390 | m_3839 391 | m_3897 392 | m_3899 393 | m_4049 394 | m_4112 395 | m_4160 396 | m_4168 397 | m_4205 398 | m_4206 399 | m_4221 400 | m_4269 401 | m_4347 402 | m_4422 403 | m_4423 404 | m_4459 405 | m_4466 406 | m_4509 407 | m_4543 408 | m_4572 409 | m_4599 410 | m_4637 411 | m_4759 412 | m_4801 413 | m_4834 414 | m_4853 415 | m_4923 416 | m_5019 417 | m_5081 418 | m_5085 419 | m_5154 420 | m_5258 421 | m_5331 422 | m_5349 423 | m_5374 424 | m_5424 425 | m_5435 426 | m_5450 427 | m_5473 428 | m_5481 429 | m_5503 430 | m_5519 431 | m_5529 432 | m_5542 433 | m_5583 434 | m_5586 435 | m_5751 436 | m_5752 437 | m_5767 438 | m_5778 439 | m_5785 440 | m_5812 441 | m_5813 442 | m_5946 443 | m_621 444 | m_622 445 | m_6390 446 | m_6428 447 | m_6429 448 | m_651 449 | m_6526 450 | m_6527 451 | m_672 452 | m_690 453 | m_7199 454 | m_7516 455 | m_7520 456 | m_7544 457 | m_7616 458 | m_7701 459 | m_7746 460 | m_7796 461 | m_786 462 | m_7939 463 | m_7954 464 | m_796 465 | m_8052 466 | m_8063 467 | m_8093 468 | m_8157 469 | m_8275 470 | m_8414 471 | m_8429 472 | m_8430 473 | m_867 474 | m_8688 475 | m_8835 476 | m_8853 477 | m_8893 478 | m_8907 479 | m_8908 480 | m_8910 481 | m_9007 482 | m_909 483 | m_927 484 | m_954 485 | m_966 486 | m_968 487 | m_989 -------------------------------------------------------------------------------- /hrwhisper/use_category2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/11/15 3 | # @Author : hrwhisper 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from scipy.sparse import csr_matrix 8 | 9 | from common_helper import ModelBase, XXToVec 10 | from parse_data import read_mall_data 11 | 12 | 13 | class CategoryToVec2(XXToVec): 14 | """ 15 | using the category feature which has been predicted by 'predict_category_pro.py' 16 | RF: k=all 0.9195226506115334 17 | """ 18 | 19 | def __init__(self): 20 | super().__init__('./feature_save/category_pro_features_{}_{}.pkl') 21 | self.k = 2 22 | self.feature_load_path = './feature_save/category/{}_{}.csv' 23 | 24 | def _do_transform(self, data, mall_id): 25 | categories = pd.concat((pd.read_csv(self.feature_load_path.format(mall_id, 'train'), dtype={'row_id': str}), 26 | pd.read_csv(self.feature_load_path.format(mall_id, 'test'), dtype={'row_id': str})), 27 | 0).set_index('row_id') 28 | 29 | features = data[['row_id']].join(categories, on='row_id', rsuffix='_train').set_index('row_id') 30 | # features = np.argpartition(features.values, -self.k)[:, -self.k:] 31 | return csr_matrix(features) 32 | 33 | def _fit_transform(self, train_data, mall_id): 34 | return self._do_transform(train_data, mall_id) 35 | 36 | def _transform(self, test_data, mall_id): 37 | return self._do_transform(test_data, mall_id) 38 | 39 | 40 | def train_test(): 41 | task = ModelBase() 42 | task.train_test([CategoryToVec2()], 'shop_id') 43 | # task.train_and_on_test_data([CategoryToVec()]) 44 | 45 | 46 | if __name__ == '__main__': 47 | train_test() 48 | -------------------------------------------------------------------------------- /hrwhisper/use_location.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/23 3 | # @Author : hrwhisper 4 | """ 5 | 经纬度 超过中心点多少的去掉。 6 | 好像没啥用? 7 | """ 8 | import collections 9 | from datetime import datetime 10 | 11 | import pandas as pd 12 | from sklearn.metrics.pairwise import cosine_similarity 13 | from scipy.sparse import csr_matrix 14 | 15 | from analysis_mall_location_data import get_distance_by_latitude_and_longitude 16 | from common_helper import ModelBase, XXToVec 17 | 18 | """ 19 | LocationToVec(), WifiToVec(), TimeToVec() 20 | RandomForestClassifier(bootstrap=True, class_weight='balanced', 21 | criterion='gini', max_depth=None, max_features='auto', 22 | max_leaf_nodes=None, min_impurity_decrease=0.0, 23 | min_impurity_split=None, min_samples_leaf=1, 24 | min_samples_split=2, min_weight_fraction_leaf=0.0, 25 | n_estimators=400, n_jobs=-1, oob_score=False, random_state=42, 26 | verbose=0, warm_start=False) Mean: 0.9093965396494474 27 | """ 28 | 29 | 30 | class LocationToVec2(XXToVec): 31 | _mall_center_and_area = pd.read_csv('./feature_save/mall_center_and_area.csv') 32 | MAX_EXCEED_AREA = 1 33 | """ 34 | scale10 scale 1 35 | 0.52 0.9136276668284586 0.91339786 36 | 0.6 0.9135071499148205 37 | 0.7 0.9134939307403611 38 | 0.8 0.913459231292 39 | 1 0.9137623822655955 40 | 1.1 0.913664140543424 41 | 1.2 0.91370422291071 42 | 1.3 0.9137034974928522 43 | 0.914946325351 44 | """ 45 | 46 | def __init__(self): 47 | super().__init__('./feature_save/location_features_{}_{}.pkl') 48 | self.scale = 1 49 | 50 | def __get_wifi_number(self, train_data): 51 | wifi_and_date = collections.defaultdict(set) 52 | wifi_rows = [] 53 | for wifi_infos, _time in zip(train_data['wifi_infos'], train_data['time_stamp']): 54 | _time = datetime.strptime(_time, "%Y-%m-%d %H:%M") 55 | for wifi in wifi_infos.split(';'): 56 | _id, _strong, _connect = wifi.split('|') 57 | wifi_and_date[_id].add(str(_time.date())) 58 | 59 | wifi_bssid = set() 60 | for wifi_infos in train_data['wifi_infos']: 61 | row = {} 62 | cur_wifi_len = len(wifi_infos.split(';')) 63 | for wifi in wifi_infos.split(';'): 64 | _id, _strong, _connect = wifi.split('|') 65 | _strong = int(_strong) + 120 66 | if _id not in row: 67 | row[_id] = [_strong, _connect == 'true'] 68 | wifi_bssid.add(_id) 69 | else: 70 | for i in range(1, cur_wifi_len): 71 | _t_id = _id + '_' + str(i) 72 | if _t_id in row: 73 | row[_t_id] = [_strong, _connect == 'true'] 74 | wifi_bssid.add(_t_id) 75 | break 76 | 77 | wifi_rows.append(row) 78 | 79 | wifi_bssid = {_id: i for i, _id in enumerate(sorted(wifi_bssid))} 80 | indptr = [0] 81 | indices = [] 82 | data = [] 83 | for row in wifi_rows: 84 | indices.extend([wifi_bssid[i] for i in row.keys()]) 85 | data.extend([t[0] for t in row.values()]) 86 | indptr.append(len(indices)) 87 | return csr_matrix((data, indices, indptr)) 88 | 89 | def _fit_transform(self, train_data, mall_id): 90 | return self._transform(train_data, mall_id) 91 | 92 | def _transform(self, test_data, mall_id): 93 | wifi_rows = self.__get_wifi_number(test_data) 94 | simility = cosine_similarity(wifi_rows) 95 | 96 | t = self._mall_center_and_area[self._mall_center_and_area['mall_id'] == mall_id] 97 | center_lat, center_log = t['center_latitude'].iat[0], t['center_longitude'].iat[0] 98 | max_area = t['max_area'].iat[0] 99 | 100 | indptr = [0] 101 | indices = [] 102 | data = [] 103 | 104 | lats, logs = test_data['latitude'], test_data['longitude'] 105 | for i, (log, lat) in enumerate(zip(logs, lats)): 106 | indices.extend([0, 1]) 107 | 108 | dis_to_center = get_distance_by_latitude_and_longitude(lat, log, center_lat, center_log) 109 | if max_area * self.MAX_EXCEED_AREA < dis_to_center: 110 | dis = sorted([(simility[i][j], j) 111 | for j in range(len(test_data)) if i != j], reverse=True) 112 | found = False 113 | for d, j in dis: 114 | dis_to_center = get_distance_by_latitude_and_longitude(lats.iat[j], logs.iat[j], center_lat, 115 | center_log) 116 | if max_area * self.MAX_EXCEED_AREA < dis_to_center: 117 | data.extend( 118 | [lats.iat[j] * self.scale, logs.iat[j] * self.scale]) 119 | found = True 120 | break 121 | if not found: 122 | data.extend([center_lat * self.scale, center_log * self.scale]) 123 | else: 124 | data.extend([lat * self.scale, log * self.scale]) 125 | indptr.append(len(indices)) 126 | 127 | return csr_matrix((data, indices, indptr)) 128 | 129 | 130 | def train_test(): 131 | task = ModelBase() 132 | task.train_test([LocationToVec2()]) 133 | # task.train_and_on_test_data([LocationToVec2()]) 134 | 135 | 136 | if __name__ == '__main__': 137 | train_test() 138 | -------------------------------------------------------------------------------- /hrwhisper/use_price.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/11/1 3 | # @Author : hrwhisper 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from scipy.sparse import csr_matrix 8 | 9 | from common_helper import ModelBase, XXToVec 10 | from parse_data import read_mall_data, read_train_join_mall, read_test_data 11 | 12 | 13 | class PriceToVec(XXToVec): 14 | """ 15 | using the price feature which has been predicted by 'predict_price.py' 16 | """ 17 | TRAIN_PRICE = pd.read_csv('./feature_save/predicted_price4.csv', dtype={'row_id': str}) 18 | 19 | def __init__(self): 20 | super().__init__('./feature_save/price_features_{}_{}.pkl') 21 | 22 | def _do_transform(self, data): 23 | d = data.join(self.TRAIN_PRICE.set_index('row_id'), on='row_id', rsuffix='_train') 24 | d = d['p_price'] 25 | features = d.values.reshape(-1, 1) 26 | # features = np.array([round(i) for i in d]).reshape(-1, 1) # d.values.reshape(-1, 1) 27 | return csr_matrix(features) 28 | 29 | def _fit_transform(self, train_data, mall_id): 30 | return self._do_transform(train_data) 31 | 32 | def _transform(self, test_data, mall_id): 33 | return self._do_transform(test_data) 34 | 35 | 36 | def train_test(): 37 | task = ModelBase() 38 | task.train_test([PriceToVec()]) 39 | task.train_and_on_test_data([PriceToVec()]) 40 | 41 | 42 | def analysis(): 43 | TRAIN_PRICE = pd.read_csv('./feature_save/predicted_price.csv') 44 | data = read_train_join_mall() 45 | data = data.loc[data['mall_id'] == 'm_1790'] 46 | d = data.join(TRAIN_PRICE.set_index('row_id'), on='row_id', rsuffix='_train') 47 | print(d.shape) 48 | diff = [] 49 | for row_id, shop_id, price, p_price in zip(d['row_id'], d['shop_id'], d['price'], d['p_price']): 50 | print(row_id, shop_id, price, p_price, p_price - price) 51 | diff.append(abs(p_price - price)) 52 | print(sum(diff), d.shape[0], sum(diff) / d.shape[0]) 53 | 54 | 55 | if __name__ == '__main__': 56 | train_test() 57 | # analysis() 58 | -------------------------------------------------------------------------------- /hrwhisper/use_strong_wifi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | merge from @Peishichao. 4 | [LocationToVec(), WifiToVec(), WifiStrongToVec()] 0.911611732342327 5 | RandomForestClassifier(class_weight='balanced', n_estimators=400, n_jobs=4, oob_score=False, random_state=42, 6 | verbose=0, warm_start=False) Mean: 0.911611732342327 7 | 线上0.9116 8 | """ 9 | from scipy.sparse import csr_matrix 10 | from sklearn.ensemble import RandomForestClassifier 11 | from common_helper import ModelBase, XXToVec 12 | import pandas as pd 13 | from collections import Counter 14 | 15 | 16 | class WifiStrongToVec(XXToVec): 17 | def __init__(self): 18 | super().__init__('./feature_save/wifi_strong_features_{}_{}.pkl') 19 | self.min_strong = -300 20 | 21 | def _fit_transform(self, data, mall_id): 22 | return self._transform(data, mall_id) 23 | 24 | def _transform(self, data, mall_id): 25 | ret_list = [] 26 | for wifi_infos in data['wifi_infos']: 27 | list_id = [] 28 | list_strong = [] 29 | list_connect = [] 30 | if wifi_infos: 31 | for wifi in wifi_infos.split(';'): 32 | _id, _strong, _connect = wifi.split('|') 33 | list_id.append(_id) 34 | list_strong.append(int(_strong)) 35 | list_connect.append(_connect) 36 | max_strong_index = list_strong.index(max(list_strong)) 37 | temp_id = list_id[max_strong_index] 38 | ret_list.append(int(temp_id.replace('b_', ''))) 39 | else: 40 | ret_list.append(list(Counter(ret_list).keys())[0]) 41 | data = data.assign(wifi_strong=pd.Series(ret_list).values) 42 | data = pd.concat([data['wifi_strong'], data['longitude'], data['latitude']], axis=1) 43 | wifi_features = csr_matrix(data) # TODO normalize 44 | return wifi_features 45 | 46 | 47 | class UseStrongWifi(ModelBase): 48 | def __init__(self): 49 | super().__init__() 50 | 51 | def _get_classifiers(self): 52 | return { 53 | 'random forest': RandomForestClassifier(n_jobs=-1, n_estimators=300, random_state=self._random_state), 54 | } 55 | 56 | 57 | def train_test(): 58 | task = UseStrongWifi() 59 | task.train_test([WifiStrongToVec()]) 60 | # task.train_and_on_test_data([WifiToVec()]) 61 | 62 | 63 | if __name__ == '__main__': 64 | train_test() 65 | -------------------------------------------------------------------------------- /hrwhisper/use_time.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/25 3 | # @Author : hrwhisper 4 | 5 | 6 | import numpy as np 7 | from scipy.sparse import csr_matrix 8 | from datetime import datetime 9 | from common_helper import ModelBase, XXToVec 10 | 11 | 12 | """ 13 | LocationToVec(), WifiToVec3(), TimeToVec() 14 | RandomForestClassifier(class_weight='balanced',n_estimators=400, n_jobs=4, 15 | random_state=42) Mean: 0.909196793335719 16 | 17 | RandomForestClassifier(class_weight='balanced',n_estimators=200, n_jobs=4, 18 | random_state=42) Mean: 0.9086988576400435 19 | """ 20 | 21 | 22 | class TimeToVec(XXToVec): 23 | def __init__(self): 24 | super().__init__('./feature_save/time_features_{}_{}.pkl') 25 | 26 | @staticmethod 27 | def _do_transform(train_data): 28 | features = np.array([ 29 | np.array([_time.isoweekday(), 30 | _time.isoweekday() >= 6, # Mean: 0.9109541149905104 0.9076093830826543 31 | # _time.hour // 6, # 0.9110178129353056 0.907882340 32 | # _time.hour // 5, # 0.9070676933021364 0.9077367366607973 33 | # _time.hour, # 0.9105093297197597 0.9070204350804165 0.9070565010851597(time//5) 34 | ]) 35 | for _time in map(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M"), train_data['time_stamp'].astype('str'))]) 36 | return csr_matrix(np.hstack([features, train_data[['longitude', 'latitude']]])) 37 | 38 | def _fit_transform(self, train_data, mall_id): 39 | return self._do_transform(train_data) 40 | 41 | def _transform(self, data, mall_id): 42 | return self._do_transform(data) 43 | 44 | 45 | def train_test(): 46 | task = ModelBase() 47 | task.train_test([TimeToVec()]) 48 | # task.train_and_on_test_data([LocationToVec(), WifiToVec(), TimeToVec()]) 49 | 50 | 51 | if __name__ == '__main__': 52 | train_test() 53 | # 2017-08-06 21:20 54 | # _time = datetime.strptime('2017-08-06 21:20', "%Y-%m-%d %H:%M") 55 | # print(_time.hour) 56 | # print(_time.weekday() >= 5) 57 | # print(type(_time.isoweekday())) 58 | -------------------------------------------------------------------------------- /hrwhisper/use_user.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/11/15 3 | # @Author : hrwhisper 4 | 5 | """ 6 | 用户特征 7 | binary rf 0.9182->0.9200 8 | """ 9 | import collections 10 | 11 | import numpy as np 12 | from scipy.sparse import csr_matrix 13 | from sklearn import preprocessing 14 | 15 | from common_helper import XXToVec 16 | 17 | 18 | class UserToVec(XXToVec): 19 | def __init__(self): 20 | super().__init__('./feature_save/user_features_{}_{}.pkl') 21 | self.shop_to_index = None 22 | self.user_counter = None 23 | self.total_counter = None 24 | self.norm = 1 25 | self._scaler = preprocessing.MaxAbsScaler() 26 | 27 | def _fit(self, train_data): 28 | self.user_counter = collections.defaultdict(lambda: collections.Counter()) 29 | self.total_counter = collections.Counter() 30 | 31 | shops = np.sort(train_data['shop_id'].unique()) 32 | self.shop_to_index = {shop_id: i for i, shop_id in enumerate(shops)} 33 | for user_id, shop_id in zip(train_data['user_id'], train_data['shop_id']): 34 | self.user_counter[user_id][shop_id] += 1 35 | self.total_counter[shop_id] += 1 36 | 37 | self.norm = train_data.shape[0] 38 | 39 | def _fit_transform(self, train_data, mall_id): 40 | self._fit(train_data) 41 | return self._scaler.fit_transform(self._do_transform(train_data, mall_id)) 42 | 43 | def _do_transform(self, data, mall_id): 44 | features = [] 45 | n = len(self.shop_to_index) 46 | 47 | for user_id in data['user_id']: 48 | feature = np.zeros(n) 49 | if user_id in self.user_counter: 50 | user = self.user_counter[user_id] 51 | for shop_id, cnt in user.items(): 52 | feature[self.shop_to_index[shop_id]] = cnt 53 | else: # TODO 冷启动问题 54 | pass 55 | # for shop_id, cnt in self.total_counter.items(): 56 | # feature[self.shop_to_index[shop_id]] = cnt 57 | # feature /= self.norm 58 | features.append(feature) 59 | return csr_matrix(features) 60 | 61 | def _transform(self, data, mall_id): 62 | return self._scaler.transform(self._do_transform(data,mall_id)) 63 | -------------------------------------------------------------------------------- /hrwhisper/use_wifi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/25 3 | # @Author : hrwhisper 4 | """ 5 | 使用了wifi特征,类似BOW (本地:0.8942214814065961 提交:0.8951) 6 | update: 同一个用户可能检测多个相同的bssid,将这些bssid编号 7 | update2: 去除mobile hotspot,若某wifi只有某天出现,则判定为mobile hotspot. 8 | update3: 对于测试集中wifi信息为空的,选择经纬度最近的用户的wifi作为该用户的wifi信息(本地0.91358) 9 | """ 10 | 11 | import collections 12 | from datetime import datetime 13 | 14 | from scipy.sparse import csr_matrix 15 | 16 | from common_helper import ModelBase, XXToVec 17 | from analysis_mall_location_data import get_distance_by_latitude_and_longitude 18 | 19 | 20 | class WifiToVec(XXToVec): 21 | def __init__(self): 22 | super().__init__('./feature_save/wifi_features_{}_{}.pkl') 23 | self.min_strong = -120 24 | self._WIFI_BSSID = None 25 | #self.hotspot = many_mall_has_same_bssid() 26 | 27 | 28 | def _fit_transform(self, train_data, mall_id): 29 | wifi_and_date = collections.defaultdict(set) 30 | wifi_rows = [] 31 | # 去除移动热点 32 | for wifi_infos, _time in zip(train_data['wifi_infos'], train_data['time_stamp']): 33 | _time = datetime.strptime(_time, "%Y-%m-%d %H:%M") 34 | for wifi in wifi_infos.split(';'): 35 | _id, _strong, _connect = wifi.split('|') 36 | wifi_and_date[_id].add(str(_time.date())) 37 | 38 | wifi_bssid = set() 39 | for wifi_infos in train_data['wifi_infos']: 40 | row = {} 41 | cur_wifi_len = len(wifi_infos.split(';')) 42 | for wifi in wifi_infos.split(';'): 43 | _id, _strong, _connect = wifi.split('|') 44 | if len(wifi_and_date[_id]) < 2: 45 | continue 46 | _strong = int(_strong) - self.min_strong 47 | # if _strong < 30: continue 48 | if _id not in row: 49 | row[_id] = [_strong, _connect == 'true'] 50 | wifi_bssid.add(_id) 51 | else: 52 | _s, _c = row[_id] 53 | if not _c: 54 | row[_id] = [max(_s , _strong), _c or _connect == 'true'] 55 | # for i in range(1, cur_wifi_len): 56 | # _t_id = _id + '_' + str(i) 57 | # if _t_id not in row: 58 | # row[_t_id] = [_strong, _connect == 'true'] 59 | # wifi_bssid.add(_t_id) 60 | # break 61 | wifi_rows.append(row) 62 | 63 | self._WIFI_BSSID = wifi_bssid = {_id: i for i, _id in enumerate(sorted(wifi_bssid))} 64 | indptr = [0] 65 | indices = [] 66 | data = [] 67 | for row in wifi_rows: 68 | for _id, (_strong, _connect) in row.items(): 69 | _id = wifi_bssid[_id] 70 | indices.append(_id) 71 | data.append(_strong) 72 | indptr.append(len(indices)) 73 | 74 | wifi_features = csr_matrix((data, indices, indptr), dtype=int) # TODO normalize 75 | return wifi_features 76 | 77 | def _transform(self, test_data, mall_id): 78 | wifi_bssid = self._WIFI_BSSID 79 | wifi_rows = [] 80 | not_in = set() 81 | to_add = [] 82 | for i, wifi_infos in enumerate(test_data['wifi_infos']): 83 | row = {} 84 | # cur_wifi_len = len(wifi_infos.split(';')) 85 | for wifi in wifi_infos.split(';'): 86 | _id, _strong, _connect = wifi.split('|') 87 | if _id not in wifi_bssid: 88 | not_in.add(_id) 89 | continue 90 | _strong = int(_strong) - self.min_strong 91 | # if _strong < 30: continue 92 | if _id not in row: 93 | row[_id] = [_strong, _connect == 'true'] 94 | else: 95 | _s, _c = row[_id] 96 | if not _c: 97 | row[_id] = [max(_s , _strong) , _c or _connect == 'true'] 98 | # for i in range(1, cur_wifi_len): 99 | # _t_id = _id + '_' + str(i) 100 | # if _t_id not in row and _t_id in wifi_bssid: 101 | # print(_t_id) 102 | # row[_t_id] = [_strong, _connect == 'true'] 103 | # break 104 | 105 | if len(row) == 0: 106 | to_add.append(i) 107 | wifi_rows.append(row) 108 | 109 | # 找最近的不为空的wifi 110 | lats, logs = test_data['latitude'], test_data['longitude'] 111 | for i in to_add: 112 | lat, log = lats.iat[i], logs.iat[i] 113 | dis = sorted([(get_distance_by_latitude_and_longitude(lat, log, lats.iat[j], logs.iat[j]), j) 114 | for j in range(len(test_data)) if i != j]) 115 | for d, j in dis: 116 | if len(wifi_rows[j]) != 0: 117 | wifi_rows[i] = wifi_rows[j] 118 | break 119 | 120 | indptr = [0] 121 | indices = [] 122 | data = [] 123 | for row in wifi_rows: 124 | for _id, (_strong, _connect) in row.items(): 125 | _id = wifi_bssid[_id] 126 | indices.append(_id) 127 | data.append(_strong) 128 | indptr.append(len(indices)) 129 | 130 | # print('total: {} ,not_in :{}'.format(len(wifi_bssid), len(not_in))) 131 | wifi_features = csr_matrix((data, indices, indptr), shape=(len(test_data), len(wifi_bssid)), dtype=int) 132 | # TODO normalize 133 | return wifi_features 134 | 135 | 136 | def train_test(): 137 | task = ModelBase() 138 | task.train_test([WifiToVec()]) 139 | # task.train_and_on_test_data([WifiToVec()]) 140 | 141 | 142 | if __name__ == '__main__': 143 | train_test() 144 | -------------------------------------------------------------------------------- /hrwhisper/use_wifi_kstrong.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/31 3 | # @Author : wqs 4 | """ 5 | 使用了wifi强度 6 | """ 7 | 8 | import collections 9 | from datetime import datetime 10 | 11 | from scipy.sparse import csr_matrix 12 | 13 | from common_helper import ModelBase, XXToVec 14 | from analysis_mall_location_data import get_distance_by_latitude_and_longitude 15 | 16 | ''' 17 | k=2 [LocationToVec2(), WifiToVec(), WifiKStrongToVec()] 0.9113118530195635 18 | k= 1 [LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec()] 0.9142910658123108 19 | k= 2 0.9142498942590803 20 | k= 3 0.9136532388641382 21 | ''' 22 | 23 | 24 | class WifiKStrongToVec(XXToVec): 25 | def __init__(self): 26 | super().__init__('./feature_save/wifi_features_{}_{}.pkl') 27 | self.min_strong = -120 28 | self._WIFI_BSSID = None 29 | self.kstrong = 1 30 | 31 | def _fit_transform(self, train_data, mall_id): 32 | wifi_and_date = collections.defaultdict(set) 33 | wifi_rows = [] 34 | # 去除同一条记录中多个bssid 35 | for wifi_infos, _time in zip(train_data['wifi_infos'], train_data['time_stamp']): 36 | _time = datetime.strptime(_time, "%Y-%m-%d %H:%M") 37 | for wifi in wifi_infos.split(';'): 38 | _id, _strong, _connect = wifi.split('|') 39 | wifi_and_date[_id].add(str(_time.date())) 40 | 41 | wifi_bssid = set() 42 | for wifi_infos in train_data['wifi_infos']: 43 | row = {} 44 | cur_wifi_len = len(wifi_infos.split(';')) 45 | for wifi in wifi_infos.split(';'): 46 | _id, _strong, _connect = wifi.split('|') 47 | if len(wifi_and_date[_id]) < 2: 48 | continue 49 | _strong = int(_strong) - self.min_strong 50 | if _id not in row: 51 | row[_id] = [_strong, _connect == 'true'] 52 | wifi_bssid.add(_id) 53 | else: 54 | for i in range(1, cur_wifi_len): 55 | _t_id = _id + '_' + str(i) 56 | if _t_id in row: 57 | row[_t_id] = [_strong, _connect == 'true'] 58 | wifi_bssid.add(_t_id) 59 | break 60 | 61 | wifi_rows.append(row) 62 | 63 | self._WIFI_BSSID = wifi_bssid = {_id: i for i, _id in enumerate(sorted(wifi_bssid))} 64 | indptr = [0] 65 | indices = [] 66 | data = [] 67 | for row in wifi_rows: 68 | tmp = [(_id, _strong, _connect) for _id, (_strong, _connect) in row.items()] 69 | tmp = sorted(tmp, key=lambda x: -x[1])[:self.kstrong] 70 | for i in range(len(tmp)): 71 | _id = wifi_bssid[tmp[i][0]] 72 | indices.append(i) 73 | data.append(tmp[i][1]) 74 | # for _id, (_strong, _connect) in row.items(): 75 | # _id = wifi_bssid[_id] 76 | # indices.append(_id) 77 | # data.append(_strong) 78 | indptr.append(len(indices)) 79 | 80 | wifi_features = csr_matrix((data, indices, indptr), shape=(len(train_data), self.kstrong), 81 | dtype=int) # TODO normalize 82 | return wifi_features 83 | 84 | def _transform(self, test_data, mall_id): 85 | wifi_bssid = self._WIFI_BSSID 86 | wifi_rows = [] 87 | not_in = set() 88 | to_add = [] 89 | for i, wifi_infos in enumerate(test_data['wifi_infos']): 90 | row = {} 91 | cur_wifi_len = len(wifi_infos.split(';')) 92 | for wifi in wifi_infos.split(';'): 93 | _id, _strong, _connect = wifi.split('|') 94 | if _id not in wifi_bssid: 95 | not_in.add(_id) 96 | continue 97 | _strong = int(_strong) - self.min_strong 98 | if _id not in row: 99 | row[_id] = [_strong, _connect == 'true'] 100 | else: 101 | for i in range(1, cur_wifi_len): 102 | _t_id = _id + '_' + str(i) 103 | if _t_id not in row and _t_id in wifi_bssid: 104 | row[_t_id] = [_strong, _connect == 'true'] 105 | break 106 | if len(row) == 0: 107 | to_add.append(i) 108 | wifi_rows.append(row) 109 | 110 | # 找最近的不为空的wifi 111 | lats, logs = test_data['latitude'], test_data['longitude'] 112 | for i in to_add: 113 | lat, log = lats.iat[i], logs.iat[i] 114 | dis = sorted([(get_distance_by_latitude_and_longitude(lat, log, lats.iat[j], logs.iat[j]), j) 115 | for j in range(len(test_data)) if i != j]) 116 | for d, j in dis: 117 | if len(wifi_rows[j]) != 0: 118 | wifi_rows[i] = wifi_rows[j] 119 | break 120 | 121 | indptr = [0] 122 | indices = [] 123 | data = [] 124 | for row in wifi_rows: 125 | tmp = [(_id, _strong, _connect) for _id, (_strong, _connect) in row.items()] 126 | tmp = sorted(tmp, key=lambda x: -x[1])[:self.kstrong] 127 | for i in range(len(tmp)): 128 | _id = wifi_bssid[tmp[i][0]] 129 | indices.append(i) 130 | data.append(tmp[i][1]) 131 | # for _id, (_strong, _connect) in row.items(): 132 | # _id = wifi_bssid[_id] 133 | # indices.append(_id) 134 | # data.append(_strong) 135 | indptr.append(len(indices)) 136 | 137 | # print('total: {} ,not_in :{}'.format(len(wifi_bssid), len(not_in))) 138 | wifi_features = csr_matrix((data, indices, indptr), shape=(len(test_data), self.kstrong), dtype=int) 139 | # TODO normalize 140 | return wifi_features 141 | 142 | 143 | def train_test(): 144 | task = ModelBase() 145 | task.train_test([WifiKStrongToVec()]) 146 | # task.train_and_on_test_data([WifiToVec2()]) 147 | 148 | 149 | if __name__ == '__main__': 150 | train_test() 151 | -------------------------------------------------------------------------------- /hrwhisper/visualization_mall_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/23 3 | # @Author : hrwhisper 4 | """ 5 | 画图 6 | """ 7 | import collections 8 | 9 | from mpl_toolkits.mplot3d import Axes3D 10 | import matplotlib.pyplot as plt 11 | import pandas as pd 12 | 13 | from parse_data import read_train_join_mall, read_mall_data 14 | from use_location import center_latitudes_and_longitudes 15 | 16 | 17 | def show_plt(): 18 | mng = plt.get_current_fig_manager() 19 | mng.window.showMaximized() 20 | plt.tight_layout() 21 | plt.show() 22 | 23 | 24 | def only_mall_visualization(mall_id=None): 25 | """ 26 | 根据经纬度信息,画mall 27 | 若给定mall_id则画特定mall否则画全部mall 28 | :param mall_id: str 29 | :return: 30 | """ 31 | train_data = read_mall_data() # read_train_join_mall() 32 | if mall_id: 33 | train_data = train_data[train_data['mall_id'] == mall_id] 34 | 35 | x = train_data['latitude'] 36 | y = train_data['longitude'] 37 | 38 | if mall_id: 39 | id2color = {shop_id: i for i, shop_id in enumerate(train_data['shop_id'].unique())} 40 | colors = [id2color[i] for i in train_data['shop_id']] 41 | else: 42 | id2color = {mall_id: i for i, mall_id in enumerate(train_data['mall_id'].unique())} 43 | colors = [id2color[i] for i in train_data['mall_id']] 44 | 45 | counter = collections.Counter(zip(x, y)) 46 | for (a, b), cnt in counter.items(): 47 | if cnt > 1: 48 | plt.text(a, b, cnt) 49 | 50 | plt.scatter(x, y, s=100, c=colors, alpha=0.5) 51 | show_plt() 52 | 53 | 54 | def shop_mall_visualization(mall_id='m_4572'): 55 | """ 56 | 画出某mall_id商场的所有店铺和用户位置 57 | """ 58 | train_data = read_train_join_mall() 59 | train_data = train_data[train_data['mall_id'] == mall_id] 60 | 61 | x = train_data['latitude'] 62 | y = train_data['longitude'] 63 | 64 | id2color = {mall_id: i for i, mall_id in enumerate(train_data['shop_id'].unique())} 65 | colors = [id2color[i] for i in train_data['shop_id']] 66 | plt.scatter(x, y, s=100, c=colors, alpha=0.5, marker='^') 67 | 68 | train_data = read_mall_data() 69 | train_data = train_data[train_data['mall_id'] == mall_id] 70 | x = train_data['latitude'] 71 | y = train_data['longitude'] 72 | 73 | colors = [id2color[i] for i in train_data['shop_id']] 74 | plt.scatter(x, y, s=600, c=colors, alpha=0.5) 75 | 76 | center = center_latitudes_and_longitudes(list(zip(x, y))) 77 | plt.scatter(center[0], center[1], s=1000, marker='s') 78 | 79 | show_plt() 80 | 81 | 82 | def mall_shop_day_sales_volume(mall_id='m_1621'): 83 | """ 84 | 画出某店铺的每日销量 85 | """ 86 | _train_data = read_train_join_mall() 87 | train_data = _train_data.loc[_train_data['mall_id'] == mall_id] 88 | train_data = train_data.assign(time_stamp=pd.to_datetime(train_data['time_stamp'])) 89 | train_data['time_stamp'] = train_data['time_stamp'].dt.day 90 | 91 | total_count = [collections.Counter() for _ in range(31)] 92 | for shop_id, day in zip(train_data['shop_id'], train_data['time_stamp']): 93 | total_count[day - 1][shop_id] += 1 94 | 95 | fig = plt.figure() 96 | ax = fig.add_subplot(111, projection='3d') 97 | 98 | z = 0 99 | shop_dis = 60 100 | 101 | for shop_id in ['s_389866', 's_432426', 's_459836', 's_634174', 's_1215854', 102 | 's_1287028', 's_2110248', 's_2670603', 's_2862961', 's_2922711', 103 | 's_3418707', 's_3479448', 's_3558937', 's_3658245', 's_3711363', 104 | 's_3716008', 's_3790469', 's_4001714', 's_4021610', 's_4050122']: 105 | if total_count[-1][shop_id] > 0: continue # 只画最后一天没有卖东西的,减少数量 106 | xs = list(range(31)) 107 | ys = [total_count[i][shop_id] for i in xs] 108 | ax.bar(xs, ys, z, zdir='y', alpha=0.8) 109 | z += shop_dis 110 | 111 | ax.set_xlabel('days') 112 | ax.set_ylabel('shops') 113 | ax.set_zlabel('sales volume') 114 | 115 | show_plt() 116 | 117 | 118 | if __name__ == '__main__': 119 | # only_mall_visualization('m_968') 120 | # shop_mall_visualization('m_1621') 121 | mall_shop_day_sales_volume('m_968') 122 | -------------------------------------------------------------------------------- /hrwhisper/visulization_wifi_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 2017/10/27 3 | # @Author : hrwhisper 4 | 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def draw_wifi(wifi_counter, mall_id): 10 | N = len(wifi_counter) 11 | height = [len(l) for l in wifi_counter.values()] 12 | 13 | ind = np.arange(N) # the x locations for the groups 14 | width = 0.35 # the width of the bars 15 | 16 | fig, ax = plt.subplots() 17 | rects1 = ax.bar(ind, height, width) 18 | 19 | ax.set_ylabel('Counts') 20 | ax.set_title('mall={} WIFI co-occurrence statics'.format(mall_id)) 21 | ax.set_xticks(ind + width / 2) 22 | # ax.set_xticklabels(list(wifi_counter.keys())) 23 | plt.show() 24 | --------------------------------------------------------------------------------