├── README.md ├── __pycache__ └── config.cpython-35.pyc ├── config.py ├── dataset ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── data.cpython-35.pyc │ └── sample.cpython-35.pyc ├── data.py └── sample.py ├── feature ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── filter.cpython-35.pyc │ ├── latlon.cpython-35.pyc │ ├── leak.cpython-35.pyc │ ├── location.cpython-35.pyc │ ├── main.cpython-35.pyc │ ├── other.cpython-35.pyc │ ├── rule.cpython-35.pyc │ └── user.cpython-35.pyc ├── filter.py ├── latlon.py ├── leak.py ├── location.py ├── main.py ├── other.py ├── rule.py └── user.py ├── main.py ├── submit.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-35.pyc ├── eval.cpython-35.pyc ├── helper.cpython-35.pyc └── label.cpython-35.pyc ├── eval.py └── helper.py /README.md: -------------------------------------------------------------------------------- 1 | # Mobike 2 | 摩拜杯 算法挑战赛 第三名 解决方案 3 | 4 | 代码使用fire设置入口,如:python3 main.py train 5 | -------------------------------------------------------------------------------- /__pycache__/config.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/__pycache__/config.cpython-35.pyc -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | class DefaultConfig(dict): 2 | def __init__(self): 3 | # ------------ 数据路径 ------------ 4 | 5 | self['data_dir'] = '../../MOBIKE_CUP_2017' 6 | self['train_csv'] = self['data_dir'] + '/train.csv' 7 | self['test_csv'] = self['data_dir'] + '/test.csv' 8 | self['cache_dir'] = '../cache' 9 | self['model_dir'] = '../snapshot' 10 | self['result_dir'] = '../result' 11 | 12 | # ------------ 训练参数 -------- 13 | 14 | self['startday'] = 23 15 | self['endday'] = 25 16 | self['lgb_leaves'] = 96 17 | self['lgb_lr'] = 0.05 18 | 19 | # ------------ 测试参数 -------- 20 | 21 | self['test_startday'] = 25 22 | self['test_endday'] = 26 23 | self['model_name'] = None 24 | 25 | # -------- 是否有用户 -------- 26 | 27 | self['user'] = True 28 | 29 | def update(self, **kwargs): 30 | for key in kwargs: 31 | self[key] = kwargs[key] 32 | self['time_prefix'] = '2017-05-' 33 | self['time_suffix'] = ' 00:00:00' 34 | self['starttime'] = '2017-05-' + str(self['startday']) + ' 00:00:00' 35 | self['endtime'] = '2017-05-' + str(self['endday']) + ' 00:00:00' 36 | self['test_starttime'] = '2017-05-' + str(self['test_startday']) + ' 00:00:00' 37 | self['test_endtime'] = '2017-05-' + str(self['test_endday']) + ' 00:00:00' 38 | 39 | def printf(self): 40 | print('Current Config:') 41 | for key in self: 42 | print('{}: {}'.format(key, self[key])) 43 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import get_train_data, get_test_data 2 | from .sample import get_sample 3 | -------------------------------------------------------------------------------- /dataset/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/dataset/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/data.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/dataset/__pycache__/data.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/sample.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/dataset/__pycache__/sample.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/data.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | os.path.join('..') 6 | from feature.other import get_distance, get_hour, get_latlon 7 | 8 | # 获取训练数据 9 | def get_train_data(opt): 10 | train = pd.read_csv(opt['train_csv']) 11 | train1 = train[(train['starttime'] < opt['starttime']) | (train['starttime'] >= opt['endtime'])] 12 | train2 = train[(train['starttime'] >= opt['starttime']) & (train['starttime'] < opt['endtime'])] 13 | # train2_old = train2[train2.userid.isin(train1.userid)] 14 | # train2_new = train2[~train2.userid.isin(train1.userid)] 15 | # train2_old_sample = train2_old.sample(frac=0.8) 16 | # train2 = pd.concat([train2_old_sample, train2_new]) 17 | # train2_newuser = train2[~train2.userid.isin(train1['userid'])] 18 | # train2 = pd.concat([train2, train2_newuser, train2_newuser]) 19 | # train1 = train[(train['starttime'] < '2017-05-18 00:00:00') | ((train['starttime'] >= '2017-05-20 00:00:00') & (train['starttime'] < '2017-05-22 00:00:00'))] 20 | # train2 = train[((train['starttime'] >= '2017-05-18 00:00:00') & (train['starttime'] < '2017-05-20 00:00:00')) | (train['starttime'] >= '2017-05-22 00:00:00')] 21 | del train2['geohashed_end_loc'] 22 | train1 = add_info(train1) # 添加小时信息、距离信息和经纬度信息 23 | #test = pd.read_csv(opt['test_csv']) # add 24 | #train2 = get_hour(train2) # add 25 | #train2 = get_latlon(train2, end=False) # add 26 | #test = get_hour(test) # add 27 | #test = get_latlon(test, end=False) # add 28 | #train_all = pd.concat([train1, train2, test]) # add 29 | print('训练数据加载完成:', train1.shape, train2.shape)#, train_all.shape) 30 | return train1, train2#, train_all 31 | 32 | # 获取测试数据 33 | def get_test_data(opt): 34 | train = pd.read_csv(opt['train_csv']) 35 | test = pd.read_csv(opt['test_csv']) 36 | # test_all = test.copy() 37 | if opt['test_endtime'] < opt['test_starttime']: test = test[(test['starttime'] >= opt['test_starttime'])] 38 | else: test = test[(test['starttime'] >= opt['test_starttime']) & (test['starttime'] < opt['test_endtime'])] 39 | train = add_info(train) # 添加小时信息、距离信息和经纬度信息 40 | # test_all = get_hour(test_all) 41 | # test_all = get_latlon(test_all, end=False) 42 | # train_all = pd.concat([train, test_all]) 43 | print('测试数据加载完成:', train.shape, test.shape)#, train_all.shape) 44 | return train, test#, train_all 45 | 46 | def add_info(res): 47 | res = get_distance(res) 48 | res = get_hour(res) 49 | res = get_latlon(res) 50 | return res -------------------------------------------------------------------------------- /dataset/sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | 4 | # 将用户的历史目的地加入为样本 5 | def get_user_end_loc(train, test): 6 | user_eloc = train[['userid', 'geohashed_end_loc']] 7 | result = pd.merge(test[['orderid', 'userid']], user_eloc, on='userid', how='left') 8 | result = result[['orderid', 'geohashed_end_loc']].drop_duplicates() 9 | result['user_end_loc_sample'] = 1 10 | return result 11 | 12 | # 将用户的历史出发地加入为样本 13 | def get_user_start_loc(train, test): 14 | # user_sloc_train = train[['userid', 'geohashed_start_loc']] 15 | # user_sloc_test = test[['userid', 'geohashed_start_loc']] 16 | user_sloc_train = pd.read_csv('../../MOBIKE_CUP_2017/train.csv')[['userid', 'geohashed_start_loc']].drop_duplicates() 17 | user_sloc_test = pd.read_csv('../../MOBIKE_CUP_2017/test.csv')[['userid', 'geohashed_start_loc']].drop_duplicates() 18 | user_sloc = pd.concat([user_sloc_train, user_sloc_test]) 19 | # user_sloc = train[['userid', 'geohashed_start_loc']].drop_duplicates() 20 | result = pd.merge(test[['orderid', 'userid']], user_sloc, on='userid', how='left') 21 | result.rename(columns={'geohashed_start_loc':'geohashed_end_loc'}, inplace=True) 22 | result = result[['orderid', 'geohashed_end_loc']].drop_duplicates() 23 | result['user_start_loc_sample'] = 1 24 | return result 25 | 26 | # 将起始地点出发到的Top10地方加入为样本 27 | def get_loc_to_loc(train, test): 28 | # sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'}) 29 | # sloc_eloc_count.sort_values('sloc_eloc_count', inplace=True) 30 | # if train['userid'].values[0] != -1: sloc_eloc_count = sloc_eloc_count.groupby('geohashed_start_loc').tail(15) 31 | sloc_eloc = train[['geohashed_start_loc', 'geohashed_end_loc']].drop_duplicates() 32 | result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc, on='geohashed_start_loc', how='left') 33 | # result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left') 34 | result = result[['orderid', 'geohashed_end_loc']].drop_duplicates() 35 | result['loc_to_loc_sample'] = 1 36 | return result 37 | 38 | # 将热度大于2的地址对加入为样本 39 | # def get_loc_to_loc2(train, test): 40 | # sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'}) 41 | # sloc_eloc_count = sloc_eloc_count[sloc_eloc_count['sloc_eloc_count']>=2] 42 | # result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left') 43 | # result = result[['orderid', 'geohashed_end_loc']].drop_duplicates() 44 | # result['loc_to_loc2_sample'] = 1 45 | # return result 46 | 47 | # 将以起始地点作为结束地点出发的Top10地方加入为样本 48 | # def get_loc_to_loc2(train, test): 49 | # sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'}) 50 | # sloc_eloc_count.sort_values('sloc_eloc_count', inplace=True) 51 | # sloc_eloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 52 | # if train['userid'].values[0] != -1: sloc_eloc_count = sloc_eloc_count.groupby('geohashed_start_loc').tail(10) 53 | # result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left') 54 | # result = result[['orderid', 'geohashed_end_loc']] 55 | # return result 56 | 57 | # 将热度最高的Top10目的地点加入为样本 58 | # def get_hot_eloc(train, test): 59 | # hot_eloc = train.groupby('geohashed_end_loc', as_index=False)['orderid'].agg({'cnt': 'count'}) 60 | # hot_eloc.sort_values(by='cnt', inplace=True) 61 | # hot_eloc = hot_eloc['geohashed_end_loc'].tail(10) 62 | # hot_eloc = pd.concat([hot_eloc] * test.shape[0]).reset_index() 63 | # result = pd.DataFrame({'orderid': pd.concat([test['orderid']] * 10)}).sort_values(by='orderid').reset_index() 64 | # result.loc[:, 'geohashed_end_loc'] = hot_eloc 65 | # return result 66 | 67 | # 将自行车下一个出发地点(在全部测试集中寻找)加入为样本 68 | def get_bike_next_sloc(train, test): 69 | train_set = pd.read_csv('../../MOBIKE_CUP_2017/train.csv') 70 | test_set = pd.read_csv('../../MOBIKE_CUP_2017/test.csv') 71 | all_set = pd.concat([train_set, test_set]) 72 | bike_sloc = all_set[['orderid', 'bikeid', 'geohashed_start_loc', 'starttime']] 73 | bike_sloc.sort_values(by=['bikeid', 'starttime'], inplace=True, ascending=True) 74 | bike_sloc['next_bikeid'] = bike_sloc['bikeid'].shift(-1) 75 | bike_sloc['geohashed_end_loc'] = bike_sloc['geohashed_start_loc'].shift(-1) 76 | result = bike_sloc[(bike_sloc['bikeid'] == bike_sloc['next_bikeid']) & (bike_sloc['starttime'] >= test['starttime'].min()) & (bike_sloc['starttime'] <= test['starttime'].max())] 77 | result = result[['orderid', 'geohashed_end_loc']].drop_duplicates() 78 | result['bike_next_sloc_sample'] = 1 79 | # bike_sloc = test[['orderid', 'bikeid', 'geohashed_start_loc', 'starttime']].drop_duplicates() 80 | # bike_sloc.sort_values(by=['bikeid', 'starttime'], inplace=True, ascending=True) 81 | # bike_sloc['next_bikeid'] = bike_sloc['bikeid'].shift(-1) 82 | # bike_sloc['geohashed_end_loc'] = bike_sloc['geohashed_start_loc'].shift(-1) 83 | # result = bike_sloc[bike_sloc['bikeid'] == bike_sloc['next_bikeid']] 84 | # result = result[['orderid', 'geohashed_end_loc']] 85 | return result 86 | 87 | # 寻找用户的下一个出发地点(在全部测试集中寻找)加入为样本 88 | def get_user_next_sloc(train, test): 89 | train_set = pd.read_csv('../../MOBIKE_CUP_2017/train.csv') 90 | test_set = pd.read_csv('../../MOBIKE_CUP_2017/test.csv') 91 | all_set = pd.concat([train_set, test_set]) 92 | user_sloc = all_set[['orderid', 'userid', 'geohashed_start_loc', 'starttime']] 93 | user_sloc.sort_values(by=['userid', 'starttime'], inplace=True, ascending=True) 94 | user_sloc['next_userid'] = user_sloc['userid'].shift(-1) 95 | user_sloc['geohashed_end_loc'] = user_sloc['geohashed_start_loc'].shift(-1) 96 | result = user_sloc[(user_sloc['userid'] == user_sloc['next_userid']) & (user_sloc['starttime'] >= test['starttime'].min()) & (user_sloc['starttime'] <= test['starttime'].max())] 97 | result = result[['orderid', 'geohashed_end_loc']].drop_duplicates() 98 | return result 99 | 100 | # 构造样本 101 | def get_sample(train, test, load=False): 102 | user_start_loc = get_user_start_loc(train, test) 103 | user_end_loc = get_user_end_loc(train, test) 104 | loc_to_loc = get_loc_to_loc(train, test) 105 | bike_next_sloc = get_bike_next_sloc(train, test) 106 | # 汇总 107 | result = pd.concat([user_end_loc[['orderid', 'geohashed_end_loc']], 108 | user_start_loc[['orderid', 'geohashed_end_loc']], 109 | loc_to_loc[['orderid', 'geohashed_end_loc']], 110 | bike_next_sloc[['orderid', 'geohashed_end_loc']] 111 | ]).drop_duplicates() 112 | 113 | restmp = pd.concat([user_end_loc, 114 | user_start_loc, 115 | loc_to_loc, 116 | bike_next_sloc 117 | ]) 118 | restmp.fillna(0, inplace=True) 119 | restmp = restmp.groupby(['orderid', 'geohashed_end_loc'], as_index=False).sum() 120 | 121 | result = pd.merge(result, restmp, on=['orderid', 'geohashed_end_loc'], how='left') 122 | # 添加负样本 123 | result = pd.merge(result, test, on='orderid', how='left') 124 | # 删除出发点和目的点相同的样本 以及 异常值 125 | result = result[result['geohashed_end_loc'] != result['geohashed_start_loc']] 126 | result = result[(~result['geohashed_start_loc'].isnull()) & (~result['geohashed_end_loc'].isnull())] 127 | # print("原始样本数目:", result.shape) 128 | # sample_leak = bike_next_sloc[['orderid', 'geohashed_end_loc']].drop_duplicates() 129 | # print("Leak样本数目:", sample_leak.shape) 130 | # sample_leak['leak'] = 1 131 | # result = pd.merge(result, sample_leak, on=['orderid', 'geohashed_end_loc'], how='left') 132 | # result = result[result.leak.isnull()] 133 | # print("过滤Leak之后的样本数目:", result.shape) 134 | # del result['leak'] 135 | if load: 136 | sample_27 = pd.read_pickle('/home/xuwenchao/dyj-storage/wc-sample/sample_filter_{}.pickle'.format(test.shape[0]))[['orderid', 'geohashed_end_loc', 'userid', 'bikeid', 'biketype', 'starttime', 'geohashed_start_loc']] 137 | # sample_27 = pd.concat([pd.read_pickle('/home/xuwenchao/dyj-storage/wc-sample/sample_filter_23_1_27_.pickle'), pd.read_pickle('/home/xuwenchao/dyj-storage/wc-sample/sample_filter_24_1_27_.pickle')])[['orderid', 'geohashed_end_loc', 'userid', 'bikeid', 'biketype', 'starttime', 'geohashed_start_loc']] 138 | result = pd.merge(sample_27, result[['orderid', 'geohashed_end_loc', 'user_end_loc_sample', 'user_start_loc_sample', 'loc_to_loc_sample', 'bike_next_sloc_sample']], on=['orderid', 'geohashed_end_loc'], how='left') 139 | print('构造样本完成:', result.shape, result[result.geohashed_start_loc.isnull()].shape) 140 | return result -------------------------------------------------------------------------------- /feature/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import get_feat 2 | from .other import get_distance -------------------------------------------------------------------------------- /feature/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/filter.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/filter.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/latlon.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/latlon.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/leak.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/leak.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/location.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/location.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/main.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/main.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/other.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/other.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/rule.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/rule.cpython-35.pyc -------------------------------------------------------------------------------- /feature/__pycache__/user.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/user.cpython-35.pyc -------------------------------------------------------------------------------- /feature/filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import Geohash as geohash 4 | import numpy as np 5 | import os 6 | import tqdm 7 | os.path.join('..') 8 | from utils import rank 9 | 10 | ''' 11 | 获取协同过滤特征 12 | ''' 13 | 14 | # ----------------- 计数 ------------------- 15 | 16 | # 获取地址对的协同过滤信息 17 | def get_loc_filter(train, result): 18 | sloc_elocs, eloc_slocs = {}, {} 19 | for i in tqdm.tqdm(train[['geohashed_start_loc', 'geohashed_end_loc']].values): 20 | if i[0] not in sloc_elocs: 21 | sloc_elocs[i[0]] = {} 22 | if i[1] not in sloc_elocs[i[0]]: 23 | sloc_elocs[i[0]][i[1]] = 0 24 | sloc_elocs[i[0]][i[1]] += 1 25 | if i[1] not in eloc_slocs: 26 | eloc_slocs[i[1]] = {} 27 | if i[0] not in eloc_slocs[i[1]]: 28 | eloc_slocs[i[1]][i[0]] = 0; 29 | eloc_slocs[i[1]][i[0]] += 1 30 | sloc_list, eloc_list, sloc_eloc_common_eloc_count, sloc_eloc_common_sloc_count, sloc_eloc_common_conn1_count, sloc_eloc_common_conn2_count = [], [], [], [], [], [] 31 | for i in tqdm.tqdm(result[['geohashed_start_loc', 'geohashed_end_loc']].drop_duplicates().values): 32 | sloc_list.append(i[0]) 33 | eloc_list.append(i[1]) 34 | # 获取地址对在历史记录中共有的目的地数目 35 | common_eloc_count = 0 36 | if (i[0] in sloc_elocs) and (i[1] in sloc_elocs): 37 | sloc_eloc_common_eloc_set = sloc_elocs[i[0]].keys() & sloc_elocs[i[1]].keys() 38 | for common_eloc in sloc_eloc_common_eloc_set: 39 | common_eloc_count = common_eloc_count + sloc_elocs[i[0]][common_eloc] + sloc_elocs[i[1]][common_eloc] 40 | sloc_eloc_common_eloc_count.append(common_eloc_count) 41 | # 获取地址对在历史记录中共有的出发地数目 42 | common_sloc_count = 0 43 | if (i[0] in eloc_slocs) and (i[1] in eloc_slocs): 44 | sloc_eloc_common_sloc_set = eloc_slocs[i[0]].keys() & eloc_slocs[i[1]].keys() 45 | for common_sloc in sloc_eloc_common_sloc_set: 46 | common_sloc_count = common_sloc_count + eloc_slocs[i[0]][common_sloc] + eloc_slocs[i[1]][common_sloc] 47 | sloc_eloc_common_sloc_count.append(common_sloc_count) 48 | # 获取地址对在历史记录中共有的连接点数目(出发点->xx->目的地) 49 | common_conn1_count = 0 50 | if (i[0] in sloc_elocs) and (i[1] in eloc_slocs): 51 | sloc_eloc_common_conn1_set = sloc_elocs[i[0]].keys() & eloc_slocs[i[1]].keys() 52 | for common_conn1 in sloc_eloc_common_conn1_set: 53 | common_conn1_count = common_conn1_count + sloc_elocs[i[0]][common_conn1] + eloc_slocs[i[1]][common_conn1] 54 | sloc_eloc_common_conn1_count.append(common_conn1_count) 55 | # 获取地址对在历史记录中共有的连接点数目(出发点<-xx<-目的地) 56 | common_conn2_count = 0 57 | if (i[0] in eloc_slocs) and (i[1] in sloc_elocs): 58 | sloc_eloc_common_conn2_set = eloc_slocs[i[0]].keys() & sloc_elocs[i[1]].keys() 59 | for common_conn2 in sloc_eloc_common_conn2_set: 60 | common_conn2_count = common_conn2_count + eloc_slocs[i[0]][common_conn2] + sloc_elocs[i[1]][common_conn2] 61 | sloc_eloc_common_conn2_count.append(common_conn2_count) 62 | loc_filter = pd.DataFrame({"geohashed_start_loc": sloc_list, "geohashed_end_loc": eloc_list, "sloc_eloc_common_eloc_count": sloc_eloc_common_eloc_count, "sloc_eloc_common_sloc_count": sloc_eloc_common_sloc_count, "sloc_eloc_common_conn1_count": sloc_eloc_common_conn1_count, "sloc_eloc_common_conn2_count": sloc_eloc_common_conn2_count}) 63 | result = pd.merge(result, loc_filter, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left') 64 | result['sloc_eloc_common_eloc_rate'] = result['sloc_eloc_common_eloc_count']/(result['sloc_count']+result['eloc_as_sloc_count']) 65 | result['sloc_eloc_common_sloc_rate'] = result['sloc_eloc_common_sloc_count']/(result['sloc_as_eloc_count']+result['eloc_count']) 66 | result['sloc_eloc_common_conn1_rate'] = result['sloc_eloc_common_conn1_count']/(result['sloc_count']+result['eloc_count']) 67 | result['sloc_eloc_common_conn2_rate'] = result['sloc_eloc_common_conn2_count']/(result['sloc_as_eloc_count']+result['eloc_as_sloc_count']) 68 | return result 69 | 70 | # 获取用户地址对的协同过滤信息 71 | def get_user_loc_filter(train, result): 72 | user_sloc_elocs, user_eloc_slocs = {}, {} 73 | for i in tqdm.tqdm(train[['userid', 'geohashed_start_loc', 'geohashed_end_loc']].values): 74 | if i[0] not in user_sloc_elocs: 75 | user_sloc_elocs[i[0]] = {} 76 | if i[1] not in user_sloc_elocs[i[0]]: 77 | user_sloc_elocs[i[0]][i[1]] = {} 78 | if i[2] not in user_sloc_elocs[i[0]][i[1]]: 79 | user_sloc_elocs[i[0]][i[1]][i[2]] = 0 80 | user_sloc_elocs[i[0]][i[1]][i[2]] += 1 81 | if i[0] not in user_eloc_slocs: 82 | user_eloc_slocs[i[0]] = {} 83 | if i[2] not in user_eloc_slocs[i[0]]: 84 | user_eloc_slocs[i[0]][i[2]] = {}; 85 | if i[1] not in user_eloc_slocs[i[0]][i[2]]: 86 | user_eloc_slocs[i[0]][i[2]][i[1]] = 0 87 | user_eloc_slocs[i[0]][i[2]][i[1]] += 1 88 | user_list, user_sloc_list, user_eloc_list, user_sloc_eloc_common_eloc_count, user_sloc_eloc_common_sloc_count, user_sloc_eloc_common_conn1_count, user_sloc_eloc_common_conn2_count = [], [], [], [], [], [], [] 89 | for i in tqdm.tqdm(result[['userid', 'geohashed_start_loc', 'geohashed_end_loc']].drop_duplicates().values): 90 | user_list.append(i[0]) 91 | user_sloc_list.append(i[1]) 92 | user_eloc_list.append(i[2]) 93 | # 获取地址对在用户历史记录中共有的目的地数目 94 | user_common_eloc_count = 0 95 | if (i[0] in user_sloc_elocs) and (i[1] in user_sloc_elocs[i[0]]) and (i[2] in user_sloc_elocs[i[0]]): 96 | user_sloc_eloc_common_eloc_set = user_sloc_elocs[i[0]][i[1]].keys() & user_sloc_elocs[i[0]][i[2]].keys() 97 | for user_common_eloc in user_sloc_eloc_common_eloc_set: 98 | user_common_eloc_count = user_common_eloc_count + user_sloc_elocs[i[0]][i[1]][user_common_eloc] + user_sloc_elocs[i[0]][i[2]][user_common_eloc] 99 | user_sloc_eloc_common_eloc_count.append(user_common_eloc_count) 100 | # 获取地址对在用户历史记录中共有的出发地数目 101 | user_common_sloc_count = 0 102 | if (i[0] in user_eloc_slocs) and (i[1] in user_eloc_slocs[i[0]]) and (i[2] in user_eloc_slocs[i[0]]): 103 | user_sloc_eloc_common_sloc_set = user_eloc_slocs[i[0]][i[1]].keys() & user_eloc_slocs[i[0]][i[2]].keys() 104 | for user_common_sloc in user_sloc_eloc_common_sloc_set: 105 | user_common_sloc_count = user_common_sloc_count + user_eloc_slocs[i[0]][i[1]][user_common_sloc] + user_eloc_slocs[i[0]][i[2]][user_common_sloc] 106 | user_sloc_eloc_common_sloc_count.append(user_common_sloc_count) 107 | # 获取地址对在用户历史记录中共有的连接点数目(出发点->xx->目的地) 108 | user_common_conn1_count = 0 109 | if (i[0] in user_sloc_elocs) and (i[1] in user_sloc_elocs[i[0]]) and (i[0] in user_eloc_slocs) and (i[2] in user_eloc_slocs[i[0]]): 110 | user_sloc_eloc_common_conn1_set = user_sloc_elocs[i[0]][i[1]].keys() & user_eloc_slocs[i[0]][i[2]].keys() 111 | for user_common_conn1 in user_sloc_eloc_common_conn1_set: 112 | user_common_conn1_count = user_common_conn1_count + user_sloc_elocs[i[0]][i[1]][user_common_conn1] + user_eloc_slocs[i[0]][i[2]][user_common_conn1] 113 | user_sloc_eloc_common_conn1_count.append(user_common_conn1_count) 114 | # 获取地址对在用户历史记录中共有的连接点数目(出发点<-xx<-目的地) 115 | user_common_conn2_count = 0 116 | if (i[0] in user_eloc_slocs) and (i[1] in user_eloc_slocs[i[0]]) and (i[0] in user_sloc_elocs) and (i[2] in user_sloc_elocs[i[0]]): 117 | user_sloc_eloc_common_conn2_set = user_eloc_slocs[i[0]][i[1]].keys() & user_sloc_elocs[i[0]][i[2]].keys() 118 | for user_common_conn2 in user_sloc_eloc_common_conn2_set: 119 | user_common_conn2_count = user_common_conn2_count + user_eloc_slocs[i[0]][i[1]][user_common_conn2] + user_sloc_elocs[i[0]][i[2]][user_common_conn2] 120 | user_sloc_eloc_common_conn2_count.append(user_common_conn2_count) 121 | user_loc_filter = pd.DataFrame({"userid": user_list, "geohashed_start_loc": user_sloc_list, "geohashed_end_loc": user_eloc_list, "user_sloc_eloc_common_eloc_count": user_sloc_eloc_common_eloc_count, "user_sloc_eloc_common_sloc_count": user_sloc_eloc_common_sloc_count, "user_sloc_eloc_common_conn1_count": user_sloc_eloc_common_conn1_count, "user_sloc_eloc_common_conn2_count": user_sloc_eloc_common_conn2_count}) 122 | result = pd.merge(result, user_loc_filter, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left') 123 | result['user_sloc_eloc_common_eloc_rate'] = result['user_sloc_eloc_common_eloc_count']/(result['user_sloc_count']+result['user_eloc_as_sloc_count']) 124 | result['user_sloc_eloc_common_sloc_rate'] = result['user_sloc_eloc_common_sloc_count']/(result['user_sloc_as_eloc_count']+result['user_eloc_count']) 125 | result['user_sloc_eloc_common_conn1_rate'] = result['user_sloc_eloc_common_conn1_count']/(result['user_sloc_count']+result['user_eloc_count']) 126 | result['user_sloc_eloc_common_conn2_rate'] = result['user_sloc_eloc_common_conn2_count']/(result['user_sloc_as_eloc_count']+result['user_eloc_as_sloc_count']) 127 | return result -------------------------------------------------------------------------------- /feature/latlon.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import Geohash as geohash 4 | import numpy as np 5 | import os 6 | os.path.join('..') 7 | from utils import rank 8 | 9 | ''' 10 | 获取经纬度特征 11 | ''' 12 | 13 | # ----------------- 经纬度 ------------------- 14 | 15 | # 获取目的地的经纬度 16 | def get_eloc_latlon(result): 17 | eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)[:2]) 18 | result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0])) 19 | result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1])) 20 | return result 21 | 22 | # 获取出发地的经纬度 23 | def get_sloc_latlon(result): 24 | sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)[:2]) 25 | result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0])) 26 | result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1])) 27 | return result 28 | 29 | # ----------------- 方向 ------------------- 30 | 31 | # 获取出发地与目的地的经纬度差 32 | def get_eloc_sloc_latlon_sub(result): 33 | # sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)[:2]) 34 | # sloc_lat = sloc_latlon.apply(lambda x: float(x[0])) 35 | # sloc_lon = sloc_latlon.apply(lambda x: float(x[1])) 36 | # eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)[:2]) 37 | # eloc_lat = eloc_latlon.apply(lambda x: float(x[0])) 38 | # eloc_lon = eloc_latlon.apply(lambda x: float(x[1])) 39 | result['eloc_sloc_lat_sub'] = result['eloc_lat'] - result['sloc_lat'] 40 | result['eloc_sloc_lon_sub'] = result['eloc_lon'] - result['sloc_lon'] 41 | return result 42 | 43 | # 获取出发地与目的地的斜率 44 | def get_eloc_sloc_slope(result): 45 | result['eloc_sloc_latlon_slope'] = result['eloc_sloc_lat_sub'] / result['eloc_sloc_lon_sub'] 46 | return result 47 | 48 | # 获取经纬度差与距离的商 49 | def get_eloc_sloc_latlon_sub_divide_distance(result): 50 | result['eloc_sloc_lat_sub_divide_distance'] = result['eloc_sloc_lat_sub'] / result['distance'] 51 | result['eloc_sloc_lon_sub_divide_distance'] = result['eloc_sloc_lon_sub'] / result['distance'] 52 | result['eloc_sloc_lat_sub_divide_manhattan'] = result['eloc_sloc_lat_sub'] / result['manhattan'] 53 | result['eloc_sloc_lon_sub_divide_manhattan'] = result['eloc_sloc_lon_sub'] / result['manhattan'] 54 | return result 55 | 56 | # 获取方向角 57 | def get_bearing_array(result): 58 | result['degree'] = np.arctan2(result['eloc_sloc_lat_sub'], result['eloc_sloc_lon_sub']) 59 | return result 60 | 61 | # ----------------- 统计 ------------------- 62 | 63 | # 获取用户出发的距离统计 64 | def get_user_latlon_sub_stat(train, result): 65 | train = train[~train.geohashed_end_loc.isnull()] 66 | user_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_lat_sub_max': 'max', 'user_lat_sub_min': 'min', 'user_lat_sub_mean': 'mean'}) 67 | result = pd.merge(result, user_lat_sub_stat, on=['userid', 'geohashed_start_loc'], how='left') 68 | user_lon_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_lon_sub_max': 'max', 'user_lon_sub_min': 'min', 'user_lon_sub_mean': 'mean'}) 69 | result = pd.merge(result, user_lon_sub_stat, on=['userid', 'geohashed_start_loc'], how='left') 70 | return result 71 | 72 | # 获取用户从某个地点出发的距离统计 73 | def get_user_sloc_latlon_sub_stat(train, result): 74 | train = train[~train.geohashed_end_loc.isnull()] 75 | user_sloc_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_sloc_lat_sub_max': 'max', 'user_sloc_lat_sub_min': 'min', 'user_sloc_lat_sub_mean': 'mean'}) # 8 7 kong 76 | # user_sloc_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_sloc_lat_sub_mean': 'mean'}) 77 | result = pd.merge(result, user_sloc_lat_sub_stat, on=['userid', 'geohashed_start_loc'], how='left') 78 | user_sloc_lon_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_sloc_lon_sub_max': 'max', 'user_sloc_lon_sub_min': 'min', 'user_sloc_lon_sub_mean': 'mean'}) 79 | result = pd.merge(result, user_sloc_lon_sub_stat, on=['userid', 'geohashed_start_loc'], how='left') 80 | return result 81 | 82 | # 获取用户到某个地点结束的距离统计 83 | def get_user_eloc_latlon_sub_stat(train, result): 84 | train = train[~train.geohashed_end_loc.isnull()] 85 | user_eloc_lat_sub_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_eloc_lat_sub_max': 'max', 'user_eloc_lat_sub_min': 'min', 'user_eloc_lat_sub_mean': 'mean'}) 86 | result = pd.merge(result, user_eloc_lat_sub_stat, on=['userid', 'geohashed_end_loc'], how='left') 87 | user_eloc_lon_sub_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_eloc_lon_sub_max': 'max', 'user_eloc_lon_sub_min': 'min', 'user_eloc_lon_sub_mean': 'mean'}) 88 | result = pd.merge(result, user_eloc_lon_sub_stat, on=['userid', 'geohashed_end_loc'], how='left') 89 | return result 90 | 91 | # 获取用户从某个地点出发的小时段距离统计 92 | def get_user_sloc_hour_latlon_sub_stat(train, result): 93 | train = train[~train.geohashed_end_loc.isnull()] 94 | user_sloc_hour_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_sloc_hour_lat_sub_max': 'max', 'user_sloc_hour_lat_sub_min': 'min', 'user_sloc_hour_lat_sub_mean': 'mean'}) 95 | result = pd.merge(result, user_sloc_hour_lat_sub_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left') 96 | user_sloc_hour_lon_sub_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_sloc_hour_lon_sub_max': 'max', 'user_sloc_hour_lon_sub_min': 'min', 'user_sloc_hour_lon_sub_mean': 'mean'}) 97 | result = pd.merge(result, user_sloc_hour_lon_sub_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left') 98 | return result 99 | 100 | # 获取用户到某个地点结束的小时段距离统计 101 | def get_user_eloc_hour_latlon_sub_stat(train, result): 102 | train = train[~train.geohashed_end_loc.isnull()] 103 | user_eloc_hour_lat_sub_stat = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_eloc_hour_lat_sub_max': 'max', 'user_eloc_hour_lat_sub_min': 'min', 'user_eloc_hour_lat_sub_mean': 'mean'}) # 4 4 6 104 | result = pd.merge(result, user_eloc_hour_lat_sub_stat, on=['userid', 'geohashed_end_loc', 'hour'], how='left') 105 | user_eloc_hour_lon_sub_stat = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_eloc_hour_lon_sub_max': 'max', 'user_eloc_hour_lon_sub_min': 'min', 'user_eloc_hour_lon_sub_mean': 'mean'}) # 2 7 6 106 | result = pd.merge(result, user_eloc_hour_lon_sub_stat, on=['userid', 'geohashed_end_loc', 'hour'], how='left') 107 | return result 108 | 109 | # 获取从某个地点出发的距离统计 110 | def get_sloc_latlon_sub_stat(train, result): 111 | train = train[~train.geohashed_end_loc.isnull()] 112 | sloc_lat_sub_stat = train.groupby(['geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'sloc_lat_sub_max': 'max', 'sloc_lat_sub_min': 'min', 'sloc_lat_sub_mean': 'mean'}) 113 | result = pd.merge(result, sloc_lat_sub_stat, on=['geohashed_start_loc'], how='left') 114 | sloc_lon_sub_stat = train.groupby(['geohashed_start_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'sloc_lon_sub_max': 'max', 'sloc_lon_sub_min': 'min', 'sloc_lon_sub_mean': 'mean'}) 115 | result = pd.merge(result, sloc_lon_sub_stat, on=['geohashed_start_loc'], how='left') 116 | return result 117 | 118 | # 获取到某个地点结束的距离统计 119 | def get_eloc_latlon_sub_stat(train, result): 120 | train = train[~train.geohashed_end_loc.isnull()] 121 | eloc_lat_sub_stat = train.groupby(['geohashed_end_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'eloc_lat_sub_max': 'max', 'eloc_lat_sub_min': 'min', 'eloc_lat_sub_mean': 'mean'}) 122 | result = pd.merge(result, eloc_lat_sub_stat, on=['geohashed_end_loc'], how='left') 123 | eloc_lon_sub_stat = train.groupby(['geohashed_end_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'eloc_lon_sub_max': 'max', 'eloc_lon_sub_min': 'min', 'eloc_lon_sub_mean': 'mean'}) 124 | result = pd.merge(result, eloc_lon_sub_stat, on=['geohashed_end_loc'], how='left') 125 | return result 126 | 127 | # 获取从某个地点出发的小时段距离统计 128 | def get_sloc_hour_latlon_sub_stat(train, result): 129 | train = train[~train.geohashed_end_loc.isnull()] 130 | sloc_hour_lat_sub_stat = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'sloc_hour_lat_sub_max': 'max', 'sloc_hour_lat_sub_min': 'min', 'sloc_hour_lat_sub_mean': 'mean'}) 131 | result = pd.merge(result, sloc_hour_lat_sub_stat, on=['geohashed_start_loc', 'hour'], how='left') 132 | sloc_hour_lon_sub_stat = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'sloc_hour_lon_sub_max': 'max', 'sloc_hour_lon_sub_min': 'min', 'sloc_hour_lon_sub_mean': 'mean'}) 133 | result = pd.merge(result, sloc_hour_lon_sub_stat, on=['geohashed_start_loc', 'hour'], how='left') 134 | return result 135 | 136 | # 获取到某个地点结束的小时段距离统计 137 | def get_eloc_hour_latlon_sub_stat(train, result): 138 | train = train[~train.geohashed_end_loc.isnull()] 139 | eloc_hour_lat_sub_stat = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'eloc_hour_lat_sub_max': 'max', 'eloc_hour_lat_sub_min': 'min', 'eloc_hour_lat_sub_mean': 'mean'}) 140 | result = pd.merge(result, eloc_hour_lat_sub_stat, on=['geohashed_end_loc', 'hour'], how='left') 141 | eloc_hour_lon_sub_stat = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'eloc_hour_lon_sub_max': 'max', 'eloc_hour_lon_sub_min': 'min', 'eloc_hour_lon_sub_mean': 'mean'}) 142 | result = pd.merge(result, eloc_hour_lon_sub_stat, on=['geohashed_end_loc', 'hour'], how='left') 143 | return result 144 | 145 | # ----------------- 排序 ------------------- 146 | 147 | # 获取用户出行距离的排序 148 | def get_user_latlon_sub_rank(result): 149 | result = rank(result, 'userid', 'eloc_sloc_lat_sub', rank_name='user_lat_sub_rank', ascending=False) 150 | result = rank(result, 'userid', 'eloc_sloc_lon_sub', rank_name='user_lon_sub_rank', ascending=False) 151 | return result 152 | 153 | # 获取用户到某个目的地的距离排序 154 | def get_user_eloc_latlon_sub_rank(result): 155 | result = rank(result, ['userid', 'geohashed_end_loc'], 'eloc_sloc_lat_sub', rank_name='user_eloc_lat_sub_rank', ascending=False) 156 | result = rank(result, ['userid', 'geohashed_end_loc'], 'eloc_sloc_lon_sub', rank_name='user_eloc_lon_sub_rank', ascending=False) 157 | return result 158 | 159 | # 获取用户从某个地点出发的距离排序 160 | def get_user_sloc_latlon_sub_rank(result): 161 | result = rank(result, ['userid', 'geohashed_start_loc'], 'eloc_sloc_lat_sub', rank_name='user_sloc_lat_sub_rank', ascending=False) 162 | result = rank(result, ['userid', 'geohashed_start_loc'], 'eloc_sloc_lon_sub', rank_name='user_sloc_lon_sub_rank', ascending=False) 163 | return result 164 | 165 | # 获取用户到某个目的地的小时段距离排序 166 | def get_user_eloc_hour_latlon_sub_rank(result): 167 | result = rank(result, ['userid', 'geohashed_end_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='user_eloc_hour_lat_sub_rank', ascending=False) 168 | result = rank(result, ['userid', 'geohashed_end_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='user_eloc_hour_lon_sub_rank', ascending=False) 169 | return result 170 | 171 | # 获取从某个目的地出发的小时段距离排序 172 | def get_user_sloc_hour_latlon_sub_rank(result): 173 | result = rank(result, ['userid', 'geohashed_start_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='user_sloc_hour_lat_sub_rank', ascending=False) 174 | result = rank(result, ['userid', 'geohashed_start_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='user_sloc_hour_lon_sub_rank', ascending=False) 175 | return result 176 | 177 | # 获取到某个目的地的距离排序 178 | def get_eloc_latlon_sub_rank(result): 179 | result = rank(result, 'geohashed_end_loc', 'eloc_sloc_lat_sub', rank_name='eloc_lat_sub_rank', ascending=False) 180 | result = rank(result, 'geohashed_end_loc', 'eloc_sloc_lon_sub', rank_name='eloc_lon_sub_rank', ascending=False) 181 | return result 182 | 183 | # 获取从某个地点出发的距离排序 184 | def get_sloc_latlon_sub_rank(result): 185 | result = rank(result, 'geohashed_start_loc', 'eloc_sloc_lat_sub', rank_name='sloc_lat_sub_rank', ascending=False) 186 | result = rank(result, 'geohashed_start_loc', 'eloc_sloc_lon_sub', rank_name='sloc_lon_sub_rank', ascending=False) 187 | return result 188 | 189 | # 获取到某个目的地的小时段距离排序 190 | def get_eloc_hour_latlon_sub_rank(result): 191 | result = rank(result, ['geohashed_end_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='eloc_hour_lat_sub_rank', ascending=False) 192 | result = rank(result, ['geohashed_end_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='eloc_hour_lon_sub_rank', ascending=False) 193 | return result 194 | 195 | # 获取从某个目的地出发的小时段距离排序 196 | def get_sloc_hour_latlon_sub_rank(result): 197 | result = rank(result, ['geohashed_start_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='sloc_hour_lat_sub_rank', ascending=False) 198 | result = rank(result, ['geohashed_start_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='sloc_hour_lon_sub_rank', ascending=False) 199 | return result 200 | 201 | # ----------------- 交叉 ------------------- 202 | 203 | # 获取距离与用户出行距离统计值的(绝对)差值 204 | def get_user_latlon_sub_stat_sub(result): 205 | result['user_lat_sub_mean_sub'] = (result['distance'] - result['user_lat_sub_mean']) 206 | result['user_lon_sub_mean_sub'] = (result['distance'] - result['user_lon_sub_mean']) 207 | result['user_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_lat_sub_mean']).abs() # 6 208 | result['user_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_lon_sub_mean']).abs() # 1 209 | return result 210 | 211 | # 获取距离与用户从某个点出发距离统计值的(绝对)差值 212 | def get_user_sloc_latlon_sub_stat_sub(result): 213 | # result['user_sloc_lat_sub_mean_sub'] = (result['distance'] - result['user_sloc_lat_sub_mean']) # 0 214 | result['user_sloc_lon_sub_mean_sub'] = (result['distance'] - result['user_sloc_lon_sub_mean']) # 2 215 | # result['user_sloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_lat_sub_mean']).abs() # 0 216 | # result['user_sloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_lon_sub_mean']).abs() # 0 217 | return result 218 | 219 | # 获取距离与用户到某个点结束距离统计值的(绝对)差值 220 | def get_user_eloc_latlon_sub_stat_sub(result): 221 | result['user_eloc_lat_sub_mean_sub'] = (result['distance'] - result['user_eloc_lat_sub_mean']) 222 | result['user_eloc_lon_sub_mean_sub'] = (result['distance'] - result['user_eloc_lon_sub_mean']) 223 | result['user_eloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_lat_sub_mean']).abs() 224 | result['user_eloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_lon_sub_mean']).abs() 225 | return result 226 | 227 | # 获取距离与用户从某个点出发距离统计值的各小时段(绝对)差值 228 | def get_user_sloc_hour_latlon_sub_stat_sub(result): 229 | result['user_sloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['user_sloc_hour_lat_sub_mean']) 230 | result['user_sloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['user_sloc_hour_lon_sub_mean']) 231 | result['user_sloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_hour_lat_sub_mean']).abs() # 5 232 | result['user_sloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_hour_lon_sub_mean']).abs() # 8 233 | return result 234 | 235 | # 获取距离与用户到某个点结束距离统计值的各小时段(绝对)差值 236 | def get_user_eloc_hour_latlon_sub_stat_sub(result): 237 | result['user_eloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['user_eloc_hour_lat_sub_mean']) # 43 238 | result['user_eloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['user_eloc_hour_lon_sub_mean']) # 18 239 | # result['user_eloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_hour_lat_sub_mean']).abs() # 0 240 | result['user_eloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_hour_lon_sub_mean']).abs() # 3 241 | return result 242 | 243 | # 获取距离与从某个点出发距离统计值的(绝对)差值 244 | def get_sloc_latlon_sub_stat_sub(result): 245 | result['sloc_lat_sub_mean_sub'] = (result['distance'] - result['sloc_lat_sub_mean']) 246 | result['sloc_lon_sub_mean_sub'] = (result['distance'] - result['sloc_lon_sub_mean']) 247 | result['sloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['sloc_lat_sub_mean']).abs() # 4 248 | result['sloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['sloc_lon_sub_mean']).abs() # 4 249 | return result 250 | 251 | # 获取距离与到某个点结束距离统计值的(绝对)差值 252 | def get_eloc_latlon_sub_stat_sub(result): 253 | result['eloc_lat_sub_mean_sub'] = (result['distance'] - result['eloc_lat_sub_mean']) 254 | result['eloc_lon_sub_mean_sub'] = (result['distance'] - result['eloc_lon_sub_mean']) 255 | result['eloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['eloc_lat_sub_mean']).abs() # 7 256 | result['eloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['eloc_lon_sub_mean']).abs() 257 | return result 258 | 259 | # 获取距离与从某个点出发距离统计值的各小时段(绝对)差值 260 | def get_sloc_hour_latlon_sub_stat_sub(result): 261 | result['sloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['sloc_hour_lat_sub_mean']) 262 | result['sloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['sloc_hour_lon_sub_mean']) 263 | result['sloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['sloc_hour_lat_sub_mean']).abs() # 7 264 | result['sloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['sloc_hour_lon_sub_mean']).abs() # 6 265 | return result 266 | 267 | # 获取距离与到某个点结束距离统计值的各小时段(绝对)差值 268 | def get_eloc_hour_latlon_sub_stat_sub(result): 269 | result['eloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['eloc_hour_lat_sub_mean']) 270 | result['eloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['eloc_hour_lon_sub_mean']) 271 | result['eloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['eloc_hour_lat_sub_mean']).abs() # 9 272 | result['eloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['eloc_hour_lon_sub_mean']).abs() 273 | return result -------------------------------------------------------------------------------- /feature/leak.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | from .other import get_distance 5 | 6 | ''' 7 | 获取Leak特征 8 | ''' 9 | 10 | # 获取目标地点与用户上一次、下一次出发地点的距离、时间差和速度 11 | def get_eloc_user_sloc_leak(result): 12 | user_order = result[['orderid', 'userid', 'geohashed_start_loc', 'starttime', 'sloc_lat', 'sloc_lon']].drop_duplicates() 13 | user_order.sort_values(by=['userid', 'starttime'], inplace=True, ascending=True) 14 | user_order['last_userid'] = user_order.userid.shift(1) 15 | user_order['next_userid'] = user_order.userid.shift(-1) 16 | user_order['last_starttime'] = user_order.starttime.shift(1) 17 | user_order['user_last_order_time_diff'] = (pd.DatetimeIndex(user_order['starttime'])-pd.DatetimeIndex(user_order['last_starttime'])).total_seconds() 18 | user_order['next_starttime'] = user_order.starttime.shift(-1) 19 | user_order['user_next_order_time_diff'] = (pd.DatetimeIndex(user_order['starttime'])-pd.DatetimeIndex(user_order['next_starttime'])).total_seconds() 20 | user_order['last_sloc'] = user_order.geohashed_start_loc.shift(1) 21 | user_order['next_sloc'] = user_order.geohashed_start_loc.shift(-1) 22 | user_order['last_sloc_lat'] = user_order.sloc_lat.shift(1) 23 | user_order['next_sloc_lat'] = user_order.sloc_lat.shift(-1) 24 | user_order['last_sloc_lon'] = user_order.sloc_lon.shift(1) 25 | user_order['next_sloc_lon'] = user_order.sloc_lon.shift(-1) 26 | user_order.drop(['geohashed_start_loc', 'starttime', 'last_starttime', 'next_starttime', 'sloc_lat', 'sloc_lon'], axis=1, inplace=True) 27 | 28 | restmp = pd.merge(result[['orderid', 'geohashed_end_loc', 'eloc_lat', 'eloc_lon']], user_order, on='orderid', how='left') 29 | restmp.rename(columns={'last_sloc': 'geohashed_start_loc'}, inplace=True) 30 | distance = get_distance(restmp) 31 | restmp['eloc_user_last_sloc_distance'] = distance['distance'] 32 | restmp['eloc_user_last_sloc_manhattan'] = distance['manhattan'] 33 | 34 | restmp['eloc_user_last_sloc_lat_sub'] = restmp.eloc_lat - restmp.last_sloc_lat 35 | restmp['eloc_user_last_sloc_lon_sub'] = restmp.eloc_lon - restmp.last_sloc_lon 36 | restmp.drop(['geohashed_start_loc', 'distance', 'last_sloc_lat', 'last_sloc_lon', 'manhattan'], axis=1, inplace=True) 37 | restmp['eloc_user_last_sloc_speed'] = restmp.eloc_user_last_sloc_distance / restmp.user_last_order_time_diff 38 | restmp['eloc_user_last_sloc_manhattan_speed'] = restmp.eloc_user_last_sloc_manhattan / restmp.user_last_order_time_diff 39 | restmp['eloc_user_last_sloc_latlon_slope'] = restmp.eloc_user_last_sloc_lat_sub / restmp.eloc_user_last_sloc_lon_sub 40 | restmp['eloc_user_last_sloc_lat_sub_divide_distance'] = restmp.eloc_user_last_sloc_lat_sub / restmp.eloc_user_last_sloc_distance 41 | restmp['eloc_user_last_sloc_lon_sub_divide_distance'] = restmp.eloc_user_last_sloc_lon_sub / restmp.eloc_user_last_sloc_distance 42 | 43 | restmp.rename(columns={'next_sloc': 'geohashed_start_loc'}, inplace=True) 44 | distance = get_distance(restmp) 45 | restmp['eloc_user_next_sloc_distance'] = distance['distance'] 46 | restmp['eloc_user_next_sloc_manhattan'] = distance['manhattan'] 47 | restmp['eloc_user_next_sloc_lat_sub'] = restmp.eloc_lat - restmp.next_sloc_lat 48 | restmp['eloc_user_next_sloc_lon_sub'] = restmp.eloc_lon - restmp.next_sloc_lon 49 | restmp.drop(['geohashed_start_loc', 'distance', 'next_sloc_lat', 'next_sloc_lon', 'manhattan'], axis=1, inplace=True) 50 | restmp['eloc_user_next_sloc_speed'] = restmp.eloc_user_next_sloc_distance / restmp.user_next_order_time_diff 51 | restmp['eloc_user_next_sloc_manhattan_speed'] = restmp.eloc_user_next_sloc_manhattan / restmp.user_next_order_time_diff 52 | restmp['eloc_user_next_sloc_latlon_slope'] = restmp.eloc_user_next_sloc_lat_sub / restmp.eloc_user_next_sloc_lon_sub 53 | restmp['eloc_user_next_sloc_lat_sub_divide_distance'] = restmp.eloc_user_next_sloc_lat_sub / restmp.eloc_user_next_sloc_distance 54 | restmp['eloc_user_next_sloc_lon_sub_divide_distance'] = restmp.eloc_user_next_sloc_lon_sub / restmp.eloc_user_next_sloc_distance 55 | 56 | restmp.loc[restmp.userid != restmp.last_userid, 'user_last_order_time_diff'] = -1000000 57 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_distance'] = -1000000 58 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_manhattan'] = -1000000 59 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_speed'] = -1000000 60 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_manhattan_speed'] = -1000000 61 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lat_sub'] = -1000000 62 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lon_sub'] = -1000000 63 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_latlon_slope'] = -1000000 64 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lat_sub_divide_distance'] = -1000000 65 | restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lon_sub_divide_distance'] = -1000000 66 | 67 | restmp.loc[restmp.userid != restmp.last_userid, 'user_next_order_time_diff'] = -1000000 68 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_distance'] = -1000000 69 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_manhattan'] = -1000000 70 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_speed'] = -1000000 71 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_manhattan_speed'] = -1000000 72 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lat_sub'] = -1000000 73 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lon_sub'] = -1000000 74 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_latlon_slope'] = -1000000 75 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lat_sub_divide_distance'] = -1000000 76 | restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lon_sub_divide_distance'] = -1000000 77 | 78 | result['user_last_order_time_diff'] = restmp['user_last_order_time_diff'] # wc 79 | result['eloc_user_last_sloc_distance'] = restmp['eloc_user_last_sloc_distance'] # dui 90 wc 80 | result['eloc_user_last_sloc_manhattan'] = restmp['eloc_user_last_sloc_manhattan'] 81 | result['eloc_user_last_sloc_speed'] = restmp['eloc_user_last_sloc_speed'] # dui 90 wc 82 | result['eloc_user_last_sloc_manhattan_speed'] = restmp['eloc_user_last_sloc_manhattan_speed'] 83 | result['eloc_user_last_sloc_lat_sub'] = restmp['eloc_user_last_sloc_lat_sub'] 84 | result['eloc_user_last_sloc_lon_sub'] = restmp['eloc_user_last_sloc_lon_sub'] 85 | result['eloc_user_last_sloc_latlon_slope'] = restmp['eloc_user_last_sloc_latlon_slope'] 86 | result['eloc_user_last_sloc_lat_sub_divide_distance'] = restmp['eloc_user_last_sloc_lat_sub_divide_distance'] 87 | result['eloc_user_last_sloc_lon_sub_divide_distance'] = restmp['eloc_user_last_sloc_lon_sub_divide_distance'] 88 | 89 | result['user_next_order_time_diff'] = restmp['user_next_order_time_diff'] 90 | result['eloc_user_next_sloc_distance'] = restmp['eloc_user_next_sloc_distance'] # dui 90 wc 91 | result['eloc_user_next_sloc_manhattan'] = restmp['eloc_user_next_sloc_manhattan'] # wc 92 | result['eloc_user_next_sloc_speed'] = restmp['eloc_user_next_sloc_speed'] # dui 90 wc 93 | result['eloc_user_next_sloc_manhattan_speed'] = restmp['eloc_user_next_sloc_manhattan_speed'] # wc 94 | result['eloc_user_next_sloc_lat_sub'] = restmp['eloc_user_next_sloc_lat_sub'] # dui 90 wc 95 | result['eloc_user_next_sloc_lon_sub'] = restmp['eloc_user_next_sloc_lon_sub'] # dui 90 wc 96 | result['eloc_user_next_sloc_latlon_slope'] = restmp['eloc_user_next_sloc_latlon_slope'] 97 | result['eloc_user_next_sloc_lat_sub_divide_distance'] = restmp['eloc_user_next_sloc_lat_sub_divide_distance'] 98 | result['eloc_user_next_sloc_lon_sub_divide_distance'] = restmp['eloc_user_next_sloc_lon_sub_divide_distance'] 99 | 100 | return result 101 | 102 | # 获取目标地点与车辆上一次、下一次出发地点的距离、时间差、速度及经纬度信息等 103 | def get_eloc_bike_sloc_leak(result): 104 | bike_order = result[['orderid', 'bikeid', 'geohashed_start_loc', 'starttime', 'sloc_lat', 'sloc_lon']].drop_duplicates() 105 | bike_order.sort_values(by=['bikeid', 'starttime'], inplace=True, ascending=True) 106 | bike_order['last_bikeid'] = bike_order.bikeid.shift(1) 107 | bike_order['next_bikeid'] = bike_order.bikeid.shift(-1) 108 | bike_order['last_starttime'] = bike_order.starttime.shift(1) 109 | bike_order['bike_last_order_time_diff'] = (pd.DatetimeIndex(bike_order['starttime'])-pd.DatetimeIndex(bike_order['last_starttime'])).total_seconds() 110 | bike_order['next_starttime'] = bike_order.starttime.shift(-1) 111 | bike_order['bike_next_order_time_diff'] = (pd.DatetimeIndex(bike_order['starttime'])-pd.DatetimeIndex(bike_order['next_starttime'])).total_seconds() 112 | bike_order['last_sloc'] = bike_order.geohashed_start_loc.shift(1) 113 | bike_order['next_sloc'] = bike_order.geohashed_start_loc.shift(-1) 114 | bike_order['last_sloc_lat'] = bike_order.sloc_lat.shift(1) 115 | bike_order['next_sloc_lat'] = bike_order.sloc_lat.shift(-1) 116 | bike_order['last_sloc_lon'] = bike_order.sloc_lon.shift(1) 117 | bike_order['next_sloc_lon'] = bike_order.sloc_lon.shift(-1) 118 | bike_order.drop(['geohashed_start_loc', 'starttime', 'last_starttime', 'next_starttime', 'sloc_lat', 'sloc_lon'], axis=1, inplace=True) 119 | 120 | restmp = pd.merge(result[['orderid', 'geohashed_end_loc', 'eloc_lat', 'eloc_lon']], bike_order, on='orderid', how='left') 121 | restmp.rename(columns={'last_sloc': 'geohashed_start_loc'}, inplace=True) 122 | distance = get_distance(restmp) 123 | restmp['eloc_bike_last_sloc_distance'] = distance['distance'] 124 | restmp['eloc_bike_last_sloc_manhattan'] = distance['manhattan'] 125 | restmp['eloc_bike_last_sloc_lat_sub'] = restmp.eloc_lat - restmp.last_sloc_lat 126 | restmp['eloc_bike_last_sloc_lon_sub'] = restmp.eloc_lon - restmp.last_sloc_lon 127 | restmp.drop(['geohashed_start_loc', 'distance', 'last_sloc_lat', 'last_sloc_lon', 'manhattan'], axis=1, inplace=True) 128 | restmp['eloc_bike_last_sloc_speed'] = restmp.eloc_bike_last_sloc_distance / restmp.bike_last_order_time_diff 129 | restmp['eloc_bike_last_sloc_manhattan_speed'] = restmp.eloc_bike_last_sloc_manhattan / restmp.bike_last_order_time_diff 130 | restmp['eloc_bike_last_sloc_latlon_slope'] = restmp.eloc_bike_last_sloc_lat_sub / restmp.eloc_bike_last_sloc_lon_sub 131 | restmp['eloc_bike_last_sloc_lat_sub_divide_distance'] = restmp.eloc_bike_last_sloc_lat_sub / restmp.eloc_bike_last_sloc_distance 132 | restmp['eloc_bike_last_sloc_lon_sub_divide_distance'] = restmp.eloc_bike_last_sloc_lon_sub / restmp.eloc_bike_last_sloc_distance 133 | 134 | restmp.rename(columns={'next_sloc': 'geohashed_start_loc'}, inplace=True) 135 | distance = get_distance(restmp) 136 | restmp['eloc_bike_next_sloc_distance'] = distance['distance'] 137 | restmp['eloc_bike_next_sloc_manhattan'] = distance['manhattan'] 138 | restmp['eloc_bike_next_sloc_lat_sub'] = restmp.eloc_lat - restmp.next_sloc_lat 139 | restmp['eloc_bike_next_sloc_lon_sub'] = restmp.eloc_lon - restmp.next_sloc_lon 140 | restmp.drop(['geohashed_start_loc', 'distance', 'next_sloc_lat', 'next_sloc_lon', 'manhattan'], axis=1, inplace=True) 141 | restmp['eloc_bike_next_sloc_speed'] = restmp.eloc_bike_next_sloc_distance / restmp.bike_next_order_time_diff 142 | restmp['eloc_bike_next_sloc_manhattan_speed'] = restmp.eloc_bike_next_sloc_manhattan / restmp.bike_next_order_time_diff 143 | restmp['eloc_bike_next_sloc_latlon_slope'] = restmp.eloc_bike_next_sloc_lat_sub / restmp.eloc_bike_next_sloc_lon_sub 144 | restmp['eloc_bike_next_sloc_lat_sub_divide_distance'] = restmp.eloc_bike_next_sloc_lat_sub / restmp.eloc_bike_next_sloc_distance 145 | restmp['eloc_bike_next_sloc_lon_sub_divide_distance'] = restmp.eloc_bike_next_sloc_lon_sub / restmp.eloc_bike_next_sloc_distance 146 | 147 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_distance'] = -1000000 148 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_manhattan'] = -1000000 149 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_speed'] = -1000000 150 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_manhattan_speed'] = -1000000 151 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lat_sub'] = -1000000 152 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lon_sub'] = -1000000 153 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_latlon_slope'] = -1000000 154 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lat_sub_divide_distance'] = -1000000 155 | restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lon_sub_divide_distance'] = -1000000 156 | 157 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_distance'] = -1000000 158 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_manhattan'] = -1000000 159 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_speed'] = -1000000 160 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_manhattan_speed'] = -1000000 161 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lat_sub'] = -1000000 162 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lon_sub'] = -1000000 163 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_latlon_slope'] = -1000000 164 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lat_sub_divide_distance'] = -1000000 165 | restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lon_sub_divide_distance'] = -1000000 166 | 167 | result['eloc_bike_last_sloc_distance'] = restmp['eloc_bike_last_sloc_distance'] # dui 90 wc 168 | result['eloc_bike_last_sloc_manhattan'] = restmp['eloc_bike_last_sloc_manhattan'] 169 | result['eloc_bike_last_sloc_speed'] = restmp['eloc_bike_last_sloc_speed'] # dui 90 wc 170 | result['eloc_bike_last_sloc_manhattan_speed'] = restmp['eloc_bike_last_sloc_manhattan_speed'] 171 | result['eloc_bike_last_sloc_lat_sub'] = restmp['eloc_bike_last_sloc_lat_sub'] # 90 172 | result['eloc_bike_last_sloc_lon_sub'] = restmp['eloc_bike_last_sloc_lon_sub'] # 90 173 | result['eloc_bike_last_sloc_latlon_slope'] = restmp['eloc_bike_last_sloc_latlon_slope'] # 90 174 | result['eloc_bike_last_sloc_lat_sub_divide_distance'] = restmp['eloc_bike_last_sloc_lat_sub_divide_distance'] # 90 175 | result['eloc_bike_last_sloc_lon_sub_divide_distance'] = restmp['eloc_bike_last_sloc_lon_sub_divide_distance'] # 90 176 | 177 | result['eloc_bike_next_sloc_distance'] = restmp['eloc_bike_next_sloc_distance'] # dui 90 wc 178 | result['eloc_bike_next_sloc_manhattan'] = restmp['eloc_bike_next_sloc_manhattan'] 179 | result['eloc_bike_next_sloc_speed'] = restmp['eloc_bike_next_sloc_speed'] # dui 90 wc 180 | result['eloc_bike_next_sloc_manhattan_speed'] = restmp['eloc_bike_next_sloc_manhattan_speed'] 181 | result['eloc_bike_next_sloc_lat_sub'] = restmp['eloc_bike_next_sloc_lat_sub'] # 90 182 | result['eloc_bike_next_sloc_lon_sub'] = restmp['eloc_bike_next_sloc_lon_sub'] # 90 183 | result['eloc_bike_next_sloc_latlon_slope'] = restmp['eloc_bike_next_sloc_latlon_slope'] # 90 184 | result['eloc_bike_next_sloc_lat_sub_divide_distance'] = restmp['eloc_bike_next_sloc_lat_sub_divide_distance'] # 90 185 | result['eloc_bike_next_sloc_lon_sub_divide_distance'] = restmp['eloc_bike_next_sloc_lon_sub_divide_distance'] # 90 186 | 187 | return result -------------------------------------------------------------------------------- /feature/location.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import Geohash as geohash 4 | import numpy as np 5 | import os 6 | os.path.join('..') 7 | from utils import rank 8 | 9 | ''' 10 | 获取地理位置特征 11 | ''' 12 | 13 | # ----------------- 计数 ------------------- 14 | 15 | # 获取目的地的热度 16 | def get_eloc_count(train, result): 17 | train = train[~train.geohashed_end_loc.isnull()] 18 | eloc_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'eloc_count': 'count'}) 19 | result = pd.merge(result, eloc_count, on='geohashed_end_loc', how='left') 20 | return result 21 | 22 | # 获取出发地热度 23 | def get_sloc_count(train, result): 24 | sloc_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'sloc_count': 'count'}) 25 | result = pd.merge(result, sloc_count, on='geohashed_start_loc', how='left') 26 | return result 27 | 28 | # 获取出发地作为目的地的热度 29 | def get_sloc_as_eloc_count(train, result): 30 | train = train[~train.geohashed_end_loc.isnull()] 31 | sloc_as_eloc_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'sloc_as_eloc_count': 'count'}) 32 | sloc_as_eloc_count.rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 33 | result = pd.merge(result, sloc_as_eloc_count, on='geohashed_start_loc', how='left') 34 | return result 35 | 36 | # 获取目的地作为出发地的热度 37 | def get_eloc_as_sloc_count(train, result): 38 | eloc_as_sloc_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'eloc_as_sloc_count': 'count'}) 39 | eloc_as_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True) 40 | result = pd.merge(result, eloc_as_sloc_count, on='geohashed_end_loc', how='left') 41 | return result 42 | 43 | # 获取出发地->目的地地址对的热度 44 | def get_sloc_eloc_count(train, result): 45 | train = train[~train.geohashed_end_loc.isnull()] 46 | sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'sloc_eloc_count': 'count'}) 47 | result = pd.merge(result, sloc_eloc_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left') 48 | return result 49 | 50 | # 获取目的地->出发地地址对的热度(返程次数) 51 | def get_eloc_sloc_count(train, result): 52 | train = train[~train.geohashed_end_loc.isnull()] 53 | eloc_sloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'eloc_sloc_count': 'count'}) 54 | eloc_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 55 | result = pd.merge(result, eloc_sloc_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left') 56 | return result 57 | 58 | # 获取目的地的用户热度 59 | def get_eloc_user_count(train, result): 60 | train = train[~train.geohashed_end_loc.isnull()] 61 | eloc_user_count = train.groupby(['geohashed_end_loc'], as_index=False)['userid'].agg({'eloc_user_count': lambda x: np.unique(x).size}) 62 | result = pd.merge(result, eloc_user_count, on='geohashed_end_loc', how='left') 63 | return result 64 | 65 | # 获取出发地的用户热度 66 | def get_sloc_user_count(train, result): 67 | sloc_user_count = train.groupby(['geohashed_start_loc'], as_index=False)['userid'].agg({'sloc_user_count': lambda x: np.unique(x).size}) 68 | result = pd.merge(result, sloc_user_count, on='geohashed_start_loc', how='left') 69 | return result 70 | 71 | # 获取出发地作为目的地的用户热度 72 | def get_sloc_as_eloc_user_count(train, result): 73 | train = train[~train.geohashed_end_loc.isnull()] 74 | sloc_as_eloc_user_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'sloc_as_eloc_user_count': lambda x: np.unique(x).size}) 75 | sloc_as_eloc_user_count.rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 76 | result = pd.merge(result, sloc_as_eloc_user_count, on='geohashed_start_loc', how='left') 77 | return result 78 | 79 | # 获取目的地作为出发地的用户热度 80 | def get_eloc_as_sloc_user_count(train, result): 81 | eloc_as_sloc_user_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'eloc_as_sloc_user_count': lambda x: np.unique(x).size}) 82 | eloc_as_sloc_user_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True) 83 | result = pd.merge(result, eloc_as_sloc_user_count, on='geohashed_end_loc', how='left') 84 | return result 85 | 86 | # 获取出发地->目的地地址对的用户热度 87 | def get_sloc_eloc_user_count(train, result): 88 | train = train[~train.geohashed_end_loc.isnull()] 89 | sloc_eloc_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'sloc_eloc_user_count': lambda x: np.unique(x).size}) 90 | result = pd.merge(result, sloc_eloc_user_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left') 91 | return result 92 | 93 | # 获取目的地->出发地地址对的用户热度(返程用户数) 94 | def get_eloc_sloc_user_count(train, result): 95 | train = train[~train.geohashed_end_loc.isnull()] 96 | eloc_sloc_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'eloc_sloc_user_count': lambda x: np.unique(x).size}) 97 | eloc_sloc_user_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 98 | result = pd.merge(result, eloc_sloc_user_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left') 99 | return result 100 | 101 | # 获取从某个地方出发的目的地个数 102 | def get_sloc_eloccount(train, result): 103 | train = train[~train.geohashed_end_loc.isnull()] 104 | sloc_eloccount = train.groupby('geohashed_start_loc', as_index=False)['geohashed_end_loc'].agg({'sloc_eloccount': lambda x: np.unique(x).size}) 105 | result = pd.merge(result, sloc_eloccount, on='geohashed_start_loc', how='left') 106 | return result 107 | 108 | # 获取到某个地方结束的出发地个数 109 | def get_eloc_sloccount(train, result): 110 | train = train[~train.geohashed_end_loc.isnull()] 111 | eloc_sloccount = train.groupby('geohashed_end_loc', as_index=False)['geohashed_start_loc'].agg({'eloc_sloccount': lambda x: np.unique(x).size}) 112 | result = pd.merge(result, eloc_sloccount, on='geohashed_end_loc', how='left') 113 | return result 114 | 115 | # 获取目的地在各小时段的订单数 116 | def get_eloc_hour_count(train, result): 117 | train = train[~train.geohashed_end_loc.isnull()] 118 | eloc_hour_count = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'eloc_hour_count': 'count'}) 119 | result = pd.merge(result, eloc_hour_count, on=['geohashed_end_loc', 'hour'], how='left') 120 | return result 121 | 122 | # 获取出发地在各小时段的订单数 123 | def get_sloc_hour_count(train, result): 124 | sloc_hour_count = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['orderid'].agg({'sloc_hour_count': 'count'}) 125 | result = pd.merge(result, sloc_hour_count, on=['geohashed_start_loc', 'hour'], how='left') 126 | return result 127 | 128 | # 获取出发地->目的地地址对的小时热度 129 | def get_sloc_eloc_hour_count(train, result): 130 | train = train[~train.geohashed_end_loc.isnull()] 131 | sloc_eloc_hour_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'sloc_eloc_hour_count': 'count'}) 132 | result = pd.merge(result, sloc_eloc_hour_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') 133 | return result 134 | 135 | # 获取目的地->出发地地址对的小时热度(返程次数) 136 | def get_eloc_sloc_hour_count(train, result): 137 | train = train[~train.geohashed_end_loc.isnull()] 138 | eloc_sloc_hour_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'eloc_sloc_hour_count': 'count'}) 139 | eloc_sloc_hour_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 140 | result = pd.merge(result, eloc_sloc_hour_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') 141 | return result 142 | 143 | # 获取目的地在各小时段的用户数 144 | def get_eloc_hour_user_count(train, result): 145 | train = train[~train.geohashed_end_loc.isnull()] 146 | eloc_hour_user_count = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'eloc_hour_user_count': 147 | lambda x: np.unique(x).size}) 148 | result = pd.merge(result, eloc_hour_user_count, on=['geohashed_end_loc', 'hour'], how='left') 149 | return result 150 | 151 | # 获取出发地在各小时段的用户数 152 | def get_sloc_hour_user_count(train, result): 153 | sloc_hour_user_count = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['userid'].agg({'sloc_hour_user_count': 154 | lambda x: np.unique(x).size}) 155 | result = pd.merge(result, sloc_hour_user_count, on=['geohashed_start_loc', 'hour'], how='left') 156 | return result 157 | 158 | # 获取出发地->目的地地址对的用户小时热度 159 | def get_sloc_eloc_hour_user_count(train, result): 160 | train = train[~train.geohashed_end_loc.isnull()] 161 | sloc_eloc_hour_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'sloc_eloc_hour_user_count': lambda x: np.unique(x).size}) 162 | result = pd.merge(result, sloc_eloc_hour_user_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') 163 | return result 164 | 165 | # 获取目的地->出发地地址对的用户小时热度(返程用户数) 166 | def get_eloc_sloc_hour_user_count(train, result): 167 | train = train[~train.geohashed_end_loc.isnull()] 168 | eloc_sloc_hour_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'eloc_sloc_hour_user_count': lambda x: np.unique(x).size}) 169 | eloc_sloc_hour_user_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 170 | result = pd.merge(result, eloc_sloc_hour_user_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') # 9 171 | return result 172 | 173 | # 获取从某个地方出发每个小时的目的地个数 174 | def get_sloc_hour_eloccount(train, result): 175 | train = train[~train.geohashed_end_loc.isnull()] 176 | sloc_hour_eloccount = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['geohashed_end_loc'].agg({'sloc_hour_eloccount': lambda x: np.unique(x).size}) 177 | result = pd.merge(result, sloc_hour_eloccount, on=['geohashed_start_loc', 'hour'], how='left') 178 | return result 179 | 180 | # 获取到某个地方结束每个小时的出发地个数 181 | def get_eloc_hour_sloccount(train, result): 182 | train = train[~train.geohashed_end_loc.isnull()] 183 | eloc_hour_sloccount = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['geohashed_start_loc'].agg({'eloc_hour_sloccount': lambda x: np.unique(x).size}) 184 | result = pd.merge(result, eloc_hour_sloccount, on=['geohashed_end_loc', 'hour'], how='left') 185 | return result 186 | 187 | # ----------------- 统计 ------------------- 188 | 189 | # 获取从某个地点出发的距离统计 190 | def get_sloc_distance_stat(train, result): 191 | train = train[~train.geohashed_end_loc.isnull()] 192 | sloc_distance_stat = train.groupby(['geohashed_start_loc'], as_index=False)['distance'].agg({'sloc_distance_max': 'max', 'sloc_distance_min': 'min', 'sloc_distance_mean': 'mean'}) 193 | result = pd.merge(result, sloc_distance_stat, on=['geohashed_start_loc'], how='left') 194 | return result 195 | 196 | # 获取到某个地点结束的距离统计 197 | def get_eloc_distance_stat(train, result): 198 | train = train[~train.geohashed_end_loc.isnull()] 199 | eloc_distance_stat = train.groupby(['geohashed_end_loc'], as_index=False)['distance'].agg({'eloc_distance_max': 'max', 'eloc_distance_min': 'min', 'eloc_distance_mean': 'mean'}) 200 | result = pd.merge(result, eloc_distance_stat, on=['geohashed_end_loc'], how='left') 201 | return result 202 | 203 | # 获取从某个地点出发的小时段距离统计 204 | def get_sloc_hour_distance_stat(train, result): 205 | train = train[~train.geohashed_end_loc.isnull()] 206 | sloc_hour_distance_stat = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['distance'].agg({'sloc_hour_distance_max': 'max', 'sloc_hour_distance_min': 'min', 'sloc_hour_distance_mean': 'mean'}) 207 | result = pd.merge(result, sloc_hour_distance_stat, on=['geohashed_start_loc', 'hour'], how='left') 208 | return result 209 | 210 | # 获取到某个地点结束的小时段距离统计 211 | def get_eloc_hour_distance_stat(train, result): 212 | train = train[~train.geohashed_end_loc.isnull()] 213 | eloc_hour_distance_stat = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['distance'].agg({'eloc_hour_distance_max': 'max', 'eloc_hour_distance_min': 'min', 'eloc_hour_distance_mean': 'mean'}) 214 | result = pd.merge(result, eloc_hour_distance_stat, on=['geohashed_end_loc', 'hour'], how='left') 215 | return result 216 | 217 | # 获取从某个地点出发的小时均值 218 | def get_sloc_hour_mean(train, result): 219 | sloc_hour_mean = train.groupby(['geohashed_start_loc'], as_index=False)['hour'].agg({'sloc_hour_mean': 'mean'}) 220 | result = pd.merge(result, sloc_hour_mean, on=['geohashed_start_loc'], how='left') 221 | return result 222 | 223 | # 获取到某个地点结束的小时均值 224 | def get_eloc_hour_mean(train, result): 225 | train = train[~train.geohashed_end_loc.isnull()] 226 | eloc_hour_mean = train.groupby(['geohashed_end_loc'], as_index=False)['hour'].agg({'eloc_hour_mean': 'mean'}) 227 | result = pd.merge(result, eloc_hour_mean, on=['geohashed_end_loc'], how='left') 228 | return result 229 | 230 | # 获取从某个点出发到某个地点结束的小时均值 231 | def get_sloc_eloc_hour_mean(train, result): 232 | train = train[~train.geohashed_end_loc.isnull()] 233 | sloc_eloc_hour_mean = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['hour'].agg({'sloc_eloc_hour_mean': 'mean'}) 234 | result = pd.merge(result, sloc_eloc_hour_mean, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left') 235 | return result 236 | 237 | # ----------------- 排序 ------------------- 238 | 239 | # 获取到某个目的地的距离排序 240 | def get_eloc_distance_rank(result): 241 | result = rank(result, 'geohashed_end_loc', 'distance', rank_name='eloc_distance_rank', ascending=False) 242 | return result 243 | 244 | # 获取从某个地点出发的距离排序 245 | def get_sloc_distance_rank(result): 246 | result = rank(result, 'geohashed_start_loc', 'distance', rank_name='sloc_distance_rank', ascending=False) 247 | return result 248 | 249 | # 获取到某个目的地的小时段距离排序 250 | def get_eloc_hour_distance_rank(result): 251 | result = rank(result, ['geohashed_end_loc', 'hour'], 'distance', rank_name='eloc_hour_distance_rank', ascending=False) 252 | return result 253 | 254 | # 获取从某个目的地出发的小时段距离排序 255 | def get_sloc_hour_distance_rank(result): 256 | result = rank(result, ['geohashed_start_loc', 'hour'], 'distance', rank_name='sloc_hour_distance_rank', ascending=False) 257 | return result 258 | 259 | # 获取从某个地点出发的小时段排序 260 | def get_sloc_hour_rank(result): 261 | result = rank(result, 'geohashed_start_loc', 'hour', rank_name='sloc_hour_rank', ascending=False) 262 | return result 263 | 264 | # 获取到某个目的地结束的小时段排序 265 | def get_eloc_hour_rank(result): 266 | result = rank(result, 'geohashed_end_loc', 'hour', rank_name='eloc_hour_rank', ascending=False) 267 | return result 268 | 269 | # 获取从某个地点出发到某个地点结束的小时段排序 270 | def get_sloc_eloc_hour_rank(result): 271 | result = rank(result, ['geohashed_start_loc', 'geohashed_end_loc'], 'hour', rank_name='sloc_eloc_hour_rank', ascending=False) 272 | return result 273 | 274 | # ----------------- 差值 ------------------- 275 | 276 | # 获取距离与从某个点出发距离统计值的绝对差值 277 | def get_sloc_distance_stat_sub(result): 278 | result['sloc_distance_mean_sub'] = (result['distance'] - result['sloc_distance_mean']) 279 | result['sloc_distance_mean_sub_abs'] = (result['distance'] - result['sloc_distance_mean']).abs() 280 | return result 281 | 282 | # 获取距离与到某个点结束距离统计值的绝对差值 283 | def get_eloc_distance_stat_sub(result): 284 | result['eloc_distance_mean_sub'] = (result['distance'] - result['eloc_distance_mean']) 285 | result['eloc_distance_mean_sub_abs'] = (result['distance'] - result['eloc_distance_mean']).abs() 286 | return result 287 | 288 | # 获取距离与从某个点出发距离统计值的各小时段绝对差值 289 | def get_sloc_hour_distance_stat_sub(result): 290 | result['sloc_hour_distance_mean_sub'] = (result['distance'] - result['sloc_hour_distance_mean']) 291 | result['sloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['sloc_hour_distance_mean']).abs() 292 | return result 293 | 294 | # 获取距离与到某个点结束距离统计值的各小时段绝对差值 295 | def get_eloc_hour_distance_stat_sub(result): 296 | result['eloc_hour_distance_mean_sub'] = (result['distance'] - result['eloc_hour_distance_mean']) 297 | result['eloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['eloc_hour_distance_mean']).abs() 298 | return result 299 | 300 | # 获取小时段与从某个点出发的小时均值的绝对差值 301 | def get_hour_sloc_hour_mean_sub(result): 302 | result['hour_sloc_hour_mean_sub'] = (result['hour'] - result['sloc_hour_mean']) 303 | result['hour_sloc_hour_mean_sub_abs'] = (result['hour'] - result['sloc_hour_mean']).abs() 304 | return result 305 | 306 | # 获取小时段与到某个点结束的小时均值的绝对差值 307 | def get_hour_eloc_hour_mean_sub(result): 308 | result['hour_eloc_hour_mean_sub'] = (result['hour'] - result['eloc_hour_mean']) 309 | result['hour_eloc_hour_mean_sub_abs'] = (result['hour'] - result['eloc_hour_mean']).abs() 310 | return result 311 | 312 | # 获取小时段与从某个点出发到某个点结束的小时均值的绝对差值 313 | def get_hour_sloc_eloc_hour_mean_sub(result): 314 | result['hour_sloc_eloc_hour_mean_sub'] = (result['hour'] - result['sloc_eloc_hour_mean']) 315 | result['hour_sloc_eloc_hour_mean_sub_abs'] = (result['hour'] - result['sloc_eloc_hour_mean']).abs() 316 | return result 317 | 318 | # ----------------- 比例 ------------------- 319 | 320 | # 获取从某个地点出发到某个地点结束的个数与从这个点出发的个数的比例 321 | def get_sloc_eloc_count_ratio(result): 322 | result['sloc_eloc_count_ratio'] = result['sloc_eloc_count'] / result['sloc_count'] 323 | return result; 324 | 325 | # 获取从某个地点出发的小时段个数与从这个地方出发的个数的比例 326 | def get_sloc_hour_count_ratio(result): 327 | result['sloc_hour_count_ratio'] = result['sloc_hour_count'] / result['sloc_count'] 328 | return result 329 | 330 | # 获取到某个目的地的小时段个数与到某个目的地的个数的比例 331 | def get_eloc_hour_count_ratio(result): 332 | result['eloc_hour_count_ratio'] = result['eloc_hour_count'] / result['eloc_count'] 333 | return result -------------------------------------------------------------------------------- /feature/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import gc 3 | 4 | from .user import * 5 | from .location import * 6 | from .latlon import * 7 | from .other import * 8 | from .leak import * 9 | from .rule import * 10 | from .filter import * 11 | 12 | # 获取全部特征 13 | def get_feat(train, sample): 14 | result = sample 15 | # 获取协同过滤特征: 16 | result = get_loc_filter(train, result) 17 | result = get_user_loc_filter(train, result) 18 | print('协同过滤特征构造完成!') 19 | gc.collect() 20 | result.fillna(-1000000, inplace=True) 21 | gc.collect() 22 | print('所有特征构造完成:\ncolumns:\n{}'.format(result.columns)) 23 | return result 24 | 25 | # 获取距离特征: 26 | result = get_distance(result) # distance # dui 90 wc 27 | # result = result[(result['distance'] < 7200) & (result['distance'] > 100)] 28 | # print('过滤大小距离之后:', result.shape) 29 | # result = result[result['distance'] < 10000] 30 | # print('过滤大距离之后:', result.shape) 31 | print('距离特征构造完成!') 32 | 33 | # 获取小时特征: 34 | result = get_hour(result) # hour # dui 90 wc 35 | result = get_hour_count(train, result) # hour_fre # dui 90 wc 36 | print('小时特征构造完成!') 37 | 38 | # 获取用户特征 39 | result = get_user_count(train, result) # user_count # dui 90 wc 40 | result = get_user_eloc_count(train, result) # user_eloc_count # dui 90 wc 41 | result = get_user_sloc_count(train, result) 42 | result = get_user_sloc_eloc_count(train, result) # user_sloc_eloc_count 43 | result = get_user_eloc_sloc_count(train, result) # user_eloc_sloc_count 44 | result = get_user_eloc_sloc_rate(train, result) # dui 90 wc 45 | result = get_user_eloc_as_sloc_count(train, result) # user_sloc_count # dui 90 wc 46 | result = get_user_sloc_as_eloc_count(train, result) # 1 47 | result = get_user_eloc_in_sloc_count(result) # eloc_in_sloc_count 48 | result = get_user_loccount(train, result) 49 | result = get_user_sloccount(train, result) 50 | result = get_user_eloccount(train, result) # dui 90 wc 51 | result = get_user_sloc_eloccount(train, result) 52 | result = get_user_eloc_sloccount(train, result) 53 | gc.collect() 54 | print('1 done!') 55 | result = get_user_hour_count(train, result) # user_hour_fre # dui 90 wc 56 | result = get_user_eloc_hour_count(train, result) # user_eloc_hour_fre # dui 90 wc 57 | result = get_user_sloc_hour_count(train, result) # user_sloc_hour_fre 58 | result = get_user_sloc_eloc_hour_count(train, result) # user_sloc_eloc_hour_fre 59 | result = get_user_eloc_sloc_hour_count(train, result) 60 | result = get_user_hour_loccount(train, result) 61 | result = get_user_hour_sloccount(train, result) 62 | result = get_user_hour_eloccount(train, result) 63 | result = get_user_sloc_hour_eloccount(train, result) 64 | result = get_user_eloc_hour_sloccount(train, result) 65 | gc.collect() 66 | print('2 done!') 67 | result = get_user_distance_stat(train, result) # user_dis_min, user_dis_max, user_dis_med # dui 90 wc 68 | result = get_user_distance_quantile(train, result) # wc 69 | result = get_user_eloc_distance_stat(train, result) # user_eloc_dis_max, user_eloc_dis_min, user_eloc_dis_med # dui 90 wc 70 | result = get_user_sloc_distance_stat(train, result) # user_sloc_dis_min, user_sloc_dis_max, user_sloc_dis_med # dui 90 wc 71 | result = get_user_hour_distance_stat(train, result) # dui 90 wc 72 | result = get_user_sloc_hour_distance_stat(train, result) # dui 90 wc 73 | result = get_user_eloc_hour_distance_stat(train, result) 74 | result = get_user_hour_stat(train, result) # 1 # dui 90 wc 75 | result = get_user_sloc_hour_stat(train, result) # 1 76 | result = get_user_eloc_hour_stat(train, result) # 1 # dui 90 wc 77 | result = get_user_sloc_eloc_hour_stat(train, result) 78 | result = get_user_most_freq_eloc(train, result) # wc 79 | result = get_user_eloc_lasttime(train, result) # wc 80 | gc.collect() 81 | print('3 done!') 82 | result = get_user_eloc_distance_rank(result) # user_sloc_dis_rank 83 | result = get_user_sloc_distance_rank(result) # user_eloc_dis_rank 84 | result = get_user_eloc_hour_distance_rank(result) # dui 90 wc 85 | result = get_user_sloc_hour_distance_rank(result) 86 | result = get_user_hour_rank(result) # 1 # dui 90 wc 87 | result = get_user_sloc_hour_rank(result) # 1 88 | result = get_user_eloc_hour_rank(result) 89 | result = get_user_sloc_eloc_hour_rank(result) 90 | gc.collect() 91 | print('4 done!') 92 | result = get_user_distance_stat_sub(result) # 1 # dui 90 wc 93 | result = get_user_sloc_distance_stat_sub(result) # 1 # dui 90 wc 94 | result = get_user_eloc_distance_stat_sub(result) # 1 95 | result = get_user_hour_distance_stat_sub(result) # dui 90 wc 96 | result = get_user_sloc_hour_distance_stat_sub(result) # dui 90 wc 97 | result = get_user_eloc_hour_distance_stat_sub(result) 98 | result = get_hour_user_hour_stat_sub(result) # 1 # dui 90 wc 99 | result = get_hour_user_sloc_hour_stat_sub(result) # 1 100 | result = get_hour_user_eloc_hour_stat_sub(result) # 1 # dui 90 wc 101 | result = get_hour_user_sloc_eloc_hour_stat_sub(result) 102 | gc.collect() 103 | print('5 done!') 104 | result = get_global_user_sloc_count_ratio(result) # wc 105 | result = get_user_eloc_count_ratio(result) # wc 106 | gc.collect() 107 | print('用户特征构造完成!') 108 | 109 | # # 获取地理位置特征 110 | result = get_eloc_count(train, result) # eloc_count # dui 90 wc 111 | result = get_sloc_count(train, result) # eloc_as_sloc_count # dui 90 wc 112 | result = get_sloc_as_eloc_count(train, result) 113 | result = get_eloc_as_sloc_count(train, result) # dui 90 wc 114 | result = get_sloc_eloc_count(train, result) # dui 90 wc 115 | result = get_eloc_sloc_count(train, result) 116 | result = get_eloc_user_count(train, result) # eloc_usercount # dui 90 wc 117 | result = get_sloc_user_count(train, result) # eloc_as_sloc_usercount # dui 90 wc 118 | result = get_sloc_as_eloc_user_count(train, result) 119 | result = get_eloc_as_sloc_user_count(train, result) # dui 90 wc 120 | result = get_sloc_eloc_user_count(train, result) # user_sloc_eloc_usercount # dui 90 wc 121 | result = get_eloc_sloc_user_count(train, result) # user_eloc_sloc_usercount 122 | result = get_sloc_eloccount(train, result) # sloc_eloccount # dui 90 wc 123 | result = get_eloc_sloccount(train, result) # eloc_sloccount 124 | gc.collect() 125 | print('1 done!') 126 | result = get_eloc_hour_count(train, result) # eloc_hour_fre # dui 90 wc 127 | result = get_sloc_hour_count(train, result) # sloc_hour_fre # 90 wc 最后删掉 128 | result = get_sloc_eloc_hour_count(train, result) # sloc_eloc_hour_fre 129 | result = get_eloc_sloc_hour_count(train, result) 130 | result = get_eloc_hour_user_count(train, result) 131 | result = get_sloc_hour_user_count(train, result) 132 | result = get_sloc_eloc_hour_user_count(train, result) 133 | result = get_eloc_sloc_hour_user_count(train, result) 134 | result = get_sloc_hour_eloccount(train, result) 135 | result = get_eloc_hour_sloccount(train, result) 136 | gc.collect() 137 | print('2 done!') 138 | result = get_sloc_distance_stat(train, result) # sloc_dis_med, sloc_dis_min, sloc_dis_max # dui 90 wc 139 | result = get_eloc_distance_stat(train, result) # eloc_dis_max, eloc_dis_min, eloc_dis_med # dui 90 wc 140 | result = get_sloc_hour_distance_stat(train, result) 141 | result = get_eloc_hour_distance_stat(train, result) 142 | result = get_sloc_hour_mean(train, result) # 1 # dui 90 wc 143 | result = get_eloc_hour_mean(train, result) # 1 # dui 90 wc 144 | result = get_sloc_eloc_hour_mean(train, result) # 1 # dui 90 wc 145 | gc.collect() 146 | print('3 done!') 147 | result = get_eloc_distance_rank(result) # sloc_dis_rank 148 | result = get_sloc_distance_rank(result) # eloc_dis_rank 149 | result = get_eloc_hour_distance_rank(result) 150 | result = get_sloc_hour_distance_rank(result) 151 | result = get_sloc_hour_rank(result) # 1 152 | result = get_eloc_hour_rank(result) # 1 153 | result = get_sloc_eloc_hour_rank(result) # 1 154 | gc.collect() 155 | print('4 done!') 156 | result = get_sloc_distance_stat_sub(result) # 1 # dui 90 wc 157 | result = get_eloc_distance_stat_sub(result) # 1 # dui 90 wc 158 | result = get_sloc_hour_distance_stat_sub(result) 159 | result = get_eloc_hour_distance_stat_sub(result) 160 | result = get_hour_sloc_hour_mean_sub(result) # 1 # dui 90 wc 161 | result = get_hour_eloc_hour_mean_sub(result) # 1 # dui 90 wc 162 | result = get_hour_sloc_eloc_hour_mean_sub(result) # 1 # dui 90 wc 163 | gc.collect() 164 | print('5 done!') 165 | result = get_sloc_eloc_count_ratio(result) # wc 166 | result = get_sloc_hour_count_ratio(result) # wc 167 | result = get_eloc_hour_count_ratio(result) # wc 168 | gc.collect() 169 | print('地理位置特征构造完成!') 170 | 171 | # 获取协同过滤特征: 172 | result = get_loc_filter(train, result) 173 | result = get_user_loc_filter(train, result) 174 | print('协同过滤特征构造完成!') 175 | 176 | # # 获取经纬度特征 177 | result = get_eloc_latlon(result) # dui 90 wc 178 | result = get_sloc_latlon(result) # dui 90 wc 179 | gc.collect() 180 | print('1 done!') 181 | result = get_eloc_sloc_latlon_sub(result) # sloc_eloc_lon_sub, sloc_eloc_lat_sub # dui 90 wc 182 | result = get_eloc_sloc_slope(result) # 1 # dui 90 wc 183 | result = get_eloc_sloc_latlon_sub_divide_distance(result) # 1 # dui 90 wc 184 | result = get_bearing_array(result) # wc 185 | gc.collect() 186 | print('2 done!') 187 | result = get_user_latlon_sub_stat(train, result) # dui 90 wc 188 | result = get_user_sloc_latlon_sub_stat(train, result) 189 | result = get_user_eloc_latlon_sub_stat(train, result) # dui 90 wc 190 | result = get_user_sloc_hour_latlon_sub_stat(train, result) 191 | result = get_user_eloc_hour_latlon_sub_stat(train, result) 192 | gc.collect() 193 | print('3 done!') 194 | result = get_sloc_latlon_sub_stat(train, result) # dui 90 wc 195 | result = get_eloc_latlon_sub_stat(train, result) # dui 90 wc 196 | result = get_sloc_hour_latlon_sub_stat(train, result) 197 | result = get_eloc_hour_latlon_sub_stat(train, result) # dui 90 wc 198 | gc.collect() 199 | print('4 done!') 200 | result = get_user_latlon_sub_rank(result) # dui 90 wc 201 | result = get_user_eloc_latlon_sub_rank(result) 202 | result = get_user_sloc_latlon_sub_rank(result) # dui 90 wc 203 | result = get_user_eloc_hour_latlon_sub_rank(result) 204 | result = get_user_sloc_hour_latlon_sub_rank(result) # dui 90 wc 205 | gc.collect() 206 | print('5 done!') 207 | result = get_eloc_latlon_sub_rank(result) # dui 90 wc 208 | result = get_sloc_latlon_sub_rank(result) 209 | result = get_eloc_hour_latlon_sub_rank(result) 210 | result = get_sloc_hour_latlon_sub_rank(result) 211 | gc.collect() 212 | print('6 done!') 213 | result = get_user_latlon_sub_stat_sub(result) 214 | result = get_user_sloc_latlon_sub_stat_sub(result) 215 | result = get_user_eloc_latlon_sub_stat_sub(result) 216 | result = get_user_sloc_hour_latlon_sub_stat_sub(result) 217 | result = get_user_eloc_hour_latlon_sub_stat_sub(result) 218 | gc.collect() 219 | print('7 done!') 220 | result = get_sloc_latlon_sub_stat_sub(result) 221 | result = get_eloc_latlon_sub_stat_sub(result) 222 | result = get_sloc_hour_latlon_sub_stat_sub(result) 223 | result = get_eloc_hour_latlon_sub_stat_sub(result) 224 | gc.collect() 225 | print('经纬度特征构造完成!') 226 | 227 | # 获取Leak特征: 228 | result = get_eloc_user_sloc_leak(result) # 1 # dui 90 wc 229 | result = get_eloc_bike_sloc_leak(result) # 1 # dui 90 wc 230 | print('Leak特征构造完成!') 231 | 232 | # 获取规则特征 233 | result = get_user_rule(result) # dui 90 wc 234 | result = get_user_didi(train, result) # dui 90 wc 235 | gc.collect() 236 | print('1 done!') 237 | result = get_loc_rule(result) # dui 90 wc 238 | result = get_loc_didi(train, result) # dui 90 wc 239 | gc.collect() 240 | print('规则特征构造完成!') 241 | 242 | # 删除无用特征 243 | # result.drop(['sloc_lon_sub_max', 'user_hour_distance_mean', 'user_sloc_distance_mean', 'user_eloc_hour_max', 'sloc_lat_sub_max', 'eloc_bike_last_sloc_distance', 'hour', 'eloc_bike_last_sloc_speed', 'user_eloc_hour_min', 'eloc_hour_count', 'user_lon_sub_max', 'eloc_hour_lon_sub_max', 'user_hour_max', 'user_lat_sub_max', 'eloc_hour_lat_sub_min', 'hour_count', 'eloc_hour_lat_sub_max', 'eloc_hour_lon_sub_min', 'user_eloc_distance_min', 'user_hour_count_rate', 'user_eloc_distance_mean', 'user_hour_min', 'user_hour_eloc_rate', 'eloc_lon_sub_min', 'user_eloc_lon_sub_mean', 'user_eloc_lat_sub_mean', 'user_eloc_hour_mean', 'user_eloc_lon_sub_min', 'user_eloc_lat_sub_min', 'user_eloc_lon_sub_max', 'sloc_lat_sub_min', 'user_eloc_hour_count', 'eloc_lat_sub_min', 'user_eloc_count_rate', 'eloc_count_rate', 'sloc_lon_sub_min', 'user_eloc_lat_sub_max', 'hour_count_rate', 'user_end_loc_sample', 'sloc_lat', 'user_lat_sub_mean', 'sloc_lon', 'user_hour_distance_mean_sub', 'sloc_distance_max', 'user_lon_sub_mean', 'user_distance_mean', 'sloc_distance_mean_sub_abs', 'user_sloc_distance_min', 'sloc_lat_sub_mean', 'eloc_lon_sub_max', 'user_sloc_distance_max', 'eloc_distance_max', 'hour_user_hour_mean_sub', 'eloc_as_sloc_count', 'sloc_distance_min', 'eloc_lat_sub_max', 'bike_next_sloc_sample', 'user_rule', 'eloc_distance_mean_sub_abs', 'user_lat_sub_min', 'user_lon_sub_min', 'loc_to_loc_sample', 'user_sloc_hour_distance_min', 'user_sloc_hour_distance_max', 'user_sloc_hour_distance_mean', 'user_sloc_hour_distance_mean_sub', 'sloc_hour_count'], axis=1, inplace=True) 244 | 245 | # 删除无用特征 246 | # result.drop(['user_eloc_hour_lon_sub_mean_sub_abs', 'user_sloc_lon_sub_mean_sub'], axis=1, inplace=True) 247 | 248 | result.fillna(-1000000, inplace=True) 249 | print('所有特征构造完成:\ncolumns:\n{}'.format(result.columns)) 250 | return result -------------------------------------------------------------------------------- /feature/other.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Geohash as geohash 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | os.path.join('..') 7 | from utils import cal_distance, rank, manhattan 8 | 9 | ''' 10 | 获取小时特征 11 | ''' 12 | 13 | # 获取小时段 14 | def get_hour(result): 15 | result['hour'] = pd.to_datetime(result['starttime']).dt.hour 16 | return result 17 | 18 | # 获取每个小时段的出行订单数 19 | def get_hour_count(train, result): 20 | hour_count = train.groupby(['hour'], as_index=False)['userid'].agg({'hour_count': 'count'}) 21 | result = pd.merge(result, hour_count, on='hour', how='left') 22 | return result 23 | 24 | ''' 25 | 获取距离特征 26 | ''' 27 | 28 | # 获取出发地到目的地的欧氏距离和曼哈顿距离 29 | def get_distance(result): 30 | locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) 31 | if np.nan in locs: 32 | locs.remove(np.nan) 33 | deloc = [] 34 | for loc in locs: 35 | deloc.append(geohash.decode_exactly(loc)) 36 | loc_dict = dict(zip(locs, deloc)) 37 | geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values 38 | distance = [] 39 | manhattan_distance = [] 40 | for i in geohashed_loc: 41 | if i[0] is not np.nan and i[1] is not np.nan: 42 | lat1, lon1, _, _ = loc_dict[i[0]] 43 | lat2, lon2, _, _ = loc_dict[i[1]] 44 | distance.append(cal_distance(float(lat1), float(lon1), float(lat2), float(lon2))) 45 | manhattan_distance.append(manhattan(float(lat1), float(lon1), float(lat2), float(lon2))) 46 | else: 47 | distance.append(np.nan) 48 | manhattan_distance.append(np.nan) 49 | result.loc[:, 'distance'] = distance 50 | result.loc[:, 'manhattan'] = manhattan_distance 51 | return result 52 | 53 | ''' 54 | 获取经纬度特征 55 | ''' 56 | def get_latlon(result, end=True): 57 | if end: 58 | eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)) 59 | result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0])) 60 | result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1])) 61 | sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)) 62 | result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0])) 63 | result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1])) 64 | if end: 65 | result['eloc_sloc_lat_sub'] = result['eloc_lat'] - result['sloc_lat'] 66 | result['eloc_sloc_lon_sub'] = result['eloc_lon'] - result['sloc_lon'] 67 | return result -------------------------------------------------------------------------------- /feature/rule.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | 5 | ''' 6 | 获取自定义规则 7 | ''' 8 | 9 | def get_user_rule(result): 10 | result['user_rule'] = (1 + result['user_eloc_count']) * result['eloc_user_count'] / (0.01 * result['distance']) 11 | return result 12 | 13 | def get_user_didi(train, result): 14 | result['user_hour_count_rate'] = result['user_hour_count'] / train.shape[0] 15 | train = train[~train.geohashed_end_loc.isnull()] 16 | result['user_eloc_count_rate'] = result['user_eloc_count'] / train.shape[0] 17 | result['user_hour_eloc_rate'] = result['user_eloc_hour_count'] / result['user_eloc_count'] 18 | result['user_hour_eloc_distribute'] = result['user_eloc_count_rate'] * result['user_hour_eloc_rate'] / result['user_hour_count_rate'] 19 | return result 20 | 21 | def get_loc_rule(result): 22 | result['loc_rule'] = result['eloc_count'] / (0.01 * result['distance']) 23 | result['loc_rule2'] = np.sqrt(result['distance'] / (result['eloc_count'] ** 1.1)) 24 | return result 25 | 26 | def get_loc_didi(train, result): 27 | result['hour_count_rate'] = result['hour_count'] / train.shape[0] 28 | train = train[~train.geohashed_end_loc.isnull()] 29 | result['eloc_count_rate'] = result['eloc_count'] / train.shape[0] 30 | result['hour_eloc_rate'] = result['eloc_hour_count'] / result['eloc_count'] 31 | result['hour_eloc_distribute'] = result['eloc_count_rate'] * result['hour_eloc_rate'] / result['hour_count_rate'] 32 | return result -------------------------------------------------------------------------------- /feature/user.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import os 4 | import numpy as np 5 | from .other import get_distance 6 | from .latlon import get_sloc_latlon, get_eloc_latlon, get_eloc_sloc_latlon_sub, get_eloc_sloc_slope, get_eloc_sloc_latlon_sub_divide_distance, get_bearing_array 7 | os.path.join('..') 8 | from utils import rank 9 | 10 | ''' 11 | 获取用户特征 12 | ''' 13 | 14 | # ----------------- 计数 ------------------- 15 | 16 | # 获取用户历史出行次数 17 | def get_user_count(train, result): 18 | user_count = train.groupby('userid', as_index=False)['orderid'].agg({'user_count': 'count'}) 19 | result = pd.merge(result, user_count, on=['userid'], how='left') 20 | return result 21 | 22 | # 获取用户去过某个地点的历史出行次数 23 | def get_user_eloc_count(train, result): 24 | train = train[~train.geohashed_end_loc.isnull()] 25 | user_eloc_count = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_count': 'count'}) 26 | result = pd.merge(result, user_eloc_count, on=['userid', 'geohashed_end_loc'], how='left') 27 | return result 28 | 29 | # 获取用户从某个地方出发的历史出行次数 30 | def get_user_sloc_count(train, result): 31 | user_sloc_count = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['userid'].agg({'user_sloc_count': 'count'}) 32 | result = pd.merge(result, user_sloc_count, on=['userid', 'geohashed_start_loc'], how='left') 33 | return result 34 | 35 | # 获取用户从某个地方出发到某个地方结束的历史出行次数 36 | def get_user_sloc_eloc_count(train, result): 37 | train = train[~train.geohashed_end_loc.isnull()] 38 | user_sloc_eloc_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_sloc_eloc_count': 'count'}) 39 | result = pd.merge(result, user_sloc_eloc_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left') 40 | return result 41 | 42 | # 获取用户从某个目的地出发到某个出发地结束的历史返程次数 43 | def get_user_eloc_sloc_count(train, result): 44 | train = train[~train.geohashed_end_loc.isnull()] 45 | user_eloc_sloc_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_sloc_count': 'count'}) 46 | user_eloc_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc', 'geohashed_end_loc':'geohashed_start_loc'}, inplace=True) 47 | result = pd.merge(result, user_eloc_sloc_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left') 48 | return result 49 | 50 | # 获取用户的返程比例 51 | def get_user_eloc_sloc_rate(train, result): 52 | train = train[~train.geohashed_end_loc.isnull()] 53 | user_eloc_sloc_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_sloc_count': 'count'}) 54 | user_eloc_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc', 'geohashed_end_loc':'geohashed_start_loc'}, inplace=True) 55 | restmp = pd.merge(train, user_eloc_sloc_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left') 56 | restmp = restmp.groupby('userid', as_index=False)['user_eloc_sloc_count'].agg({'user_eloc_sloc_rate': lambda x: np.sum(x>0)/np.size(x)}) 57 | result = pd.merge(result, restmp, on='userid', how='left') 58 | return result 59 | 60 | # 获取用户目的地点作为出发地的次数 61 | def get_user_eloc_as_sloc_count(train, result): 62 | user_eloc_as_sloc_count = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['userid'].agg({'user_eloc_as_sloc_count': 'count'}) 63 | user_eloc_as_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True) 64 | result = pd.merge(result, user_eloc_as_sloc_count, on=['userid', 'geohashed_end_loc'], how='left') 65 | return result 66 | 67 | # 获取用户出发地点作为目的地的次数 68 | def get_user_sloc_as_eloc_count(train, result): 69 | train = train[~train.geohashed_end_loc.isnull()] 70 | user_sloc_as_eloc_count = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_sloc_as_eloc_count': 'count'}) 71 | user_sloc_as_eloc_count.rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 72 | result = pd.merge(result, user_sloc_as_eloc_count, on=['userid', 'geohashed_start_loc'], how='left') 73 | return result 74 | 75 | # 获取用户目的地出现在出发地中的个数 76 | def get_user_eloc_in_sloc_count(result): 77 | user_eloc_in_sloc_count = result.groupby(['userid', 'geohashed_start_loc'], as_index=False)['orderid'].agg({'user_eloc_in_sloc_count': lambda x: np.unique(x).size}) 78 | user_eloc_in_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True) 79 | result = pd.merge(result, user_eloc_in_sloc_count, on=['userid', 'geohashed_end_loc'], how='left') 80 | return result 81 | 82 | # 获取用户涉及到的地点个数 83 | def get_user_loccount(train, result): 84 | user_sloc = train[['userid', 'geohashed_start_loc']] 85 | train = train[~train.geohashed_end_loc.isnull()] 86 | user_eloc = train[['userid', 'geohashed_end_loc']].rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 87 | user_loc = pd.concat([user_sloc, user_eloc]) 88 | user_loccount = user_loc.groupby('userid', as_index=False)['geohashed_start_loc'].agg({'user_loccount': lambda x: np.unique(x).size}) 89 | result = pd.merge(result, user_loccount, on=['userid'], how='left') 90 | return result 91 | 92 | # 获取用户出发的出发地个数 93 | def get_user_sloccount(train, result): 94 | user_sloccount = train.groupby('userid', as_index=False)['geohashed_start_loc'].agg({'user_sloccount': lambda x: np.unique(x).size}) 95 | result = pd.merge(result, user_sloccount, on=['userid'], how='left') 96 | return result 97 | 98 | # 获取用户到达的目的地个数 99 | def get_user_eloccount(train, result): 100 | train = train[~train.geohashed_end_loc.isnull()] 101 | user_eloccount = train.groupby('userid', as_index=False)['geohashed_end_loc'].agg({'user_eloccount': lambda x: np.unique(x).size}) 102 | result = pd.merge(result, user_eloccount, on=['userid'], how='left') 103 | return result 104 | 105 | # 获取用户从某个地方出发到的目的地数目 106 | def get_user_sloc_eloccount(train, result): 107 | train = train[~train.geohashed_end_loc.isnull()] 108 | user_sloc_eloccount = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['geohashed_end_loc'].agg({'user_sloc_eloccount': lambda x: np.unique(x).size}) 109 | result = pd.merge(result, user_sloc_eloccount, on=['userid', 'geohashed_start_loc'], how='left') 110 | return result 111 | 112 | # 获取用户到某个地方结束的出发地数目 113 | def get_user_eloc_sloccount(train, result): 114 | train = train[~train.geohashed_end_loc.isnull()] 115 | user_eloc_sloccount = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['geohashed_start_loc'].agg({'user_eloc_sloccount': lambda x: np.unique(x).size}) 116 | result = pd.merge(result, user_eloc_sloccount, on=['userid', 'geohashed_end_loc'], how='left') 117 | return result 118 | 119 | # 获取用户在每个小时段的出行订单数 120 | def get_user_hour_count(train, result): 121 | user_hour_count = train.groupby(['userid', 'hour'], as_index=False)['orderid'].agg({'user_hour_count': 'count'}) 122 | result = pd.merge(result, user_hour_count, on=['userid', 'hour'], how='left') 123 | return result 124 | 125 | # 获取用户在每个小时段从某个地方出发的订单数 126 | def get_user_sloc_hour_count(train, result): 127 | user_sloc_hour_count = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['orderid'].agg({'user_sloc_hour_count': 'count'}) 128 | result = pd.merge(result, user_sloc_hour_count, on=['userid', 'geohashed_start_loc', 'hour'], how='left') 129 | return result 130 | 131 | # 获取用户在每个小时段到某个地方结束的订单数 132 | def get_user_eloc_hour_count(train, result): 133 | train = train[~train.geohashed_end_loc.isnull()] 134 | user_eloc_hour_count = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'user_eloc_hour_count': 'count'}) 135 | result = pd.merge(result, user_eloc_hour_count, on=['userid', 'geohashed_end_loc', 'hour'], how='left') 136 | return result 137 | 138 | # 获取用户在每个小时段从某个地方出发到某个地方结束的订单数 139 | def get_user_sloc_eloc_hour_count(train, result): 140 | train = train[~train.geohashed_end_loc.isnull()] 141 | user_sloc_eloc_hour_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'user_sloc_eloc_hour_count': 'count'}) 142 | result = pd.merge(result, user_sloc_eloc_hour_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') 143 | return result 144 | 145 | # 获取用户在每个小时段从某个地方出发到某个地方结束的返程订单数 146 | def get_user_eloc_sloc_hour_count(train, result): 147 | train = train[~train.geohashed_end_loc.isnull()] 148 | user_eloc_sloc_hour_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'user_eloc_sloc_hour_count': 'count'}) 149 | user_eloc_sloc_hour_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 150 | result = pd.merge(result, user_eloc_sloc_hour_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') # 1 151 | return result 152 | 153 | # 获取用户每个小时段涉及到的地点数 154 | def get_user_hour_loccount(train, result): 155 | user_hour_sloc = train[['userid', 'hour', 'geohashed_start_loc']] 156 | train = train[~train.geohashed_end_loc.isnull()] 157 | user_hour_eloc = train[['userid', 'hour', 'geohashed_end_loc']].rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True) 158 | user_hour_loc = pd.concat([user_hour_sloc, user_hour_eloc]) 159 | user_hour_loccount = user_hour_loc.groupby(['userid', 'hour'], as_index=False)['geohashed_start_loc'].agg({'user_hour_loccount': lambda x: np.unique(x).size}) 160 | result = pd.merge(result, user_hour_loccount, on=['userid', 'hour'], how='left') 161 | return result 162 | 163 | # 获取用户每个小时段出发的出发地个数 164 | def get_user_hour_sloccount(train, result): 165 | user_hour_sloccount = train.groupby(['userid', 'hour'], as_index=False)['geohashed_start_loc'].agg({'user_hour_sloccount': lambda x: np.unique(x).size}) 166 | result = pd.merge(result, user_hour_sloccount, on=['userid', 'hour'], how='left') # 4 167 | return result 168 | 169 | # 获取用户每个小时段到达的目的地个数 170 | def get_user_hour_eloccount(train, result): 171 | train = train[~train.geohashed_end_loc.isnull()] 172 | user_hour_eloccount = train.groupby(['userid', 'hour'], as_index=False)['geohashed_end_loc'].agg({'user_hour_eloccount': lambda x: np.unique(x).size}) 173 | result = pd.merge(result, user_hour_eloccount, on=['userid', 'hour'], how='left') 174 | return result 175 | 176 | # 获取用户每个小时段从某个地方出发到的目的地数目 177 | def get_user_sloc_hour_eloccount(train, result): 178 | train = train[~train.geohashed_end_loc.isnull()] 179 | user_sloc_hour_eloccount = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['geohashed_end_loc'].agg({'user_sloc_hour_eloccount': lambda x: np.unique(x).size}) 180 | result = pd.merge(result, user_sloc_hour_eloccount, on=['userid', 'geohashed_start_loc', 'hour'], how='left') 181 | return result 182 | 183 | # 获取用户每个小时段到某个地方结束的出发地数目 184 | def get_user_eloc_hour_sloccount(train, result): 185 | train = train[~train.geohashed_end_loc.isnull()] 186 | user_eloc_hour_sloccount = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['geohashed_start_loc'].agg({'user_eloc_hour_sloccount': lambda x: np.unique(x).size}) 187 | result = pd.merge(result, user_eloc_hour_sloccount, on=['userid', 'geohashed_end_loc', 'hour'], how='left') # 9 188 | return result 189 | 190 | # ----------------- 统计 ------------------- 191 | 192 | # 获取用户出行距离的统计值 193 | def get_user_distance_stat(train, result): 194 | train = train[~train.geohashed_end_loc.isnull()] 195 | user_distance_stat = train.groupby('userid', as_index=False)['distance'].agg({'user_distance_max': 'max', 'user_distance_min': 'min', 'user_distance_mean': 'mean'}) 196 | result = pd.merge(result, user_distance_stat, on=['userid'], how='left') 197 | user_manhattan_stat = train.groupby('userid', as_index=False)['manhattan'].agg({'user_manhattan_max': 'max', 'user_manhattan_min': 'min', 'user_manhattan_mean': 'mean'}) 198 | result = pd.merge(result, user_manhattan_stat, on=['userid'], how='left') 199 | return result 200 | 201 | # 获取用户出行距离的分位点 202 | def get_user_distance_quantile(train, result): 203 | train = train[~train.geohashed_end_loc.isnull()] 204 | user_distance_quantile = train.groupby('userid')['distance'].quantile(0.2).reset_index() 205 | user_distance_quantile.rename(columns={'distance': 'user_distance_quantile_2'}, inplace=True) 206 | result = pd.merge(result, user_distance_quantile, on='userid', how='left') 207 | user_manhattan_quantile = train.groupby('userid')['manhattan'].quantile(0.2).reset_index() 208 | user_manhattan_quantile.rename(columns={'manhattan': 'user_manhattan_quantile_2'}, inplace=True) 209 | result = pd.merge(result, user_manhattan_quantile, on='userid', how='left') 210 | user_distance_quantile = train.groupby('userid')['distance'].quantile(0.8).reset_index() 211 | user_distance_quantile.rename(columns={'distance': 'user_distance_quantile_8'}, inplace=True) 212 | result = pd.merge(result, user_distance_quantile, on='userid', how='left') 213 | return result 214 | 215 | # 获取用户从某个地点出发的出行距离统计值 216 | def get_user_sloc_distance_stat(train, result): 217 | train = train[~train.geohashed_end_loc.isnull()] 218 | user_sloc_distance_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['distance'].agg({'user_sloc_distance_max': 'max', 'user_sloc_distance_min': 'min', 'user_sloc_distance_mean': 'mean'}) 219 | result = pd.merge(result, user_sloc_distance_stat, on=['userid', 'geohashed_start_loc'], how='left') 220 | user_sloc_manhattan_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['manhattan'].agg({'user_sloc_manhattan_max': 'max', 'user_sloc_manhattan_min': 'min', 'user_sloc_manhattan_mean': 'mean'}) 221 | result = pd.merge(result, user_sloc_manhattan_stat, on=['userid', 'geohashed_start_loc'], how='left') 222 | return result 223 | 224 | # 获取用户到某个地点结束的出行距离统计值 225 | def get_user_eloc_distance_stat(train, result): 226 | train = train[~train.geohashed_end_loc.isnull()] 227 | user_eloc_distance_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['distance'].agg({'user_eloc_distance_max': 'max', 'user_eloc_distance_min': 'min', 'user_eloc_distance_mean': 'mean'}) 228 | result = pd.merge(result, user_eloc_distance_stat, on=['userid', 'geohashed_end_loc'], how='left') 229 | return result 230 | 231 | # 获取用户各时间段出行距离的统计值 232 | def get_user_hour_distance_stat(train, result): 233 | train = train[~train.geohashed_end_loc.isnull()] 234 | user_hour_distance_stat = train.groupby(['userid', 'hour'], as_index=False)['distance'].agg({'user_hour_distance_max': 'max', 'user_hour_distance_min': 'min', 'user_hour_distance_mean': 'mean'}) 235 | result = pd.merge(result, user_hour_distance_stat, on=['userid', 'hour'], how='left') 236 | user_hour_manhattan_stat = train.groupby(['userid', 'hour'], as_index=False)['manhattan'].agg({'user_hour_manhattan_max': 'max', 'user_hour_manhattan_min': 'min', 'user_hour_manhattan_mean': 'mean'}) 237 | result = pd.merge(result, user_hour_manhattan_stat, on=['userid', 'hour'], how='left') 238 | return result 239 | 240 | # 获取用户各时间段从某个地点出发的出行距离统计值 241 | def get_user_sloc_hour_distance_stat(train, result): 242 | train = train[~train.geohashed_end_loc.isnull()] 243 | user_sloc_hour_distance_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['distance'].agg({'user_sloc_hour_distance_max': 'max', 'user_sloc_hour_distance_min': 'min', 'user_sloc_hour_distance_mean': 'mean'}) 244 | result = pd.merge(result, user_sloc_hour_distance_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left') 245 | user_sloc_hour_manhattan_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['manhattan'].agg({'user_sloc_hour_manhattan_max': 'max', 'user_sloc_hour_manhattan_min': 'min', 'user_sloc_hour_manhattan_mean': 'mean'}) 246 | result = pd.merge(result, user_sloc_hour_manhattan_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left') 247 | return result 248 | 249 | # 获取用户各时间段到某个地点结束的出行距离统计值 250 | def get_user_eloc_hour_distance_stat(train, result): 251 | train = train[~train.geohashed_end_loc.isnull()] 252 | user_eloc_hour_distance_stat = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['distance'].agg({'user_eloc_hour_distance_max': 'max', 'user_eloc_hour_distance_min': 'min', 'user_eloc_hour_distance_mean': 'mean'}) 253 | result = pd.merge(result, user_eloc_hour_distance_stat, on=['userid', 'geohashed_end_loc', 'hour'], how='left') 254 | return result 255 | 256 | # 获取用户出行的小时段统计值 257 | def get_user_hour_stat(train, result): 258 | user_hour_stat = train.groupby(['userid'], as_index=False)['hour'].agg({'user_hour_max': 'max', 'user_hour_min': 'min', 'user_hour_mean': 'mean'}) 259 | result = pd.merge(result, user_hour_stat, on=['userid'], how='left') 260 | return result 261 | 262 | # 获取用户从某个地点出行的小时段统计值 263 | def get_user_sloc_hour_stat(train, result): 264 | user_sloc_hour_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['hour'].agg({'user_sloc_hour_max': 'max', 'user_sloc_hour_min': 'min', 'user_sloc_hour_mean': 'mean'}) 265 | result = pd.merge(result, user_sloc_hour_stat, on=['userid', 'geohashed_start_loc'], how='left') 266 | return result 267 | 268 | # 获取用户到某个地点结束的小时段统计值 269 | def get_user_eloc_hour_stat(train, result): 270 | train = train[~train.geohashed_end_loc.isnull()] 271 | user_eloc_hour_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['hour'].agg({'user_eloc_hour_max': 'max', 'user_eloc_hour_min': 'min', 'user_eloc_hour_mean': 'mean'}) 272 | result = pd.merge(result, user_eloc_hour_stat, on=['userid', 'geohashed_end_loc'], how='left') 273 | return result 274 | 275 | # 获取用户从某个地点出发到某个地点结束的小时段统计值 276 | def get_user_sloc_eloc_hour_stat(train, result): 277 | train = train[~train.geohashed_end_loc.isnull()] 278 | user_sloc_eloc_hour_stat = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['hour'].agg({'user_sloc_eloc_hour_max': 'max', 'user_sloc_eloc_hour_min': 'min', 'user_sloc_eloc_hour_mean': 'mean'}) # 6 279 | # user_sloc_eloc_hour_stat = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['hour'].agg({'user_sloc_eloc_hour_min': 'min', 'user_sloc_eloc_hour_mean': 'mean'}) 280 | result = pd.merge(result, user_sloc_eloc_hour_stat, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left') 281 | return result 282 | 283 | # 获取用户到过最多的地点的各信息 284 | def get_user_most_freq_eloc(train, result): 285 | train = train[~train.geohashed_end_loc.isnull()] 286 | user_eloc_count = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_count': 'count'}) 287 | user_most_freq_eloc = user_eloc_count.sort_values(by=['userid', 'user_eloc_count']).groupby('userid', as_index=False).last()[['userid', 'geohashed_end_loc']] 288 | user_most_freq_eloc.rename(columns={'geohashed_end_loc': 'user_most_freq_eloc'}, inplace=True) 289 | result = pd.merge(result, user_most_freq_eloc, on='userid', how='left') 290 | restmp = result[['orderid', 'geohashed_start_loc', 'user_most_freq_eloc']]; 291 | restmp.rename(columns={'user_most_freq_eloc': 'geohashed_end_loc'}, inplace=True) 292 | restmp = get_distance(restmp) 293 | restmp = get_sloc_latlon(restmp) 294 | restmp = pd.merge(restmp, get_eloc_latlon(restmp[~restmp.geohashed_end_loc.isnull()][['orderid', 'geohashed_end_loc']]), on=['orderid', 'geohashed_end_loc'], how='left') 295 | restmp = get_eloc_sloc_latlon_sub(restmp) 296 | restmp = get_eloc_sloc_slope(restmp) 297 | restmp = get_eloc_sloc_latlon_sub_divide_distance(restmp) 298 | restmp = get_bearing_array(restmp) 299 | result['user_most_freq_eloc_distance'] = restmp['distance'] 300 | result['user_most_freq_eloc_distance_sub'] = result['distance'] - result['user_most_freq_eloc_distance'] 301 | result['user_most_freq_eloc_distance_sub_abs'] = (result['distance'] - result['user_most_freq_eloc_distance']).abs() 302 | result['user_most_freq_eloc_manhattan_distance'] = restmp['manhattan'] 303 | result['user_most_freq_eloc_manhattan_sub'] = result['manhattan'] - result['user_most_freq_eloc_manhattan_distance'] 304 | result['user_most_freq_eloc_manhattan_sub_abs'] = (result['manhattan'] - result['user_most_freq_eloc_manhattan_distance']).abs() 305 | result['user_most_freq_eloc_lon_sub'] = restmp['eloc_sloc_lon_sub'] 306 | result['user_most_freq_eloc_lat_sub'] = restmp['eloc_sloc_lat_sub'] 307 | result['user_most_freq_eloc_slope'] = restmp['eloc_sloc_latlon_slope'] 308 | result['user_most_freq_eloc_lat_sub_divide_distance'] = restmp['eloc_sloc_lat_sub_divide_distance'] 309 | result['user_most_freq_eloc_lon_sub_divide_distance'] = restmp['eloc_sloc_lon_sub_divide_distance'] 310 | result['user_most_freq_eloc_degree'] = restmp['degree'] 311 | result.drop(['user_most_freq_eloc'], axis=1, inplace=True) 312 | return result 313 | 314 | # 获取用户到某个地点的最后一次时间与当前的时间差 315 | def get_user_eloc_lasttime(train, result): 316 | train = train[~train.geohashed_end_loc.isnull()] 317 | train = train.sort_values(by='starttime') 318 | user_eloc_last = train.groupby(['userid','geohashed_end_loc'], as_index=False).last()[['userid','geohashed_end_loc', 'starttime']] 319 | user_eloc_last.rename(columns={'starttime': 'user_eloc_lasttime'}, inplace=True) 320 | result = pd.merge(result, user_eloc_last, on=['userid', 'geohashed_end_loc'], how='left') 321 | result['user_eloc_lasttime'] = (pd.DatetimeIndex(result.starttime) - pd.DatetimeIndex(result.user_eloc_lasttime)).total_seconds().values 322 | return result 323 | 324 | # ----------------- 排序 ------------------- 325 | 326 | # 获取用户到某个地点结束的距离排序 327 | def get_user_eloc_distance_rank(result): 328 | result = rank(result, ['userid', 'geohashed_end_loc'], 'distance', rank_name='user_eloc_distance_rank', ascending=False) 329 | return result 330 | 331 | # 获取用户从某个地点出发的距离排序 332 | def get_user_sloc_distance_rank(result): 333 | result = rank(result, ['userid', 'geohashed_start_loc'], 'distance', rank_name='user_sloc_distance_rank', ascending=False) 334 | return result 335 | 336 | # 获取用户各小时段到某个地点结束的距离排序 337 | def get_user_eloc_hour_distance_rank(result): 338 | result = rank(result, ['userid', 'geohashed_start_loc', 'hour'], 'distance', rank_name='user_eloc_hour_distance_rank', ascending=False) 339 | return result 340 | 341 | # 获取用户各小时段从某个地点出发的距离排序 342 | def get_user_sloc_hour_distance_rank(result): 343 | result = rank(result, ['userid', 'geohashed_end_loc', 'hour'], 'distance', rank_name='user_sloc_hour_distance_rank', ascending=False) 344 | return result 345 | 346 | # 获取用户出行时间的小时段排序 347 | def get_user_hour_rank(result): 348 | result = rank(result, 'userid', 'hour', rank_name='user_hour_rank', ascending=False) 349 | return result 350 | 351 | # 获取用户从某个地点出发的出行时间的小时段排序 352 | def get_user_sloc_hour_rank(result): 353 | result = rank(result, ['userid', 'geohashed_start_loc'], 'hour', rank_name='user_sloc_hour_rank', ascending=False) 354 | return result 355 | 356 | # 获取用户到某个地点结束的出行时间的小时段排序 357 | def get_user_eloc_hour_rank(result): 358 | result = rank(result, ['userid', 'geohashed_end_loc'], 'hour', rank_name='user_eloc_hour_rank', ascending=False) 359 | return result 360 | 361 | # 获取用户从某个地点出发到某个地点结束的出行时间的小时段排序 362 | def get_user_sloc_eloc_hour_rank(result): 363 | result = rank(result, ['userid', 'geohashed_start_loc', 'geohashed_end_loc'], 'hour', rank_name='user_sloc_eloc_hour_rank', ascending=False) # 5 364 | return result 365 | 366 | # ----------------- 差值 ------------------- 367 | 368 | # 获取实际距离与用户出行距离统计值的(绝对)差值 369 | def get_user_distance_stat_sub(result): 370 | result['user_distance_mean_sub'] = (result['distance'] - result['user_distance_mean']) 371 | result['user_distance_mean_sub_abs'] = (result['distance'] - result['user_distance_mean']).abs() 372 | result['user_manhattan_mean_sub'] = (result['manhattan'] - result['user_manhattan_mean']) 373 | result['user_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_manhattan_mean']).abs() 374 | return result 375 | 376 | # 获取实际距离与用户从某个点出发距离统计值的(绝对)差值 377 | def get_user_sloc_distance_stat_sub(result): 378 | result['user_sloc_distance_mean_sub'] = (result['distance'] - result['user_sloc_distance_mean']) 379 | result['user_sloc_distance_mean_sub_abs'] = (result['distance'] - result['user_sloc_distance_mean']).abs() 380 | result['user_sloc_manhattan_mean_sub'] = (result['manhattan'] - result['user_sloc_manhattan_mean']) 381 | result['user_sloc_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_sloc_manhattan_mean']).abs() 382 | return result 383 | 384 | # 获取实际距离与用户到某个点结束距离统计值的(绝对)差值 385 | def get_user_eloc_distance_stat_sub(result): 386 | result['user_eloc_distance_mean_sub'] = (result['distance'] - result['user_eloc_distance_mean']) 387 | result['user_eloc_distance_mean_sub_abs'] = (result['distance'] - result['user_eloc_distance_mean']).abs() 388 | return result 389 | 390 | # 获取实际距离与用户出行距离统计值的各小时段(绝对)差值 391 | def get_user_hour_distance_stat_sub(result): 392 | result['user_hour_distance_mean_sub'] = (result['distance'] - result['user_hour_distance_mean']) 393 | result['user_hour_distance_mean_sub_abs'] = (result['distance'] - result['user_hour_distance_mean']).abs() 394 | result['user_hour_manhattan_mean_sub'] = (result['manhattan'] - result['user_hour_manhattan_mean']) 395 | result['user_hour_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_hour_manhattan_mean']).abs() 396 | return result 397 | 398 | # 获取实际距离与用户从某个点出发距离统计值的各小时段(绝对)差值 399 | def get_user_sloc_hour_distance_stat_sub(result): 400 | result['user_sloc_hour_distance_mean_sub'] = (result['distance'] - result['user_sloc_hour_distance_mean']) 401 | result['user_sloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['user_sloc_hour_distance_mean']).abs() 402 | result['user_sloc_hour_manhattan_mean_sub'] = (result['manhattan'] - result['user_sloc_hour_manhattan_mean']) 403 | result['user_sloc_hour_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_sloc_hour_manhattan_mean']).abs() 404 | return result 405 | 406 | # 获取实际距离与用户到某个点结束距离统计值的各小时段(绝对)差值 407 | def get_user_eloc_hour_distance_stat_sub(result): 408 | result['user_eloc_hour_distance_mean_sub'] = (result['distance'] - result['user_eloc_hour_distance_mean']) 409 | result['user_eloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['user_eloc_hour_distance_mean']).abs() 410 | return result 411 | 412 | # 获取小时段与用户出行的小时段统计值的(绝对)差值 413 | def get_hour_user_hour_stat_sub(result): 414 | result['hour_user_hour_mean_sub'] = (result['hour'] - result['user_hour_mean']) 415 | result['hour_user_hour_mean_sub_abs'] = (result['hour'] - result['user_hour_mean']).abs() 416 | return result 417 | 418 | # 获取小时段与用户从某个地方出发的小时段统计值的(绝对)差值 419 | def get_hour_user_sloc_hour_stat_sub(result): 420 | result['hour_user_sloc_hour_mean_sub'] = (result['hour'] - result['user_sloc_hour_mean']) 421 | result['hour_user_sloc_hour_mean_sub_abs'] = (result['hour'] - result['user_sloc_hour_mean']).abs() 422 | return result 423 | 424 | # 获取小时段与用户到某个地方结束的小时段统计值的(绝对)差值 425 | def get_hour_user_eloc_hour_stat_sub(result): 426 | result['hour_user_eloc_hour_mean_sub'] = (result['hour'] - result['user_eloc_hour_mean']) 427 | result['hour_user_eloc_hour_mean_sub_abs'] = (result['hour'] - result['user_eloc_hour_mean']).abs() 428 | return result 429 | 430 | # 获取小时段与用户从某个地点出发到某个地方结束的小时段统计值的(绝对)差值 431 | def get_hour_user_sloc_eloc_hour_stat_sub(result): 432 | result['hour_user_sloc_eloc_hour_mean_sub'] = (result['hour'] - result['user_sloc_eloc_hour_mean']) 433 | result['hour_user_sloc_eloc_hour_mean_sub_abs'] = (result['hour'] - result['user_sloc_eloc_hour_mean']).abs() 434 | return result 435 | 436 | # ----------------- 比例 ------------------- 437 | 438 | # 获取全局中用户目的地出现在出发地中的个数占用户出行次数的比例 439 | def get_global_user_sloc_count_ratio(result): 440 | train = pd.read_csv('../../MOBIKE_CUP_2017/train.csv') 441 | test = pd.read_csv('../../MOBIKE_CUP_2017/test.csv') 442 | train = pd.concat([train, test]) 443 | user_sloc_count = train.groupby(['userid','geohashed_start_loc'])['userid'].agg({'global_user_sloc_count_ratio': 'count'}) 444 | user_count = train.groupby(['userid'])['userid'].agg({'global_user_sloc_count_ratio': 'count'}) 445 | user_sloc_count = user_sloc_count.div(user_count).reset_index() 446 | user_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc'},inplace=True) 447 | result = pd.merge(result, user_sloc_count, on=['userid', 'geohashed_end_loc'], how='left') 448 | return result 449 | 450 | # 获取用户到某个目的地的个数占用户出行总数的比例 451 | def get_user_eloc_count_ratio(result): 452 | result['user_eloc_count_ratio'] = result['user_eloc_count'] / result['user_count'] 453 | return result -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | import lightgbm as lgb 6 | import pickle 7 | import datetime 8 | import fire 9 | import gc 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | from config import DefaultConfig 14 | from dataset import get_train_data, get_test_data, get_sample 15 | from feature import get_feat 16 | from utils import get_label, get_score, load_model, predict, rank 17 | 18 | def train(**kwargs): 19 | 20 | # ---------------------- 更新参数 ---------------------- 21 | opt = DefaultConfig() 22 | opt.update(**kwargs) 23 | opt.printf() 24 | 25 | # ---------------------- 数据处理 ---------------------- 26 | 27 | # 获取数据 28 | # train1, train2 = get_train_data(opt) 29 | # 获取样本 30 | # train_sample = get_sample(train1, train2, load=True) 31 | # 获取特征 32 | # train_feat = get_feat(train1, train_sample) 33 | # 获取标签 34 | # train_all = get_label(train_feat, opt) 35 | # gc.collect() 36 | 37 | # train_all.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf', 'w', complib='blosc', complevel=5) 38 | train_all = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf') 39 | print(train_all.shape) 40 | 41 | # 取出需要用的特征 42 | # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' 43 | # gbm, use_feat = load_model(opt) 44 | # predictors_100 = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()}) 45 | # predictors_100 = predictors_100.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100] 46 | # use_feat = list(predictors_100) + ['orderid', 'geohashed_end_loc', 'label'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] 47 | # train_all = train_all[use_feat] 48 | # gc.collect() 49 | 50 | # -------------------- 训练第一层 ------------------------ 51 | 52 | # ********* 准备数据 ********** 53 | # 划分验证集 54 | train, val = train_test_split(train_all, test_size=0.1) 55 | # 定义使用哪些特征 56 | # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' 57 | # gbm, use_feat = load_model(opt) 58 | filters = set(['orderid', 'userid', 'biketype', 'geohashed_start_loc', 'bikeid', 'starttime', 'geohashed_end_loc', 'label']) 59 | predictors = list(filter(lambda x: x not in filters, train_all.columns.tolist())) 60 | # predictors = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()}) 61 | # predictors = predictors.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100] 62 | # use_feat = list(predictors) + ['orderid', 'geohashed_end_loc'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] 63 | # predictors = list(predictors_100) + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] 64 | print('使用的特征:{}维\n'.format(len(predictors)), predictors) 65 | # 定义数据集 66 | X_train = train[predictors] 67 | y_train = train['label'] 68 | X_val = val[predictors] 69 | y_val = val['label'] 70 | del train, val 71 | gc.collect() 72 | 73 | # ********* LightGBM ********* 74 | # 数据集 75 | lgb_train = lgb.Dataset(X_train, y_train) 76 | lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) 77 | # 配置 78 | params = { 79 | 'objective': 'binary', 80 | 'metric': {'auc', 'binary_logloss'}, 81 | 'is_unbalance': True, 82 | 'num_leaves': opt['lgb_leaves'], 83 | 'learning_rate': opt['lgb_lr'], 84 | 'feature_fraction': 0.886, 85 | 'bagging_fraction': 0.886, 86 | 'bagging_freq': 5 87 | } 88 | gc.collect() 89 | # ********** 开始训练 ********* 90 | gbm1 = lgb.train( 91 | params, 92 | lgb_train, 93 | num_boost_round=1200, 94 | valid_sets=[lgb_train, lgb_val], 95 | early_stopping_rounds=5 96 | ) 97 | gc.collect() 98 | 99 | # # ********* 保存模型 ********* 100 | 101 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 102 | # save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb', cur_time, score[0]) 103 | save_path = '{}/{}_{}.pkl'.format(opt['model_dir'], 'lgb', cur_time) 104 | with open(save_path, 'wb') as fout: 105 | pickle.dump(gbm1, fout) 106 | print('保存模型:', save_path) 107 | gc.collect() 108 | 109 | # # ********* 评估 ********* 110 | 111 | # # 在训练集上看效果 112 | del X_train, y_train, X_val, y_val 113 | gc.collect() 114 | score = get_score(train_all, predictors, gbm1, opt) 115 | print('训练集分数:{}'.format(score)) 116 | 117 | import sys 118 | sys.exit(0) 119 | 120 | # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_1_300_top25') 121 | # with open(save_path, 'wb') as fout: 122 | # pickle.dump(gbm1, fout) 123 | # print('保存模型(第一层):', save_path) 124 | 125 | # ********* save predict ***** 126 | 127 | # train_all[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/train2324_80_pred_res.hdf', 'w', complib='blosc', complevel=5) 128 | # print('Save train_pred_res.hdf successful!!!') 129 | 130 | # import sys 131 | # sys.exit(0) 132 | 133 | # -------------------- 训练第二层 ------------------------ 134 | 135 | # opt['model_name'] = 'lgb_1_300_top25.pkl' 136 | # gbm1, use_feat1 = load_model(opt) 137 | # train_all.loc[:, 'pred'] = gbm1.predict(train_all[use_feat1]) 138 | 139 | # 去掉重要性较低的特征,筛选出排名前十的候选样本,重新训练模型(后期可以载入模型finetune,尤其是对于样本量较少的情况,甚至可以选前5,但15可以覆盖99.5%的原始label,10可以覆盖98%的原始label,这两者可能会好一些,备选方案:5(+finetune),10(+finetune),15(+finetune)) 140 | predictors = pd.DataFrame(data={'feature_name': gbm1.feature_name(), 'feature_importance': gbm1.feature_importance()}) 141 | predictors = predictors[predictors['feature_importance']>0]['feature_name'].values 142 | print('第二层使用的特征:{}维\n'.format(len(predictors)), predictors) 143 | train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15) 144 | # train_all = rank(train_all, 'orderid', 'pred', ascending=False) 145 | del train_all['pred'] 146 | print('第二层数据:', train_all.shape) 147 | 148 | # ********* 准备数据 ********** 149 | # 划分验证集 150 | train, val = train_test_split(train_all, test_size=0.1) 151 | 152 | # 定义数据集 153 | X_train = train[predictors] 154 | y_train = train['label'] 155 | X_val = val[predictors] 156 | y_val = val['label'] 157 | del train, val 158 | gc.collect() 159 | 160 | # 数据集 161 | lgb_train = lgb.Dataset(X_train, y_train) 162 | lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) 163 | 164 | # ********** 开始训练 ********* 165 | gbm2 = lgb.train( 166 | params, 167 | lgb_train, 168 | num_boost_round=1200, 169 | valid_sets=[lgb_train, lgb_val], 170 | early_stopping_rounds=5 171 | # init_model=gbm1 # finetune 172 | ) 173 | 174 | # ********* 评估 ********* 175 | 176 | # 在训练集上看效果 177 | score = get_score(train_all, predictors, gbm2, opt) 178 | print('训练集分数(第二层):{}'.format(score)) 179 | 180 | # ********* 保存模型 ********* 181 | 182 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 183 | save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_2', cur_time, score[0]) 184 | with open(save_path, 'wb') as fout: 185 | pickle.dump(gbm2, fout) 186 | print('保存模型(第二层):', save_path) 187 | # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_2_300_top15') 188 | # with open(save_path, 'wb') as fout: 189 | # pickle.dump(gbm2, fout) 190 | # print('保存模型(第二层):', save_path) 191 | 192 | import sys 193 | sys.exit(0) 194 | 195 | # -------------------- 训练第三层 ------------------------ 196 | 197 | # 筛选出排名前五的候选样本 198 | predictors = pd.DataFrame(data={'feature_name': gbm2.feature_name(), 'feature_importance': gbm2.feature_importance()}) 199 | predictors = predictors[predictors['feature_importance']>0]['feature_name'].values 200 | print('第三层使用的特征:{}维\n'.format(len(predictors)), predictors) 201 | train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10) 202 | # train_all = rank(train_all, 'orderid', 'pred', ascending=False) 203 | del train_all['pred'] 204 | print('第三层数据:', train_all.shape) 205 | 206 | # ********* 准备数据 ********** 207 | # 划分验证集 208 | train, val = train_test_split(train_all, test_size=0.1) 209 | 210 | # 定义数据集 211 | X_train = train[predictors] 212 | y_train = train['label'] 213 | X_val = val[predictors] 214 | y_val = val['label'] 215 | del train, val 216 | gc.collect() 217 | 218 | # 数据集 219 | lgb_train = lgb.Dataset(X_train, y_train) 220 | lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) 221 | 222 | # ********** 开始训练 ********* 223 | gbm3 = lgb.train( 224 | params, 225 | lgb_train, 226 | num_boost_round=1200, 227 | valid_sets=[lgb_train, lgb_val], 228 | early_stopping_rounds=5 229 | # init_model=gbm2 # finetune 230 | ) 231 | 232 | # ********* 评估 ********* 233 | 234 | # 在训练集上看效果 235 | score = get_score(train_all, predictors, gbm3, opt) 236 | print('训练集分数(第三层):{}'.format(score)) 237 | 238 | # ********* 保存模型 ********* 239 | 240 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 241 | save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_3', cur_time, score[0]) 242 | with open(save_path, 'wb') as fout: 243 | pickle.dump(gbm3, fout) 244 | print('保存模型(第三层):', save_path) 245 | save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_3_300_top10') 246 | with open(save_path, 'wb') as fout: 247 | pickle.dump(gbm3, fout) 248 | print('保存模型(第三层):', save_path) 249 | 250 | 251 | # -------------------- 训练第四层 ------------------------ 252 | 253 | # 筛选出排名前三的候选样本 254 | predictors = pd.DataFrame(data={'feature_name': gbm3.feature_name(), 'feature_importance': gbm3.feature_importance()}) 255 | predictors = predictors[predictors['feature_importance']>0]['feature_name'].values 256 | print('第四层使用的特征:{}维\n'.format(len(predictors)), predictors) 257 | train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5) 258 | # train_all = rank(train_all, 'orderid', 'pred', ascending=False) 259 | del train_all['pred'] 260 | print('第四层数据:', train_all.shape) 261 | 262 | # ********* 准备数据 ********** 263 | # 划分验证集 264 | train, val = train_test_split(train_all, test_size=0.1) 265 | 266 | # 定义数据集 267 | X_train = train[predictors] 268 | y_train = train['label'] 269 | X_val = val[predictors] 270 | y_val = val['label'] 271 | del train, val 272 | gc.collect() 273 | 274 | # 数据集 275 | lgb_train = lgb.Dataset(X_train, y_train) 276 | lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) 277 | 278 | # ********** 开始训练 ********* 279 | gbm4 = lgb.train( 280 | params, 281 | lgb_train, 282 | num_boost_round=1200, 283 | valid_sets=[lgb_train, lgb_val], 284 | early_stopping_rounds=5 285 | # init_model=gbm3 # finetune 286 | ) 287 | 288 | # ********* 评估 ********* 289 | 290 | # 在训练集上看效果 291 | score = get_score(train_all, predictors, gbm4, opt) 292 | print('训练集分数(第四层):{}'.format(score)) 293 | 294 | # ********* 保存模型 ********* 295 | 296 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 297 | save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_4', cur_time, score[0]) 298 | with open(save_path, 'wb') as fout: 299 | pickle.dump(gbm4, fout) 300 | print('保存模型(第四层):', save_path) 301 | save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_4_300_top5') 302 | with open(save_path, 'wb') as fout: 303 | pickle.dump(gbm4, fout) 304 | print('保存模型(第四层):', save_path) 305 | 306 | def val(**kwargs): 307 | 308 | # ---------------------- 更新参数 ---------------------- 309 | opt = DefaultConfig() 310 | opt.update(**kwargs) 311 | opt.printf() 312 | 313 | # ---------------------- 数据处理 ---------------------- 314 | 315 | # 获取数据 316 | # train1, train2, train_test = get_train_data(opt) 317 | # 获取样本 318 | # train_sample = get_sample(train1, train2, load=True) 319 | # 获取特征 320 | # train_feat = get_feat(train_test, train_sample) 321 | # gc.collect() 322 | 323 | # train_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(opt['startday']), 'w', complib='blosc', complevel=5) 324 | train_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf') 325 | 326 | # ---------------------- 载入模型 ---------------------- 327 | 328 | # opt['model_name'] = 'lgb_1_90_all.pkl' 329 | # gbm0, use_feat0 = load_model(opt) 330 | opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' 331 | gbm, use_feat = load_model(opt) 332 | opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl' 333 | gbm1, use_feat1 = load_model(opt) 334 | # gbm2, use_feat2 = load_model(opt) 335 | # opt['model_name'] = 'lgb_2017-09-03#23:24:26_0.57836.pkl' 336 | # gbm3, use_feat3 = load_model(opt) 337 | # opt['model_name'] = '' 338 | # gbm4, use_feat4 = load_model(opt) 339 | 340 | # ---------------------- 评估 ------------------------- 341 | 342 | train_feat.loc[:, 'pred'] = gbm.predict(train_feat[use_feat]) 343 | gc.collect() 344 | train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58820.csv', index=None) 345 | train_feat.loc[:, 'pred'] = gbm1.predict(train_feat[use_feat1]) 346 | gc.collect() 347 | train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58893.csv', index=None) 348 | # train_feat = train_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(25) 349 | # train_feat[['orderid', 'geohashed_end_loc']].to_hdf('/home/xuwenchao/dyj-storage/sample_25_{}.hdf'.format(train.shape[0]), 'w', complib='blosc', complevel=5) 350 | # gc.collect() 351 | 352 | # score = get_score(train_feat, use_feat, gbm, opt) 353 | # print('day{}分数:{}'.format(opt['startday'], score)) 354 | # score = get_score(train_feat, use_feat1, gbm1, opt) 355 | # print('day{}分数:{}'.format(opt['startday'], score)) 356 | # train_feat = train_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15) 357 | # score = get_score(train_feat, use_feat1, gbm1, opt) 358 | # print('day{}分数:{}'.format(opt['startday'], score)) 359 | # score1 = get_score(train_feat, use_feat1, gbm1, opt) 360 | # print('day{}分数: {}'.format(opt['startday'], score1)) 361 | # score2 = get_score(train_feat, use_feat2, gbm2, opt) 362 | # print('day{}分数: {}'.format(opt['startday'], score2)) 363 | # score3 = get_score(train_feat, use_feat3, gbm3, opt) 364 | # print('day{}分数: {}'.format(opt['startday'], score3)) 365 | # score4 = get_score(train_feat, use_feat4, gbm4, opt) 366 | # print('day{}分数: {}'.format(opt['startday'], score4)) 367 | 368 | def test(**kwargs): 369 | 370 | # ---------------------- 更新参数 ---------------------- 371 | opt = DefaultConfig() 372 | opt.update(**kwargs) 373 | opt.printf() 374 | 375 | # ---------------------- 数据处理 ---------------------- 376 | 377 | # 获取数据 378 | train, test = get_test_data(opt) 379 | gc.collect() 380 | # # 获取样本 381 | # test_sample = get_sample(train, test, load=True) 382 | # gc.collect() 383 | # # 获取特征 384 | # test_feat = get_feat(train, test_sample) 385 | # gc.collect() 386 | 387 | # 保存特征至文件 388 | # test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) 389 | test_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0])) 390 | test_feat = get_feat(train, test_feat) 391 | gc.collect() 392 | test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}_filter.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) 393 | 394 | # ---------------------- 载入模型 ---------------------- 395 | 396 | # opt['model_name'] = 'lgb_1_90_all.pkl' 397 | # gbm0, use_feat0 = load_model(opt) 398 | opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl' 399 | gbm1, use_feat1 = load_model(opt) 400 | # opt['model_name'] = 'lgb_2_300_top15.pkl' 401 | # gbm2, use_feat2 = load_model(opt) 402 | # opt['model_name'] = 'lgb_3_300_top10.pkl' 403 | # gbm3, use_feat3 = load_model(opt) 404 | # opt['model_name'] = 'lgb_4_300_top5.pkl' 405 | # gbm4, use_feat4 = load_model(opt) 406 | 407 | # ---------------------- 保存预测结果 ------------------- 408 | 409 | # test_feat.loc[:, 'pred'] = gbm0.predict(test_feat[use_feat0]) 410 | # gc.collect() 411 | # res = test_feat[['orderid', 'geohashed_end_loc', 'pred']].sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(25) 412 | # res[['orderid', 'geohashed_end_loc']].to_hdf('/home/xuwenchao/dyj-storage/sample_25_{}_filter_leak_sample.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) 413 | # gc.collect() 414 | 415 | # test_feat.loc[:, 'pred'] = gbm1.predict(test_feat[use_feat1]) 416 | # test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58820.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) 417 | 418 | res = predict(test_feat, use_feat1, gbm1) 419 | test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58893.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) 420 | gc.collect() 421 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 422 | res_path = '{}/day{}_{}_wc_sample_0.58893.csv'.format(opt['result_dir'], opt['test_startday'], cur_time) 423 | res.to_csv(res_path, index=False) 424 | print('保存测试结果至:', res_path) 425 | # test_feat = test_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15) 426 | # del test_feat['pred'] 427 | # gc.collect() 428 | 429 | # res = predict(test_feat, use_feat2, gbm2) 430 | # gc.collect() 431 | # test_feat = test_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10) 432 | # del test_feat['pred'] 433 | # gc.collect() 434 | 435 | # res = predict(test_feat, use_feat3, gbm3) 436 | # gc.collect() 437 | # test_feat = test_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5) 438 | # del test_feat['pred'] 439 | # gc.collect() 440 | 441 | # res = predict(test_feat, use_feat4, gbm4) 442 | # gc.collect() 443 | # cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 444 | # res_path = '{}/day{}_{}_5.csv'.format(opt['result_dir'], opt['test_startday'], cur_time) 445 | # res.to_csv(res_path, index=False) 446 | # print('保存测试结果至:', res_path) 447 | 448 | if __name__ == '__main__': 449 | fire.Fire() -------------------------------------------------------------------------------- /submit.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pandas as pd 3 | import datetime 4 | 5 | result_dir = '../result' 6 | 7 | test25 = pd.read_csv(result_dir + '/day25_2017-09-24#05:02:10_wc_sample_0.58893.csv') 8 | test26 = pd.read_csv(result_dir + '/day26_2017-09-24#09:27:13_wc_sample_0.58893.csv') 9 | test27 = pd.read_csv(result_dir + '/day27_2017-09-24#16:49:10_wc_sample_0.58893.csv') 10 | test28 = pd.read_csv(result_dir + '/day28_2017-09-24#20:18:04_wc_sample_0.58893.csv') 11 | test29 = pd.read_csv(result_dir + '/day29_2017-09-24#23:26:19_wc_sample_0.58893.csv') 12 | test30 = pd.read_csv(result_dir + '/day30_2017-09-25#02:02:54_wc_sample_0.58893.csv') 13 | test31 = pd.read_csv(result_dir + '/day31_2017-09-25#08:46:58_wc_sample_0.58893.csv') 14 | 15 | # 生成全部测试结果 16 | test = pd.read_csv('../../MOBIKE_CUP_2017/test.csv') 17 | res = pd.concat([test25, test26, test27, test28, test29, test30, test31]) 18 | print(res.shape) 19 | res = pd.merge(test[['orderid']], res, on='orderid', how='left') 20 | res.fillna('0', inplace=True) 21 | 22 | # 生成提交文件 23 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 24 | res_path = '{}/result_{}_{}.csv'.format(result_dir, '0.58893_wc_sample', cur_time) 25 | res.to_csv(res_path, header=False, index=False) 26 | print('保存提交结果至:', res_path) -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .helper import * 2 | from .eval import get_score, predict, get_label 3 | -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/eval.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/eval.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/helper.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/helper.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/label.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/label.cpython-35.pyc -------------------------------------------------------------------------------- /utils/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import pickle 6 | from .helper import rank 7 | 8 | # 获取真实标签 9 | def get_label(data, opt): 10 | result_path = opt['cache_dir'] + '/true.pkl' 11 | if os.path.exists(result_path): 12 | true = pickle.load(open(result_path, 'rb+')) 13 | else: 14 | train = pd.read_csv(opt['train_csv']) 15 | test = pd.read_csv(opt['test_csv']) 16 | test['geohashed_end_loc'] = np.nan 17 | data_all = pd.concat([train, test]) 18 | true = dict(zip(data_all['orderid'].values, data_all['geohashed_end_loc'])) 19 | pickle.dump(true, open(result_path, 'wb+')) 20 | data['label'] = data['orderid'].map(true) 21 | if data.get('geohashed_end_loc', None) is not None: 22 | data['label'] = (data['label'] == data['geohashed_end_loc']).astype('int') 23 | return data 24 | 25 | # 整合预测结果 26 | def reshape(pred): 27 | result = pred[["orderid", "pred", "geohashed_end_loc"]].copy() 28 | result = rank(result, 'orderid', 'pred', ascending=False) 29 | result = result[result['rank']<3][['orderid', 'geohashed_end_loc', 'rank']] 30 | result = result.set_index(['orderid', 'rank']).unstack() 31 | result.reset_index(inplace=True) 32 | result.columns = ['orderid', 0, 1, 2] 33 | return result 34 | 35 | # 评估函数 36 | def map_score(result): 37 | ''' 38 | result: orderid, 0, 1, 2, label 39 | ''' 40 | data = result.copy() 41 | acc1 = sum(data['label'] == data[0]) # 第一个位置上正确的个数 42 | acc2 = sum(data['label'] == data[1]) # 第二个位置上正确的个数 43 | acc3 = sum(data['label'] == data[2]) # 第三个位置上正确的个数 44 | score = (acc1+acc2/2+acc3/3)/data.shape[0] 45 | return score, acc1, acc2, acc3, data.shape[0] 46 | 47 | # 预测结果 48 | def predict(data, feat, model): 49 | data.loc[:, 'pred'] = model.predict(data[feat]) 50 | res = reshape(data) 51 | res.fillna('0', inplace=True) 52 | return res 53 | 54 | # 获取分数 55 | def get_score(data, feat, model, opt): 56 | res = predict(data, feat, model) 57 | res = get_label(res, opt) 58 | score = map_score(res) 59 | return score 60 | -------------------------------------------------------------------------------- /utils/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | 6 | # 计算相差的分钟数 7 | def diff_of_minutes(time1, time2): 8 | d = {'5': 0, '6': 31, } 9 | try: 10 | days = (d[time1[6]] + int(time1[8:10])) - (d[time2[6]] + int(time2[8:10])) 11 | try: 12 | minutes1 = int(time1[11:13]) * 60 + int(time1[14:16]) 13 | except: 14 | minutes1 = 0 15 | try: 16 | minutes2 = int(time2[11:13]) * 60 + int(time2[14:16]) 17 | except: 18 | minutes2 = 0 19 | return (days * 1440 - minutes2 + minutes1) 20 | except: 21 | return np.nan 22 | 23 | # 计算两点之间距离 24 | def haversine(lat1, lng1, lat2, lng2): 25 | """function to calculate haversine distance between two co-ordinates""" 26 | lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 27 | AVG_EARTH_RADIUS = 6371 # in km 28 | lat = lat2 - lat1 29 | lng = lng2 - lng1 30 | d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2 31 | h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 32 | return(h) 33 | 34 | def manhattan(lat1, lng1, lat2, lng2): 35 | """function to calculate manhatten distance between pick_drop""" 36 | a = haversine(lat1, lng1, lat1, lng2) 37 | b = haversine(lat1, lng1, lat2, lng1) 38 | return a + b 39 | 40 | # 计算两个经纬度之间的距离 41 | def cal_distance(lat1,lon1,lat2,lon2): 42 | dx = np.abs(lon1 - lon2) 43 | dy = np.abs(lat1 - lat2) 44 | b = (lat1 + lat2) / 2.0 45 | Lx = 6371004.0 * (dx / 57.2958) * np.cos(b / 57.2958) 46 | Ly = 6371004.0 * (dy / 57.2958) 47 | L = (Lx**2 + Ly**2) ** 0.5 48 | return L 49 | 50 | # 计算两个经纬度之间的方向角 51 | def bearing_array(lat1, lng1, lat2, lng2): 52 | """ function was taken from beluga's notebook as this function works on array 53 | while my function used to work on individual elements and was noticably slow""" 54 | AVG_EARTH_RADIUS = 6371 # in km 55 | lng_delta_rad = np.radians(lng2 - lng1) 56 | lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 57 | y = np.sin(lng_delta_rad) * np.cos(lat2) 58 | x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad) 59 | return np.degrees(np.arctan2(y, x)) 60 | 61 | # 分组排序 62 | def rank(data, feat1, feat2, rank_name='rank', ascending=True): 63 | if type(feat1) == list: feat = feat1 + [feat2] 64 | else: feat = [feat1, feat2] 65 | use_feat = list(set(feat + ['orderid', 'geohashed_end_loc'])) 66 | datatmp = data[use_feat] 67 | datatmp.sort_values(feat, inplace=True, ascending=ascending) 68 | datatmp[rank_name] = range(datatmp.shape[0]) 69 | min_rank = datatmp.groupby(feat1, as_index=False)[rank_name].agg({'min_rank': 'min'}) 70 | datatmp = pd.merge(datatmp, min_rank, on=feat1, how='left') 71 | datatmp[rank_name] = datatmp[rank_name] - datatmp['min_rank'] 72 | data = pd.merge(data, datatmp[['orderid', 'geohashed_end_loc', rank_name]], on=['orderid', 'geohashed_end_loc'], how='left') 73 | # del data['min_rank'] 74 | return data 75 | 76 | # 载入模型 77 | def load_model(opt): 78 | with open('{}/{}'.format(opt['model_dir'], opt['model_name']), 'rb') as fin: 79 | gbm = pickle.load(fin) 80 | use_feat = gbm.feature_name() 81 | print('载入模型成功:', len(use_feat), use_feat) 82 | return gbm, use_feat 83 | --------------------------------------------------------------------------------