├── README.md
├── __pycache__
    └── config.cpython-35.pyc
├── config.py
├── dataset
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── data.cpython-35.pyc
    │   └── sample.cpython-35.pyc
    ├── data.py
    └── sample.py
├── feature
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── filter.cpython-35.pyc
    │   ├── latlon.cpython-35.pyc
    │   ├── leak.cpython-35.pyc
    │   ├── location.cpython-35.pyc
    │   ├── main.cpython-35.pyc
    │   ├── other.cpython-35.pyc
    │   ├── rule.cpython-35.pyc
    │   └── user.cpython-35.pyc
    ├── filter.py
    ├── latlon.py
    ├── leak.py
    ├── location.py
    ├── main.py
    ├── other.py
    ├── rule.py
    └── user.py
├── main.py
├── submit.py
└── utils
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-35.pyc
        ├── eval.cpython-35.pyc
        ├── helper.cpython-35.pyc
        └── label.cpython-35.pyc
    ├── eval.py
    └── helper.py


/README.md:
--------------------------------------------------------------------------------
1 | # Mobike
2 | 摩拜杯 算法挑战赛 第三名 解决方案
3 | 
4 | 代码使用fire设置入口，如：python3 main.py train
5 | 


--------------------------------------------------------------------------------
/__pycache__/config.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/__pycache__/config.cpython-35.pyc


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | class DefaultConfig(dict):
 2 | 	def __init__(self):
 3 | 		# ------------ 数据路径 ------------
 4 | 		
 5 | 		self['data_dir'] = '../../MOBIKE_CUP_2017'
 6 | 		self['train_csv'] = self['data_dir'] + '/train.csv'
 7 | 		self['test_csv'] = self['data_dir'] + '/test.csv'
 8 | 		self['cache_dir'] = '../cache'
 9 | 		self['model_dir'] = '../snapshot'
10 | 		self['result_dir'] = '../result'
11 | 
12 | 		# ------------ 训练参数 --------
13 | 
14 | 		self['startday'] = 23
15 | 		self['endday'] = 25
16 | 		self['lgb_leaves'] = 96
17 | 		self['lgb_lr'] = 0.05
18 | 
19 | 		# ------------ 测试参数 --------
20 | 
21 | 		self['test_startday'] = 25
22 | 		self['test_endday'] = 26
23 | 		self['model_name'] = None
24 |         
25 | 		# -------- 是否有用户 --------
26 |         
27 | 		self['user'] = True
28 | 
29 | 	def update(self, **kwargs):
30 | 		for key in kwargs:
31 | 		    self[key] = kwargs[key]
32 | 		self['time_prefix'] = '2017-05-'
33 | 		self['time_suffix'] = ' 00:00:00'
34 | 		self['starttime'] = '2017-05-' + str(self['startday']) + ' 00:00:00'
35 | 		self['endtime'] = '2017-05-' + str(self['endday']) + ' 00:00:00'
36 | 		self['test_starttime'] = '2017-05-' + str(self['test_startday']) + ' 00:00:00'
37 | 		self['test_endtime'] = '2017-05-' + str(self['test_endday']) + ' 00:00:00'
38 | 
39 | 	def printf(self):
40 | 		print('Current Config:')
41 | 		for key in self:
42 | 			print('{}: {}'.format(key, self[key]))
43 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import get_train_data, get_test_data
2 | from .sample import get_sample
3 | 


--------------------------------------------------------------------------------
/dataset/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/dataset/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/dataset/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/sample.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/dataset/__pycache__/sample.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | os.path.join('..')
 6 | from feature.other import get_distance, get_hour, get_latlon
 7 | 
 8 | # 获取训练数据
 9 | def get_train_data(opt):
10 | 	train = pd.read_csv(opt['train_csv'])
11 | 	train1 = train[(train['starttime'] < opt['starttime']) | (train['starttime'] >= opt['endtime'])]
12 | 	train2 = train[(train['starttime'] >= opt['starttime']) & (train['starttime'] < opt['endtime'])]
13 | 	# train2_old = train2[train2.userid.isin(train1.userid)]
14 | 	# train2_new = train2[~train2.userid.isin(train1.userid)]
15 | 	# train2_old_sample = train2_old.sample(frac=0.8)
16 | 	# train2 = pd.concat([train2_old_sample, train2_new])
17 | 	# train2_newuser = train2[~train2.userid.isin(train1['userid'])]
18 | 	# train2 = pd.concat([train2, train2_newuser, train2_newuser])
19 | 	# train1 = train[(train['starttime'] < '2017-05-18 00:00:00') | ((train['starttime'] >= '2017-05-20 00:00:00') & (train['starttime'] < '2017-05-22 00:00:00'))]
20 | 	# train2 = train[((train['starttime'] >= '2017-05-18 00:00:00') & (train['starttime'] < '2017-05-20 00:00:00')) | (train['starttime'] >= '2017-05-22 00:00:00')]
21 | 	del train2['geohashed_end_loc']
22 | 	train1 = add_info(train1) # 添加小时信息、距离信息和经纬度信息
23 | 	#test = pd.read_csv(opt['test_csv']) # add
24 | 	#train2 = get_hour(train2) # add
25 | 	#train2 = get_latlon(train2, end=False) # add
26 | 	#test = get_hour(test) # add
27 | 	#test = get_latlon(test, end=False) # add
28 | 	#train_all = pd.concat([train1, train2, test]) # add
29 | 	print('训练数据加载完成：', train1.shape, train2.shape)#, train_all.shape)
30 | 	return train1, train2#, train_all
31 | 
32 | # 获取测试数据
33 | def get_test_data(opt):
34 | 	train = pd.read_csv(opt['train_csv'])
35 | 	test = pd.read_csv(opt['test_csv'])
36 | 	# test_all = test.copy()
37 | 	if opt['test_endtime'] < opt['test_starttime']: test = test[(test['starttime'] >= opt['test_starttime'])]
38 | 	else: test = test[(test['starttime'] >= opt['test_starttime']) & (test['starttime'] < opt['test_endtime'])]
39 | 	train = add_info(train) # 添加小时信息、距离信息和经纬度信息
40 | 	# test_all = get_hour(test_all)
41 | 	# test_all = get_latlon(test_all, end=False)
42 | 	# train_all = pd.concat([train, test_all])
43 | 	print('测试数据加载完成：', train.shape, test.shape)#, train_all.shape)
44 | 	return train, test#, train_all
45 | 
46 | def add_info(res):
47 |     res = get_distance(res)
48 |     res = get_hour(res)
49 |     res = get_latlon(res)
50 |     return res


--------------------------------------------------------------------------------
/dataset/sample.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | 
  4 | # 将用户的历史目的地加入为样本
  5 | def get_user_end_loc(train, test):
  6 |     user_eloc = train[['userid', 'geohashed_end_loc']]
  7 |     result = pd.merge(test[['orderid', 'userid']], user_eloc, on='userid', how='left')
  8 |     result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
  9 |     result['user_end_loc_sample'] = 1
 10 |     return result
 11 | 
 12 | # 将用户的历史出发地加入为样本
 13 | def get_user_start_loc(train, test):
 14 |     # user_sloc_train = train[['userid', 'geohashed_start_loc']]
 15 |     # user_sloc_test = test[['userid', 'geohashed_start_loc']]
 16 |     user_sloc_train = pd.read_csv('../../MOBIKE_CUP_2017/train.csv')[['userid', 'geohashed_start_loc']].drop_duplicates()
 17 |     user_sloc_test = pd.read_csv('../../MOBIKE_CUP_2017/test.csv')[['userid', 'geohashed_start_loc']].drop_duplicates()
 18 |     user_sloc = pd.concat([user_sloc_train, user_sloc_test])
 19 |     # user_sloc = train[['userid', 'geohashed_start_loc']].drop_duplicates()
 20 |     result = pd.merge(test[['orderid', 'userid']], user_sloc, on='userid', how='left')
 21 |     result.rename(columns={'geohashed_start_loc':'geohashed_end_loc'}, inplace=True)
 22 |     result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
 23 |     result['user_start_loc_sample'] = 1
 24 |     return result
 25 | 
 26 | # 将起始地点出发到的Top10地方加入为样本
 27 | def get_loc_to_loc(train, test):
 28 |     # sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'})
 29 |     # sloc_eloc_count.sort_values('sloc_eloc_count', inplace=True)
 30 |     # if train['userid'].values[0] != -1: sloc_eloc_count = sloc_eloc_count.groupby('geohashed_start_loc').tail(15)
 31 |     sloc_eloc = train[['geohashed_start_loc', 'geohashed_end_loc']].drop_duplicates()
 32 |     result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc, on='geohashed_start_loc', how='left')
 33 |     # result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left')
 34 |     result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
 35 |     result['loc_to_loc_sample'] = 1
 36 |     return result
 37 | 
 38 | # 将热度大于2的地址对加入为样本
 39 | # def get_loc_to_loc2(train, test):
 40 | #     sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'})
 41 | #     sloc_eloc_count = sloc_eloc_count[sloc_eloc_count['sloc_eloc_count']>=2]
 42 | #     result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left')
 43 | #     result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
 44 | #     result['loc_to_loc2_sample'] = 1
 45 | #     return result
 46 | 
 47 | # 将以起始地点作为结束地点出发的Top10地方加入为样本
 48 | # def get_loc_to_loc2(train, test):
 49 | #     sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'})
 50 | #     sloc_eloc_count.sort_values('sloc_eloc_count', inplace=True)
 51 | #     sloc_eloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 52 | #     if train['userid'].values[0] != -1: sloc_eloc_count = sloc_eloc_count.groupby('geohashed_start_loc').tail(10)
 53 | #     result = pd.merge(test[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left')
 54 | #     result = result[['orderid', 'geohashed_end_loc']]
 55 | #     return result
 56 | 
 57 | # 将热度最高的Top10目的地点加入为样本
 58 | # def get_hot_eloc(train, test):
 59 | #     hot_eloc = train.groupby('geohashed_end_loc', as_index=False)['orderid'].agg({'cnt': 'count'})
 60 | #     hot_eloc.sort_values(by='cnt', inplace=True)
 61 | #     hot_eloc = hot_eloc['geohashed_end_loc'].tail(10)
 62 | #     hot_eloc = pd.concat([hot_eloc] * test.shape[0]).reset_index()
 63 | #     result = pd.DataFrame({'orderid': pd.concat([test['orderid']] * 10)}).sort_values(by='orderid').reset_index()
 64 | #     result.loc[:, 'geohashed_end_loc'] = hot_eloc
 65 | #     return result
 66 | 
 67 | # 将自行车下一个出发地点(在全部测试集中寻找)加入为样本
 68 | def get_bike_next_sloc(train, test):
 69 |     train_set = pd.read_csv('../../MOBIKE_CUP_2017/train.csv')
 70 |     test_set = pd.read_csv('../../MOBIKE_CUP_2017/test.csv')
 71 |     all_set = pd.concat([train_set, test_set])
 72 |     bike_sloc = all_set[['orderid', 'bikeid', 'geohashed_start_loc', 'starttime']]
 73 |     bike_sloc.sort_values(by=['bikeid', 'starttime'], inplace=True, ascending=True)
 74 |     bike_sloc['next_bikeid'] = bike_sloc['bikeid'].shift(-1)
 75 |     bike_sloc['geohashed_end_loc'] = bike_sloc['geohashed_start_loc'].shift(-1)
 76 |     result = bike_sloc[(bike_sloc['bikeid'] == bike_sloc['next_bikeid']) & (bike_sloc['starttime'] >= test['starttime'].min()) & (bike_sloc['starttime'] <= test['starttime'].max())]
 77 |     result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
 78 |     result['bike_next_sloc_sample'] = 1
 79 |     # bike_sloc = test[['orderid', 'bikeid', 'geohashed_start_loc', 'starttime']].drop_duplicates()
 80 |     # bike_sloc.sort_values(by=['bikeid', 'starttime'], inplace=True, ascending=True)
 81 |     # bike_sloc['next_bikeid'] = bike_sloc['bikeid'].shift(-1)
 82 |     # bike_sloc['geohashed_end_loc'] = bike_sloc['geohashed_start_loc'].shift(-1)
 83 |     # result = bike_sloc[bike_sloc['bikeid'] == bike_sloc['next_bikeid']]
 84 |     # result = result[['orderid', 'geohashed_end_loc']]
 85 |     return result
 86 | 
 87 | # 寻找用户的下一个出发地点(在全部测试集中寻找)加入为样本
 88 | def get_user_next_sloc(train, test):
 89 |     train_set = pd.read_csv('../../MOBIKE_CUP_2017/train.csv')
 90 |     test_set = pd.read_csv('../../MOBIKE_CUP_2017/test.csv')
 91 |     all_set = pd.concat([train_set, test_set])
 92 |     user_sloc = all_set[['orderid', 'userid', 'geohashed_start_loc', 'starttime']]
 93 |     user_sloc.sort_values(by=['userid', 'starttime'], inplace=True, ascending=True)
 94 |     user_sloc['next_userid'] = user_sloc['userid'].shift(-1)
 95 |     user_sloc['geohashed_end_loc'] = user_sloc['geohashed_start_loc'].shift(-1)
 96 |     result = user_sloc[(user_sloc['userid'] == user_sloc['next_userid']) & (user_sloc['starttime'] >= test['starttime'].min()) & (user_sloc['starttime'] <= test['starttime'].max())]
 97 |     result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
 98 |     return result
 99 | 
100 | # 构造样本
101 | def get_sample(train, test, load=False):
102 |     user_start_loc = get_user_start_loc(train, test)
103 |     user_end_loc = get_user_end_loc(train, test)
104 |     loc_to_loc = get_loc_to_loc(train, test)
105 |     bike_next_sloc = get_bike_next_sloc(train, test)
106 |     # 汇总
107 |     result = pd.concat([user_end_loc[['orderid', 'geohashed_end_loc']],
108 |                         user_start_loc[['orderid', 'geohashed_end_loc']],
109 |                         loc_to_loc[['orderid', 'geohashed_end_loc']],
110 |                         bike_next_sloc[['orderid', 'geohashed_end_loc']]
111 |                        ]).drop_duplicates()
112 | 
113 |     restmp = pd.concat([user_end_loc,
114 |                         user_start_loc,
115 |                         loc_to_loc,
116 |                         bike_next_sloc
117 |                        ])
118 |     restmp.fillna(0, inplace=True)
119 |     restmp = restmp.groupby(['orderid', 'geohashed_end_loc'], as_index=False).sum()
120 | 
121 |     result = pd.merge(result, restmp, on=['orderid', 'geohashed_end_loc'], how='left')
122 |     # 添加负样本
123 |     result = pd.merge(result, test, on='orderid', how='left')
124 |     # 删除出发点和目的点相同的样本 以及 异常值
125 |     result = result[result['geohashed_end_loc'] != result['geohashed_start_loc']]
126 |     result = result[(~result['geohashed_start_loc'].isnull()) & (~result['geohashed_end_loc'].isnull())]
127 |     # print("原始样本数目：", result.shape)
128 |     # sample_leak = bike_next_sloc[['orderid', 'geohashed_end_loc']].drop_duplicates()
129 |     # print("Leak样本数目：", sample_leak.shape)
130 |     # sample_leak['leak'] = 1
131 |     # result = pd.merge(result, sample_leak, on=['orderid', 'geohashed_end_loc'], how='left')
132 |     # result = result[result.leak.isnull()]
133 |     # print("过滤Leak之后的样本数目：", result.shape)
134 |     # del result['leak']
135 |     if load:
136 |         sample_27 = pd.read_pickle('/home/xuwenchao/dyj-storage/wc-sample/sample_filter_{}.pickle'.format(test.shape[0]))[['orderid', 'geohashed_end_loc', 'userid', 'bikeid', 'biketype', 'starttime', 'geohashed_start_loc']]
137 |         # sample_27 = pd.concat([pd.read_pickle('/home/xuwenchao/dyj-storage/wc-sample/sample_filter_23_1_27_.pickle'), pd.read_pickle('/home/xuwenchao/dyj-storage/wc-sample/sample_filter_24_1_27_.pickle')])[['orderid', 'geohashed_end_loc', 'userid', 'bikeid', 'biketype', 'starttime', 'geohashed_start_loc']]
138 |         result = pd.merge(sample_27, result[['orderid', 'geohashed_end_loc', 'user_end_loc_sample', 'user_start_loc_sample', 'loc_to_loc_sample', 'bike_next_sloc_sample']], on=['orderid', 'geohashed_end_loc'], how='left')
139 |     print('构造样本完成：', result.shape, result[result.geohashed_start_loc.isnull()].shape)
140 |     return result


--------------------------------------------------------------------------------
/feature/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import get_feat
2 | from .other import get_distance


--------------------------------------------------------------------------------
/feature/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/filter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/filter.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/latlon.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/latlon.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/leak.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/leak.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/location.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/location.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/main.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/main.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/other.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/other.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/rule.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/rule.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/__pycache__/user.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/feature/__pycache__/user.cpython-35.pyc


--------------------------------------------------------------------------------
/feature/filter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import Geohash as geohash
  4 | import numpy as np
  5 | import os
  6 | import tqdm
  7 | os.path.join('..')
  8 | from utils import rank
  9 | 
 10 | '''
 11 | 	获取协同过滤特征
 12 | '''
 13 | 
 14 | # ----------------- 计数 -------------------
 15 | 
 16 | # 获取地址对的协同过滤信息
 17 | def get_loc_filter(train, result):
 18 |     sloc_elocs, eloc_slocs = {}, {}
 19 |     for i in tqdm.tqdm(train[['geohashed_start_loc', 'geohashed_end_loc']].values):
 20 |         if i[0] not in sloc_elocs:
 21 |             sloc_elocs[i[0]] = {}
 22 |         if i[1] not in sloc_elocs[i[0]]:
 23 |             sloc_elocs[i[0]][i[1]] = 0
 24 |         sloc_elocs[i[0]][i[1]] += 1
 25 |         if i[1] not in eloc_slocs:
 26 |             eloc_slocs[i[1]] = {}
 27 |         if i[0] not in eloc_slocs[i[1]]:
 28 |             eloc_slocs[i[1]][i[0]] = 0;
 29 |         eloc_slocs[i[1]][i[0]] += 1
 30 |     sloc_list, eloc_list, sloc_eloc_common_eloc_count, sloc_eloc_common_sloc_count, sloc_eloc_common_conn1_count, sloc_eloc_common_conn2_count = [], [], [], [], [], []
 31 |     for i in tqdm.tqdm(result[['geohashed_start_loc', 'geohashed_end_loc']].drop_duplicates().values):
 32 |         sloc_list.append(i[0])
 33 |         eloc_list.append(i[1])
 34 |         # 获取地址对在历史记录中共有的目的地数目
 35 |         common_eloc_count = 0
 36 |         if (i[0] in sloc_elocs) and (i[1] in sloc_elocs):
 37 |             sloc_eloc_common_eloc_set = sloc_elocs[i[0]].keys() & sloc_elocs[i[1]].keys()
 38 |             for common_eloc in sloc_eloc_common_eloc_set:
 39 |                 common_eloc_count = common_eloc_count + sloc_elocs[i[0]][common_eloc] + sloc_elocs[i[1]][common_eloc]
 40 |         sloc_eloc_common_eloc_count.append(common_eloc_count)
 41 |         # 获取地址对在历史记录中共有的出发地数目
 42 |         common_sloc_count = 0
 43 |         if (i[0] in eloc_slocs) and (i[1] in eloc_slocs):
 44 |             sloc_eloc_common_sloc_set = eloc_slocs[i[0]].keys() & eloc_slocs[i[1]].keys()
 45 |             for common_sloc in sloc_eloc_common_sloc_set:
 46 |                 common_sloc_count = common_sloc_count + eloc_slocs[i[0]][common_sloc] + eloc_slocs[i[1]][common_sloc]
 47 |         sloc_eloc_common_sloc_count.append(common_sloc_count)
 48 |         # 获取地址对在历史记录中共有的连接点数目(出发点->xx->目的地)
 49 |         common_conn1_count = 0
 50 |         if (i[0] in sloc_elocs) and (i[1] in eloc_slocs):
 51 |             sloc_eloc_common_conn1_set = sloc_elocs[i[0]].keys() & eloc_slocs[i[1]].keys()
 52 |             for common_conn1 in sloc_eloc_common_conn1_set:
 53 |                 common_conn1_count = common_conn1_count + sloc_elocs[i[0]][common_conn1] + eloc_slocs[i[1]][common_conn1]
 54 |         sloc_eloc_common_conn1_count.append(common_conn1_count)
 55 |         # 获取地址对在历史记录中共有的连接点数目(出发点<-xx<-目的地)
 56 |         common_conn2_count = 0
 57 |         if (i[0] in eloc_slocs) and (i[1] in sloc_elocs):
 58 |             sloc_eloc_common_conn2_set = eloc_slocs[i[0]].keys() & sloc_elocs[i[1]].keys()
 59 |             for common_conn2 in sloc_eloc_common_conn2_set:
 60 |                 common_conn2_count = common_conn2_count + eloc_slocs[i[0]][common_conn2] + sloc_elocs[i[1]][common_conn2]
 61 |         sloc_eloc_common_conn2_count.append(common_conn2_count)
 62 |     loc_filter = pd.DataFrame({"geohashed_start_loc": sloc_list, "geohashed_end_loc": eloc_list, "sloc_eloc_common_eloc_count": sloc_eloc_common_eloc_count, "sloc_eloc_common_sloc_count": sloc_eloc_common_sloc_count, "sloc_eloc_common_conn1_count": sloc_eloc_common_conn1_count, "sloc_eloc_common_conn2_count": sloc_eloc_common_conn2_count})
 63 |     result = pd.merge(result, loc_filter, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left')
 64 |     result['sloc_eloc_common_eloc_rate'] = result['sloc_eloc_common_eloc_count']/(result['sloc_count']+result['eloc_as_sloc_count'])
 65 |     result['sloc_eloc_common_sloc_rate'] = result['sloc_eloc_common_sloc_count']/(result['sloc_as_eloc_count']+result['eloc_count'])
 66 |     result['sloc_eloc_common_conn1_rate'] = result['sloc_eloc_common_conn1_count']/(result['sloc_count']+result['eloc_count'])
 67 |     result['sloc_eloc_common_conn2_rate'] = result['sloc_eloc_common_conn2_count']/(result['sloc_as_eloc_count']+result['eloc_as_sloc_count'])
 68 |     return result
 69 | 
 70 | # 获取用户地址对的协同过滤信息
 71 | def get_user_loc_filter(train, result):
 72 |     user_sloc_elocs, user_eloc_slocs = {}, {}
 73 |     for i in tqdm.tqdm(train[['userid', 'geohashed_start_loc', 'geohashed_end_loc']].values):
 74 |         if i[0] not in user_sloc_elocs:
 75 |             user_sloc_elocs[i[0]] = {}
 76 |         if i[1] not in user_sloc_elocs[i[0]]:
 77 |             user_sloc_elocs[i[0]][i[1]] = {}
 78 |         if i[2] not in user_sloc_elocs[i[0]][i[1]]:
 79 |             user_sloc_elocs[i[0]][i[1]][i[2]] = 0
 80 |         user_sloc_elocs[i[0]][i[1]][i[2]] += 1
 81 |         if i[0] not in user_eloc_slocs:
 82 |             user_eloc_slocs[i[0]] = {}
 83 |         if i[2] not in user_eloc_slocs[i[0]]:
 84 |             user_eloc_slocs[i[0]][i[2]] = {};
 85 |         if i[1] not in user_eloc_slocs[i[0]][i[2]]:
 86 |             user_eloc_slocs[i[0]][i[2]][i[1]] = 0
 87 |         user_eloc_slocs[i[0]][i[2]][i[1]] += 1
 88 |     user_list, user_sloc_list, user_eloc_list, user_sloc_eloc_common_eloc_count, user_sloc_eloc_common_sloc_count, user_sloc_eloc_common_conn1_count, user_sloc_eloc_common_conn2_count = [], [], [], [], [], [], []
 89 |     for i in tqdm.tqdm(result[['userid', 'geohashed_start_loc', 'geohashed_end_loc']].drop_duplicates().values):
 90 |         user_list.append(i[0])
 91 |         user_sloc_list.append(i[1])
 92 |         user_eloc_list.append(i[2])
 93 |         # 获取地址对在用户历史记录中共有的目的地数目
 94 |         user_common_eloc_count = 0
 95 |         if (i[0] in user_sloc_elocs) and (i[1] in user_sloc_elocs[i[0]]) and (i[2] in user_sloc_elocs[i[0]]):
 96 |             user_sloc_eloc_common_eloc_set = user_sloc_elocs[i[0]][i[1]].keys() & user_sloc_elocs[i[0]][i[2]].keys()
 97 |             for user_common_eloc in user_sloc_eloc_common_eloc_set:
 98 |                 user_common_eloc_count = user_common_eloc_count + user_sloc_elocs[i[0]][i[1]][user_common_eloc] + user_sloc_elocs[i[0]][i[2]][user_common_eloc]
 99 |         user_sloc_eloc_common_eloc_count.append(user_common_eloc_count)
100 |         # 获取地址对在用户历史记录中共有的出发地数目
101 |         user_common_sloc_count = 0
102 |         if (i[0] in user_eloc_slocs) and (i[1] in user_eloc_slocs[i[0]]) and (i[2] in user_eloc_slocs[i[0]]):
103 |             user_sloc_eloc_common_sloc_set = user_eloc_slocs[i[0]][i[1]].keys() & user_eloc_slocs[i[0]][i[2]].keys()
104 |             for user_common_sloc in user_sloc_eloc_common_sloc_set:
105 |                 user_common_sloc_count = user_common_sloc_count + user_eloc_slocs[i[0]][i[1]][user_common_sloc] + user_eloc_slocs[i[0]][i[2]][user_common_sloc]
106 |         user_sloc_eloc_common_sloc_count.append(user_common_sloc_count)
107 |         # 获取地址对在用户历史记录中共有的连接点数目(出发点->xx->目的地)
108 |         user_common_conn1_count = 0
109 |         if (i[0] in user_sloc_elocs) and (i[1] in user_sloc_elocs[i[0]]) and (i[0] in user_eloc_slocs) and (i[2] in user_eloc_slocs[i[0]]):
110 |             user_sloc_eloc_common_conn1_set = user_sloc_elocs[i[0]][i[1]].keys() & user_eloc_slocs[i[0]][i[2]].keys()
111 |             for user_common_conn1 in user_sloc_eloc_common_conn1_set:
112 |                 user_common_conn1_count = user_common_conn1_count + user_sloc_elocs[i[0]][i[1]][user_common_conn1] + user_eloc_slocs[i[0]][i[2]][user_common_conn1]
113 |         user_sloc_eloc_common_conn1_count.append(user_common_conn1_count)
114 |         # 获取地址对在用户历史记录中共有的连接点数目(出发点<-xx<-目的地)
115 |         user_common_conn2_count = 0
116 |         if (i[0] in user_eloc_slocs) and (i[1] in user_eloc_slocs[i[0]]) and (i[0] in user_sloc_elocs) and (i[2] in user_sloc_elocs[i[0]]):
117 |             user_sloc_eloc_common_conn2_set = user_eloc_slocs[i[0]][i[1]].keys() & user_sloc_elocs[i[0]][i[2]].keys()
118 |             for user_common_conn2 in user_sloc_eloc_common_conn2_set:
119 |                 user_common_conn2_count = user_common_conn2_count + user_eloc_slocs[i[0]][i[1]][user_common_conn2] + user_sloc_elocs[i[0]][i[2]][user_common_conn2]
120 |         user_sloc_eloc_common_conn2_count.append(user_common_conn2_count)
121 |     user_loc_filter = pd.DataFrame({"userid": user_list, "geohashed_start_loc": user_sloc_list, "geohashed_end_loc": user_eloc_list, "user_sloc_eloc_common_eloc_count": user_sloc_eloc_common_eloc_count, "user_sloc_eloc_common_sloc_count": user_sloc_eloc_common_sloc_count, "user_sloc_eloc_common_conn1_count": user_sloc_eloc_common_conn1_count, "user_sloc_eloc_common_conn2_count": user_sloc_eloc_common_conn2_count})
122 |     result = pd.merge(result, user_loc_filter, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left')
123 |     result['user_sloc_eloc_common_eloc_rate'] = result['user_sloc_eloc_common_eloc_count']/(result['user_sloc_count']+result['user_eloc_as_sloc_count'])
124 |     result['user_sloc_eloc_common_sloc_rate'] = result['user_sloc_eloc_common_sloc_count']/(result['user_sloc_as_eloc_count']+result['user_eloc_count'])
125 |     result['user_sloc_eloc_common_conn1_rate'] = result['user_sloc_eloc_common_conn1_count']/(result['user_sloc_count']+result['user_eloc_count'])
126 |     result['user_sloc_eloc_common_conn2_rate'] = result['user_sloc_eloc_common_conn2_count']/(result['user_sloc_as_eloc_count']+result['user_eloc_as_sloc_count'])
127 |     return result


--------------------------------------------------------------------------------
/feature/latlon.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import Geohash as geohash
  4 | import numpy as np
  5 | import os
  6 | os.path.join('..')
  7 | from utils import rank
  8 | 
  9 | '''
 10 |     获取经纬度特征
 11 | '''
 12 | 
 13 | # ----------------- 经纬度 -------------------
 14 | 
 15 | # 获取目的地的经纬度
 16 | def get_eloc_latlon(result):
 17 |     eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)[:2])
 18 |     result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0]))
 19 |     result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1]))
 20 |     return result
 21 | 
 22 | # 获取出发地的经纬度
 23 | def get_sloc_latlon(result):
 24 |     sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)[:2])
 25 |     result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0]))
 26 |     result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1]))
 27 |     return result
 28 | 
 29 | # ----------------- 方向 -------------------
 30 | 
 31 | # 获取出发地与目的地的经纬度差
 32 | def get_eloc_sloc_latlon_sub(result):
 33 |     # sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)[:2])
 34 |     # sloc_lat = sloc_latlon.apply(lambda x: float(x[0]))
 35 |     # sloc_lon = sloc_latlon.apply(lambda x: float(x[1]))
 36 |     # eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)[:2])
 37 |     # eloc_lat = eloc_latlon.apply(lambda x: float(x[0]))
 38 |     # eloc_lon = eloc_latlon.apply(lambda x: float(x[1]))
 39 |     result['eloc_sloc_lat_sub'] = result['eloc_lat'] - result['sloc_lat']
 40 |     result['eloc_sloc_lon_sub'] = result['eloc_lon'] - result['sloc_lon']
 41 |     return result
 42 | 
 43 | # 获取出发地与目的地的斜率
 44 | def get_eloc_sloc_slope(result):
 45 |     result['eloc_sloc_latlon_slope'] = result['eloc_sloc_lat_sub'] / result['eloc_sloc_lon_sub']
 46 |     return result
 47 | 
 48 | # 获取经纬度差与距离的商
 49 | def get_eloc_sloc_latlon_sub_divide_distance(result):
 50 |     result['eloc_sloc_lat_sub_divide_distance'] = result['eloc_sloc_lat_sub'] / result['distance']
 51 |     result['eloc_sloc_lon_sub_divide_distance'] = result['eloc_sloc_lon_sub'] / result['distance']
 52 |     result['eloc_sloc_lat_sub_divide_manhattan'] = result['eloc_sloc_lat_sub'] / result['manhattan']
 53 |     result['eloc_sloc_lon_sub_divide_manhattan'] = result['eloc_sloc_lon_sub'] / result['manhattan']
 54 |     return result
 55 | 
 56 | # 获取方向角
 57 | def get_bearing_array(result):
 58 |     result['degree'] = np.arctan2(result['eloc_sloc_lat_sub'], result['eloc_sloc_lon_sub'])
 59 |     return result
 60 | 
 61 | # ----------------- 统计 -------------------
 62 | 
 63 | # 获取用户出发的距离统计
 64 | def get_user_latlon_sub_stat(train, result):
 65 |     train = train[~train.geohashed_end_loc.isnull()]
 66 |     user_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_lat_sub_max': 'max', 'user_lat_sub_min': 'min', 'user_lat_sub_mean': 'mean'})
 67 |     result = pd.merge(result, user_lat_sub_stat, on=['userid', 'geohashed_start_loc'], how='left')
 68 |     user_lon_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_lon_sub_max': 'max', 'user_lon_sub_min': 'min', 'user_lon_sub_mean': 'mean'})
 69 |     result = pd.merge(result, user_lon_sub_stat, on=['userid', 'geohashed_start_loc'], how='left')
 70 |     return result
 71 | 
 72 | # 获取用户从某个地点出发的距离统计
 73 | def get_user_sloc_latlon_sub_stat(train, result):
 74 |     train = train[~train.geohashed_end_loc.isnull()]
 75 |     user_sloc_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_sloc_lat_sub_max': 'max', 'user_sloc_lat_sub_min': 'min', 'user_sloc_lat_sub_mean': 'mean'}) # 8 7 kong
 76 |     # user_sloc_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_sloc_lat_sub_mean': 'mean'})
 77 |     result = pd.merge(result, user_sloc_lat_sub_stat, on=['userid', 'geohashed_start_loc'], how='left')
 78 |     user_sloc_lon_sub_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_sloc_lon_sub_max': 'max', 'user_sloc_lon_sub_min': 'min', 'user_sloc_lon_sub_mean': 'mean'})
 79 |     result = pd.merge(result, user_sloc_lon_sub_stat, on=['userid', 'geohashed_start_loc'], how='left')
 80 |     return result
 81 | 
 82 | # 获取用户到某个地点结束的距离统计
 83 | def get_user_eloc_latlon_sub_stat(train, result):
 84 |     train = train[~train.geohashed_end_loc.isnull()]
 85 |     user_eloc_lat_sub_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_eloc_lat_sub_max': 'max', 'user_eloc_lat_sub_min': 'min', 'user_eloc_lat_sub_mean': 'mean'})
 86 |     result = pd.merge(result, user_eloc_lat_sub_stat, on=['userid', 'geohashed_end_loc'], how='left')
 87 |     user_eloc_lon_sub_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_eloc_lon_sub_max': 'max', 'user_eloc_lon_sub_min': 'min', 'user_eloc_lon_sub_mean': 'mean'})
 88 |     result = pd.merge(result, user_eloc_lon_sub_stat, on=['userid', 'geohashed_end_loc'], how='left')
 89 |     return result
 90 | 
 91 | # 获取用户从某个地点出发的小时段距离统计
 92 | def get_user_sloc_hour_latlon_sub_stat(train, result):
 93 |     train = train[~train.geohashed_end_loc.isnull()]
 94 |     user_sloc_hour_lat_sub_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_sloc_hour_lat_sub_max': 'max', 'user_sloc_hour_lat_sub_min': 'min', 'user_sloc_hour_lat_sub_mean': 'mean'})
 95 |     result = pd.merge(result, user_sloc_hour_lat_sub_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left')
 96 |     user_sloc_hour_lon_sub_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_sloc_hour_lon_sub_max': 'max', 'user_sloc_hour_lon_sub_min': 'min', 'user_sloc_hour_lon_sub_mean': 'mean'})
 97 |     result = pd.merge(result, user_sloc_hour_lon_sub_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left')
 98 |     return result
 99 | 
100 | # 获取用户到某个地点结束的小时段距离统计
101 | def get_user_eloc_hour_latlon_sub_stat(train, result):
102 |     train = train[~train.geohashed_end_loc.isnull()]
103 |     user_eloc_hour_lat_sub_stat = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'user_eloc_hour_lat_sub_max': 'max', 'user_eloc_hour_lat_sub_min': 'min', 'user_eloc_hour_lat_sub_mean': 'mean'}) # 4 4 6
104 |     result = pd.merge(result, user_eloc_hour_lat_sub_stat, on=['userid', 'geohashed_end_loc', 'hour'], how='left')
105 |     user_eloc_hour_lon_sub_stat = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'user_eloc_hour_lon_sub_max': 'max', 'user_eloc_hour_lon_sub_min': 'min', 'user_eloc_hour_lon_sub_mean': 'mean'}) # 2 7 6
106 |     result = pd.merge(result, user_eloc_hour_lon_sub_stat, on=['userid', 'geohashed_end_loc', 'hour'], how='left')
107 |     return result
108 | 
109 | # 获取从某个地点出发的距离统计
110 | def get_sloc_latlon_sub_stat(train, result):
111 |     train = train[~train.geohashed_end_loc.isnull()]
112 |     sloc_lat_sub_stat = train.groupby(['geohashed_start_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'sloc_lat_sub_max': 'max', 'sloc_lat_sub_min': 'min', 'sloc_lat_sub_mean': 'mean'})
113 |     result = pd.merge(result, sloc_lat_sub_stat, on=['geohashed_start_loc'], how='left')
114 |     sloc_lon_sub_stat = train.groupby(['geohashed_start_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'sloc_lon_sub_max': 'max', 'sloc_lon_sub_min': 'min', 'sloc_lon_sub_mean': 'mean'})
115 |     result = pd.merge(result, sloc_lon_sub_stat, on=['geohashed_start_loc'], how='left')
116 |     return result
117 | 
118 | # 获取到某个地点结束的距离统计
119 | def get_eloc_latlon_sub_stat(train, result):
120 |     train = train[~train.geohashed_end_loc.isnull()]
121 |     eloc_lat_sub_stat = train.groupby(['geohashed_end_loc'], as_index=False)['eloc_sloc_lat_sub'].agg({'eloc_lat_sub_max': 'max', 'eloc_lat_sub_min': 'min', 'eloc_lat_sub_mean': 'mean'})
122 |     result = pd.merge(result, eloc_lat_sub_stat, on=['geohashed_end_loc'], how='left')
123 |     eloc_lon_sub_stat = train.groupby(['geohashed_end_loc'], as_index=False)['eloc_sloc_lon_sub'].agg({'eloc_lon_sub_max': 'max', 'eloc_lon_sub_min': 'min', 'eloc_lon_sub_mean': 'mean'})
124 |     result = pd.merge(result, eloc_lon_sub_stat, on=['geohashed_end_loc'], how='left')
125 |     return result
126 | 
127 | # 获取从某个地点出发的小时段距离统计
128 | def get_sloc_hour_latlon_sub_stat(train, result):
129 |     train = train[~train.geohashed_end_loc.isnull()]
130 |     sloc_hour_lat_sub_stat = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'sloc_hour_lat_sub_max': 'max', 'sloc_hour_lat_sub_min': 'min', 'sloc_hour_lat_sub_mean': 'mean'})
131 |     result = pd.merge(result, sloc_hour_lat_sub_stat, on=['geohashed_start_loc', 'hour'], how='left')
132 |     sloc_hour_lon_sub_stat = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'sloc_hour_lon_sub_max': 'max', 'sloc_hour_lon_sub_min': 'min', 'sloc_hour_lon_sub_mean': 'mean'})
133 |     result = pd.merge(result, sloc_hour_lon_sub_stat, on=['geohashed_start_loc', 'hour'], how='left')
134 |     return result
135 | 
136 | # 获取到某个地点结束的小时段距离统计
137 | def get_eloc_hour_latlon_sub_stat(train, result):
138 |     train = train[~train.geohashed_end_loc.isnull()]
139 |     eloc_hour_lat_sub_stat = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lat_sub'].agg({'eloc_hour_lat_sub_max': 'max', 'eloc_hour_lat_sub_min': 'min', 'eloc_hour_lat_sub_mean': 'mean'})
140 |     result = pd.merge(result, eloc_hour_lat_sub_stat, on=['geohashed_end_loc', 'hour'], how='left')
141 |     eloc_hour_lon_sub_stat = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['eloc_sloc_lon_sub'].agg({'eloc_hour_lon_sub_max': 'max', 'eloc_hour_lon_sub_min': 'min', 'eloc_hour_lon_sub_mean': 'mean'})
142 |     result = pd.merge(result, eloc_hour_lon_sub_stat, on=['geohashed_end_loc', 'hour'], how='left')
143 |     return result
144 | 
145 | # ----------------- 排序 -------------------
146 | 
147 | # 获取用户出行距离的排序
148 | def get_user_latlon_sub_rank(result):
149 |     result = rank(result, 'userid', 'eloc_sloc_lat_sub', rank_name='user_lat_sub_rank', ascending=False)
150 |     result = rank(result, 'userid', 'eloc_sloc_lon_sub', rank_name='user_lon_sub_rank', ascending=False)
151 |     return result
152 | 
153 | # 获取用户到某个目的地的距离排序
154 | def get_user_eloc_latlon_sub_rank(result):
155 |     result = rank(result, ['userid', 'geohashed_end_loc'], 'eloc_sloc_lat_sub', rank_name='user_eloc_lat_sub_rank', ascending=False)
156 |     result = rank(result, ['userid', 'geohashed_end_loc'], 'eloc_sloc_lon_sub', rank_name='user_eloc_lon_sub_rank', ascending=False)
157 |     return result
158 | 
159 | # 获取用户从某个地点出发的距离排序
160 | def get_user_sloc_latlon_sub_rank(result):
161 |     result = rank(result, ['userid', 'geohashed_start_loc'], 'eloc_sloc_lat_sub', rank_name='user_sloc_lat_sub_rank', ascending=False)
162 |     result = rank(result, ['userid', 'geohashed_start_loc'], 'eloc_sloc_lon_sub', rank_name='user_sloc_lon_sub_rank', ascending=False)
163 |     return result
164 | 
165 | # 获取用户到某个目的地的小时段距离排序
166 | def get_user_eloc_hour_latlon_sub_rank(result):
167 |     result = rank(result, ['userid', 'geohashed_end_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='user_eloc_hour_lat_sub_rank', ascending=False)
168 |     result = rank(result, ['userid', 'geohashed_end_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='user_eloc_hour_lon_sub_rank', ascending=False)
169 |     return result
170 | 
171 | # 获取从某个目的地出发的小时段距离排序
172 | def get_user_sloc_hour_latlon_sub_rank(result):
173 |     result = rank(result, ['userid', 'geohashed_start_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='user_sloc_hour_lat_sub_rank', ascending=False)
174 |     result = rank(result, ['userid', 'geohashed_start_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='user_sloc_hour_lon_sub_rank', ascending=False)
175 |     return result
176 | 
177 | # 获取到某个目的地的距离排序
178 | def get_eloc_latlon_sub_rank(result):
179 |     result = rank(result, 'geohashed_end_loc', 'eloc_sloc_lat_sub', rank_name='eloc_lat_sub_rank', ascending=False)
180 |     result = rank(result, 'geohashed_end_loc', 'eloc_sloc_lon_sub', rank_name='eloc_lon_sub_rank', ascending=False)
181 |     return result
182 | 
183 | # 获取从某个地点出发的距离排序
184 | def get_sloc_latlon_sub_rank(result):
185 |     result = rank(result, 'geohashed_start_loc', 'eloc_sloc_lat_sub', rank_name='sloc_lat_sub_rank', ascending=False)
186 |     result = rank(result, 'geohashed_start_loc', 'eloc_sloc_lon_sub', rank_name='sloc_lon_sub_rank', ascending=False)
187 |     return result
188 | 
189 | # 获取到某个目的地的小时段距离排序
190 | def get_eloc_hour_latlon_sub_rank(result):
191 |     result = rank(result, ['geohashed_end_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='eloc_hour_lat_sub_rank', ascending=False)
192 |     result = rank(result, ['geohashed_end_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='eloc_hour_lon_sub_rank', ascending=False)
193 |     return result
194 | 
195 | # 获取从某个目的地出发的小时段距离排序
196 | def get_sloc_hour_latlon_sub_rank(result):
197 |     result = rank(result, ['geohashed_start_loc', 'hour'], 'eloc_sloc_lat_sub', rank_name='sloc_hour_lat_sub_rank', ascending=False)
198 |     result = rank(result, ['geohashed_start_loc', 'hour'], 'eloc_sloc_lon_sub', rank_name='sloc_hour_lon_sub_rank', ascending=False)
199 |     return result
200 | 
201 | # ----------------- 交叉 -------------------
202 | 
203 | # 获取距离与用户出行距离统计值的(绝对)差值
204 | def get_user_latlon_sub_stat_sub(result):
205 |     result['user_lat_sub_mean_sub'] = (result['distance'] - result['user_lat_sub_mean'])
206 |     result['user_lon_sub_mean_sub'] = (result['distance'] - result['user_lon_sub_mean'])
207 |     result['user_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_lat_sub_mean']).abs() # 6
208 |     result['user_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_lon_sub_mean']).abs() # 1
209 |     return result
210 | 
211 | # 获取距离与用户从某个点出发距离统计值的(绝对)差值
212 | def get_user_sloc_latlon_sub_stat_sub(result):
213 |     # result['user_sloc_lat_sub_mean_sub'] = (result['distance'] - result['user_sloc_lat_sub_mean']) # 0
214 |     result['user_sloc_lon_sub_mean_sub'] = (result['distance'] - result['user_sloc_lon_sub_mean']) # 2
215 |     # result['user_sloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_lat_sub_mean']).abs() # 0
216 |     # result['user_sloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_lon_sub_mean']).abs() # 0
217 |     return result
218 | 
219 | # 获取距离与用户到某个点结束距离统计值的(绝对)差值
220 | def get_user_eloc_latlon_sub_stat_sub(result):
221 |     result['user_eloc_lat_sub_mean_sub'] = (result['distance'] - result['user_eloc_lat_sub_mean'])
222 |     result['user_eloc_lon_sub_mean_sub'] = (result['distance'] - result['user_eloc_lon_sub_mean'])
223 |     result['user_eloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_lat_sub_mean']).abs()
224 |     result['user_eloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_lon_sub_mean']).abs()
225 |     return result
226 | 
227 | # 获取距离与用户从某个点出发距离统计值的各小时段(绝对)差值
228 | def get_user_sloc_hour_latlon_sub_stat_sub(result):
229 |     result['user_sloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['user_sloc_hour_lat_sub_mean'])
230 |     result['user_sloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['user_sloc_hour_lon_sub_mean'])
231 |     result['user_sloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_hour_lat_sub_mean']).abs() # 5
232 |     result['user_sloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_sloc_hour_lon_sub_mean']).abs() # 8
233 |     return result
234 | 
235 | # 获取距离与用户到某个点结束距离统计值的各小时段(绝对)差值
236 | def get_user_eloc_hour_latlon_sub_stat_sub(result):
237 |     result['user_eloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['user_eloc_hour_lat_sub_mean']) # 43
238 |     result['user_eloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['user_eloc_hour_lon_sub_mean']) # 18
239 |     # result['user_eloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_hour_lat_sub_mean']).abs() # 0
240 |     result['user_eloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['user_eloc_hour_lon_sub_mean']).abs() # 3
241 |     return result
242 | 
243 | # 获取距离与从某个点出发距离统计值的(绝对)差值
244 | def get_sloc_latlon_sub_stat_sub(result):
245 |     result['sloc_lat_sub_mean_sub'] = (result['distance'] - result['sloc_lat_sub_mean'])
246 |     result['sloc_lon_sub_mean_sub'] = (result['distance'] - result['sloc_lon_sub_mean'])
247 |     result['sloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['sloc_lat_sub_mean']).abs() # 4
248 |     result['sloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['sloc_lon_sub_mean']).abs() # 4
249 |     return result
250 | 
251 | # 获取距离与到某个点结束距离统计值的(绝对)差值
252 | def get_eloc_latlon_sub_stat_sub(result):
253 |     result['eloc_lat_sub_mean_sub'] = (result['distance'] - result['eloc_lat_sub_mean'])
254 |     result['eloc_lon_sub_mean_sub'] = (result['distance'] - result['eloc_lon_sub_mean'])
255 |     result['eloc_lat_sub_mean_sub_abs'] = (result['distance'] - result['eloc_lat_sub_mean']).abs() # 7
256 |     result['eloc_lon_sub_mean_sub_abs'] = (result['distance'] - result['eloc_lon_sub_mean']).abs()
257 |     return result
258 | 
259 | # 获取距离与从某个点出发距离统计值的各小时段(绝对)差值
260 | def get_sloc_hour_latlon_sub_stat_sub(result):
261 |     result['sloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['sloc_hour_lat_sub_mean'])
262 |     result['sloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['sloc_hour_lon_sub_mean'])
263 |     result['sloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['sloc_hour_lat_sub_mean']).abs() # 7
264 |     result['sloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['sloc_hour_lon_sub_mean']).abs() # 6
265 |     return result
266 | 
267 | # 获取距离与到某个点结束距离统计值的各小时段(绝对)差值
268 | def get_eloc_hour_latlon_sub_stat_sub(result):
269 |     result['eloc_hour_lat_sub_mean_sub'] = (result['distance'] - result['eloc_hour_lat_sub_mean'])
270 |     result['eloc_hour_lon_sub_mean_sub'] = (result['distance'] - result['eloc_hour_lon_sub_mean'])
271 |     result['eloc_hour_lat_sub_mean_sub_abs'] = (result['distance'] - result['eloc_hour_lat_sub_mean']).abs() # 9
272 |     result['eloc_hour_lon_sub_mean_sub_abs'] = (result['distance'] - result['eloc_hour_lon_sub_mean']).abs()
273 |     return result


--------------------------------------------------------------------------------
/feature/leak.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import numpy as np
  4 | from .other import get_distance
  5 | 
  6 | '''
  7 |     获取Leak特征
  8 | '''
  9 | 
 10 | # 获取目标地点与用户上一次、下一次出发地点的距离、时间差和速度
 11 | def get_eloc_user_sloc_leak(result):
 12 |     user_order = result[['orderid', 'userid', 'geohashed_start_loc', 'starttime', 'sloc_lat', 'sloc_lon']].drop_duplicates()
 13 |     user_order.sort_values(by=['userid', 'starttime'], inplace=True, ascending=True)
 14 |     user_order['last_userid'] = user_order.userid.shift(1)
 15 |     user_order['next_userid'] = user_order.userid.shift(-1)
 16 |     user_order['last_starttime'] = user_order.starttime.shift(1)
 17 |     user_order['user_last_order_time_diff'] = (pd.DatetimeIndex(user_order['starttime'])-pd.DatetimeIndex(user_order['last_starttime'])).total_seconds()
 18 |     user_order['next_starttime'] = user_order.starttime.shift(-1)
 19 |     user_order['user_next_order_time_diff'] = (pd.DatetimeIndex(user_order['starttime'])-pd.DatetimeIndex(user_order['next_starttime'])).total_seconds()
 20 |     user_order['last_sloc'] = user_order.geohashed_start_loc.shift(1)
 21 |     user_order['next_sloc'] = user_order.geohashed_start_loc.shift(-1)
 22 |     user_order['last_sloc_lat'] = user_order.sloc_lat.shift(1)
 23 |     user_order['next_sloc_lat'] = user_order.sloc_lat.shift(-1)
 24 |     user_order['last_sloc_lon'] = user_order.sloc_lon.shift(1)
 25 |     user_order['next_sloc_lon'] = user_order.sloc_lon.shift(-1)
 26 |     user_order.drop(['geohashed_start_loc', 'starttime', 'last_starttime', 'next_starttime', 'sloc_lat', 'sloc_lon'], axis=1, inplace=True)
 27 | 
 28 |     restmp = pd.merge(result[['orderid', 'geohashed_end_loc', 'eloc_lat', 'eloc_lon']], user_order, on='orderid', how='left')
 29 |     restmp.rename(columns={'last_sloc': 'geohashed_start_loc'}, inplace=True)
 30 |     distance = get_distance(restmp)
 31 |     restmp['eloc_user_last_sloc_distance'] = distance['distance']
 32 |     restmp['eloc_user_last_sloc_manhattan'] = distance['manhattan']
 33 | 
 34 |     restmp['eloc_user_last_sloc_lat_sub'] = restmp.eloc_lat - restmp.last_sloc_lat
 35 |     restmp['eloc_user_last_sloc_lon_sub'] = restmp.eloc_lon - restmp.last_sloc_lon
 36 |     restmp.drop(['geohashed_start_loc', 'distance', 'last_sloc_lat', 'last_sloc_lon', 'manhattan'], axis=1, inplace=True)
 37 |     restmp['eloc_user_last_sloc_speed'] = restmp.eloc_user_last_sloc_distance / restmp.user_last_order_time_diff
 38 |     restmp['eloc_user_last_sloc_manhattan_speed'] = restmp.eloc_user_last_sloc_manhattan / restmp.user_last_order_time_diff
 39 |     restmp['eloc_user_last_sloc_latlon_slope'] = restmp.eloc_user_last_sloc_lat_sub / restmp.eloc_user_last_sloc_lon_sub
 40 |     restmp['eloc_user_last_sloc_lat_sub_divide_distance'] = restmp.eloc_user_last_sloc_lat_sub / restmp.eloc_user_last_sloc_distance
 41 |     restmp['eloc_user_last_sloc_lon_sub_divide_distance'] = restmp.eloc_user_last_sloc_lon_sub / restmp.eloc_user_last_sloc_distance
 42 |     
 43 |     restmp.rename(columns={'next_sloc': 'geohashed_start_loc'}, inplace=True)
 44 |     distance = get_distance(restmp)
 45 |     restmp['eloc_user_next_sloc_distance'] = distance['distance']
 46 |     restmp['eloc_user_next_sloc_manhattan'] = distance['manhattan']
 47 |     restmp['eloc_user_next_sloc_lat_sub'] = restmp.eloc_lat - restmp.next_sloc_lat
 48 |     restmp['eloc_user_next_sloc_lon_sub'] = restmp.eloc_lon - restmp.next_sloc_lon
 49 |     restmp.drop(['geohashed_start_loc', 'distance', 'next_sloc_lat', 'next_sloc_lon', 'manhattan'], axis=1, inplace=True)
 50 |     restmp['eloc_user_next_sloc_speed'] = restmp.eloc_user_next_sloc_distance / restmp.user_next_order_time_diff
 51 |     restmp['eloc_user_next_sloc_manhattan_speed'] = restmp.eloc_user_next_sloc_manhattan / restmp.user_next_order_time_diff
 52 |     restmp['eloc_user_next_sloc_latlon_slope'] = restmp.eloc_user_next_sloc_lat_sub / restmp.eloc_user_next_sloc_lon_sub
 53 |     restmp['eloc_user_next_sloc_lat_sub_divide_distance'] = restmp.eloc_user_next_sloc_lat_sub / restmp.eloc_user_next_sloc_distance
 54 |     restmp['eloc_user_next_sloc_lon_sub_divide_distance'] = restmp.eloc_user_next_sloc_lon_sub / restmp.eloc_user_next_sloc_distance
 55 |     
 56 |     restmp.loc[restmp.userid != restmp.last_userid, 'user_last_order_time_diff'] = -1000000
 57 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_distance'] = -1000000
 58 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_manhattan'] = -1000000
 59 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_speed'] = -1000000
 60 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_manhattan_speed'] = -1000000
 61 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lat_sub'] = -1000000
 62 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lon_sub'] = -1000000
 63 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_latlon_slope'] = -1000000
 64 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lat_sub_divide_distance'] = -1000000
 65 |     restmp.loc[restmp.userid != restmp.last_userid, 'eloc_user_last_sloc_lon_sub_divide_distance'] = -1000000
 66 |     
 67 |     restmp.loc[restmp.userid != restmp.last_userid, 'user_next_order_time_diff'] = -1000000
 68 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_distance'] = -1000000
 69 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_manhattan'] = -1000000
 70 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_speed'] = -1000000
 71 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_manhattan_speed'] = -1000000
 72 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lat_sub'] = -1000000
 73 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lon_sub'] = -1000000
 74 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_latlon_slope'] = -1000000
 75 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lat_sub_divide_distance'] = -1000000
 76 |     restmp.loc[restmp.userid != restmp.next_userid, 'eloc_user_next_sloc_lon_sub_divide_distance'] = -1000000
 77 | 
 78 |     result['user_last_order_time_diff'] = restmp['user_last_order_time_diff'] # wc
 79 |     result['eloc_user_last_sloc_distance'] = restmp['eloc_user_last_sloc_distance'] # dui 90 wc
 80 |     result['eloc_user_last_sloc_manhattan'] = restmp['eloc_user_last_sloc_manhattan']
 81 |     result['eloc_user_last_sloc_speed'] = restmp['eloc_user_last_sloc_speed'] # dui 90 wc
 82 |     result['eloc_user_last_sloc_manhattan_speed'] = restmp['eloc_user_last_sloc_manhattan_speed']
 83 |     result['eloc_user_last_sloc_lat_sub'] = restmp['eloc_user_last_sloc_lat_sub']
 84 |     result['eloc_user_last_sloc_lon_sub'] = restmp['eloc_user_last_sloc_lon_sub']
 85 |     result['eloc_user_last_sloc_latlon_slope'] = restmp['eloc_user_last_sloc_latlon_slope']
 86 |     result['eloc_user_last_sloc_lat_sub_divide_distance'] = restmp['eloc_user_last_sloc_lat_sub_divide_distance']
 87 |     result['eloc_user_last_sloc_lon_sub_divide_distance'] = restmp['eloc_user_last_sloc_lon_sub_divide_distance']
 88 |     
 89 |     result['user_next_order_time_diff'] = restmp['user_next_order_time_diff']
 90 |     result['eloc_user_next_sloc_distance'] = restmp['eloc_user_next_sloc_distance'] # dui 90 wc
 91 |     result['eloc_user_next_sloc_manhattan'] = restmp['eloc_user_next_sloc_manhattan'] # wc
 92 |     result['eloc_user_next_sloc_speed'] = restmp['eloc_user_next_sloc_speed'] # dui 90 wc
 93 |     result['eloc_user_next_sloc_manhattan_speed'] = restmp['eloc_user_next_sloc_manhattan_speed'] # wc
 94 |     result['eloc_user_next_sloc_lat_sub'] = restmp['eloc_user_next_sloc_lat_sub'] # dui 90 wc
 95 |     result['eloc_user_next_sloc_lon_sub'] = restmp['eloc_user_next_sloc_lon_sub'] # dui 90 wc
 96 |     result['eloc_user_next_sloc_latlon_slope'] = restmp['eloc_user_next_sloc_latlon_slope']
 97 |     result['eloc_user_next_sloc_lat_sub_divide_distance'] = restmp['eloc_user_next_sloc_lat_sub_divide_distance']
 98 |     result['eloc_user_next_sloc_lon_sub_divide_distance'] = restmp['eloc_user_next_sloc_lon_sub_divide_distance']
 99 | 
100 |     return result
101 | 
102 | # 获取目标地点与车辆上一次、下一次出发地点的距离、时间差、速度及经纬度信息等
103 | def get_eloc_bike_sloc_leak(result):
104 |     bike_order = result[['orderid', 'bikeid', 'geohashed_start_loc', 'starttime', 'sloc_lat', 'sloc_lon']].drop_duplicates()
105 |     bike_order.sort_values(by=['bikeid', 'starttime'], inplace=True, ascending=True)
106 |     bike_order['last_bikeid'] = bike_order.bikeid.shift(1)
107 |     bike_order['next_bikeid'] = bike_order.bikeid.shift(-1)
108 |     bike_order['last_starttime'] = bike_order.starttime.shift(1)
109 |     bike_order['bike_last_order_time_diff'] = (pd.DatetimeIndex(bike_order['starttime'])-pd.DatetimeIndex(bike_order['last_starttime'])).total_seconds()
110 |     bike_order['next_starttime'] = bike_order.starttime.shift(-1)
111 |     bike_order['bike_next_order_time_diff'] = (pd.DatetimeIndex(bike_order['starttime'])-pd.DatetimeIndex(bike_order['next_starttime'])).total_seconds()
112 |     bike_order['last_sloc'] = bike_order.geohashed_start_loc.shift(1)
113 |     bike_order['next_sloc'] = bike_order.geohashed_start_loc.shift(-1)
114 |     bike_order['last_sloc_lat'] = bike_order.sloc_lat.shift(1)
115 |     bike_order['next_sloc_lat'] = bike_order.sloc_lat.shift(-1)
116 |     bike_order['last_sloc_lon'] = bike_order.sloc_lon.shift(1)
117 |     bike_order['next_sloc_lon'] = bike_order.sloc_lon.shift(-1)
118 |     bike_order.drop(['geohashed_start_loc', 'starttime', 'last_starttime', 'next_starttime', 'sloc_lat', 'sloc_lon'], axis=1, inplace=True)
119 | 
120 |     restmp = pd.merge(result[['orderid', 'geohashed_end_loc', 'eloc_lat', 'eloc_lon']], bike_order, on='orderid', how='left')
121 |     restmp.rename(columns={'last_sloc': 'geohashed_start_loc'}, inplace=True)
122 |     distance = get_distance(restmp)
123 |     restmp['eloc_bike_last_sloc_distance'] = distance['distance']
124 |     restmp['eloc_bike_last_sloc_manhattan'] = distance['manhattan']
125 |     restmp['eloc_bike_last_sloc_lat_sub'] = restmp.eloc_lat - restmp.last_sloc_lat
126 |     restmp['eloc_bike_last_sloc_lon_sub'] = restmp.eloc_lon - restmp.last_sloc_lon
127 |     restmp.drop(['geohashed_start_loc', 'distance', 'last_sloc_lat', 'last_sloc_lon', 'manhattan'], axis=1, inplace=True)
128 |     restmp['eloc_bike_last_sloc_speed'] = restmp.eloc_bike_last_sloc_distance / restmp.bike_last_order_time_diff
129 |     restmp['eloc_bike_last_sloc_manhattan_speed'] = restmp.eloc_bike_last_sloc_manhattan / restmp.bike_last_order_time_diff
130 |     restmp['eloc_bike_last_sloc_latlon_slope'] = restmp.eloc_bike_last_sloc_lat_sub / restmp.eloc_bike_last_sloc_lon_sub
131 |     restmp['eloc_bike_last_sloc_lat_sub_divide_distance'] = restmp.eloc_bike_last_sloc_lat_sub / restmp.eloc_bike_last_sloc_distance
132 |     restmp['eloc_bike_last_sloc_lon_sub_divide_distance'] = restmp.eloc_bike_last_sloc_lon_sub / restmp.eloc_bike_last_sloc_distance
133 |     
134 |     restmp.rename(columns={'next_sloc': 'geohashed_start_loc'}, inplace=True)
135 |     distance = get_distance(restmp)
136 |     restmp['eloc_bike_next_sloc_distance'] = distance['distance']
137 |     restmp['eloc_bike_next_sloc_manhattan'] = distance['manhattan']
138 |     restmp['eloc_bike_next_sloc_lat_sub'] = restmp.eloc_lat - restmp.next_sloc_lat
139 |     restmp['eloc_bike_next_sloc_lon_sub'] = restmp.eloc_lon - restmp.next_sloc_lon
140 |     restmp.drop(['geohashed_start_loc', 'distance', 'next_sloc_lat', 'next_sloc_lon', 'manhattan'], axis=1, inplace=True)
141 |     restmp['eloc_bike_next_sloc_speed'] = restmp.eloc_bike_next_sloc_distance / restmp.bike_next_order_time_diff
142 |     restmp['eloc_bike_next_sloc_manhattan_speed'] = restmp.eloc_bike_next_sloc_manhattan / restmp.bike_next_order_time_diff
143 |     restmp['eloc_bike_next_sloc_latlon_slope'] = restmp.eloc_bike_next_sloc_lat_sub / restmp.eloc_bike_next_sloc_lon_sub
144 |     restmp['eloc_bike_next_sloc_lat_sub_divide_distance'] = restmp.eloc_bike_next_sloc_lat_sub / restmp.eloc_bike_next_sloc_distance
145 |     restmp['eloc_bike_next_sloc_lon_sub_divide_distance'] = restmp.eloc_bike_next_sloc_lon_sub / restmp.eloc_bike_next_sloc_distance
146 |     
147 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_distance'] = -1000000
148 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_manhattan'] = -1000000
149 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_speed'] = -1000000
150 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_manhattan_speed'] = -1000000
151 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lat_sub'] = -1000000
152 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lon_sub'] = -1000000
153 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_latlon_slope'] = -1000000
154 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lat_sub_divide_distance'] = -1000000
155 |     restmp.loc[restmp.bikeid != restmp.last_bikeid, 'eloc_bike_last_sloc_lon_sub_divide_distance'] = -1000000
156 |     
157 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_distance'] = -1000000
158 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_manhattan'] = -1000000
159 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_speed'] = -1000000
160 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_manhattan_speed'] = -1000000
161 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lat_sub'] = -1000000
162 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lon_sub'] = -1000000
163 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_latlon_slope'] = -1000000
164 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lat_sub_divide_distance'] = -1000000
165 |     restmp.loc[restmp.bikeid != restmp.next_bikeid, 'eloc_bike_next_sloc_lon_sub_divide_distance'] = -1000000
166 | 
167 |     result['eloc_bike_last_sloc_distance'] = restmp['eloc_bike_last_sloc_distance'] # dui 90 wc
168 |     result['eloc_bike_last_sloc_manhattan'] = restmp['eloc_bike_last_sloc_manhattan']
169 |     result['eloc_bike_last_sloc_speed'] = restmp['eloc_bike_last_sloc_speed'] # dui 90 wc
170 |     result['eloc_bike_last_sloc_manhattan_speed'] = restmp['eloc_bike_last_sloc_manhattan_speed']
171 |     result['eloc_bike_last_sloc_lat_sub'] = restmp['eloc_bike_last_sloc_lat_sub'] # 90
172 |     result['eloc_bike_last_sloc_lon_sub'] = restmp['eloc_bike_last_sloc_lon_sub'] # 90
173 |     result['eloc_bike_last_sloc_latlon_slope'] = restmp['eloc_bike_last_sloc_latlon_slope'] # 90
174 |     result['eloc_bike_last_sloc_lat_sub_divide_distance'] = restmp['eloc_bike_last_sloc_lat_sub_divide_distance'] # 90
175 |     result['eloc_bike_last_sloc_lon_sub_divide_distance'] = restmp['eloc_bike_last_sloc_lon_sub_divide_distance'] # 90
176 |     
177 |     result['eloc_bike_next_sloc_distance'] = restmp['eloc_bike_next_sloc_distance'] # dui 90 wc
178 |     result['eloc_bike_next_sloc_manhattan'] = restmp['eloc_bike_next_sloc_manhattan']
179 |     result['eloc_bike_next_sloc_speed'] = restmp['eloc_bike_next_sloc_speed'] # dui 90 wc
180 |     result['eloc_bike_next_sloc_manhattan_speed'] = restmp['eloc_bike_next_sloc_manhattan_speed']
181 |     result['eloc_bike_next_sloc_lat_sub'] = restmp['eloc_bike_next_sloc_lat_sub'] # 90
182 |     result['eloc_bike_next_sloc_lon_sub'] = restmp['eloc_bike_next_sloc_lon_sub'] # 90
183 |     result['eloc_bike_next_sloc_latlon_slope'] = restmp['eloc_bike_next_sloc_latlon_slope'] # 90
184 |     result['eloc_bike_next_sloc_lat_sub_divide_distance'] = restmp['eloc_bike_next_sloc_lat_sub_divide_distance'] # 90
185 |     result['eloc_bike_next_sloc_lon_sub_divide_distance'] = restmp['eloc_bike_next_sloc_lon_sub_divide_distance'] # 90
186 |     
187 |     return result


--------------------------------------------------------------------------------
/feature/location.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import Geohash as geohash
  4 | import numpy as np
  5 | import os
  6 | os.path.join('..')
  7 | from utils import rank
  8 | 
  9 | '''
 10 | 	获取地理位置特征
 11 | '''
 12 | 
 13 | # ----------------- 计数 -------------------
 14 | 
 15 | # 获取目的地的热度
 16 | def get_eloc_count(train, result):
 17 |     train = train[~train.geohashed_end_loc.isnull()]
 18 |     eloc_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'eloc_count': 'count'})
 19 |     result = pd.merge(result, eloc_count, on='geohashed_end_loc', how='left')
 20 |     return result
 21 | 
 22 | # 获取出发地热度
 23 | def get_sloc_count(train, result):
 24 |     sloc_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'sloc_count': 'count'})
 25 |     result = pd.merge(result, sloc_count, on='geohashed_start_loc', how='left')
 26 |     return result
 27 | 
 28 | # 获取出发地作为目的地的热度
 29 | def get_sloc_as_eloc_count(train, result):
 30 |     train = train[~train.geohashed_end_loc.isnull()]
 31 |     sloc_as_eloc_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'sloc_as_eloc_count': 'count'})
 32 |     sloc_as_eloc_count.rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 33 |     result = pd.merge(result, sloc_as_eloc_count, on='geohashed_start_loc', how='left')
 34 |     return result
 35 | 
 36 | # 获取目的地作为出发地的热度
 37 | def get_eloc_as_sloc_count(train, result):
 38 |     eloc_as_sloc_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'eloc_as_sloc_count': 'count'})
 39 |     eloc_as_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True)
 40 |     result = pd.merge(result, eloc_as_sloc_count, on='geohashed_end_loc', how='left')
 41 |     return result
 42 | 
 43 | # 获取出发地->目的地地址对的热度
 44 | def get_sloc_eloc_count(train, result):
 45 |     train = train[~train.geohashed_end_loc.isnull()]
 46 |     sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'sloc_eloc_count': 'count'})
 47 |     result = pd.merge(result, sloc_eloc_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left')
 48 |     return result
 49 | 
 50 | # 获取目的地->出发地地址对的热度（返程次数）
 51 | def get_eloc_sloc_count(train, result):
 52 |     train = train[~train.geohashed_end_loc.isnull()]
 53 |     eloc_sloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'eloc_sloc_count': 'count'})
 54 |     eloc_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 55 |     result = pd.merge(result, eloc_sloc_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left')
 56 |     return result
 57 | 
 58 | # 获取目的地的用户热度
 59 | def get_eloc_user_count(train, result):
 60 |     train = train[~train.geohashed_end_loc.isnull()]
 61 |     eloc_user_count = train.groupby(['geohashed_end_loc'], as_index=False)['userid'].agg({'eloc_user_count': lambda x: np.unique(x).size})
 62 |     result = pd.merge(result, eloc_user_count, on='geohashed_end_loc', how='left')
 63 |     return result
 64 | 
 65 | # 获取出发地的用户热度
 66 | def get_sloc_user_count(train, result):
 67 |     sloc_user_count = train.groupby(['geohashed_start_loc'], as_index=False)['userid'].agg({'sloc_user_count': lambda x: np.unique(x).size})
 68 |     result = pd.merge(result, sloc_user_count, on='geohashed_start_loc', how='left')
 69 |     return result
 70 | 
 71 | # 获取出发地作为目的地的用户热度
 72 | def get_sloc_as_eloc_user_count(train, result):
 73 |     train = train[~train.geohashed_end_loc.isnull()]
 74 |     sloc_as_eloc_user_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'sloc_as_eloc_user_count': lambda x: np.unique(x).size})
 75 |     sloc_as_eloc_user_count.rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 76 |     result = pd.merge(result, sloc_as_eloc_user_count, on='geohashed_start_loc', how='left')
 77 |     return result
 78 | 
 79 | # 获取目的地作为出发地的用户热度
 80 | def get_eloc_as_sloc_user_count(train, result):
 81 |     eloc_as_sloc_user_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'eloc_as_sloc_user_count': lambda x: np.unique(x).size})
 82 |     eloc_as_sloc_user_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True)
 83 |     result = pd.merge(result, eloc_as_sloc_user_count, on='geohashed_end_loc', how='left')
 84 |     return result
 85 | 
 86 | # 获取出发地->目的地地址对的用户热度
 87 | def get_sloc_eloc_user_count(train, result):
 88 |     train = train[~train.geohashed_end_loc.isnull()]
 89 |     sloc_eloc_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'sloc_eloc_user_count': lambda x: np.unique(x).size})
 90 |     result = pd.merge(result, sloc_eloc_user_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left')
 91 |     return result
 92 | 
 93 | # 获取目的地->出发地地址对的用户热度（返程用户数）
 94 | def get_eloc_sloc_user_count(train, result):
 95 |     train = train[~train.geohashed_end_loc.isnull()]
 96 |     eloc_sloc_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'eloc_sloc_user_count': lambda x: np.unique(x).size})
 97 |     eloc_sloc_user_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 98 |     result = pd.merge(result, eloc_sloc_user_count, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left')
 99 |     return result
100 | 
101 | # 获取从某个地方出发的目的地个数
102 | def get_sloc_eloccount(train, result):
103 |     train = train[~train.geohashed_end_loc.isnull()]
104 |     sloc_eloccount = train.groupby('geohashed_start_loc', as_index=False)['geohashed_end_loc'].agg({'sloc_eloccount': lambda x: np.unique(x).size})
105 |     result = pd.merge(result, sloc_eloccount, on='geohashed_start_loc', how='left')
106 |     return result
107 | 
108 | # 获取到某个地方结束的出发地个数
109 | def get_eloc_sloccount(train, result):
110 |     train = train[~train.geohashed_end_loc.isnull()]
111 |     eloc_sloccount = train.groupby('geohashed_end_loc', as_index=False)['geohashed_start_loc'].agg({'eloc_sloccount': lambda x: np.unique(x).size})
112 |     result = pd.merge(result, eloc_sloccount, on='geohashed_end_loc', how='left')
113 |     return result
114 | 
115 | # 获取目的地在各小时段的订单数
116 | def get_eloc_hour_count(train, result):
117 |     train = train[~train.geohashed_end_loc.isnull()]
118 |     eloc_hour_count = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'eloc_hour_count': 'count'})
119 |     result = pd.merge(result, eloc_hour_count, on=['geohashed_end_loc', 'hour'], how='left')
120 |     return result
121 | 
122 | # 获取出发地在各小时段的订单数
123 | def get_sloc_hour_count(train, result):
124 |     sloc_hour_count = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['orderid'].agg({'sloc_hour_count': 'count'})
125 |     result = pd.merge(result, sloc_hour_count, on=['geohashed_start_loc', 'hour'], how='left')
126 |     return result
127 | 
128 | # 获取出发地->目的地地址对的小时热度
129 | def get_sloc_eloc_hour_count(train, result):
130 |     train = train[~train.geohashed_end_loc.isnull()]
131 |     sloc_eloc_hour_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'sloc_eloc_hour_count': 'count'})
132 |     result = pd.merge(result, sloc_eloc_hour_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left')
133 |     return result
134 | 
135 | # 获取目的地->出发地地址对的小时热度（返程次数）
136 | def get_eloc_sloc_hour_count(train, result):
137 |     train = train[~train.geohashed_end_loc.isnull()]
138 |     eloc_sloc_hour_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'eloc_sloc_hour_count': 'count'})
139 |     eloc_sloc_hour_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
140 |     result = pd.merge(result, eloc_sloc_hour_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left')
141 |     return result
142 | 
143 | # 获取目的地在各小时段的用户数
144 | def get_eloc_hour_user_count(train, result):
145 |     train = train[~train.geohashed_end_loc.isnull()]
146 |     eloc_hour_user_count = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'eloc_hour_user_count': 
147 |                                                                                     lambda x: np.unique(x).size})
148 |     result = pd.merge(result, eloc_hour_user_count, on=['geohashed_end_loc', 'hour'], how='left')
149 |     return result
150 | 
151 | # 获取出发地在各小时段的用户数
152 | def get_sloc_hour_user_count(train, result):
153 |     sloc_hour_user_count = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['userid'].agg({'sloc_hour_user_count': 
154 |                                                                                     lambda x: np.unique(x).size})
155 |     result = pd.merge(result, sloc_hour_user_count, on=['geohashed_start_loc', 'hour'], how='left')
156 |     return result
157 | 
158 | # 获取出发地->目的地地址对的用户小时热度
159 | def get_sloc_eloc_hour_user_count(train, result):
160 |     train = train[~train.geohashed_end_loc.isnull()]
161 |     sloc_eloc_hour_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'sloc_eloc_hour_user_count': lambda x: np.unique(x).size})
162 |     result = pd.merge(result, sloc_eloc_hour_user_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left')
163 |     return result
164 | 
165 | # 获取目的地->出发地地址对的用户小时热度（返程用户数）
166 | def get_eloc_sloc_hour_user_count(train, result):
167 |     train = train[~train.geohashed_end_loc.isnull()]
168 |     eloc_sloc_hour_user_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['userid'].agg({'eloc_sloc_hour_user_count': lambda x: np.unique(x).size})
169 |     eloc_sloc_hour_user_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
170 |     result = pd.merge(result, eloc_sloc_hour_user_count, on=['geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') # 9
171 |     return result
172 | 
173 | # 获取从某个地方出发每个小时的目的地个数
174 | def get_sloc_hour_eloccount(train, result):
175 |     train = train[~train.geohashed_end_loc.isnull()]
176 |     sloc_hour_eloccount = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['geohashed_end_loc'].agg({'sloc_hour_eloccount': lambda x: np.unique(x).size})
177 |     result = pd.merge(result, sloc_hour_eloccount, on=['geohashed_start_loc', 'hour'], how='left')
178 |     return result
179 | 
180 | # 获取到某个地方结束每个小时的出发地个数
181 | def get_eloc_hour_sloccount(train, result):
182 |     train = train[~train.geohashed_end_loc.isnull()]
183 |     eloc_hour_sloccount = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['geohashed_start_loc'].agg({'eloc_hour_sloccount': lambda x: np.unique(x).size})
184 |     result = pd.merge(result, eloc_hour_sloccount, on=['geohashed_end_loc', 'hour'], how='left')
185 |     return result
186 | 
187 | # ----------------- 统计 -------------------
188 | 
189 | # 获取从某个地点出发的距离统计
190 | def get_sloc_distance_stat(train, result):
191 |     train = train[~train.geohashed_end_loc.isnull()]
192 |     sloc_distance_stat = train.groupby(['geohashed_start_loc'], as_index=False)['distance'].agg({'sloc_distance_max': 'max', 'sloc_distance_min': 'min', 'sloc_distance_mean': 'mean'})
193 |     result = pd.merge(result, sloc_distance_stat, on=['geohashed_start_loc'], how='left')
194 |     return result
195 | 
196 | # 获取到某个地点结束的距离统计
197 | def get_eloc_distance_stat(train, result):
198 |     train = train[~train.geohashed_end_loc.isnull()]
199 |     eloc_distance_stat = train.groupby(['geohashed_end_loc'], as_index=False)['distance'].agg({'eloc_distance_max': 'max', 'eloc_distance_min': 'min', 'eloc_distance_mean': 'mean'})
200 |     result = pd.merge(result, eloc_distance_stat, on=['geohashed_end_loc'], how='left')
201 |     return result
202 | 
203 | # 获取从某个地点出发的小时段距离统计
204 | def get_sloc_hour_distance_stat(train, result):
205 |     train = train[~train.geohashed_end_loc.isnull()]
206 |     sloc_hour_distance_stat = train.groupby(['geohashed_start_loc', 'hour'], as_index=False)['distance'].agg({'sloc_hour_distance_max': 'max', 'sloc_hour_distance_min': 'min', 'sloc_hour_distance_mean': 'mean'})
207 |     result = pd.merge(result, sloc_hour_distance_stat, on=['geohashed_start_loc', 'hour'], how='left')
208 |     return result
209 | 
210 | # 获取到某个地点结束的小时段距离统计
211 | def get_eloc_hour_distance_stat(train, result):
212 |     train = train[~train.geohashed_end_loc.isnull()]
213 |     eloc_hour_distance_stat = train.groupby(['geohashed_end_loc', 'hour'], as_index=False)['distance'].agg({'eloc_hour_distance_max': 'max', 'eloc_hour_distance_min': 'min', 'eloc_hour_distance_mean': 'mean'})
214 |     result = pd.merge(result, eloc_hour_distance_stat, on=['geohashed_end_loc', 'hour'], how='left')
215 |     return result
216 | 
217 | # 获取从某个地点出发的小时均值
218 | def get_sloc_hour_mean(train, result):
219 |     sloc_hour_mean = train.groupby(['geohashed_start_loc'], as_index=False)['hour'].agg({'sloc_hour_mean': 'mean'})
220 |     result = pd.merge(result, sloc_hour_mean, on=['geohashed_start_loc'], how='left')
221 |     return result
222 | 
223 | # 获取到某个地点结束的小时均值
224 | def get_eloc_hour_mean(train, result):
225 |     train = train[~train.geohashed_end_loc.isnull()]
226 |     eloc_hour_mean = train.groupby(['geohashed_end_loc'], as_index=False)['hour'].agg({'eloc_hour_mean': 'mean'})
227 |     result = pd.merge(result, eloc_hour_mean, on=['geohashed_end_loc'], how='left')
228 |     return result
229 | 
230 | # 获取从某个点出发到某个地点结束的小时均值
231 | def get_sloc_eloc_hour_mean(train, result):
232 |     train = train[~train.geohashed_end_loc.isnull()]
233 |     sloc_eloc_hour_mean = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['hour'].agg({'sloc_eloc_hour_mean': 'mean'})
234 |     result = pd.merge(result, sloc_eloc_hour_mean, on=['geohashed_start_loc', 'geohashed_end_loc'], how='left')
235 |     return result
236 | 
237 | # ----------------- 排序 -------------------
238 | 
239 | # 获取到某个目的地的距离排序
240 | def get_eloc_distance_rank(result):
241 |     result = rank(result, 'geohashed_end_loc', 'distance', rank_name='eloc_distance_rank', ascending=False)
242 |     return result
243 | 
244 | # 获取从某个地点出发的距离排序
245 | def get_sloc_distance_rank(result):
246 |     result = rank(result, 'geohashed_start_loc', 'distance', rank_name='sloc_distance_rank', ascending=False)
247 |     return result
248 | 
249 | # 获取到某个目的地的小时段距离排序
250 | def get_eloc_hour_distance_rank(result):
251 |     result = rank(result, ['geohashed_end_loc', 'hour'], 'distance', rank_name='eloc_hour_distance_rank', ascending=False)
252 |     return result
253 | 
254 | # 获取从某个目的地出发的小时段距离排序
255 | def get_sloc_hour_distance_rank(result):
256 |     result = rank(result, ['geohashed_start_loc', 'hour'], 'distance', rank_name='sloc_hour_distance_rank', ascending=False)
257 |     return result
258 | 
259 | # 获取从某个地点出发的小时段排序
260 | def get_sloc_hour_rank(result):
261 |     result = rank(result, 'geohashed_start_loc', 'hour', rank_name='sloc_hour_rank', ascending=False)
262 |     return result
263 | 
264 | # 获取到某个目的地结束的小时段排序
265 | def get_eloc_hour_rank(result):
266 |     result = rank(result, 'geohashed_end_loc', 'hour', rank_name='eloc_hour_rank', ascending=False)
267 |     return result
268 | 
269 | # 获取从某个地点出发到某个地点结束的小时段排序
270 | def get_sloc_eloc_hour_rank(result):
271 |     result = rank(result, ['geohashed_start_loc', 'geohashed_end_loc'], 'hour', rank_name='sloc_eloc_hour_rank', ascending=False)
272 |     return result
273 | 
274 | # ----------------- 差值 -------------------
275 | 
276 | # 获取距离与从某个点出发距离统计值的绝对差值
277 | def get_sloc_distance_stat_sub(result):
278 |     result['sloc_distance_mean_sub'] = (result['distance'] - result['sloc_distance_mean'])
279 |     result['sloc_distance_mean_sub_abs'] = (result['distance'] - result['sloc_distance_mean']).abs()
280 |     return result
281 | 
282 | # 获取距离与到某个点结束距离统计值的绝对差值
283 | def get_eloc_distance_stat_sub(result):
284 |     result['eloc_distance_mean_sub'] = (result['distance'] - result['eloc_distance_mean'])
285 |     result['eloc_distance_mean_sub_abs'] = (result['distance'] - result['eloc_distance_mean']).abs()
286 |     return result
287 | 
288 | # 获取距离与从某个点出发距离统计值的各小时段绝对差值
289 | def get_sloc_hour_distance_stat_sub(result):
290 |     result['sloc_hour_distance_mean_sub'] = (result['distance'] - result['sloc_hour_distance_mean'])
291 |     result['sloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['sloc_hour_distance_mean']).abs()
292 |     return result
293 | 
294 | # 获取距离与到某个点结束距离统计值的各小时段绝对差值
295 | def get_eloc_hour_distance_stat_sub(result):
296 |     result['eloc_hour_distance_mean_sub'] = (result['distance'] - result['eloc_hour_distance_mean'])
297 |     result['eloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['eloc_hour_distance_mean']).abs()
298 |     return result
299 | 
300 | # 获取小时段与从某个点出发的小时均值的绝对差值
301 | def get_hour_sloc_hour_mean_sub(result):
302 |     result['hour_sloc_hour_mean_sub'] = (result['hour'] - result['sloc_hour_mean'])
303 |     result['hour_sloc_hour_mean_sub_abs'] = (result['hour'] - result['sloc_hour_mean']).abs()
304 |     return result
305 | 
306 | # 获取小时段与到某个点结束的小时均值的绝对差值
307 | def get_hour_eloc_hour_mean_sub(result):
308 |     result['hour_eloc_hour_mean_sub'] = (result['hour'] - result['eloc_hour_mean'])
309 |     result['hour_eloc_hour_mean_sub_abs'] = (result['hour'] - result['eloc_hour_mean']).abs()
310 |     return result
311 | 
312 | # 获取小时段与从某个点出发到某个点结束的小时均值的绝对差值
313 | def get_hour_sloc_eloc_hour_mean_sub(result):
314 |     result['hour_sloc_eloc_hour_mean_sub'] = (result['hour'] - result['sloc_eloc_hour_mean'])
315 |     result['hour_sloc_eloc_hour_mean_sub_abs'] = (result['hour'] - result['sloc_eloc_hour_mean']).abs()
316 |     return result
317 | 
318 | # ----------------- 比例 ------------------- 
319 | 
320 | # 获取从某个地点出发到某个地点结束的个数与从这个点出发的个数的比例
321 | def get_sloc_eloc_count_ratio(result):
322 |     result['sloc_eloc_count_ratio'] = result['sloc_eloc_count'] / result['sloc_count']
323 |     return result;
324 | 
325 | # 获取从某个地点出发的小时段个数与从这个地方出发的个数的比例
326 | def get_sloc_hour_count_ratio(result):
327 |     result['sloc_hour_count_ratio'] = result['sloc_hour_count'] / result['sloc_count']
328 |     return result
329 | 
330 | # 获取到某个目的地的小时段个数与到某个目的地的个数的比例
331 | def get_eloc_hour_count_ratio(result):
332 |     result['eloc_hour_count_ratio'] = result['eloc_hour_count'] / result['eloc_count']
333 |     return result


--------------------------------------------------------------------------------
/feature/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import gc
  3 | 
  4 | from .user import *
  5 | from .location import *
  6 | from .latlon import *
  7 | from .other import *
  8 | from .leak import *
  9 | from .rule import *
 10 | from .filter import *
 11 | 
 12 | # 获取全部特征
 13 | def get_feat(train, sample):
 14 |     result = sample
 15 |     # 获取协同过滤特征：
 16 |     result = get_loc_filter(train, result)
 17 |     result = get_user_loc_filter(train, result)
 18 |     print('协同过滤特征构造完成！')
 19 |     gc.collect()
 20 |     result.fillna(-1000000, inplace=True)
 21 |     gc.collect()
 22 |     print('所有特征构造完成：\ncolumns:\n{}'.format(result.columns))
 23 |     return result
 24 |     
 25 |     # 获取距离特征：
 26 |     result = get_distance(result) # distance # dui 90 wc
 27 |     # result = result[(result['distance'] < 7200) & (result['distance'] > 100)]
 28 |     # print('过滤大小距离之后：', result.shape)
 29 |     # result = result[result['distance'] < 10000]
 30 |     # print('过滤大距离之后：', result.shape)
 31 |     print('距离特征构造完成！')
 32 |     
 33 |     # 获取小时特征：
 34 |     result = get_hour(result) # hour # dui 90 wc
 35 |     result = get_hour_count(train, result) # hour_fre # dui 90 wc
 36 |     print('小时特征构造完成！')
 37 | 
 38 |     # 获取用户特征
 39 |     result = get_user_count(train, result) # user_count # dui 90 wc 
 40 |     result = get_user_eloc_count(train, result) # user_eloc_count # dui 90 wc
 41 |     result = get_user_sloc_count(train, result) 
 42 |     result = get_user_sloc_eloc_count(train, result) # user_sloc_eloc_count
 43 |     result = get_user_eloc_sloc_count(train, result) # user_eloc_sloc_count
 44 |     result = get_user_eloc_sloc_rate(train, result) # dui 90 wc
 45 |     result = get_user_eloc_as_sloc_count(train, result) # user_sloc_count # dui 90 wc 
 46 |     result = get_user_sloc_as_eloc_count(train, result) # 1
 47 |     result = get_user_eloc_in_sloc_count(result) # eloc_in_sloc_count
 48 |     result = get_user_loccount(train, result)
 49 |     result = get_user_sloccount(train, result)
 50 |     result = get_user_eloccount(train, result) # dui 90 wc
 51 |     result = get_user_sloc_eloccount(train, result)
 52 |     result = get_user_eloc_sloccount(train, result)
 53 |     gc.collect()
 54 |     print('1 done!')
 55 |     result = get_user_hour_count(train, result) # user_hour_fre # dui 90 wc
 56 |     result = get_user_eloc_hour_count(train, result) # user_eloc_hour_fre # dui 90 wc
 57 |     result = get_user_sloc_hour_count(train, result) # user_sloc_hour_fre
 58 |     result = get_user_sloc_eloc_hour_count(train, result) # user_sloc_eloc_hour_fre
 59 |     result = get_user_eloc_sloc_hour_count(train, result)
 60 |     result = get_user_hour_loccount(train, result)
 61 |     result = get_user_hour_sloccount(train, result)
 62 |     result = get_user_hour_eloccount(train, result)
 63 |     result = get_user_sloc_hour_eloccount(train, result)
 64 |     result = get_user_eloc_hour_sloccount(train, result)
 65 |     gc.collect()
 66 |     print('2 done!')
 67 |     result = get_user_distance_stat(train, result) # user_dis_min, user_dis_max, user_dis_med # dui 90 wc
 68 |     result = get_user_distance_quantile(train, result) # wc
 69 |     result = get_user_eloc_distance_stat(train, result) # user_eloc_dis_max, user_eloc_dis_min, user_eloc_dis_med # dui 90 wc
 70 |     result = get_user_sloc_distance_stat(train, result) # user_sloc_dis_min, user_sloc_dis_max, user_sloc_dis_med # dui 90 wc
 71 |     result = get_user_hour_distance_stat(train, result) # dui 90 wc
 72 |     result = get_user_sloc_hour_distance_stat(train, result) # dui 90 wc
 73 |     result = get_user_eloc_hour_distance_stat(train, result)
 74 |     result = get_user_hour_stat(train, result) # 1 # dui 90 wc
 75 |     result = get_user_sloc_hour_stat(train, result) # 1
 76 |     result = get_user_eloc_hour_stat(train, result) # 1 # dui 90 wc
 77 |     result = get_user_sloc_eloc_hour_stat(train, result)
 78 |     result = get_user_most_freq_eloc(train, result) # wc
 79 |     result = get_user_eloc_lasttime(train, result) # wc
 80 |     gc.collect()
 81 |     print('3 done!')
 82 |     result = get_user_eloc_distance_rank(result) # user_sloc_dis_rank
 83 |     result = get_user_sloc_distance_rank(result) # user_eloc_dis_rank
 84 |     result = get_user_eloc_hour_distance_rank(result) # dui 90 wc
 85 |     result = get_user_sloc_hour_distance_rank(result)
 86 |     result = get_user_hour_rank(result) # 1 # dui 90 wc
 87 |     result = get_user_sloc_hour_rank(result) # 1
 88 |     result = get_user_eloc_hour_rank(result)
 89 |     result = get_user_sloc_eloc_hour_rank(result)
 90 |     gc.collect()
 91 |     print('4 done!')
 92 |     result = get_user_distance_stat_sub(result) # 1 # dui 90 wc
 93 |     result = get_user_sloc_distance_stat_sub(result) # 1 # dui 90 wc
 94 |     result = get_user_eloc_distance_stat_sub(result) # 1
 95 |     result = get_user_hour_distance_stat_sub(result) # dui 90 wc
 96 |     result = get_user_sloc_hour_distance_stat_sub(result) # dui 90 wc
 97 |     result = get_user_eloc_hour_distance_stat_sub(result)
 98 |     result = get_hour_user_hour_stat_sub(result) # 1 # dui 90 wc
 99 |     result = get_hour_user_sloc_hour_stat_sub(result) # 1
100 |     result = get_hour_user_eloc_hour_stat_sub(result) # 1 # dui 90 wc
101 |     result = get_hour_user_sloc_eloc_hour_stat_sub(result)
102 |     gc.collect()
103 |     print('5 done!')
104 |     result = get_global_user_sloc_count_ratio(result) # wc
105 |     result = get_user_eloc_count_ratio(result) # wc
106 |     gc.collect()
107 |     print('用户特征构造完成！')
108 |     
109 |     # # 获取地理位置特征
110 |     result = get_eloc_count(train, result) # eloc_count # dui 90 wc
111 |     result = get_sloc_count(train, result) # eloc_as_sloc_count # dui 90 wc
112 |     result = get_sloc_as_eloc_count(train, result)
113 |     result = get_eloc_as_sloc_count(train, result) # dui 90 wc
114 |     result = get_sloc_eloc_count(train, result) # dui 90 wc
115 |     result = get_eloc_sloc_count(train, result)
116 |     result = get_eloc_user_count(train, result) # eloc_usercount # dui 90 wc
117 |     result = get_sloc_user_count(train, result) # eloc_as_sloc_usercount # dui 90 wc
118 |     result = get_sloc_as_eloc_user_count(train, result)
119 |     result = get_eloc_as_sloc_user_count(train, result) # dui 90 wc
120 |     result = get_sloc_eloc_user_count(train, result) # user_sloc_eloc_usercount # dui 90 wc
121 |     result = get_eloc_sloc_user_count(train, result) # user_eloc_sloc_usercount
122 |     result = get_sloc_eloccount(train, result) # sloc_eloccount # dui 90 wc
123 |     result = get_eloc_sloccount(train, result) # eloc_sloccount
124 |     gc.collect()
125 |     print('1 done!')
126 |     result = get_eloc_hour_count(train, result) # eloc_hour_fre # dui 90 wc
127 |     result = get_sloc_hour_count(train, result) # sloc_hour_fre # 90 wc 最后删掉
128 |     result = get_sloc_eloc_hour_count(train, result) # sloc_eloc_hour_fre
129 |     result = get_eloc_sloc_hour_count(train, result)
130 |     result = get_eloc_hour_user_count(train, result)
131 |     result = get_sloc_hour_user_count(train, result)
132 |     result = get_sloc_eloc_hour_user_count(train, result)
133 |     result = get_eloc_sloc_hour_user_count(train, result)
134 |     result = get_sloc_hour_eloccount(train, result)
135 |     result = get_eloc_hour_sloccount(train, result)
136 |     gc.collect()
137 |     print('2 done!')
138 |     result = get_sloc_distance_stat(train, result) # sloc_dis_med, sloc_dis_min, sloc_dis_max # dui 90 wc
139 |     result = get_eloc_distance_stat(train, result) # eloc_dis_max, eloc_dis_min, eloc_dis_med # dui 90 wc
140 |     result = get_sloc_hour_distance_stat(train, result)
141 |     result = get_eloc_hour_distance_stat(train, result)
142 |     result = get_sloc_hour_mean(train, result) # 1 # dui 90 wc
143 |     result = get_eloc_hour_mean(train, result) # 1 # dui 90 wc
144 |     result = get_sloc_eloc_hour_mean(train, result) # 1 # dui 90 wc
145 |     gc.collect()
146 |     print('3 done!')
147 |     result = get_eloc_distance_rank(result) # sloc_dis_rank
148 |     result = get_sloc_distance_rank(result) # eloc_dis_rank
149 |     result = get_eloc_hour_distance_rank(result)
150 |     result = get_sloc_hour_distance_rank(result)
151 |     result = get_sloc_hour_rank(result) # 1
152 |     result = get_eloc_hour_rank(result) # 1
153 |     result = get_sloc_eloc_hour_rank(result) # 1
154 |     gc.collect()
155 |     print('4 done!')
156 |     result = get_sloc_distance_stat_sub(result) # 1 # dui 90 wc
157 |     result = get_eloc_distance_stat_sub(result) # 1 # dui 90 wc
158 |     result = get_sloc_hour_distance_stat_sub(result)
159 |     result = get_eloc_hour_distance_stat_sub(result)
160 |     result = get_hour_sloc_hour_mean_sub(result) # 1 # dui 90 wc
161 |     result = get_hour_eloc_hour_mean_sub(result) # 1 # dui 90 wc
162 |     result = get_hour_sloc_eloc_hour_mean_sub(result) # 1 # dui 90 wc
163 |     gc.collect()
164 |     print('5 done!')
165 |     result = get_sloc_eloc_count_ratio(result) # wc
166 |     result = get_sloc_hour_count_ratio(result) # wc
167 |     result = get_eloc_hour_count_ratio(result) # wc
168 |     gc.collect()
169 |     print('地理位置特征构造完成！')
170 |     
171 |     # 获取协同过滤特征：
172 |     result = get_loc_filter(train, result)
173 |     result = get_user_loc_filter(train, result)
174 |     print('协同过滤特征构造完成！')
175 | 
176 |     # # 获取经纬度特征
177 |     result = get_eloc_latlon(result) # dui 90 wc
178 |     result = get_sloc_latlon(result) # dui 90 wc
179 |     gc.collect()
180 |     print('1 done!')
181 |     result = get_eloc_sloc_latlon_sub(result) # sloc_eloc_lon_sub, sloc_eloc_lat_sub # dui 90 wc
182 |     result = get_eloc_sloc_slope(result) # 1 # dui 90 wc
183 |     result = get_eloc_sloc_latlon_sub_divide_distance(result) # 1 # dui 90 wc
184 |     result = get_bearing_array(result) # wc
185 |     gc.collect()
186 |     print('2 done!')
187 |     result = get_user_latlon_sub_stat(train, result) # dui 90 wc
188 |     result = get_user_sloc_latlon_sub_stat(train, result)
189 |     result = get_user_eloc_latlon_sub_stat(train, result) # dui 90 wc
190 |     result = get_user_sloc_hour_latlon_sub_stat(train, result)
191 |     result = get_user_eloc_hour_latlon_sub_stat(train, result)
192 |     gc.collect()
193 |     print('3 done!')
194 |     result = get_sloc_latlon_sub_stat(train, result) # dui 90 wc
195 |     result = get_eloc_latlon_sub_stat(train, result) # dui 90 wc
196 |     result = get_sloc_hour_latlon_sub_stat(train, result)
197 |     result = get_eloc_hour_latlon_sub_stat(train, result) # dui 90 wc
198 |     gc.collect()
199 |     print('4 done!')
200 |     result = get_user_latlon_sub_rank(result) # dui 90 wc
201 |     result = get_user_eloc_latlon_sub_rank(result)
202 |     result = get_user_sloc_latlon_sub_rank(result) # dui 90 wc
203 |     result = get_user_eloc_hour_latlon_sub_rank(result)
204 |     result = get_user_sloc_hour_latlon_sub_rank(result) # dui 90 wc
205 |     gc.collect()
206 |     print('5 done!')
207 |     result = get_eloc_latlon_sub_rank(result) # dui 90 wc
208 |     result = get_sloc_latlon_sub_rank(result)
209 |     result = get_eloc_hour_latlon_sub_rank(result)
210 |     result = get_sloc_hour_latlon_sub_rank(result)
211 |     gc.collect()
212 |     print('6 done!')
213 |     result = get_user_latlon_sub_stat_sub(result)
214 |     result = get_user_sloc_latlon_sub_stat_sub(result)
215 |     result = get_user_eloc_latlon_sub_stat_sub(result)
216 |     result = get_user_sloc_hour_latlon_sub_stat_sub(result)
217 |     result = get_user_eloc_hour_latlon_sub_stat_sub(result)
218 |     gc.collect()
219 |     print('7 done!')
220 |     result = get_sloc_latlon_sub_stat_sub(result)
221 |     result = get_eloc_latlon_sub_stat_sub(result)
222 |     result = get_sloc_hour_latlon_sub_stat_sub(result)
223 |     result = get_eloc_hour_latlon_sub_stat_sub(result)
224 |     gc.collect()
225 |     print('经纬度特征构造完成！')
226 |     
227 |     # 获取Leak特征：
228 |     result = get_eloc_user_sloc_leak(result) # 1 # dui 90 wc
229 |     result = get_eloc_bike_sloc_leak(result) # 1 # dui 90 wc
230 |     print('Leak特征构造完成！')
231 |     
232 |     # 获取规则特征
233 |     result = get_user_rule(result) # dui 90 wc
234 |     result = get_user_didi(train, result) # dui 90 wc
235 |     gc.collect()
236 |     print('1 done!')
237 |     result = get_loc_rule(result) # dui 90 wc
238 |     result = get_loc_didi(train, result) # dui 90 wc
239 |     gc.collect()
240 |     print('规则特征构造完成！')
241 |     
242 |     # 删除无用特征
243 |     # result.drop(['sloc_lon_sub_max', 'user_hour_distance_mean', 'user_sloc_distance_mean', 'user_eloc_hour_max', 'sloc_lat_sub_max', 'eloc_bike_last_sloc_distance', 'hour', 'eloc_bike_last_sloc_speed', 'user_eloc_hour_min', 'eloc_hour_count', 'user_lon_sub_max', 'eloc_hour_lon_sub_max', 'user_hour_max', 'user_lat_sub_max', 'eloc_hour_lat_sub_min', 'hour_count', 'eloc_hour_lat_sub_max', 'eloc_hour_lon_sub_min', 'user_eloc_distance_min', 'user_hour_count_rate', 'user_eloc_distance_mean', 'user_hour_min', 'user_hour_eloc_rate', 'eloc_lon_sub_min', 'user_eloc_lon_sub_mean', 'user_eloc_lat_sub_mean', 'user_eloc_hour_mean', 'user_eloc_lon_sub_min', 'user_eloc_lat_sub_min', 'user_eloc_lon_sub_max', 'sloc_lat_sub_min', 'user_eloc_hour_count', 'eloc_lat_sub_min', 'user_eloc_count_rate', 'eloc_count_rate', 'sloc_lon_sub_min', 'user_eloc_lat_sub_max', 'hour_count_rate', 'user_end_loc_sample', 'sloc_lat', 'user_lat_sub_mean', 'sloc_lon', 'user_hour_distance_mean_sub', 'sloc_distance_max', 'user_lon_sub_mean', 'user_distance_mean', 'sloc_distance_mean_sub_abs', 'user_sloc_distance_min', 'sloc_lat_sub_mean', 'eloc_lon_sub_max', 'user_sloc_distance_max', 'eloc_distance_max', 'hour_user_hour_mean_sub', 'eloc_as_sloc_count', 'sloc_distance_min', 'eloc_lat_sub_max', 'bike_next_sloc_sample', 'user_rule', 'eloc_distance_mean_sub_abs', 'user_lat_sub_min', 'user_lon_sub_min', 'loc_to_loc_sample', 'user_sloc_hour_distance_min', 'user_sloc_hour_distance_max', 'user_sloc_hour_distance_mean', 'user_sloc_hour_distance_mean_sub', 'sloc_hour_count'], axis=1, inplace=True)
244 |     
245 |     # 删除无用特征
246 |     # result.drop(['user_eloc_hour_lon_sub_mean_sub_abs', 'user_sloc_lon_sub_mean_sub'], axis=1, inplace=True)
247 |     
248 |     result.fillna(-1000000, inplace=True)
249 |     print('所有特征构造完成：\ncolumns:\n{}'.format(result.columns))
250 |     return result


--------------------------------------------------------------------------------
/feature/other.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import Geohash as geohash
 3 | import pandas as pd
 4 | import numpy as np
 5 | import os
 6 | os.path.join('..')
 7 | from utils import cal_distance, rank, manhattan
 8 | 
 9 | '''
10 | 	获取小时特征
11 | '''
12 | 
13 | # 获取小时段
14 | def get_hour(result):
15 |     result['hour'] = pd.to_datetime(result['starttime']).dt.hour
16 |     return result
17 | 
18 | # 获取每个小时段的出行订单数
19 | def get_hour_count(train, result):
20 |     hour_count = train.groupby(['hour'], as_index=False)['userid'].agg({'hour_count': 'count'})
21 |     result = pd.merge(result, hour_count, on='hour', how='left')
22 |     return result
23 | 
24 | '''
25 | 	获取距离特征
26 | '''
27 | 
28 | # 获取出发地到目的地的欧氏距离和曼哈顿距离
29 | def get_distance(result):
30 |     locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
31 |     if np.nan in locs: 
32 |         locs.remove(np.nan)
33 |     deloc = []
34 |     for loc in locs:
35 |         deloc.append(geohash.decode_exactly(loc))
36 |     loc_dict = dict(zip(locs, deloc))
37 |     geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values
38 |     distance = []
39 |     manhattan_distance = []
40 |     for i in geohashed_loc:
41 |         if i[0] is not np.nan and i[1] is not np.nan:
42 |             lat1, lon1, _, _ = loc_dict[i[0]]
43 |             lat2, lon2, _, _ = loc_dict[i[1]]
44 |             distance.append(cal_distance(float(lat1), float(lon1), float(lat2), float(lon2)))
45 |             manhattan_distance.append(manhattan(float(lat1), float(lon1), float(lat2), float(lon2)))
46 |         else:
47 |             distance.append(np.nan)
48 |             manhattan_distance.append(np.nan)
49 |     result.loc[:, 'distance'] = distance
50 |     result.loc[:, 'manhattan'] = manhattan_distance
51 |     return result
52 | 
53 | '''
54 | 	获取经纬度特征
55 | '''
56 | def get_latlon(result, end=True):
57 |     if end:
58 |         eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x))
59 |         result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0]))
60 |         result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1]))
61 |     sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x))
62 |     result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0]))
63 |     result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1]))
64 |     if end:
65 |         result['eloc_sloc_lat_sub'] = result['eloc_lat'] - result['sloc_lat']
66 |         result['eloc_sloc_lon_sub'] = result['eloc_lon'] - result['sloc_lon']
67 |     return result


--------------------------------------------------------------------------------
/feature/rule.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | '''
 6 |     获取自定义规则
 7 | '''
 8 | 
 9 | def get_user_rule(result):
10 |     result['user_rule'] = (1 + result['user_eloc_count']) * result['eloc_user_count'] / (0.01 * result['distance'])
11 |     return result
12 | 
13 | def get_user_didi(train, result):
14 |     result['user_hour_count_rate'] = result['user_hour_count'] / train.shape[0]
15 |     train = train[~train.geohashed_end_loc.isnull()]
16 |     result['user_eloc_count_rate'] = result['user_eloc_count'] / train.shape[0]
17 |     result['user_hour_eloc_rate'] = result['user_eloc_hour_count'] / result['user_eloc_count']
18 |     result['user_hour_eloc_distribute'] = result['user_eloc_count_rate'] * result['user_hour_eloc_rate'] / result['user_hour_count_rate']
19 |     return result
20 | 
21 | def get_loc_rule(result):
22 |     result['loc_rule'] = result['eloc_count'] / (0.01 * result['distance'])
23 |     result['loc_rule2'] = np.sqrt(result['distance'] / (result['eloc_count'] ** 1.1))
24 |     return result
25 | 
26 | def get_loc_didi(train, result):
27 |     result['hour_count_rate'] = result['hour_count'] / train.shape[0]
28 |     train = train[~train.geohashed_end_loc.isnull()]
29 |     result['eloc_count_rate'] = result['eloc_count'] / train.shape[0]
30 |     result['hour_eloc_rate'] = result['eloc_hour_count'] / result['eloc_count']
31 |     result['hour_eloc_distribute'] = result['eloc_count_rate'] * result['hour_eloc_rate'] / result['hour_count_rate']
32 |     return result


--------------------------------------------------------------------------------
/feature/user.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import os
  4 | import numpy as np
  5 | from .other import get_distance
  6 | from .latlon import get_sloc_latlon, get_eloc_latlon, get_eloc_sloc_latlon_sub, get_eloc_sloc_slope, get_eloc_sloc_latlon_sub_divide_distance, get_bearing_array
  7 | os.path.join('..')
  8 | from utils import rank
  9 | 
 10 | '''
 11 | 	获取用户特征
 12 | '''
 13 | 
 14 | # ----------------- 计数 -------------------
 15 | 
 16 | # 获取用户历史出行次数
 17 | def get_user_count(train, result):
 18 |     user_count = train.groupby('userid', as_index=False)['orderid'].agg({'user_count': 'count'})
 19 |     result = pd.merge(result, user_count, on=['userid'], how='left')
 20 |     return result
 21 | 
 22 | # 获取用户去过某个地点的历史出行次数
 23 | def get_user_eloc_count(train, result):
 24 |     train = train[~train.geohashed_end_loc.isnull()]
 25 |     user_eloc_count = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_count': 'count'})
 26 |     result = pd.merge(result, user_eloc_count, on=['userid', 'geohashed_end_loc'], how='left')
 27 |     return result
 28 | 
 29 | # 获取用户从某个地方出发的历史出行次数
 30 | def get_user_sloc_count(train, result):
 31 |     user_sloc_count = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['userid'].agg({'user_sloc_count': 'count'})
 32 |     result = pd.merge(result, user_sloc_count, on=['userid', 'geohashed_start_loc'], how='left')
 33 |     return result
 34 | 
 35 | # 获取用户从某个地方出发到某个地方结束的历史出行次数
 36 | def get_user_sloc_eloc_count(train, result):
 37 |     train = train[~train.geohashed_end_loc.isnull()]
 38 |     user_sloc_eloc_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_sloc_eloc_count': 'count'})
 39 |     result = pd.merge(result, user_sloc_eloc_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left')
 40 |     return result
 41 | 
 42 | # 获取用户从某个目的地出发到某个出发地结束的历史返程次数
 43 | def get_user_eloc_sloc_count(train, result):
 44 |     train = train[~train.geohashed_end_loc.isnull()]
 45 |     user_eloc_sloc_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_sloc_count': 'count'})
 46 |     user_eloc_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc', 'geohashed_end_loc':'geohashed_start_loc'}, inplace=True)
 47 |     result = pd.merge(result, user_eloc_sloc_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left')
 48 |     return result
 49 | 
 50 | # 获取用户的返程比例
 51 | def get_user_eloc_sloc_rate(train, result):
 52 |     train = train[~train.geohashed_end_loc.isnull()]
 53 |     user_eloc_sloc_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_sloc_count': 'count'})
 54 |     user_eloc_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc', 'geohashed_end_loc':'geohashed_start_loc'}, inplace=True)
 55 |     restmp = pd.merge(train, user_eloc_sloc_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left')
 56 |     restmp = restmp.groupby('userid', as_index=False)['user_eloc_sloc_count'].agg({'user_eloc_sloc_rate': lambda x: np.sum(x>0)/np.size(x)})
 57 |     result = pd.merge(result, restmp, on='userid', how='left')
 58 |     return result
 59 | 
 60 | # 获取用户目的地点作为出发地的次数
 61 | def get_user_eloc_as_sloc_count(train, result):
 62 |     user_eloc_as_sloc_count = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['userid'].agg({'user_eloc_as_sloc_count': 'count'})
 63 |     user_eloc_as_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True)
 64 |     result = pd.merge(result, user_eloc_as_sloc_count, on=['userid', 'geohashed_end_loc'], how='left')
 65 |     return result
 66 | 
 67 | # 获取用户出发地点作为目的地的次数
 68 | def get_user_sloc_as_eloc_count(train, result):
 69 |     train = train[~train.geohashed_end_loc.isnull()]
 70 |     user_sloc_as_eloc_count = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_sloc_as_eloc_count': 'count'})
 71 |     user_sloc_as_eloc_count.rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 72 |     result = pd.merge(result, user_sloc_as_eloc_count, on=['userid', 'geohashed_start_loc'], how='left')
 73 |     return result
 74 | 
 75 | # 获取用户目的地出现在出发地中的个数
 76 | def get_user_eloc_in_sloc_count(result):
 77 |     user_eloc_in_sloc_count = result.groupby(['userid', 'geohashed_start_loc'], as_index=False)['orderid'].agg({'user_eloc_in_sloc_count': lambda x: np.unique(x).size})
 78 |     user_eloc_in_sloc_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc'}, inplace=True)
 79 |     result = pd.merge(result, user_eloc_in_sloc_count, on=['userid', 'geohashed_end_loc'], how='left')
 80 |     return result
 81 | 
 82 | # 获取用户涉及到的地点个数
 83 | def get_user_loccount(train, result):
 84 |     user_sloc = train[['userid', 'geohashed_start_loc']]
 85 |     train = train[~train.geohashed_end_loc.isnull()]
 86 |     user_eloc = train[['userid', 'geohashed_end_loc']].rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
 87 |     user_loc = pd.concat([user_sloc, user_eloc])
 88 |     user_loccount = user_loc.groupby('userid', as_index=False)['geohashed_start_loc'].agg({'user_loccount': lambda x: np.unique(x).size})
 89 |     result = pd.merge(result, user_loccount, on=['userid'], how='left')
 90 |     return result
 91 | 
 92 | # 获取用户出发的出发地个数
 93 | def get_user_sloccount(train, result):
 94 |     user_sloccount = train.groupby('userid', as_index=False)['geohashed_start_loc'].agg({'user_sloccount': lambda x: np.unique(x).size})
 95 |     result = pd.merge(result, user_sloccount, on=['userid'], how='left')
 96 |     return result
 97 | 
 98 | # 获取用户到达的目的地个数
 99 | def get_user_eloccount(train, result):
100 |     train = train[~train.geohashed_end_loc.isnull()]
101 |     user_eloccount = train.groupby('userid', as_index=False)['geohashed_end_loc'].agg({'user_eloccount': lambda x: np.unique(x).size})
102 |     result = pd.merge(result, user_eloccount, on=['userid'], how='left')
103 |     return result
104 | 
105 | # 获取用户从某个地方出发到的目的地数目
106 | def get_user_sloc_eloccount(train, result):
107 |     train = train[~train.geohashed_end_loc.isnull()]
108 |     user_sloc_eloccount = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['geohashed_end_loc'].agg({'user_sloc_eloccount': lambda x: np.unique(x).size})
109 |     result = pd.merge(result, user_sloc_eloccount, on=['userid', 'geohashed_start_loc'], how='left')
110 |     return result
111 | 
112 | # 获取用户到某个地方结束的出发地数目
113 | def get_user_eloc_sloccount(train, result):
114 |     train = train[~train.geohashed_end_loc.isnull()]
115 |     user_eloc_sloccount = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['geohashed_start_loc'].agg({'user_eloc_sloccount': lambda x: np.unique(x).size})
116 |     result = pd.merge(result, user_eloc_sloccount, on=['userid', 'geohashed_end_loc'], how='left')
117 |     return result
118 | 
119 | # 获取用户在每个小时段的出行订单数
120 | def get_user_hour_count(train, result):
121 |     user_hour_count = train.groupby(['userid', 'hour'], as_index=False)['orderid'].agg({'user_hour_count': 'count'})
122 |     result = pd.merge(result, user_hour_count, on=['userid', 'hour'], how='left')
123 |     return result
124 | 
125 | # 获取用户在每个小时段从某个地方出发的订单数
126 | def get_user_sloc_hour_count(train, result):
127 |     user_sloc_hour_count = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['orderid'].agg({'user_sloc_hour_count': 'count'})
128 |     result = pd.merge(result, user_sloc_hour_count, on=['userid', 'geohashed_start_loc', 'hour'], how='left')
129 |     return result
130 | 
131 | # 获取用户在每个小时段到某个地方结束的订单数
132 | def get_user_eloc_hour_count(train, result):
133 |     train = train[~train.geohashed_end_loc.isnull()]
134 |     user_eloc_hour_count = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'user_eloc_hour_count': 'count'})
135 |     result = pd.merge(result, user_eloc_hour_count, on=['userid', 'geohashed_end_loc', 'hour'], how='left')
136 |     return result
137 | 
138 | # 获取用户在每个小时段从某个地方出发到某个地方结束的订单数
139 | def get_user_sloc_eloc_hour_count(train, result):
140 |     train = train[~train.geohashed_end_loc.isnull()]
141 |     user_sloc_eloc_hour_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'user_sloc_eloc_hour_count': 'count'})
142 |     result = pd.merge(result, user_sloc_eloc_hour_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left')
143 |     return result
144 | 
145 | # 获取用户在每个小时段从某个地方出发到某个地方结束的返程订单数
146 | def get_user_eloc_sloc_hour_count(train, result):
147 |     train = train[~train.geohashed_end_loc.isnull()]
148 |     user_eloc_sloc_hour_count = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], as_index=False)['orderid'].agg({'user_eloc_sloc_hour_count': 'count'})
149 |     user_eloc_sloc_hour_count.rename(columns={'geohashed_start_loc': 'geohashed_end_loc', 'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
150 |     result = pd.merge(result, user_eloc_sloc_hour_count, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc', 'hour'], how='left') # 1
151 |     return result
152 | 
153 | # 获取用户每个小时段涉及到的地点数
154 | def get_user_hour_loccount(train, result):
155 |     user_hour_sloc = train[['userid', 'hour', 'geohashed_start_loc']]
156 |     train = train[~train.geohashed_end_loc.isnull()]
157 |     user_hour_eloc = train[['userid', 'hour', 'geohashed_end_loc']].rename(columns={'geohashed_end_loc': 'geohashed_start_loc'}, inplace=True)
158 |     user_hour_loc = pd.concat([user_hour_sloc, user_hour_eloc])
159 |     user_hour_loccount = user_hour_loc.groupby(['userid', 'hour'], as_index=False)['geohashed_start_loc'].agg({'user_hour_loccount': lambda x: np.unique(x).size})
160 |     result = pd.merge(result, user_hour_loccount, on=['userid', 'hour'], how='left')
161 |     return result
162 | 
163 | # 获取用户每个小时段出发的出发地个数
164 | def get_user_hour_sloccount(train, result):
165 |     user_hour_sloccount = train.groupby(['userid', 'hour'], as_index=False)['geohashed_start_loc'].agg({'user_hour_sloccount': lambda x: np.unique(x).size})
166 |     result = pd.merge(result, user_hour_sloccount, on=['userid', 'hour'], how='left') # 4
167 |     return result
168 | 
169 | # 获取用户每个小时段到达的目的地个数
170 | def get_user_hour_eloccount(train, result):
171 |     train = train[~train.geohashed_end_loc.isnull()]
172 |     user_hour_eloccount = train.groupby(['userid', 'hour'], as_index=False)['geohashed_end_loc'].agg({'user_hour_eloccount': lambda x: np.unique(x).size})
173 |     result = pd.merge(result, user_hour_eloccount, on=['userid', 'hour'], how='left')
174 |     return result
175 | 
176 | # 获取用户每个小时段从某个地方出发到的目的地数目
177 | def get_user_sloc_hour_eloccount(train, result):
178 |     train = train[~train.geohashed_end_loc.isnull()]
179 |     user_sloc_hour_eloccount = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['geohashed_end_loc'].agg({'user_sloc_hour_eloccount': lambda x: np.unique(x).size})
180 |     result = pd.merge(result, user_sloc_hour_eloccount, on=['userid', 'geohashed_start_loc', 'hour'], how='left')
181 |     return result
182 | 
183 | # 获取用户每个小时段到某个地方结束的出发地数目
184 | def get_user_eloc_hour_sloccount(train, result):
185 |     train = train[~train.geohashed_end_loc.isnull()]
186 |     user_eloc_hour_sloccount = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['geohashed_start_loc'].agg({'user_eloc_hour_sloccount': lambda x: np.unique(x).size})
187 |     result = pd.merge(result, user_eloc_hour_sloccount, on=['userid', 'geohashed_end_loc', 'hour'], how='left') # 9
188 |     return result
189 | 
190 | # ----------------- 统计 -------------------
191 | 
192 | # 获取用户出行距离的统计值
193 | def get_user_distance_stat(train, result):
194 |     train = train[~train.geohashed_end_loc.isnull()]
195 |     user_distance_stat = train.groupby('userid', as_index=False)['distance'].agg({'user_distance_max': 'max', 'user_distance_min': 'min', 'user_distance_mean': 'mean'})
196 |     result = pd.merge(result, user_distance_stat, on=['userid'], how='left')
197 |     user_manhattan_stat = train.groupby('userid', as_index=False)['manhattan'].agg({'user_manhattan_max': 'max', 'user_manhattan_min': 'min', 'user_manhattan_mean': 'mean'})
198 |     result = pd.merge(result, user_manhattan_stat, on=['userid'], how='left')
199 |     return result
200 | 
201 | # 获取用户出行距离的分位点
202 | def get_user_distance_quantile(train, result):
203 |     train = train[~train.geohashed_end_loc.isnull()]
204 |     user_distance_quantile = train.groupby('userid')['distance'].quantile(0.2).reset_index()
205 |     user_distance_quantile.rename(columns={'distance': 'user_distance_quantile_2'}, inplace=True)
206 |     result = pd.merge(result, user_distance_quantile, on='userid', how='left')
207 |     user_manhattan_quantile = train.groupby('userid')['manhattan'].quantile(0.2).reset_index()
208 |     user_manhattan_quantile.rename(columns={'manhattan': 'user_manhattan_quantile_2'}, inplace=True)
209 |     result = pd.merge(result, user_manhattan_quantile, on='userid', how='left')
210 |     user_distance_quantile = train.groupby('userid')['distance'].quantile(0.8).reset_index()
211 |     user_distance_quantile.rename(columns={'distance': 'user_distance_quantile_8'}, inplace=True)
212 |     result = pd.merge(result, user_distance_quantile, on='userid', how='left')
213 |     return result
214 | 
215 | # 获取用户从某个地点出发的出行距离统计值
216 | def get_user_sloc_distance_stat(train, result):
217 |     train = train[~train.geohashed_end_loc.isnull()]
218 |     user_sloc_distance_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['distance'].agg({'user_sloc_distance_max': 'max', 'user_sloc_distance_min': 'min', 'user_sloc_distance_mean': 'mean'})
219 |     result = pd.merge(result, user_sloc_distance_stat, on=['userid', 'geohashed_start_loc'], how='left')
220 |     user_sloc_manhattan_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['manhattan'].agg({'user_sloc_manhattan_max': 'max', 'user_sloc_manhattan_min': 'min', 'user_sloc_manhattan_mean': 'mean'})
221 |     result = pd.merge(result, user_sloc_manhattan_stat, on=['userid', 'geohashed_start_loc'], how='left')
222 |     return result
223 | 
224 | # 获取用户到某个地点结束的出行距离统计值
225 | def get_user_eloc_distance_stat(train, result):
226 |     train = train[~train.geohashed_end_loc.isnull()]
227 |     user_eloc_distance_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['distance'].agg({'user_eloc_distance_max': 'max', 'user_eloc_distance_min': 'min', 'user_eloc_distance_mean': 'mean'})
228 |     result = pd.merge(result, user_eloc_distance_stat, on=['userid', 'geohashed_end_loc'], how='left')
229 |     return result
230 | 
231 | # 获取用户各时间段出行距离的统计值
232 | def get_user_hour_distance_stat(train, result):
233 |     train = train[~train.geohashed_end_loc.isnull()]
234 |     user_hour_distance_stat = train.groupby(['userid', 'hour'], as_index=False)['distance'].agg({'user_hour_distance_max': 'max', 'user_hour_distance_min': 'min', 'user_hour_distance_mean': 'mean'})
235 |     result = pd.merge(result, user_hour_distance_stat, on=['userid', 'hour'], how='left')
236 |     user_hour_manhattan_stat = train.groupby(['userid', 'hour'], as_index=False)['manhattan'].agg({'user_hour_manhattan_max': 'max', 'user_hour_manhattan_min': 'min', 'user_hour_manhattan_mean': 'mean'})
237 |     result = pd.merge(result, user_hour_manhattan_stat, on=['userid', 'hour'], how='left')
238 |     return result
239 | 
240 | # 获取用户各时间段从某个地点出发的出行距离统计值
241 | def get_user_sloc_hour_distance_stat(train, result):
242 |     train = train[~train.geohashed_end_loc.isnull()]
243 |     user_sloc_hour_distance_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['distance'].agg({'user_sloc_hour_distance_max': 'max', 'user_sloc_hour_distance_min': 'min', 'user_sloc_hour_distance_mean': 'mean'})
244 |     result = pd.merge(result, user_sloc_hour_distance_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left')
245 |     user_sloc_hour_manhattan_stat = train.groupby(['userid', 'geohashed_start_loc', 'hour'], as_index=False)['manhattan'].agg({'user_sloc_hour_manhattan_max': 'max', 'user_sloc_hour_manhattan_min': 'min', 'user_sloc_hour_manhattan_mean': 'mean'})
246 |     result = pd.merge(result, user_sloc_hour_manhattan_stat, on=['userid', 'geohashed_start_loc', 'hour'], how='left')
247 |     return result
248 | 
249 | # 获取用户各时间段到某个地点结束的出行距离统计值
250 | def get_user_eloc_hour_distance_stat(train, result):
251 |     train = train[~train.geohashed_end_loc.isnull()]
252 |     user_eloc_hour_distance_stat = train.groupby(['userid', 'geohashed_end_loc', 'hour'], as_index=False)['distance'].agg({'user_eloc_hour_distance_max': 'max', 'user_eloc_hour_distance_min': 'min', 'user_eloc_hour_distance_mean': 'mean'})
253 |     result = pd.merge(result, user_eloc_hour_distance_stat, on=['userid', 'geohashed_end_loc', 'hour'], how='left')
254 |     return result
255 | 
256 | # 获取用户出行的小时段统计值
257 | def get_user_hour_stat(train, result):
258 |     user_hour_stat = train.groupby(['userid'], as_index=False)['hour'].agg({'user_hour_max': 'max', 'user_hour_min': 'min', 'user_hour_mean': 'mean'})
259 |     result = pd.merge(result, user_hour_stat, on=['userid'], how='left')
260 |     return result
261 | 
262 | # 获取用户从某个地点出行的小时段统计值
263 | def get_user_sloc_hour_stat(train, result):
264 |     user_sloc_hour_stat = train.groupby(['userid', 'geohashed_start_loc'], as_index=False)['hour'].agg({'user_sloc_hour_max': 'max', 'user_sloc_hour_min': 'min', 'user_sloc_hour_mean': 'mean'})
265 |     result = pd.merge(result, user_sloc_hour_stat, on=['userid', 'geohashed_start_loc'], how='left')
266 |     return result
267 | 
268 | # 获取用户到某个地点结束的小时段统计值
269 | def get_user_eloc_hour_stat(train, result):
270 |     train = train[~train.geohashed_end_loc.isnull()]
271 |     user_eloc_hour_stat = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['hour'].agg({'user_eloc_hour_max': 'max', 'user_eloc_hour_min': 'min', 'user_eloc_hour_mean': 'mean'})
272 |     result = pd.merge(result, user_eloc_hour_stat, on=['userid', 'geohashed_end_loc'], how='left')
273 |     return result
274 | 
275 | # 获取用户从某个地点出发到某个地点结束的小时段统计值
276 | def get_user_sloc_eloc_hour_stat(train, result):
277 |     train = train[~train.geohashed_end_loc.isnull()]
278 |     user_sloc_eloc_hour_stat = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['hour'].agg({'user_sloc_eloc_hour_max': 'max', 'user_sloc_eloc_hour_min': 'min', 'user_sloc_eloc_hour_mean': 'mean'}) # 6
279 |     # user_sloc_eloc_hour_stat = train.groupby(['userid', 'geohashed_start_loc', 'geohashed_end_loc'], as_index=False)['hour'].agg({'user_sloc_eloc_hour_min': 'min', 'user_sloc_eloc_hour_mean': 'mean'})
280 |     result = pd.merge(result, user_sloc_eloc_hour_stat, on=['userid', 'geohashed_start_loc', 'geohashed_end_loc'], how='left')
281 |     return result
282 | 
283 | # 获取用户到过最多的地点的各信息
284 | def get_user_most_freq_eloc(train, result):
285 |     train = train[~train.geohashed_end_loc.isnull()]
286 |     user_eloc_count = train.groupby(['userid', 'geohashed_end_loc'], as_index=False)['userid'].agg({'user_eloc_count': 'count'})
287 |     user_most_freq_eloc = user_eloc_count.sort_values(by=['userid', 'user_eloc_count']).groupby('userid', as_index=False).last()[['userid', 'geohashed_end_loc']]
288 |     user_most_freq_eloc.rename(columns={'geohashed_end_loc': 'user_most_freq_eloc'}, inplace=True)
289 |     result = pd.merge(result, user_most_freq_eloc, on='userid', how='left')
290 |     restmp = result[['orderid', 'geohashed_start_loc', 'user_most_freq_eloc']];
291 |     restmp.rename(columns={'user_most_freq_eloc': 'geohashed_end_loc'}, inplace=True)
292 |     restmp = get_distance(restmp)
293 |     restmp = get_sloc_latlon(restmp)
294 |     restmp = pd.merge(restmp, get_eloc_latlon(restmp[~restmp.geohashed_end_loc.isnull()][['orderid', 'geohashed_end_loc']]), on=['orderid', 'geohashed_end_loc'], how='left')
295 |     restmp = get_eloc_sloc_latlon_sub(restmp)
296 |     restmp = get_eloc_sloc_slope(restmp)
297 |     restmp = get_eloc_sloc_latlon_sub_divide_distance(restmp)
298 |     restmp = get_bearing_array(restmp)
299 |     result['user_most_freq_eloc_distance'] = restmp['distance']
300 |     result['user_most_freq_eloc_distance_sub'] = result['distance'] - result['user_most_freq_eloc_distance']
301 |     result['user_most_freq_eloc_distance_sub_abs'] = (result['distance'] - result['user_most_freq_eloc_distance']).abs()
302 |     result['user_most_freq_eloc_manhattan_distance'] = restmp['manhattan']
303 |     result['user_most_freq_eloc_manhattan_sub'] = result['manhattan'] - result['user_most_freq_eloc_manhattan_distance']
304 |     result['user_most_freq_eloc_manhattan_sub_abs'] = (result['manhattan'] - result['user_most_freq_eloc_manhattan_distance']).abs()
305 |     result['user_most_freq_eloc_lon_sub'] = restmp['eloc_sloc_lon_sub']
306 |     result['user_most_freq_eloc_lat_sub'] = restmp['eloc_sloc_lat_sub']
307 |     result['user_most_freq_eloc_slope'] = restmp['eloc_sloc_latlon_slope']
308 |     result['user_most_freq_eloc_lat_sub_divide_distance'] = restmp['eloc_sloc_lat_sub_divide_distance']
309 |     result['user_most_freq_eloc_lon_sub_divide_distance'] = restmp['eloc_sloc_lon_sub_divide_distance']
310 |     result['user_most_freq_eloc_degree'] = restmp['degree']
311 |     result.drop(['user_most_freq_eloc'], axis=1, inplace=True)
312 |     return result
313 | 
314 | # 获取用户到某个地点的最后一次时间与当前的时间差
315 | def get_user_eloc_lasttime(train, result):
316 |     train = train[~train.geohashed_end_loc.isnull()]
317 |     train = train.sort_values(by='starttime')
318 |     user_eloc_last = train.groupby(['userid','geohashed_end_loc'], as_index=False).last()[['userid','geohashed_end_loc', 'starttime']]
319 |     user_eloc_last.rename(columns={'starttime': 'user_eloc_lasttime'}, inplace=True)
320 |     result = pd.merge(result, user_eloc_last, on=['userid', 'geohashed_end_loc'], how='left')
321 |     result['user_eloc_lasttime'] = (pd.DatetimeIndex(result.starttime) - pd.DatetimeIndex(result.user_eloc_lasttime)).total_seconds().values
322 |     return result
323 | 
324 | # ----------------- 排序 -------------------
325 | 
326 | # 获取用户到某个地点结束的距离排序
327 | def get_user_eloc_distance_rank(result):
328 |     result = rank(result, ['userid', 'geohashed_end_loc'], 'distance', rank_name='user_eloc_distance_rank', ascending=False)
329 |     return result
330 | 
331 | # 获取用户从某个地点出发的距离排序
332 | def get_user_sloc_distance_rank(result):
333 |     result = rank(result, ['userid', 'geohashed_start_loc'], 'distance', rank_name='user_sloc_distance_rank', ascending=False)
334 |     return result
335 | 
336 | # 获取用户各小时段到某个地点结束的距离排序
337 | def get_user_eloc_hour_distance_rank(result):
338 |     result = rank(result, ['userid', 'geohashed_start_loc', 'hour'], 'distance', rank_name='user_eloc_hour_distance_rank', ascending=False)
339 |     return result
340 | 
341 | # 获取用户各小时段从某个地点出发的距离排序
342 | def get_user_sloc_hour_distance_rank(result):
343 |     result = rank(result, ['userid', 'geohashed_end_loc', 'hour'], 'distance', rank_name='user_sloc_hour_distance_rank', ascending=False)
344 |     return result
345 | 
346 | # 获取用户出行时间的小时段排序
347 | def get_user_hour_rank(result):
348 |     result = rank(result, 'userid', 'hour', rank_name='user_hour_rank', ascending=False)
349 |     return result
350 | 
351 | # 获取用户从某个地点出发的出行时间的小时段排序
352 | def get_user_sloc_hour_rank(result):
353 |     result = rank(result, ['userid', 'geohashed_start_loc'], 'hour', rank_name='user_sloc_hour_rank', ascending=False)
354 |     return result
355 | 
356 | # 获取用户到某个地点结束的出行时间的小时段排序
357 | def get_user_eloc_hour_rank(result):
358 |     result = rank(result, ['userid', 'geohashed_end_loc'], 'hour', rank_name='user_eloc_hour_rank', ascending=False)
359 |     return result
360 | 
361 | # 获取用户从某个地点出发到某个地点结束的出行时间的小时段排序
362 | def get_user_sloc_eloc_hour_rank(result):
363 |     result = rank(result, ['userid', 'geohashed_start_loc', 'geohashed_end_loc'], 'hour', rank_name='user_sloc_eloc_hour_rank', ascending=False) # 5
364 |     return result
365 | 
366 | # ----------------- 差值 -------------------
367 | 
368 | # 获取实际距离与用户出行距离统计值的(绝对)差值
369 | def get_user_distance_stat_sub(result):
370 |     result['user_distance_mean_sub'] = (result['distance'] - result['user_distance_mean'])
371 |     result['user_distance_mean_sub_abs'] = (result['distance'] - result['user_distance_mean']).abs()
372 |     result['user_manhattan_mean_sub'] = (result['manhattan'] - result['user_manhattan_mean'])
373 |     result['user_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_manhattan_mean']).abs()
374 |     return result
375 | 
376 | # 获取实际距离与用户从某个点出发距离统计值的(绝对)差值
377 | def get_user_sloc_distance_stat_sub(result):
378 |     result['user_sloc_distance_mean_sub'] = (result['distance'] - result['user_sloc_distance_mean'])
379 |     result['user_sloc_distance_mean_sub_abs'] = (result['distance'] - result['user_sloc_distance_mean']).abs()
380 |     result['user_sloc_manhattan_mean_sub'] = (result['manhattan'] - result['user_sloc_manhattan_mean'])
381 |     result['user_sloc_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_sloc_manhattan_mean']).abs()
382 |     return result
383 | 
384 | # 获取实际距离与用户到某个点结束距离统计值的(绝对)差值
385 | def get_user_eloc_distance_stat_sub(result):
386 |     result['user_eloc_distance_mean_sub'] = (result['distance'] - result['user_eloc_distance_mean'])
387 |     result['user_eloc_distance_mean_sub_abs'] = (result['distance'] - result['user_eloc_distance_mean']).abs()
388 |     return result
389 | 
390 | # 获取实际距离与用户出行距离统计值的各小时段(绝对)差值
391 | def get_user_hour_distance_stat_sub(result):
392 |     result['user_hour_distance_mean_sub'] = (result['distance'] - result['user_hour_distance_mean'])
393 |     result['user_hour_distance_mean_sub_abs'] = (result['distance'] - result['user_hour_distance_mean']).abs()
394 |     result['user_hour_manhattan_mean_sub'] = (result['manhattan'] - result['user_hour_manhattan_mean'])
395 |     result['user_hour_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_hour_manhattan_mean']).abs()
396 |     return result
397 | 
398 | # 获取实际距离与用户从某个点出发距离统计值的各小时段(绝对)差值
399 | def get_user_sloc_hour_distance_stat_sub(result):
400 |     result['user_sloc_hour_distance_mean_sub'] = (result['distance'] - result['user_sloc_hour_distance_mean'])
401 |     result['user_sloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['user_sloc_hour_distance_mean']).abs()
402 |     result['user_sloc_hour_manhattan_mean_sub'] = (result['manhattan'] - result['user_sloc_hour_manhattan_mean'])
403 |     result['user_sloc_hour_manhattan_mean_sub_abs'] = (result['manhattan'] - result['user_sloc_hour_manhattan_mean']).abs()
404 |     return result
405 | 
406 | # 获取实际距离与用户到某个点结束距离统计值的各小时段(绝对)差值
407 | def get_user_eloc_hour_distance_stat_sub(result):
408 |     result['user_eloc_hour_distance_mean_sub'] = (result['distance'] - result['user_eloc_hour_distance_mean'])
409 |     result['user_eloc_hour_distance_mean_sub_abs'] = (result['distance'] - result['user_eloc_hour_distance_mean']).abs()
410 |     return result
411 | 
412 | # 获取小时段与用户出行的小时段统计值的(绝对)差值
413 | def get_hour_user_hour_stat_sub(result):
414 |     result['hour_user_hour_mean_sub'] = (result['hour'] - result['user_hour_mean'])
415 |     result['hour_user_hour_mean_sub_abs'] = (result['hour'] - result['user_hour_mean']).abs()
416 |     return result
417 | 
418 | # 获取小时段与用户从某个地方出发的小时段统计值的(绝对)差值
419 | def get_hour_user_sloc_hour_stat_sub(result):
420 |     result['hour_user_sloc_hour_mean_sub'] = (result['hour'] - result['user_sloc_hour_mean'])
421 |     result['hour_user_sloc_hour_mean_sub_abs'] = (result['hour'] - result['user_sloc_hour_mean']).abs()
422 |     return result
423 | 
424 | # 获取小时段与用户到某个地方结束的小时段统计值的(绝对)差值
425 | def get_hour_user_eloc_hour_stat_sub(result):
426 |     result['hour_user_eloc_hour_mean_sub'] = (result['hour'] - result['user_eloc_hour_mean'])
427 |     result['hour_user_eloc_hour_mean_sub_abs'] = (result['hour'] - result['user_eloc_hour_mean']).abs()
428 |     return result
429 | 
430 | # 获取小时段与用户从某个地点出发到某个地方结束的小时段统计值的(绝对)差值
431 | def get_hour_user_sloc_eloc_hour_stat_sub(result):
432 |     result['hour_user_sloc_eloc_hour_mean_sub'] = (result['hour'] - result['user_sloc_eloc_hour_mean'])
433 |     result['hour_user_sloc_eloc_hour_mean_sub_abs'] = (result['hour'] - result['user_sloc_eloc_hour_mean']).abs()
434 |     return result
435 | 
436 | # ----------------- 比例 -------------------
437 | 
438 | # 获取全局中用户目的地出现在出发地中的个数占用户出行次数的比例
439 | def get_global_user_sloc_count_ratio(result):
440 |     train = pd.read_csv('../../MOBIKE_CUP_2017/train.csv')
441 |     test = pd.read_csv('../../MOBIKE_CUP_2017/test.csv')
442 |     train = pd.concat([train, test])
443 |     user_sloc_count = train.groupby(['userid','geohashed_start_loc'])['userid'].agg({'global_user_sloc_count_ratio': 'count'})
444 |     user_count = train.groupby(['userid'])['userid'].agg({'global_user_sloc_count_ratio': 'count'})  
445 |     user_sloc_count = user_sloc_count.div(user_count).reset_index()
446 |     user_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc'},inplace=True)
447 |     result = pd.merge(result, user_sloc_count, on=['userid', 'geohashed_end_loc'], how='left')
448 |     return result
449 | 
450 | # 获取用户到某个目的地的个数占用户出行总数的比例
451 | def get_user_eloc_count_ratio(result):
452 |     result['user_eloc_count_ratio'] = result['user_eloc_count'] / result['user_count']
453 |     return result


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import train_test_split
  5 | import lightgbm as lgb
  6 | import pickle
  7 | import datetime
  8 | import fire
  9 | import gc
 10 | import warnings
 11 | warnings.filterwarnings('ignore')
 12 | 
 13 | from config import DefaultConfig
 14 | from dataset import get_train_data, get_test_data, get_sample
 15 | from feature import get_feat
 16 | from utils import get_label, get_score, load_model, predict, rank
 17 | 
 18 | def train(**kwargs):
 19 | 
 20 | 	# ---------------------- 更新参数 ----------------------
 21 | 	opt = DefaultConfig()
 22 | 	opt.update(**kwargs)
 23 | 	opt.printf()
 24 | 
 25 | 	# ---------------------- 数据处理 ----------------------
 26 | 
 27 | 	# 获取数据
 28 | 	# train1, train2 = get_train_data(opt)
 29 | 	# 获取样本
 30 | 	# train_sample = get_sample(train1, train2, load=True)
 31 | 	# 获取特征
 32 | 	# train_feat = get_feat(train1, train_sample)
 33 | 	# 获取标签
 34 | 	# train_all = get_label(train_feat, opt)
 35 | 	# gc.collect()
 36 | 
 37 | 	# train_all.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf', 'w', complib='blosc', complevel=5)
 38 | 	train_all = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf')
 39 | 	print(train_all.shape)
 40 |     
 41 | 	# 取出需要用的特征
 42 | 	# opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
 43 | 	# gbm, use_feat = load_model(opt)
 44 | 	# predictors_100 = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()})
 45 | 	# predictors_100 = predictors_100.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100]
 46 | 	# use_feat = list(predictors_100) + ['orderid', 'geohashed_end_loc', 'label'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
 47 | 	# train_all = train_all[use_feat]
 48 | 	# gc.collect()
 49 |     
 50 | 	# -------------------- 训练第一层 ------------------------
 51 | 
 52 | 	# ********* 准备数据 **********
 53 | 	# 划分验证集
 54 | 	train, val = train_test_split(train_all, test_size=0.1)
 55 | 	# 定义使用哪些特征
 56 | 	# opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
 57 | 	# gbm, use_feat = load_model(opt)
 58 | 	filters = set(['orderid', 'userid', 'biketype', 'geohashed_start_loc', 'bikeid', 'starttime', 'geohashed_end_loc', 'label'])
 59 | 	predictors = list(filter(lambda x: x not in filters, train_all.columns.tolist()))
 60 | 	# predictors = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()})
 61 | 	# predictors = predictors.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100]
 62 | 	# use_feat = list(predictors) + ['orderid', 'geohashed_end_loc'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
 63 | 	# predictors = list(predictors_100) + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
 64 | 	print('使用的特征：{}维\n'.format(len(predictors)), predictors)
 65 | 	# 定义数据集
 66 | 	X_train = train[predictors]
 67 | 	y_train = train['label']
 68 | 	X_val = val[predictors]
 69 | 	y_val = val['label']
 70 | 	del train, val
 71 | 	gc.collect()
 72 | 
 73 | 	# ********* LightGBM *********
 74 | 	# 数据集
 75 | 	lgb_train = lgb.Dataset(X_train, y_train)
 76 | 	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
 77 | 	# 配置
 78 | 	params = {
 79 | 	    'objective': 'binary',
 80 | 	    'metric': {'auc', 'binary_logloss'},
 81 | 	    'is_unbalance': True,
 82 | 	    'num_leaves': opt['lgb_leaves'],
 83 | 	    'learning_rate': opt['lgb_lr'],
 84 | 	    'feature_fraction': 0.886,
 85 | 	    'bagging_fraction': 0.886,
 86 | 	    'bagging_freq': 5
 87 | 	}
 88 | 	gc.collect()    
 89 | 	# ********** 开始训练 *********
 90 | 	gbm1 = lgb.train(
 91 | 				params,
 92 |                 lgb_train,
 93 |                 num_boost_round=1200,
 94 |                 valid_sets=[lgb_train, lgb_val],
 95 |                 early_stopping_rounds=5
 96 | 	)
 97 | 	gc.collect()
 98 |     
 99 | 	# #  ********* 保存模型 *********
100 | 
101 | 	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
102 | 	# save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb', cur_time, score[0])
103 | 	save_path = '{}/{}_{}.pkl'.format(opt['model_dir'], 'lgb', cur_time)
104 | 	with open(save_path, 'wb') as fout:
105 | 	    pickle.dump(gbm1, fout)
106 | 	print('保存模型：', save_path)
107 | 	gc.collect()    
108 | 
109 | 	# #  ********* 评估  *********
110 | 
111 | 	# # 在训练集上看效果
112 | 	del X_train, y_train, X_val, y_val
113 | 	gc.collect()    
114 | 	score = get_score(train_all, predictors, gbm1, opt)
115 | 	print('训练集分数：{}'.format(score))
116 |     
117 | 	import sys
118 | 	sys.exit(0)
119 |     
120 | 	# save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_1_300_top25')
121 | 	# with open(save_path, 'wb') as fout:
122 | 	#     pickle.dump(gbm1, fout)
123 | 	# print('保存模型(第一层)：', save_path)
124 | 
125 | 	# ********* save predict *****
126 | 
127 | 	# train_all[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/train2324_80_pred_res.hdf', 'w', complib='blosc', complevel=5)
128 | 	# print('Save train_pred_res.hdf successful!!!')
129 | 
130 | 	# import sys
131 | 	# sys.exit(0)
132 |     
133 | 	# -------------------- 训练第二层 ------------------------
134 |     
135 | 	# opt['model_name'] = 'lgb_1_300_top25.pkl'
136 | 	# gbm1, use_feat1 = load_model(opt)
137 | 	# train_all.loc[:, 'pred'] = gbm1.predict(train_all[use_feat1])
138 | 
139 | 	# 去掉重要性较低的特征，筛选出排名前十的候选样本，重新训练模型（后期可以载入模型finetune，尤其是对于样本量较少的情况，甚至可以选前5，但15可以覆盖99.5%的原始label，10可以覆盖98%的原始label，这两者可能会好一些，备选方案：5(+finetune)，10(+finetune)，15(+finetune)）
140 | 	predictors = pd.DataFrame(data={'feature_name': gbm1.feature_name(), 'feature_importance': gbm1.feature_importance()})
141 | 	predictors = predictors[predictors['feature_importance']>0]['feature_name'].values
142 | 	print('第二层使用的特征：{}维\n'.format(len(predictors)), predictors)
143 | 	train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15)
144 | 	# train_all = rank(train_all, 'orderid', 'pred', ascending=False)
145 | 	del train_all['pred']
146 | 	print('第二层数据：', train_all.shape)
147 |     
148 | 	# ********* 准备数据 **********
149 | 	# 划分验证集
150 | 	train, val = train_test_split(train_all, test_size=0.1)
151 |     
152 | 	# 定义数据集
153 | 	X_train = train[predictors]
154 | 	y_train = train['label']
155 | 	X_val = val[predictors]
156 | 	y_val = val['label']
157 | 	del train, val
158 | 	gc.collect()
159 | 
160 | 	# 数据集
161 | 	lgb_train = lgb.Dataset(X_train, y_train)
162 | 	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
163 | 
164 | 	# ********** 开始训练 *********
165 | 	gbm2 = lgb.train(
166 | 				params,
167 |                 lgb_train,
168 |                 num_boost_round=1200,
169 |                 valid_sets=[lgb_train, lgb_val],
170 |                 early_stopping_rounds=5
171 |                 # init_model=gbm1 # finetune
172 | 	        )
173 | 
174 | 	#  ********* 评估  *********
175 | 
176 | 	# 在训练集上看效果    
177 | 	score = get_score(train_all, predictors, gbm2, opt)
178 | 	print('训练集分数(第二层)：{}'.format(score))
179 | 
180 | 	#  ********* 保存模型 *********
181 | 
182 | 	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
183 | 	save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_2', cur_time, score[0])
184 | 	with open(save_path, 'wb') as fout:
185 | 	    pickle.dump(gbm2, fout)
186 | 	print('保存模型(第二层)：', save_path)
187 | 	# save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_2_300_top15')
188 | 	# with open(save_path, 'wb') as fout:
189 | 	#     pickle.dump(gbm2, fout)
190 | 	# print('保存模型(第二层)：', save_path)
191 |     
192 | 	import sys
193 | 	sys.exit(0)
194 |     
195 | 	# -------------------- 训练第三层 ------------------------
196 |     
197 | 	# 筛选出排名前五的候选样本
198 | 	predictors = pd.DataFrame(data={'feature_name': gbm2.feature_name(), 'feature_importance': gbm2.feature_importance()})
199 | 	predictors = predictors[predictors['feature_importance']>0]['feature_name'].values
200 | 	print('第三层使用的特征：{}维\n'.format(len(predictors)), predictors)
201 | 	train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10)
202 | 	# train_all = rank(train_all, 'orderid', 'pred', ascending=False)
203 | 	del train_all['pred']
204 | 	print('第三层数据：', train_all.shape)
205 |     
206 | 	# ********* 准备数据 **********
207 | 	# 划分验证集
208 | 	train, val = train_test_split(train_all, test_size=0.1)
209 |     
210 | 	# 定义数据集
211 | 	X_train = train[predictors]
212 | 	y_train = train['label']
213 | 	X_val = val[predictors]
214 | 	y_val = val['label']
215 | 	del train, val
216 | 	gc.collect()
217 | 
218 | 	# 数据集
219 | 	lgb_train = lgb.Dataset(X_train, y_train)
220 | 	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
221 | 
222 | 	# ********** 开始训练 *********
223 | 	gbm3 = lgb.train(
224 | 				params,
225 |                 lgb_train,
226 |                 num_boost_round=1200,
227 |                 valid_sets=[lgb_train, lgb_val],
228 |                 early_stopping_rounds=5
229 |                 # init_model=gbm2 # finetune
230 | 	        )
231 | 
232 | 	#  ********* 评估  *********
233 | 
234 | 	# 在训练集上看效果    
235 | 	score = get_score(train_all, predictors, gbm3, opt)
236 | 	print('训练集分数(第三层)：{}'.format(score))
237 | 
238 | 	#  ********* 保存模型 *********
239 | 
240 | 	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
241 | 	save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_3', cur_time, score[0])
242 | 	with open(save_path, 'wb') as fout:
243 | 	    pickle.dump(gbm3, fout)
244 | 	print('保存模型(第三层)：', save_path)
245 | 	save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_3_300_top10')
246 | 	with open(save_path, 'wb') as fout:
247 | 	    pickle.dump(gbm3, fout) 
248 | 	print('保存模型(第三层)：', save_path)
249 | 
250 |     
251 | 	# -------------------- 训练第四层 ------------------------
252 |     
253 | 	# 筛选出排名前三的候选样本
254 | 	predictors = pd.DataFrame(data={'feature_name': gbm3.feature_name(), 'feature_importance': gbm3.feature_importance()})
255 | 	predictors = predictors[predictors['feature_importance']>0]['feature_name'].values
256 | 	print('第四层使用的特征：{}维\n'.format(len(predictors)), predictors)
257 | 	train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5)
258 | 	# train_all = rank(train_all, 'orderid', 'pred', ascending=False)
259 | 	del train_all['pred']
260 | 	print('第四层数据：', train_all.shape)
261 |     
262 | 	# ********* 准备数据 **********
263 | 	# 划分验证集
264 | 	train, val = train_test_split(train_all, test_size=0.1)
265 |     
266 | 	# 定义数据集
267 | 	X_train = train[predictors]
268 | 	y_train = train['label']
269 | 	X_val = val[predictors]
270 | 	y_val = val['label']
271 | 	del train, val
272 | 	gc.collect()
273 | 
274 | 	# 数据集
275 | 	lgb_train = lgb.Dataset(X_train, y_train)
276 | 	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
277 | 
278 | 	# ********** 开始训练 *********
279 | 	gbm4 = lgb.train(
280 | 				params,
281 |                 lgb_train,
282 |                 num_boost_round=1200,
283 |                 valid_sets=[lgb_train, lgb_val],
284 |                 early_stopping_rounds=5
285 |                 # init_model=gbm3 # finetune
286 | 	        )
287 | 
288 | 	#  ********* 评估  *********
289 | 
290 | 	# 在训练集上看效果    
291 | 	score = get_score(train_all, predictors, gbm4, opt)
292 | 	print('训练集分数(第四层)：{}'.format(score))
293 | 
294 | 	#  ********* 保存模型 *********
295 | 
296 | 	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
297 | 	save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_4', cur_time, score[0])
298 | 	with open(save_path, 'wb') as fout:
299 | 	    pickle.dump(gbm4, fout)
300 | 	print('保存模型(第四层)：', save_path)
301 | 	save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_4_300_top5')
302 | 	with open(save_path, 'wb') as fout:
303 | 	    pickle.dump(gbm4, fout) 
304 | 	print('保存模型(第四层)：', save_path)
305 | 
306 | def val(**kwargs):
307 | 
308 | 	# ---------------------- 更新参数 ----------------------
309 | 	opt = DefaultConfig()
310 | 	opt.update(**kwargs)
311 | 	opt.printf()
312 | 
313 | 	# ---------------------- 数据处理 ----------------------
314 | 
315 | 	# 获取数据
316 | 	# train1, train2, train_test = get_train_data(opt)
317 | 	# 获取样本
318 | 	# train_sample = get_sample(train1, train2, load=True)
319 | 	# 获取特征
320 | 	# train_feat = get_feat(train_test, train_sample)
321 | 	# gc.collect()
322 |     
323 | 	# train_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(opt['startday']), 'w', complib='blosc', complevel=5)
324 | 	train_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf')
325 | 
326 |     # ---------------------- 载入模型 ----------------------
327 | 	
328 | 	# opt['model_name'] = 'lgb_1_90_all.pkl'
329 | 	# gbm0, use_feat0 = load_model(opt)
330 | 	opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
331 | 	gbm, use_feat = load_model(opt)
332 | 	opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl'
333 | 	gbm1, use_feat1 = load_model(opt)
334 | 	# gbm2, use_feat2 = load_model(opt)
335 | 	# opt['model_name'] = 'lgb_2017-09-03#23:24:26_0.57836.pkl'
336 | 	# gbm3, use_feat3 = load_model(opt)
337 | 	# opt['model_name'] = ''
338 | 	# gbm4, use_feat4 = load_model(opt)
339 | 
340 | 	# ---------------------- 评估 -------------------------
341 | 
342 | 	train_feat.loc[:, 'pred'] = gbm.predict(train_feat[use_feat])
343 | 	gc.collect()
344 | 	train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58820.csv', index=None)
345 | 	train_feat.loc[:, 'pred'] = gbm1.predict(train_feat[use_feat1])
346 | 	gc.collect()
347 | 	train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58893.csv', index=None)
348 | 	# train_feat = train_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(25)
349 | 	# train_feat[['orderid', 'geohashed_end_loc']].to_hdf('/home/xuwenchao/dyj-storage/sample_25_{}.hdf'.format(train.shape[0]), 'w', complib='blosc', complevel=5)
350 | 	# gc.collect()
351 | 
352 | 	# score = get_score(train_feat, use_feat, gbm, opt)
353 | 	# print('day{}分数：{}'.format(opt['startday'], score))
354 | 	# score = get_score(train_feat, use_feat1, gbm1, opt)
355 | 	# print('day{}分数：{}'.format(opt['startday'], score)) 
356 | 	# train_feat = train_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15)
357 | 	# score = get_score(train_feat, use_feat1, gbm1, opt)
358 | 	# print('day{}分数：{}'.format(opt['startday'], score)) 
359 | 	# score1 = get_score(train_feat, use_feat1, gbm1, opt)
360 | 	# print('day{}分数: {}'.format(opt['startday'], score1))
361 | 	# score2 = get_score(train_feat, use_feat2, gbm2, opt)
362 | 	# print('day{}分数: {}'.format(opt['startday'], score2))
363 | 	# score3 = get_score(train_feat, use_feat3, gbm3, opt)
364 | 	# print('day{}分数: {}'.format(opt['startday'], score3))
365 | 	# score4 = get_score(train_feat, use_feat4, gbm4, opt)
366 | 	# print('day{}分数: {}'.format(opt['startday'], score4))
367 | 
368 | def test(**kwargs):
369 | 	
370 | 	# ---------------------- 更新参数 ----------------------
371 | 	opt = DefaultConfig()
372 | 	opt.update(**kwargs)
373 | 	opt.printf()
374 | 
375 |     # ---------------------- 数据处理 ----------------------
376 | 
377 |     # 获取数据
378 | 	train, test = get_test_data(opt)
379 | 	gc.collect()
380 |  #    # 获取样本
381 | 	# test_sample = get_sample(train, test, load=True)
382 | 	# gc.collect()
383 |  #    # 获取特征
384 | 	# test_feat = get_feat(train, test_sample)
385 | 	# gc.collect()
386 | 
387 | 	# 保存特征至文件
388 | 	# test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
389 | 	test_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0]))
390 | 	test_feat = get_feat(train, test_feat)
391 | 	gc.collect()
392 | 	test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}_filter.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
393 | 
394 |     # ---------------------- 载入模型 ----------------------
395 | 
396 | 	# opt['model_name'] = 'lgb_1_90_all.pkl'
397 | 	# gbm0, use_feat0 = load_model(opt)
398 | 	opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl'
399 | 	gbm1, use_feat1 = load_model(opt)
400 | 	# opt['model_name'] = 'lgb_2_300_top15.pkl'
401 | 	# gbm2, use_feat2 = load_model(opt)
402 | 	# opt['model_name'] = 'lgb_3_300_top10.pkl'
403 | 	# gbm3, use_feat3 = load_model(opt)
404 | 	# opt['model_name'] = 'lgb_4_300_top5.pkl'
405 | 	# gbm4, use_feat4 = load_model(opt)
406 |     
407 | 	# ---------------------- 保存预测结果 -------------------
408 | 
409 | 	# test_feat.loc[:, 'pred'] = gbm0.predict(test_feat[use_feat0])
410 | 	# gc.collect()
411 | 	# res = test_feat[['orderid', 'geohashed_end_loc', 'pred']].sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(25)
412 | 	# res[['orderid', 'geohashed_end_loc']].to_hdf('/home/xuwenchao/dyj-storage/sample_25_{}_filter_leak_sample.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
413 | 	# gc.collect()
414 | 
415 | 	# test_feat.loc[:, 'pred'] = gbm1.predict(test_feat[use_feat1])
416 | 	# test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58820.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
417 | 
418 | 	res = predict(test_feat, use_feat1, gbm1)
419 | 	test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58893.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
420 | 	gc.collect()
421 | 	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
422 | 	res_path = '{}/day{}_{}_wc_sample_0.58893.csv'.format(opt['result_dir'], opt['test_startday'], cur_time)
423 | 	res.to_csv(res_path, index=False)
424 | 	print('保存测试结果至：', res_path)
425 | # 	test_feat = test_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15)
426 | # 	del test_feat['pred']
427 | # 	gc.collect()
428 | 
429 | # 	res = predict(test_feat, use_feat2, gbm2)
430 | # 	gc.collect()
431 | # 	test_feat = test_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10)
432 | # 	del test_feat['pred']
433 | # 	gc.collect()
434 | 
435 | # 	res = predict(test_feat, use_feat3, gbm3)
436 | # 	gc.collect()
437 | # 	test_feat = test_feat.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5)
438 | # 	del test_feat['pred']
439 | # 	gc.collect()
440 | 
441 | # 	res = predict(test_feat, use_feat4, gbm4)
442 | # 	gc.collect()
443 | # 	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
444 | # 	res_path = '{}/day{}_{}_5.csv'.format(opt['result_dir'], opt['test_startday'], cur_time)
445 | # 	res.to_csv(res_path, index=False)
446 | # 	print('保存测试结果至：', res_path)
447 | 
448 | if __name__ == '__main__':
449 |     fire.Fire()


--------------------------------------------------------------------------------
/submit.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import pandas as pd
 3 | import datetime
 4 | 
 5 | result_dir = '../result'
 6 | 
 7 | test25 = pd.read_csv(result_dir + '/day25_2017-09-24#05:02:10_wc_sample_0.58893.csv')
 8 | test26 = pd.read_csv(result_dir + '/day26_2017-09-24#09:27:13_wc_sample_0.58893.csv')
 9 | test27 = pd.read_csv(result_dir + '/day27_2017-09-24#16:49:10_wc_sample_0.58893.csv')
10 | test28 = pd.read_csv(result_dir + '/day28_2017-09-24#20:18:04_wc_sample_0.58893.csv')
11 | test29 = pd.read_csv(result_dir + '/day29_2017-09-24#23:26:19_wc_sample_0.58893.csv')
12 | test30 = pd.read_csv(result_dir + '/day30_2017-09-25#02:02:54_wc_sample_0.58893.csv')
13 | test31 = pd.read_csv(result_dir + '/day31_2017-09-25#08:46:58_wc_sample_0.58893.csv')
14 | 
15 | # 生成全部测试结果
16 | test = pd.read_csv('../../MOBIKE_CUP_2017/test.csv')
17 | res = pd.concat([test25, test26, test27, test28, test29, test30, test31])
18 | print(res.shape)
19 | res = pd.merge(test[['orderid']], res, on='orderid', how='left')
20 | res.fillna('0', inplace=True)
21 | 
22 | # 生成提交文件
23 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
24 | res_path = '{}/result_{}_{}.csv'.format(result_dir, '0.58893_wc_sample', cur_time)
25 | res.to_csv(res_path, header=False, index=False)
26 | print('保存提交结果至：', res_path)


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .helper import *
2 | from .eval import get_score, predict, get_label
3 | 


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/eval.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/eval.cpython-35.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/helper.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/helper.cpython-35.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/label.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Magic-Bubble/Mobike/7492d9ac7e05a22a61c2435a24d14a15387ccaf2/utils/__pycache__/label.cpython-35.pyc


--------------------------------------------------------------------------------
/utils/eval.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import os
 3 | import pandas as pd
 4 | import numpy as np
 5 | import pickle
 6 | from .helper import rank
 7 | 
 8 | # 获取真实标签
 9 | def get_label(data, opt):
10 |     result_path = opt['cache_dir'] + '/true.pkl'
11 |     if os.path.exists(result_path):
12 |         true = pickle.load(open(result_path, 'rb+'))
13 |     else:
14 |         train = pd.read_csv(opt['train_csv'])
15 |         test = pd.read_csv(opt['test_csv'])
16 |         test['geohashed_end_loc'] = np.nan
17 |         data_all = pd.concat([train, test])
18 |         true = dict(zip(data_all['orderid'].values, data_all['geohashed_end_loc']))
19 |         pickle.dump(true, open(result_path, 'wb+'))
20 |     data['label'] = data['orderid'].map(true)
21 |     if data.get('geohashed_end_loc', None) is not None:
22 |         data['label'] = (data['label'] == data['geohashed_end_loc']).astype('int')
23 |     return data
24 | 
25 | # 整合预测结果
26 | def reshape(pred):
27 |     result = pred[["orderid", "pred", "geohashed_end_loc"]].copy()
28 |     result = rank(result, 'orderid', 'pred', ascending=False)
29 |     result = result[result['rank']<3][['orderid', 'geohashed_end_loc', 'rank']]
30 |     result = result.set_index(['orderid', 'rank']).unstack()
31 |     result.reset_index(inplace=True)
32 |     result.columns = ['orderid', 0, 1, 2]
33 |     return result
34 | 
35 | # 评估函数
36 | def map_score(result):
37 |     '''
38 |         result: orderid, 0, 1, 2, label
39 |     '''
40 |     data = result.copy()
41 |     acc1 = sum(data['label'] == data[0]) # 第一个位置上正确的个数
42 |     acc2 = sum(data['label'] == data[1]) # 第二个位置上正确的个数
43 |     acc3 = sum(data['label'] == data[2]) # 第三个位置上正确的个数
44 |     score = (acc1+acc2/2+acc3/3)/data.shape[0]
45 |     return score, acc1, acc2, acc3, data.shape[0]
46 | 
47 | # 预测结果
48 | def predict(data, feat, model):
49 |     data.loc[:, 'pred'] = model.predict(data[feat])
50 |     res = reshape(data)
51 |     res.fillna('0', inplace=True)
52 |     return res
53 | 
54 | # 获取分数
55 | def get_score(data, feat, model, opt):
56 |     res = predict(data, feat, model)
57 |     res = get_label(res, opt)
58 |     score = map_score(res)
59 |     return score
60 | 


--------------------------------------------------------------------------------
/utils/helper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import numpy as np
 3 | import pandas as pd
 4 | import pickle
 5 | 
 6 | # 计算相差的分钟数
 7 | def diff_of_minutes(time1, time2):
 8 |     d = {'5': 0, '6': 31, }
 9 |     try:
10 |         days = (d[time1[6]] + int(time1[8:10])) - (d[time2[6]] + int(time2[8:10]))
11 |         try:
12 |             minutes1 = int(time1[11:13]) * 60 + int(time1[14:16])
13 |         except:
14 |             minutes1 = 0
15 |         try:
16 |             minutes2 = int(time2[11:13]) * 60 + int(time2[14:16])
17 |         except:
18 |             minutes2 = 0
19 |         return (days * 1440 - minutes2 + minutes1)
20 |     except:
21 |         return np.nan
22 | 
23 | # 计算两点之间距离
24 | def haversine(lat1, lng1, lat2, lng2):
25 |     """function to calculate haversine distance between two co-ordinates"""
26 |     lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
27 |     AVG_EARTH_RADIUS = 6371  # in km
28 |     lat = lat2 - lat1
29 |     lng = lng2 - lng1
30 |     d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
31 |     h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
32 |     return(h)
33 | 
34 | def manhattan(lat1, lng1, lat2, lng2):
35 |     """function to calculate manhatten distance between pick_drop"""
36 |     a = haversine(lat1, lng1, lat1, lng2)
37 |     b = haversine(lat1, lng1, lat2, lng1)
38 |     return a + b
39 | 
40 | # 计算两个经纬度之间的距离
41 | def cal_distance(lat1,lon1,lat2,lon2):
42 |     dx = np.abs(lon1 - lon2)
43 |     dy = np.abs(lat1 - lat2)
44 |     b = (lat1 + lat2) / 2.0
45 |     Lx = 6371004.0 * (dx / 57.2958) * np.cos(b / 57.2958)
46 |     Ly = 6371004.0 * (dy / 57.2958)
47 |     L = (Lx**2 + Ly**2) ** 0.5
48 |     return L
49 | 
50 | # 计算两个经纬度之间的方向角
51 | def bearing_array(lat1, lng1, lat2, lng2):
52 |     """ function was taken from beluga's notebook as this function works on array
53 |     while my function used to work on individual elements and was noticably slow"""
54 |     AVG_EARTH_RADIUS = 6371  # in km
55 |     lng_delta_rad = np.radians(lng2 - lng1)
56 |     lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
57 |     y = np.sin(lng_delta_rad) * np.cos(lat2)
58 |     x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
59 |     return np.degrees(np.arctan2(y, x))
60 | 
61 | # 分组排序
62 | def rank(data, feat1, feat2, rank_name='rank', ascending=True):
63 |     if type(feat1) == list: feat = feat1 + [feat2]
64 |     else: feat = [feat1, feat2]
65 |     use_feat = list(set(feat + ['orderid', 'geohashed_end_loc']))
66 |     datatmp = data[use_feat]
67 |     datatmp.sort_values(feat, inplace=True, ascending=ascending)
68 |     datatmp[rank_name] = range(datatmp.shape[0])
69 |     min_rank = datatmp.groupby(feat1, as_index=False)[rank_name].agg({'min_rank': 'min'})
70 |     datatmp = pd.merge(datatmp, min_rank, on=feat1, how='left')
71 |     datatmp[rank_name] = datatmp[rank_name] - datatmp['min_rank']
72 |     data = pd.merge(data, datatmp[['orderid', 'geohashed_end_loc', rank_name]], on=['orderid', 'geohashed_end_loc'], how='left')
73 |     # del data['min_rank']
74 |     return data
75 | 
76 | # 载入模型
77 | def load_model(opt):
78 |     with open('{}/{}'.format(opt['model_dir'], opt['model_name']), 'rb') as fin:
79 |         gbm = pickle.load(fin)
80 |     use_feat = gbm.feature_name()
81 |     print('载入模型成功：', len(use_feat), use_feat)
82 |     return gbm, use_feat
83 | 


--------------------------------------------------------------------------------