├── LICENSE ├── Modelling_approach.pdf ├── Readme.md ├── calculate_features.py ├── environment.yml ├── predict.py ├── requirements.txt ├── scale.py └── split_test.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Belinda Trotta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Modelling_approach.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/btrotta/kaggle-plasticc/910a6798da337854fd9aeeebc3ee713dfe6c22a7/Modelling_approach.pdf -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # kaggle-plasticc 2 | 3 | Code for the 14th place solution in the Kaggle PLAsTiCC competition. 4 | 5 | See `Modelling_approach.pdf` for a detailed discussion of the modelling approach. 6 | 7 | #### Quick-start guide to running the code 8 | 9 | Total runtime is around 5.5 hours on a 24 Gb laptop. 10 | 11 | - Download the code. Create a subfolder called `data` and save the csv files there. 12 | 13 | - To reproduce the results exactly, create an environment with the specific 14 | package versions I used. (If you already have numpy, pandas, scikit-learn 15 | and lightgbm you can skip this 16 | step, but the results may differ slightly if you have different versions.) If you have conda, the 17 | easiest option is to 18 | build a conda environment using this command: 19 | ``` 20 | conda env create environment.yml 21 | ``` 22 | This will create an environment called `plasticc-bt`. 23 | The `requirements.txt` file is provided as well if you want to build an environment with pip. 24 | 25 | - Run `split_test.py` to split the test data into 100 hdf5 files. They will 26 | be saved in an automatically created subfolder `split_100` of the `data` folder. Takes around 15 minutes. 27 | 28 | - Run `calculate_features.py` to calculate the features. This will generate 3 files in a folder called 29 | `features` (the folder is created automatically). Takes around 3.5 hours. 30 | 31 | - Run `predict.py` to train the model and make predictions on the test set. Takes around 1.5 hours. 32 | 33 | - Run `scale.py` to apply regularisation to the class 99 predictions and generate the final submission file. 34 | Takes a couple of minutes. -------------------------------------------------------------------------------- /calculate_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import gc 4 | import os 5 | 6 | 7 | # read data 8 | col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8, 9 | 'detected': np.int8} 10 | train_meta = pd.read_csv(os.path.join('data', 'training_set_metadata.csv')) 11 | train = pd.read_csv(os.path.join('data', 'training_set.csv'), dtype=col_dict) 12 | 13 | 14 | def calc_aggs(all_data, exact): 15 | 16 | # Normalise the flux, following the Bayesian approach here: 17 | # https://www.statlect.com/fundamentals-of-statistics/normal-distribution-Bayesian-estimation 18 | # Similar idea (but not the same) as the normalisation done in the Starter Kit 19 | # https://www.kaggle.com/michaelapers/the-plasticc-astronomy-starter-kit?scriptVersionId=6040398 20 | prior_mean = all_data.groupby(['object_id', 'passband'])['flux'].transform('mean') 21 | prior_std = all_data.groupby(['object_id', 'passband'])['flux'].transform('std') 22 | prior_std.loc[prior_std.isnull()] = all_data.loc[prior_std.isnull(), 'flux_err'] 23 | obs_std = all_data['flux_err'] # since the above kernel tells us that the flux error is the 68% confidence interval 24 | all_data['bayes_flux'] = (all_data['flux'] / obs_std**2 + prior_mean / prior_std**2) \ 25 | / (1 / obs_std**2 + 1 / prior_std**2) 26 | all_data.loc[all_data['bayes_flux'].notnull(), 'flux'] \ 27 | = all_data.loc[all_data['bayes_flux'].notnull(), 'bayes_flux'] 28 | 29 | # Estimate the flux at source, using the fact that light is proportional 30 | # to inverse square of distance from source. 31 | # This is hinted at here: https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70725#417195 32 | redshift = all_meta.set_index('object_id')[['hostgal_specz', 'hostgal_photoz']] 33 | if exact: 34 | redshift['redshift'] = redshift['hostgal_specz'] 35 | redshift.loc[redshift['redshift'].isnull(), 'redshift'] \ 36 | = redshift.loc[redshift['redshift'].isnull(), 'hostgal_photoz'] 37 | else: 38 | redshift['redshift'] = redshift['hostgal_photoz'] 39 | all_data = pd.merge(all_data, redshift, 'left', 'object_id') 40 | nonzero_redshift = all_data['redshift'] > 0 41 | all_data.loc[nonzero_redshift, 'flux'] = all_data.loc[nonzero_redshift, 'flux'] \ 42 | * all_data.loc[nonzero_redshift, 'redshift']**2 43 | 44 | # aggregate features 45 | band_aggs = all_data.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'std', 'max', 'min']).unstack(-1) 46 | band_aggs.columns = [x + '_' + str(y) for x in band_aggs.columns.levels[0] 47 | for y in band_aggs.columns.levels[1]] 48 | all_data.sort_values(['object_id', 'passband', 'flux'], inplace=True) 49 | # this way of calculating quantiles is faster than using the pandas quantile builtin on the groupby object 50 | all_data['group_count'] = all_data.groupby(['object_id', 'passband']).cumcount() 51 | all_data['group_size'] = all_data.groupby(['object_id', 'passband'])['flux'].transform('size') 52 | q_list = [0.25, 0.75] 53 | for q in q_list: 54 | all_data['q_' + str(q)] = all_data.loc[ 55 | (all_data['group_size'] * q).astype(int) == all_data['group_count'], 'flux'] 56 | quantiles = all_data.groupby(['object_id', 'passband'])[['q_' + str(q) for q in q_list]].max().unstack(-1) 57 | quantiles.columns = [str(x) + '_' + str(y) + '_quantile' for x in quantiles.columns.levels[0] 58 | for y in quantiles.columns.levels[1]] 59 | 60 | # max detected flux 61 | max_detected = all_data.loc[all_data['detected'] == 1].groupby('object_id')['flux'].max().to_frame('max_detected') 62 | 63 | def most_extreme(df_in, k, positive=True, suffix='', include_max=True, include_dur=True, include_interval=False): 64 | # find the "most extreme" time for each object, and for each band, retrieve the k data points on either side 65 | # k points before 66 | df = df_in.copy() 67 | df['object_passband_mean'] = df.groupby(['object_id', 'passband'])['flux'].transform('median') 68 | if positive: 69 | df['dist_from_mean'] = (df['flux'] - df['object_passband_mean']) 70 | else: 71 | df['dist_from_mean'] = -(df['flux'] - df['object_passband_mean']) 72 | 73 | max_time = df.loc[df['detected'] == 1].groupby('object_id')['dist_from_mean'].idxmax().to_frame( 74 | 'max_ind') 75 | max_time['mjd_max' + suffix] = df.loc[max_time['max_ind'].values, 'mjd'].values 76 | df = pd.merge(df, max_time[['mjd_max' + suffix]], 'left', left_on=['object_id'], right_index=True) 77 | df['time_after_mjd_max'] = df['mjd'] - df['mjd_max' + suffix] 78 | df['time_before_mjd_max'] = -df['time_after_mjd_max'] 79 | 80 | # first k after event 81 | df.sort_values(['object_id', 'passband', 'time_after_mjd_max'], inplace=True) 82 | df['row_num_after'] = df.loc[df['time_after_mjd_max'] >= 0].groupby( 83 | ['object_id', 'passband']).cumcount() 84 | first_k_after = df.loc[(df['row_num_after'] < k) & (df['time_after_mjd_max'] <= 50), 85 | ['object_id', 'passband', 'flux', 'row_num_after']] 86 | first_k_after.set_index(['object_id', 'passband', 'row_num_after'], inplace=True) 87 | first_k_after = first_k_after.unstack(level=-1).unstack(level=-1) 88 | first_k_after.columns = [str(x) + '_' + str(y) + '_after' for x in first_k_after.columns.levels[1] 89 | for y in first_k_after.columns.levels[2]] 90 | extreme_data = first_k_after 91 | time_bands = [[-50, -20], [-20, -10], [-10, 0], [0, 10], [10, 20], [20, 50], [50, 100], [100, 200], [200, 500]] 92 | if include_interval: 93 | interval_arr = [] 94 | for start, end in time_bands: 95 | band_data = df.loc[(start <= df['time_after_mjd_max']) & (df['time_after_mjd_max'] <= end)] 96 | interval_agg = band_data.groupby(['object_id', 'passband'])['flux'].mean().unstack(-1) 97 | interval_agg.columns = ['{}_start_{}_end_{}'.format(c, start, end) for c in interval_agg.columns] 98 | interval_arr.append(interval_agg) 99 | interval_data = pd.concat(interval_arr, axis=1) 100 | extreme_data = pd.concat([extreme_data, interval_data], axis=1) 101 | if include_dur: 102 | # detection duration in each passband after event 103 | duration_after = df.loc[(df['time_after_mjd_max'] >= 0) & (df['detected'] == 0)] \ 104 | .groupby(['object_id', 'passband'])['time_after_mjd_max'].first().unstack(-1) 105 | duration_after.columns = ['dur_after_' + str(c) for c in range(6)] 106 | extreme_data = pd.concat([extreme_data, duration_after], axis=1) 107 | 108 | # last k before event 109 | df.sort_values(['object_id', 'passband', 'time_before_mjd_max'], inplace=True) 110 | df['row_num_before'] = df.loc[df['time_before_mjd_max'] >= 0].groupby( 111 | ['object_id', 'passband']).cumcount() 112 | first_k_before = df.loc[(df['row_num_before'] < k) & (df['time_after_mjd_max'] <= 50), 113 | ['object_id', 'passband', 'flux', 'row_num_before']] 114 | first_k_before.set_index(['object_id', 'passband', 'row_num_before'], inplace=True) 115 | first_k_before = first_k_before.unstack(level=-1).unstack(level=-1) 116 | first_k_before.columns = [str(x) + '_' + str(y) + '_before' for x in first_k_before.columns.levels[1] 117 | for y in first_k_before.columns.levels[2]] 118 | extreme_data = pd.concat([extreme_data, first_k_before], axis=1) 119 | if include_dur: 120 | # detection duration in each passband before event 121 | duration_before = df.loc[(df['time_before_mjd_max'] >= 0) & (df['detected'] == 0)] \ 122 | .groupby(['object_id', 'passband'])['time_before_mjd_max'].first().unstack(-1) 123 | duration_before.columns = ['dur_before_' + str(c) for c in range(6)] 124 | extreme_data = pd.concat([extreme_data, duration_before], axis=1) 125 | 126 | if include_max: 127 | # passband with maximum detected flux for each object 128 | max_pb = df.loc[max_time['max_ind'].values].groupby('object_id')['passband'].max().to_frame( 129 | 'max_passband') 130 | # time of max in each passband, relative to extreme max 131 | band_max_ind = df.groupby(['object_id', 'passband'])['flux'].idxmax() 132 | band_mjd_max = df.loc[band_max_ind.values].groupby(['object_id', 'passband'])['mjd'].max().unstack(-1) 133 | cols = ['max_time_' + str(i) for i in range(6)] 134 | band_mjd_max.columns = cols 135 | band_mjd_max = pd.merge(band_mjd_max, max_time, 'left', 'object_id') 136 | for c in cols: 137 | band_mjd_max[c] -= band_mjd_max['mjd_max' + suffix] 138 | band_mjd_max.drop(['mjd_max' + suffix, 'max_ind'], axis=1, inplace=True) 139 | extreme_data = pd.concat([extreme_data, max_pb, band_mjd_max], axis=1) 140 | 141 | extreme_data.columns = [c + suffix for c in extreme_data.columns] 142 | return extreme_data 143 | 144 | extreme_max = most_extreme(all_data, 1, positive=True, suffix='', include_max=True, include_dur=True, 145 | include_interval=True) 146 | extreme_min = most_extreme(all_data, 1, positive=False, suffix='_min', include_max=False, include_dur=True) 147 | 148 | # add the feature mentioned here, attempts to identify periodicity: 149 | # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538 150 | time_between_detections = all_data.loc[all_data['detected'] == 1].groupby('object_id')['mjd'].agg(['max', 'min']) 151 | time_between_detections['det_period'] = time_between_detections['max'] - time_between_detections['min'] 152 | # same feature but grouped by passband 153 | time_between_detections_pb \ 154 | = all_data.loc[all_data['detected'] == 1].groupby(['object_id', 'passband'])['mjd'].agg(['max', 'min']) 155 | time_between_detections_pb['det_period'] = time_between_detections_pb['max'] - time_between_detections_pb['min'] 156 | time_between_detections_pb = time_between_detections_pb['det_period'].unstack(-1) 157 | time_between_detections_pb.columns = ['det_period_pb_' + str(i) for i in range(6)] 158 | # similar feature based on high values 159 | all_data['threshold'] = all_data.groupby(['object_id'])['flux'].transform('max') * 0.75 160 | all_data['high'] = ((all_data['flux'] >= all_data['threshold']) & (all_data['detected'] == 1)).astype(int) 161 | time_between_highs = all_data.loc[all_data['high'] == 1].groupby('object_id')['mjd'].agg(['max', 'min']) 162 | time_between_highs['det_period_high'] = time_between_highs['max'] - time_between_highs['min'] 163 | 164 | # aggregate values of the features during the detection period 165 | all_data = pd.merge(all_data, time_between_detections, 'left', 'object_id') 166 | det_data = all_data.loc[(all_data['mjd'] >= all_data['min']) & (all_data['mjd'] <= all_data['max'])] 167 | det_aggs = det_data.groupby(['object_id', 'passband'])['flux'].agg(['min', 'max', 'std', 'median']) 168 | det_aggs['prop_detected'] = det_data.groupby(['object_id', 'passband'])['detected'].mean() 169 | det_aggs = det_aggs.unstack(-1) 170 | det_aggs.columns = [x + '_' + str(y) + '_det_period' for x in det_aggs.columns.levels[0] 171 | for y in det_aggs.columns.levels[1]] 172 | 173 | # time distribution of detections in each band 174 | detection_time_dist \ 175 | = all_data.loc[all_data['detected'] == 1].groupby(['object_id', 'passband'])['mjd'].std().unstack(-1) 176 | detection_time_dist.columns = ['time_dist_' + str(i) for i in range(6)] 177 | detection_time_dist_all \ 178 | = all_data.loc[all_data['detected'] == 1].groupby(['object_id'])['mjd'].std().to_frame('time_dist') 179 | 180 | # scale data and recalculate band aggs 181 | all_data['abs_flux'] = all_data['flux'].abs() 182 | all_data['flux'] = (all_data['flux']) / all_data.groupby('object_id')['abs_flux'].transform('max') 183 | band_aggs_s = all_data.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'std', 'max', 'min']).unstack(-1) 184 | band_aggs_s.columns = [x + '_' + str(y) + '_scaled' for x in band_aggs_s.columns.levels[0] 185 | for y in band_aggs_s.columns.levels[1]] 186 | all_data.sort_values(['object_id', 'passband', 'flux'], inplace=True) 187 | for q in q_list: 188 | all_data['q_' + str(q)] = all_data.loc[ 189 | (all_data['group_size'] * q).astype(int) == all_data['group_count'], 'flux'] 190 | quantiles_s = all_data.groupby(['object_id', 'passband'])[['q_' + str(q) for q in q_list]].max().unstack(-1) 191 | quantiles_s.columns = [str(x) + '_' + str(y) + '_quantile_s' for x in quantiles_s.columns.levels[0] 192 | for y in quantiles_s.columns.levels[1]] 193 | 194 | extreme_max_s = most_extreme(all_data, 1, positive=True, suffix='_s', include_max=False, include_dur=False, 195 | include_interval=True) 196 | extreme_min_s = most_extreme(all_data, 1, positive=False, suffix='_min_s', include_max=False, include_dur=False) 197 | 198 | new_data = pd.concat([band_aggs, quantiles, band_aggs_s, max_detected, time_between_detections[['det_period']], 199 | time_between_detections_pb, extreme_max, extreme_min, extreme_max_s, extreme_min_s, 200 | time_between_highs[['det_period_high']], quantiles_s, detection_time_dist, 201 | detection_time_dist_all, det_aggs], axis=1) 202 | return new_data 203 | 204 | 205 | # get the metadata 206 | test_meta = pd.read_csv(os.path.join('data', 'test_set_metadata.csv')) 207 | all_meta = pd.concat([train_meta, test_meta], axis=0, ignore_index=True, sort=True).reset_index() 208 | all_meta.drop('index', axis=1, inplace=True) 209 | n_chunks = 100 210 | 211 | # calculate features 212 | new_data_exact = calc_aggs(train.copy(), True) 213 | new_data_approx = calc_aggs(train.copy(), False) 214 | train_meta_exact = pd.merge(train_meta, new_data_exact, 'left', left_on='object_id', right_index=True) 215 | train_meta_approx = pd.merge(train_meta, new_data_approx, 'left', left_on='object_id', right_index=True) 216 | 217 | # process training set (not actually used, just to get right shape of dataframe) 218 | new_data_arr = [] 219 | new_data_arr.append(calc_aggs(train.copy(), True)) 220 | # process test set 221 | for i in range(n_chunks): 222 | df = pd.read_hdf(os.path.join('data', 'split_{}'.format(n_chunks), 'chunk_{}.hdf5'.format(i)), key='file0') 223 | df.drop('index', axis=1, inplace=True) 224 | print('Read chunk {}'.format(i)) 225 | new_data_arr.append(calc_aggs(df.copy(), True)) 226 | print('Calculated features for chunk {}'.format(i)) 227 | del df 228 | gc.collect() 229 | new_data = pd.concat(new_data_arr, axis=0, sort=True) 230 | 231 | # merge 232 | all_meta = pd.merge(all_meta, new_data, 'left', left_on='object_id', right_index=True) 233 | 234 | # write output 235 | dir_name = 'features' 236 | if not os.path.exists(os.path.join('data', dir_name)): 237 | os.mkdir(os.path.join('data', dir_name)) 238 | all_meta.to_hdf(os.path.join('data', dir_name, 'all_data.hdf5'), key='file0') 239 | train_meta_exact.to_hdf(os.path.join('data', dir_name, 'train_meta_exact.hdf5'), key='file0') 240 | train_meta_approx.to_hdf(os.path.join('data', dir_name, 'train_meta_approx.hdf5'), key='file0') 241 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: plasticc-bt 2 | dependencies: 3 | - python=3.6.5 4 | - numpy=1.14.3 5 | - matplotlib=2.2.2 6 | - pytables=3.4.3 7 | - pandas=0.23.0 8 | - scikit-learn=0.20.0 9 | - pip: 10 | - lightgbm==2.1.1 11 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn import metrics, model_selection 4 | import lightgbm as lgb 5 | import os 6 | 7 | 8 | # if test_mode is True, just run training and cross-validation on training data; 9 | # if False, also make predictions on test set 10 | test_mode = False 11 | 12 | # read data 13 | all_meta = pd.read_hdf(os.path.join('data', 'features', 'all_data.hdf5'), key='file0') 14 | train_meta_approx = pd.read_hdf(os.path.join('data', 'features', 'train_meta_approx.hdf5'), key='file0') 15 | train_meta_exact = pd.read_hdf(os.path.join('data', 'features', 'train_meta_exact.hdf5'), key='file0') 16 | 17 | # map classes to range [0, 14] 18 | classes = np.sort(all_meta.loc[all_meta['target'].notnull(), 'target'].unique().astype(int)) 19 | # Train separate models for galatic and extra-galactic, since these classes contain disjoint sets of objects, 20 | # and can be distinguished by whether hostgl_photoz == 0, as observed here: 21 | # https://www.kaggle.com/kyleboone/naive-benchmark-galactic-vs-extragalactic?scriptVersionId=6104036# 22 | galactic_bool = all_meta['hostgal_photoz'] == 0 23 | exact_bool = all_meta['hostgal_photoz'].notnull().astype(int) 24 | galactic_classes = np.sort(all_meta.loc[all_meta['target'].notnull() & galactic_bool, 'target'].unique().astype(int)) 25 | non_galactic_classes = np.sort( 26 | all_meta.loc[all_meta['target'].notnull() & ~galactic_bool, 'target'].unique().astype(int)) 27 | # transform the target so the classes are the integers range(num_classes) 28 | for df in [all_meta, train_meta_approx, train_meta_exact]: 29 | df['target_trans'] = np.nan 30 | df['target_trans_galactic'] = np.nan 31 | df['target_trans_non_galactic'] = np.nan 32 | for k, class_list in enumerate([classes, galactic_classes, non_galactic_classes]): 33 | if k == 0: 34 | suffix = '' 35 | elif k == 1: 36 | suffix = '_galactic' 37 | else: 38 | suffix = '_non_galactic' 39 | for i in range(len(class_list)): 40 | df.loc[df['target'] == class_list[i], 'target_trans' + suffix] = i 41 | 42 | # train 2 models for each class, one for when we have exact redshift, and another for when we don't 43 | train_cols_exact_redshift \ 44 | = [c for c in train_meta_exact.columns if 45 | c not in ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'target', 'target_trans', 'target_trans_galactic', 46 | 'target_trans_non_galactic', 'ddf', 47 | 'distmod', 'mwebv', 'hostgal_photoz', 'hostgal_photoz_err', 'index']] 48 | train_cols_approx_redshift \ 49 | = [c for c in train_meta_approx.columns if 50 | c not in ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'target', 'target_trans', 'target_trans_galactic', 51 | 'target_trans_non_galactic', 'ddf', 52 | 'distmod', 'mwebv', 'hostgal_specz', 'index']] 53 | 54 | # separate parameters for galactic and non-galactic 55 | params_galactic = {'boosting_type': 'gbdt', 'application': 'binary', 'num_leaves': 32, 'seed': 0, 'verbose': -1, 56 | 'min_data_in_leaf': 1, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l1': 0, 'lambda_l2': 1, 57 | 'learning_rate': 0.02} 58 | params_non_galactic = {'boosting_type': 'gbdt', 'application': 'binary', 'num_leaves': 16, 'seed': 0, 'verbose': -1, 59 | 'min_data_in_leaf': 1, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l1': 0, 60 | 'lambda_l2': 1, 'learning_rate': 0.02} 61 | num_rounds = 3000 62 | 63 | # cross-validate on train set, and measure distribution of out-of-sample predicted values 64 | train_err_exact = [] 65 | test_err_exact = [] 66 | train_err_approx = [] 67 | test_err_approx = [] 68 | cv = model_selection.KFold(5, shuffle=True, random_state=4) 69 | galactic_bool_train = train_meta_exact['hostgal_photoz'] == 0 70 | train_meta_exact['predict_max_exact'] = 0 71 | train_meta_exact['predict_max_approx'] = 0 72 | train_meta_approx['predict_max_exact'] = 0 73 | train_meta_approx['predict_max_approx'] = 0 74 | predict_cols = ['class_' + str(c) for c in classes] 75 | train_prediction_exact \ 76 | = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols) 77 | train_prediction_approx \ 78 | = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols) 79 | eval_prediction_exact \ 80 | = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols) 81 | eval_prediction_approx \ 82 | = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols) 83 | importance = {} 84 | best_iter_exact = {c: [] for c in classes} 85 | best_iter_approx = {c: [] for c in classes} 86 | # Evaluate accuracy on resampled training set having similar distribution to test. The data note says 87 | # "The training data are mostly composed of nearby, low-redshift, brighter objects while the test data contain 88 | # more distant (higher redshift) and fainter objects." So we resample to achieve a similar distribution of 89 | # hostgal_photoz. 90 | train_bool = all_meta['target'].notnull() 91 | ddf = all_meta['ddf'] == 1 92 | w = pd.DataFrame(index=train_meta_exact.index, columns=['galactic', 'non_galactic']) 93 | w['galactic'] = galactic_bool_train.astype(int) 94 | w['non_galactic'] = np.nan 95 | bands = np.arange(all_meta.loc[~train_bool, 'hostgal_photoz'].min(), 96 | all_meta.loc[~train_bool, 'hostgal_photoz'].max() + 0.00001, 0.1) 97 | for i in range(len(bands[:-1])): 98 | band_bool = ~galactic_bool_train & ~ddf & (train_meta_exact['hostgal_photoz'] >= bands[i]) \ 99 | & (train_meta_exact['hostgal_photoz'] <= bands[i + 1]) 100 | train_prop = band_bool.sum() / (~galactic_bool_train & ~ddf).sum() 101 | test_prop = ((all_meta.loc[~train_bool & ~galactic_bool, 'hostgal_photoz'] >= bands[i]) 102 | & (all_meta.loc[~train_bool & ~galactic_bool, 'hostgal_photoz'] <= bands[i + 1])).sum() \ 103 | / (~train_bool & ~galactic_bool).sum() 104 | w.loc[band_bool, 'non_galactic'] = test_prop / train_prop 105 | w.loc[ddf] = 0 106 | for train_ind, test_ind in list(cv.split(train_meta_exact.index, train_meta_exact['target_trans'])): 107 | train_bool = train_meta_exact.index.isin(train_ind) 108 | ddf = train_meta_exact['ddf'] == 1 109 | 110 | for i, c in enumerate(classes): 111 | g = c in galactic_classes 112 | gal_bool_train_curr = galactic_bool_train == g 113 | params = params_galactic if g else params_non_galactic 114 | col = 'class_' + str(c) 115 | weight_col = 'galactic' if g else 'non_galactic' 116 | 117 | # exact redshift model 118 | lgb_train = lgb.Dataset(train_meta_exact.loc[train_bool & gal_bool_train_curr, train_cols_exact_redshift], 119 | label=(train_meta_exact.loc[train_bool & gal_bool_train_curr, 'target'] == c).astype(int)) 120 | lgb_valid = lgb.Dataset(train_meta_exact.loc[(~train_bool) & gal_bool_train_curr & ~ddf, train_cols_exact_redshift], 121 | label=(train_meta_exact.loc[(~train_bool) & gal_bool_train_curr & ~ddf, 'target'] == c).astype(int), 122 | weight=w.loc[(~train_bool) & gal_bool_train_curr & ~ddf, weight_col]) 123 | est = lgb.train(train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid'], 124 | params=params, num_boost_round=num_rounds, early_stopping_rounds=100) 125 | best_iter_exact[c].append(est.best_iteration) 126 | train_prediction_exact.loc[~train_bool & gal_bool_train_curr, col] = est.predict( 127 | train_meta_exact.loc[(~train_bool) & gal_bool_train_curr, train_cols_exact_redshift], 128 | num_iteration=est.best_iteration) 129 | # measure errors on train and test 130 | eval_prediction_exact.loc[gal_bool_train_curr, col] \ 131 | = est.predict(train_meta_exact.loc[gal_bool_train_curr, train_cols_exact_redshift], 132 | num_iteration=est.best_iteration) 133 | 134 | # approx redshift models 135 | lgb_train = lgb.Dataset(train_meta_approx.loc[train_bool & gal_bool_train_curr, train_cols_approx_redshift], 136 | label=(train_meta_approx.loc[train_bool & gal_bool_train_curr, 'target'] == c).astype(int)) 137 | lgb_valid = lgb.Dataset(train_meta_approx.loc[(~train_bool) & gal_bool_train_curr & ~ddf, train_cols_approx_redshift], 138 | label=(train_meta_approx.loc[(~train_bool) & gal_bool_train_curr & ~ddf, 'target'] == c).astype(int), 139 | weight=w.loc[(~train_bool) & gal_bool_train_curr & ~ddf, weight_col]) 140 | est = lgb.train(train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid'], 141 | params=params, num_boost_round=num_rounds, early_stopping_rounds=100) 142 | best_iter_approx[c].append(est.best_iteration) 143 | train_prediction_approx.loc[(~train_bool) & gal_bool_train_curr, col] = est.predict( 144 | train_meta_approx.loc[(~train_bool) & gal_bool_train_curr, train_cols_approx_redshift], 145 | num_iteration=est.best_iteration) 146 | # measure errors on train and test 147 | eval_prediction_approx.loc[gal_bool_train_curr, col] \ 148 | = est.predict(train_meta_approx.loc[gal_bool_train_curr, train_cols_approx_redshift], 149 | num_iteration=est.best_iteration) 150 | 151 | imp_arr = est.feature_importance() 152 | importance[g] = {c: imp_arr[i] for i, c in enumerate(train_cols_approx_redshift)} 153 | 154 | # fill nulls 155 | for df in [train_prediction_exact, train_prediction_approx, eval_prediction_exact, eval_prediction_approx]: 156 | df[predict_cols] = df[predict_cols].fillna(0) 157 | 158 | # scale so columns add to 1 159 | for df in [eval_prediction_exact, eval_prediction_approx]: 160 | col_sum = df.sum(axis=1) 161 | for c in df.columns: 162 | df[c] /= col_sum 163 | 164 | train_err_exact.append(metrics.log_loss(train_meta_exact.loc[train_bool & ~ddf, 'target_trans'], 165 | eval_prediction_exact.loc[train_bool & ~ddf, predict_cols])) 166 | test_err_exact.append(metrics.log_loss(train_meta_exact.loc[~train_bool & ~ddf, 'target_trans'], 167 | eval_prediction_exact.loc[~train_bool & ~ddf, predict_cols])) 168 | print('Train exact error: ', train_err_exact) 169 | print('Test exact error: ', test_err_exact) 170 | train_err_approx.append(metrics.log_loss(train_meta_approx.loc[train_bool & ~ddf, 'target_trans'], 171 | eval_prediction_approx.loc[train_bool & ~ddf, predict_cols])) 172 | test_err_approx.append(metrics.log_loss(train_meta_approx.loc[~train_bool & ~ddf, 'target_trans'], 173 | eval_prediction_approx.loc[~train_bool & ~ddf, predict_cols])) 174 | print('Train approx error: ', train_err_approx) 175 | print('Test approx error: ', test_err_approx) 176 | 177 | with open('log.txt', 'w') as f: 178 | f.write('Train exact error: '.format(train_err_exact)) 179 | f.write('\nTest exact error: '.format(test_err_exact)) 180 | f.write('\nTrain approx error: {}'.format(train_err_approx)) 181 | f.write('\nTest approx error: {}'.format(test_err_approx)) 182 | f.write('\nMean exact train error: {}'.format(np.mean(train_err_exact))) 183 | f.write('\nMean exact test error: {}'.format(np.mean(test_err_exact))) 184 | f.write('\nMean approx train error: {}'.format(np.mean(train_err_approx))) 185 | f.write('\nMean approx test error: {}'.format(np.mean(test_err_approx))) 186 | 187 | if not test_mode: 188 | prediction = pd.DataFrame(np.zeros((len(all_meta), 14)), columns=predict_cols, index=all_meta.index) 189 | exact_bool = all_meta['hostgal_specz'].notnull() 190 | train_bool = all_meta['target'].notnull() 191 | 192 | for i, c in enumerate(classes): 193 | g = c in galactic_classes 194 | gal_bool_train_curr = galactic_bool_train == g 195 | galactic_bool_curr = galactic_bool == g 196 | params = params_galactic if g else params_non_galactic 197 | col = 'class_' + str(c) 198 | 199 | if not g: 200 | # model for exact redshift, only needed for non-galactic objects, since all galactic objects in the 201 | # test set have approx data 202 | lgb_train = lgb.Dataset(train_meta_exact.loc[gal_bool_train_curr, train_cols_exact_redshift], 203 | label=(train_meta_exact.loc[gal_bool_train_curr, 'target'] == c).astype(int)) 204 | est_exact_redshift = lgb.train(train_set=lgb_train, valid_sets=[lgb_train], valid_names=['train'], 205 | params=params, num_boost_round=int(np.max(best_iter_exact[c]))) 206 | prediction.loc[exact_bool & galactic_bool_curr, col] \ 207 | = est_exact_redshift.predict(all_meta.loc[exact_bool & galactic_bool_curr, train_cols_exact_redshift]) 208 | 209 | # model for approx redshift 210 | lgb_train = lgb.Dataset(train_meta_approx.loc[gal_bool_train_curr, train_cols_approx_redshift], 211 | label=(train_meta_approx.loc[gal_bool_train_curr, 'target'] == c).astype(int)) 212 | est_approx_redshift = lgb.train(train_set=lgb_train, valid_sets=[lgb_train], valid_names=['train'], 213 | params=params, num_boost_round=int(np.max(best_iter_approx[c]))) 214 | prediction.loc[~exact_bool & galactic_bool_curr, col] \ 215 | = est_approx_redshift.predict(all_meta.loc[~exact_bool & galactic_bool_curr, train_cols_approx_redshift]) 216 | 217 | # fill nulls 218 | prediction[predict_cols] = prediction[predict_cols].fillna(0) 219 | 220 | # We will calculate the probability that the object is class 99 using 1 - max(other columns). But this is an 221 | # overestimate, since the max is always less than 1, even in the training set. So adjust for this by comparing to 222 | # the max in the training set, resampled as before to account for the different distribution. Different 223 | # distributions for each combination of galactic/non-galactic and approx/exact. All galactic objects have approx 224 | # redshift, so only need three combinations. First add the target column. 225 | train_prediction_exact['target'] = train_meta_exact['target'] 226 | train_prediction_approx['target'] = train_meta_approx['target'] 227 | train_approx_galactic_resample \ 228 | = train_prediction_approx.sample(n=10000, weights=w['galactic'], replace=True, random_state=0) 229 | train_exact_non_galactic_resample \ 230 | = train_prediction_exact.sample(n=10000, weights=w['non_galactic'], replace=True, random_state=0) 231 | train_approx_non_galactic_resample \ 232 | = train_prediction_approx.sample(n=10000, weights=w['non_galactic'], replace=True, random_state=0) 233 | 234 | # predict class 99 235 | prediction['class_99'] = 1 - prediction[predict_cols].max(axis=1) 236 | # adjust as described above 237 | for exact in [True, False]: 238 | for g in [True, False]: 239 | if exact and g: 240 | continue 241 | prediction_ind = ~train_bool & (exact_bool == exact) & (galactic_bool == g) 242 | if exact: 243 | train_avg_max = train_exact_non_galactic_resample[predict_cols].max(axis=1).mean() 244 | else: 245 | if g: 246 | train_avg_max = train_approx_galactic_resample[predict_cols].max(axis=1).mean() 247 | else: 248 | train_avg_max = train_approx_non_galactic_resample[predict_cols].max(axis=1).mean() 249 | if (1 - train_avg_max) < prediction.loc[prediction_ind, 'class_99'].mean(): 250 | old_avg = prediction.loc[prediction_ind, 'class_99'].mean() 251 | new_avg = old_avg - (1 - train_avg_max) 252 | prediction.loc[prediction_ind, 'class_99'] *= new_avg / old_avg 253 | 254 | # write output 255 | prediction[predict_cols + ['class_99']] = prediction[predict_cols + ['class_99']] 256 | prediction['object_id'] = all_meta['object_id'] 257 | filename = 'Prediction_raw.csv' 258 | prediction.loc[~train_bool, ['object_id'] + predict_cols + ['class_99']].to_csv(filename, index=False, header=True) 259 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2018.11.29 2 | cycler==0.10.0 3 | kiwisolver==1.0.1 4 | lightgbm==2.1.1 5 | matplotlib==2.2.2 6 | mkl-fft==1.0.6 7 | mkl-random==1.0.1 8 | numexpr==2.6.8 9 | numpy==1.14.3 10 | pandas==0.23.0 11 | pyparsing==2.3.0 12 | python-dateutil==2.7.5 13 | pytz==2018.7 14 | scikit-learn==0.20.0 15 | scipy==1.1.0 16 | six==1.12.0 17 | tables==3.4.3 18 | tornado==5.1.1 19 | wincertstore==0.2 20 | -------------------------------------------------------------------------------- /scale.py: -------------------------------------------------------------------------------- 1 | """Scale raw predictions to add to 1 and apply regularisation.""" 2 | import pandas as pd 3 | import numpy as np 4 | import datetime as dt 5 | import os 6 | 7 | 8 | # read raw predictions 9 | prediction = pd.read_csv('Prediction_raw.csv', dtype={'object_id': np.int32}) 10 | col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8, 11 | 'detected': np.int8} 12 | test_meta = pd.read_csv(os.path.join('data', 'test_set_metadata.csv'), dtype=col_dict) 13 | test_meta['galactic'] = test_meta['hostgal_photoz'] == 0 14 | test_meta['exact'] = test_meta['hostgal_specz'].notnull() 15 | prediction = pd.merge(prediction, test_meta[['object_id', 'galactic', 'exact']], 'left', 'object_id') 16 | 17 | # Regularise class 99 prediction 18 | # Use separate mean for galactic/non-galactic and approx/exact, since the predicted averages are different 19 | # (which also accords with the claims in the following thread that class 99 occurs much less frequently for galactic 20 | # objects compared to extra-galactic): 21 | # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/68943 22 | alpha = 0.5 # regularisation parameter, between 0 and 1; small alpha = more regularisation 23 | mean_99 = prediction.groupby(['exact', 'galactic'])['class_99'].transform('mean') 24 | prediction['class_99'] \ 25 | = mean_99 + alpha * (prediction['class_99'] - mean_99) 26 | prediction.drop(['exact', 'galactic'], axis=1, inplace=True) 27 | 28 | # scale so remaining columns sum to 1 - Pr(class_99) 29 | predict_cols = [c for c in prediction.columns if (c not in ['object_id', 'class_99'])] 30 | predict_sum = prediction[predict_cols].sum(axis=1) 31 | for c in predict_cols: 32 | prediction[c] *= (1 - prediction['class_99']) / predict_sum 33 | 34 | # calculate the weights 35 | # losses from https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194 36 | losses = [32.62, 30.702, 32.62, 32.62, 32.62, 32.622, 32.62, 30.692, 32.62, 32.62, 32.62, 32.62, 32.62, 32.62, 30.701] 37 | # Assume WLOG that the weight of class 0 is 1, and use the above losses to calculate the other weights 38 | p_min = np.log(10 ** -15) 39 | # It can be shown that if classes c, d have losses L_c, L_d, then the ratio of their weights is 40 | # w_c / w_d = (L_c + p_min) / (L_d + p_max) 41 | w = [(loss + p_min) / (32.62 + p_min) for loss in losses] 42 | weights = pd.DataFrame(w, columns=['W']) 43 | # (As described in the following post, a close approximation is given by setting all weights to 1, except for 44 | # classes 15, 64, 99 which have weight 2; this is consistent with the weights found above) 45 | # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194#397153 46 | 47 | # estimate proportions of class 99 48 | weights['N'] = prediction[predict_cols + ['class_99']].mean().values 49 | weights['N'] /= weights['N'].sum() 50 | 51 | # scale using the class weights 52 | for i, c in enumerate(predict_cols + ['class_99']): 53 | prediction[c] *= weights.loc[i, 'W'] / weights.loc[i, 'N'] 54 | 55 | # scale so columns sum to 1 56 | predict_sum = prediction[predict_cols + ['class_99']].sum(axis=1) 57 | for c in predict_cols + ['class_99']: 58 | prediction[c] /= predict_sum 59 | 60 | # write output 61 | prediction[predict_cols + ['class_99']] = prediction[predict_cols + ['class_99']].astype(np.float16) 62 | filename = 'Submission_alpha_{}_{}.csv'.format(alpha, dt.datetime.now().strftime('%y%m%d_%H%M')) 63 | prediction[['object_id'] + predict_cols + ['class_99']].to_csv(filename, index=False, header=True) 64 | -------------------------------------------------------------------------------- /split_test.py: -------------------------------------------------------------------------------- 1 | """Split the test data into chunks.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | 7 | n_chunks = 100 8 | if not os.path.exists(os.path.join('data', 'split_{}'.format(n_chunks))): 9 | os.mkdir(os.path.join('data', 'split_{}'.format(n_chunks))) 10 | 11 | col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8, 12 | 'detected': np.int8} 13 | test = pd.read_csv(os.path.join('data', 'test_set.csv'), dtype=col_dict) 14 | test.sort_values('object_id', inplace=True) 15 | test = test.reset_index() 16 | test_len = len(test) 17 | 18 | id_diff = test.loc[test['object_id'].diff() != 0].index 19 | chunk_starts = [id_diff[int(len(id_diff) * i / n_chunks)] for i in range(n_chunks)] 20 | for i in range(n_chunks): 21 | if i == n_chunks - 1: 22 | end = len(test) 23 | else: 24 | end = chunk_starts[i + 1] 25 | test.iloc[chunk_starts[i]: end - 1].to_hdf(os.path.join('data', 'split_{}'.format(n_chunks), 26 | 'chunk_{}.hdf5'.format(i)), key='file0') 27 | 28 | --------------------------------------------------------------------------------