├── LICENSE
├── Modelling_approach.pdf
├── Readme.md
├── calculate_features.py
├── environment.yml
├── predict.py
├── requirements.txt
├── scale.py
└── split_test.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Belinda Trotta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Modelling_approach.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/btrotta/kaggle-plasticc/910a6798da337854fd9aeeebc3ee713dfe6c22a7/Modelling_approach.pdf


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # kaggle-plasticc
 2 | 
 3 | Code for the 14th place solution in the Kaggle PLAsTiCC competition. 
 4 | 
 5 | See `Modelling_approach.pdf` for a detailed discussion of the modelling approach.
 6 | 
 7 | #### Quick-start guide to running the code
 8 | 
 9 | Total runtime is around 5.5 hours on a 24 Gb laptop.
10 | 
11 | - Download the code. Create a subfolder called `data` and save the csv files there.
12 | 
13 | - To reproduce the results exactly, create an environment with the specific
14 |      package versions I used. (If you already have numpy, pandas, scikit-learn
15 |      and lightgbm you can skip this 
16 |      step, but the results may differ slightly if you have different versions.) If you have conda, the 
17 |      easiest option is to
18 |      build a conda environment using this command:
19 |      ```
20 |      conda env create environment.yml
21 |      ```
22 |      This will create an environment called `plasticc-bt`.
23 |      The `requirements.txt` file is provided as well if you want to build an environment with pip.
24 | 
25 | - Run `split_test.py` to split the test data into 100 hdf5 files. They will 
26 |  be saved in an automatically created subfolder `split_100` of the `data` folder. Takes around 15 minutes.
27 | 
28 | - Run `calculate_features.py` to calculate the features. This will generate 3 files in a folder called  
29 | `features` (the folder is created automatically). Takes around 3.5 hours.
30 | 
31 | - Run `predict.py` to train the model and make predictions on the test set. Takes around 1.5 hours.
32 | 
33 | - Run `scale.py` to apply regularisation to the class 99 predictions and generate the final submission file. 
34 |   Takes a couple of minutes.


--------------------------------------------------------------------------------
/calculate_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import gc
  4 | import os
  5 | 
  6 | 
  7 | # read data
  8 | col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8,
  9 |             'detected': np.int8}
 10 | train_meta = pd.read_csv(os.path.join('data', 'training_set_metadata.csv'))
 11 | train = pd.read_csv(os.path.join('data', 'training_set.csv'), dtype=col_dict)
 12 | 
 13 | 
 14 | def calc_aggs(all_data, exact):
 15 | 
 16 |     # Normalise the flux, following the Bayesian approach here:
 17 |     # https://www.statlect.com/fundamentals-of-statistics/normal-distribution-Bayesian-estimation
 18 |     # Similar idea (but not the same) as the normalisation done in the Starter Kit
 19 |     # https://www.kaggle.com/michaelapers/the-plasticc-astronomy-starter-kit?scriptVersionId=6040398
 20 |     prior_mean = all_data.groupby(['object_id', 'passband'])['flux'].transform('mean')
 21 |     prior_std = all_data.groupby(['object_id', 'passband'])['flux'].transform('std')
 22 |     prior_std.loc[prior_std.isnull()] = all_data.loc[prior_std.isnull(), 'flux_err']
 23 |     obs_std = all_data['flux_err']  # since the above kernel tells us that the flux error is the 68% confidence interval
 24 |     all_data['bayes_flux'] = (all_data['flux'] / obs_std**2 + prior_mean / prior_std**2) \
 25 |                              / (1 / obs_std**2 + 1 / prior_std**2)
 26 |     all_data.loc[all_data['bayes_flux'].notnull(), 'flux'] \
 27 |         = all_data.loc[all_data['bayes_flux'].notnull(), 'bayes_flux']
 28 | 
 29 |     # Estimate the flux at source, using the fact that light is proportional
 30 |     # to inverse square of distance from source.
 31 |     # This is hinted at here: https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70725#417195
 32 |     redshift = all_meta.set_index('object_id')[['hostgal_specz', 'hostgal_photoz']]
 33 |     if exact:
 34 |         redshift['redshift'] = redshift['hostgal_specz']
 35 |         redshift.loc[redshift['redshift'].isnull(), 'redshift'] \
 36 |             = redshift.loc[redshift['redshift'].isnull(), 'hostgal_photoz']
 37 |     else:
 38 |         redshift['redshift'] = redshift['hostgal_photoz']
 39 |     all_data = pd.merge(all_data, redshift, 'left', 'object_id')
 40 |     nonzero_redshift = all_data['redshift'] > 0
 41 |     all_data.loc[nonzero_redshift, 'flux'] = all_data.loc[nonzero_redshift, 'flux'] \
 42 |                                              * all_data.loc[nonzero_redshift, 'redshift']**2
 43 | 
 44 |     # aggregate features
 45 |     band_aggs = all_data.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'std', 'max', 'min']).unstack(-1)
 46 |     band_aggs.columns = [x + '_' + str(y) for x in band_aggs.columns.levels[0]
 47 |                           for y in band_aggs.columns.levels[1]]
 48 |     all_data.sort_values(['object_id', 'passband', 'flux'], inplace=True)
 49 |     # this way of calculating quantiles is faster than using the pandas quantile builtin on the groupby object
 50 |     all_data['group_count'] = all_data.groupby(['object_id', 'passband']).cumcount()
 51 |     all_data['group_size'] = all_data.groupby(['object_id', 'passband'])['flux'].transform('size')
 52 |     q_list = [0.25, 0.75]
 53 |     for q in q_list:
 54 |         all_data['q_' + str(q)] = all_data.loc[
 55 |             (all_data['group_size'] * q).astype(int) == all_data['group_count'], 'flux']
 56 |     quantiles = all_data.groupby(['object_id', 'passband'])[['q_' + str(q) for q in q_list]].max().unstack(-1)
 57 |     quantiles.columns = [str(x) + '_' + str(y) + '_quantile' for x in quantiles.columns.levels[0]
 58 |                          for y in quantiles.columns.levels[1]]
 59 | 
 60 |     # max detected flux
 61 |     max_detected = all_data.loc[all_data['detected'] == 1].groupby('object_id')['flux'].max().to_frame('max_detected')
 62 | 
 63 |     def most_extreme(df_in, k, positive=True, suffix='', include_max=True, include_dur=True, include_interval=False):
 64 |         # find the "most extreme" time for each object, and for each band, retrieve the k data points on either side
 65 |         # k points before
 66 |         df = df_in.copy()
 67 |         df['object_passband_mean'] = df.groupby(['object_id', 'passband'])['flux'].transform('median')
 68 |         if positive:
 69 |             df['dist_from_mean'] = (df['flux'] - df['object_passband_mean'])
 70 |         else:
 71 |             df['dist_from_mean'] = -(df['flux'] - df['object_passband_mean'])
 72 | 
 73 |         max_time = df.loc[df['detected'] == 1].groupby('object_id')['dist_from_mean'].idxmax().to_frame(
 74 |             'max_ind')
 75 |         max_time['mjd_max' + suffix] = df.loc[max_time['max_ind'].values, 'mjd'].values
 76 |         df = pd.merge(df, max_time[['mjd_max' + suffix]], 'left', left_on=['object_id'], right_index=True)
 77 |         df['time_after_mjd_max'] = df['mjd'] - df['mjd_max' + suffix]
 78 |         df['time_before_mjd_max'] = -df['time_after_mjd_max']
 79 | 
 80 |         # first k after event
 81 |         df.sort_values(['object_id', 'passband', 'time_after_mjd_max'], inplace=True)
 82 |         df['row_num_after'] = df.loc[df['time_after_mjd_max'] >= 0].groupby(
 83 |             ['object_id', 'passband']).cumcount()
 84 |         first_k_after = df.loc[(df['row_num_after'] < k) & (df['time_after_mjd_max'] <= 50),
 85 |                               ['object_id', 'passband', 'flux', 'row_num_after']]
 86 |         first_k_after.set_index(['object_id', 'passband', 'row_num_after'], inplace=True)
 87 |         first_k_after = first_k_after.unstack(level=-1).unstack(level=-1)
 88 |         first_k_after.columns = [str(x) + '_' + str(y) + '_after' for x in first_k_after.columns.levels[1]
 89 |                                  for y in first_k_after.columns.levels[2]]
 90 |         extreme_data = first_k_after
 91 |         time_bands = [[-50, -20], [-20, -10], [-10, 0], [0, 10], [10, 20], [20, 50], [50, 100], [100, 200], [200, 500]]
 92 |         if include_interval:
 93 |             interval_arr = []
 94 |             for start, end in time_bands:
 95 |                 band_data = df.loc[(start <= df['time_after_mjd_max']) & (df['time_after_mjd_max'] <= end)]
 96 |                 interval_agg = band_data.groupby(['object_id', 'passband'])['flux'].mean().unstack(-1)
 97 |                 interval_agg.columns = ['{}_start_{}_end_{}'.format(c, start, end) for c in interval_agg.columns]
 98 |                 interval_arr.append(interval_agg)
 99 |             interval_data = pd.concat(interval_arr, axis=1)
100 |             extreme_data = pd.concat([extreme_data, interval_data], axis=1)
101 |         if include_dur:
102 |             # detection duration in each passband after event
103 |             duration_after = df.loc[(df['time_after_mjd_max'] >= 0) & (df['detected'] == 0)] \
104 |                 .groupby(['object_id', 'passband'])['time_after_mjd_max'].first().unstack(-1)
105 |             duration_after.columns = ['dur_after_' + str(c) for c in range(6)]
106 |             extreme_data = pd.concat([extreme_data, duration_after], axis=1)
107 | 
108 |         # last k before event
109 |         df.sort_values(['object_id', 'passband', 'time_before_mjd_max'], inplace=True)
110 |         df['row_num_before'] = df.loc[df['time_before_mjd_max'] >= 0].groupby(
111 |             ['object_id', 'passband']).cumcount()
112 |         first_k_before = df.loc[(df['row_num_before'] < k) & (df['time_after_mjd_max'] <= 50),
113 |                                 ['object_id', 'passband', 'flux', 'row_num_before']]
114 |         first_k_before.set_index(['object_id', 'passband', 'row_num_before'], inplace=True)
115 |         first_k_before = first_k_before.unstack(level=-1).unstack(level=-1)
116 |         first_k_before.columns = [str(x) + '_' + str(y) + '_before' for x in first_k_before.columns.levels[1]
117 |                                   for y in first_k_before.columns.levels[2]]
118 |         extreme_data = pd.concat([extreme_data, first_k_before], axis=1)
119 |         if include_dur:
120 |             # detection duration in each passband before event
121 |             duration_before = df.loc[(df['time_before_mjd_max'] >= 0) & (df['detected'] == 0)] \
122 |                 .groupby(['object_id', 'passband'])['time_before_mjd_max'].first().unstack(-1)
123 |             duration_before.columns = ['dur_before_' + str(c) for c in range(6)]
124 |             extreme_data = pd.concat([extreme_data, duration_before], axis=1)
125 | 
126 |         if include_max:
127 |             # passband with maximum detected flux for each object
128 |             max_pb = df.loc[max_time['max_ind'].values].groupby('object_id')['passband'].max().to_frame(
129 |                 'max_passband')
130 |             # time of max in each passband, relative to extreme max
131 |             band_max_ind = df.groupby(['object_id', 'passband'])['flux'].idxmax()
132 |             band_mjd_max = df.loc[band_max_ind.values].groupby(['object_id', 'passband'])['mjd'].max().unstack(-1)
133 |             cols = ['max_time_' + str(i) for i in range(6)]
134 |             band_mjd_max.columns = cols
135 |             band_mjd_max = pd.merge(band_mjd_max, max_time, 'left', 'object_id')
136 |             for c in cols:
137 |                 band_mjd_max[c] -= band_mjd_max['mjd_max' + suffix]
138 |             band_mjd_max.drop(['mjd_max' + suffix, 'max_ind'], axis=1, inplace=True)
139 |             extreme_data = pd.concat([extreme_data, max_pb, band_mjd_max], axis=1)
140 | 
141 |         extreme_data.columns = [c + suffix for c in extreme_data.columns]
142 |         return extreme_data
143 | 
144 |     extreme_max = most_extreme(all_data, 1, positive=True, suffix='', include_max=True, include_dur=True,
145 |                                include_interval=True)
146 |     extreme_min = most_extreme(all_data, 1, positive=False, suffix='_min', include_max=False, include_dur=True)
147 | 
148 |     # add the feature mentioned here, attempts to identify periodicity:
149 |     # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
150 |     time_between_detections = all_data.loc[all_data['detected'] == 1].groupby('object_id')['mjd'].agg(['max', 'min'])
151 |     time_between_detections['det_period'] = time_between_detections['max'] - time_between_detections['min']
152 |     # same feature but grouped by passband
153 |     time_between_detections_pb \
154 |         = all_data.loc[all_data['detected'] == 1].groupby(['object_id', 'passband'])['mjd'].agg(['max', 'min'])
155 |     time_between_detections_pb['det_period'] = time_between_detections_pb['max'] - time_between_detections_pb['min']
156 |     time_between_detections_pb = time_between_detections_pb['det_period'].unstack(-1)
157 |     time_between_detections_pb.columns = ['det_period_pb_' + str(i) for i in range(6)]
158 |     # similar feature based on high values
159 |     all_data['threshold'] = all_data.groupby(['object_id'])['flux'].transform('max') * 0.75
160 |     all_data['high'] = ((all_data['flux'] >= all_data['threshold']) & (all_data['detected'] == 1)).astype(int)
161 |     time_between_highs = all_data.loc[all_data['high'] == 1].groupby('object_id')['mjd'].agg(['max', 'min'])
162 |     time_between_highs['det_period_high'] = time_between_highs['max'] - time_between_highs['min']
163 | 
164 |     # aggregate values of the features during the detection period
165 |     all_data = pd.merge(all_data, time_between_detections, 'left', 'object_id')
166 |     det_data = all_data.loc[(all_data['mjd'] >= all_data['min']) & (all_data['mjd'] <= all_data['max'])]
167 |     det_aggs = det_data.groupby(['object_id', 'passband'])['flux'].agg(['min', 'max', 'std', 'median'])
168 |     det_aggs['prop_detected'] = det_data.groupby(['object_id', 'passband'])['detected'].mean()
169 |     det_aggs = det_aggs.unstack(-1)
170 |     det_aggs.columns = [x + '_' + str(y) + '_det_period' for x in det_aggs.columns.levels[0]
171 |                           for y in det_aggs.columns.levels[1]]
172 | 
173 |     # time distribution of detections in each band
174 |     detection_time_dist \
175 |         = all_data.loc[all_data['detected'] == 1].groupby(['object_id', 'passband'])['mjd'].std().unstack(-1)
176 |     detection_time_dist.columns = ['time_dist_' + str(i) for i in range(6)]
177 |     detection_time_dist_all \
178 |         = all_data.loc[all_data['detected'] == 1].groupby(['object_id'])['mjd'].std().to_frame('time_dist')
179 | 
180 |     # scale data and recalculate band aggs
181 |     all_data['abs_flux'] = all_data['flux'].abs()
182 |     all_data['flux'] = (all_data['flux']) / all_data.groupby('object_id')['abs_flux'].transform('max')
183 |     band_aggs_s = all_data.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'std', 'max', 'min']).unstack(-1)
184 |     band_aggs_s.columns = [x + '_' + str(y) + '_scaled' for x in band_aggs_s.columns.levels[0]
185 |                           for y in band_aggs_s.columns.levels[1]]
186 |     all_data.sort_values(['object_id', 'passband', 'flux'], inplace=True)
187 |     for q in q_list:
188 |         all_data['q_' + str(q)] = all_data.loc[
189 |             (all_data['group_size'] * q).astype(int) == all_data['group_count'], 'flux']
190 |     quantiles_s = all_data.groupby(['object_id', 'passband'])[['q_' + str(q) for q in q_list]].max().unstack(-1)
191 |     quantiles_s.columns = [str(x) + '_' + str(y) + '_quantile_s' for x in quantiles_s.columns.levels[0]
192 |                           for y in quantiles_s.columns.levels[1]]
193 | 
194 |     extreme_max_s = most_extreme(all_data, 1, positive=True, suffix='_s', include_max=False, include_dur=False,
195 |                                  include_interval=True)
196 |     extreme_min_s = most_extreme(all_data, 1, positive=False, suffix='_min_s', include_max=False, include_dur=False)
197 | 
198 |     new_data = pd.concat([band_aggs, quantiles, band_aggs_s, max_detected, time_between_detections[['det_period']],
199 |                           time_between_detections_pb, extreme_max, extreme_min, extreme_max_s, extreme_min_s,
200 |                           time_between_highs[['det_period_high']], quantiles_s, detection_time_dist,
201 |                           detection_time_dist_all, det_aggs], axis=1)
202 |     return new_data
203 | 
204 | 
205 | # get the metadata
206 | test_meta = pd.read_csv(os.path.join('data', 'test_set_metadata.csv'))
207 | all_meta = pd.concat([train_meta, test_meta], axis=0, ignore_index=True, sort=True).reset_index()
208 | all_meta.drop('index', axis=1, inplace=True)
209 | n_chunks = 100
210 | 
211 | # calculate features
212 | new_data_exact = calc_aggs(train.copy(), True)
213 | new_data_approx = calc_aggs(train.copy(), False)
214 | train_meta_exact = pd.merge(train_meta, new_data_exact, 'left', left_on='object_id', right_index=True)
215 | train_meta_approx = pd.merge(train_meta, new_data_approx, 'left', left_on='object_id', right_index=True)
216 | 
217 | # process training set (not actually used, just to get right shape of dataframe)
218 | new_data_arr = []
219 | new_data_arr.append(calc_aggs(train.copy(), True))
220 | # process test set
221 | for i in range(n_chunks):
222 |     df = pd.read_hdf(os.path.join('data', 'split_{}'.format(n_chunks), 'chunk_{}.hdf5'.format(i)), key='file0')
223 |     df.drop('index', axis=1, inplace=True)
224 |     print('Read chunk {}'.format(i))
225 |     new_data_arr.append(calc_aggs(df.copy(), True))
226 |     print('Calculated features for chunk {}'.format(i))
227 | del df
228 | gc.collect()
229 | new_data = pd.concat(new_data_arr, axis=0, sort=True)
230 | 
231 | # merge
232 | all_meta = pd.merge(all_meta, new_data, 'left', left_on='object_id', right_index=True)
233 | 
234 | # write output
235 | dir_name = 'features'
236 | if not os.path.exists(os.path.join('data', dir_name)):
237 |     os.mkdir(os.path.join('data', dir_name))
238 | all_meta.to_hdf(os.path.join('data', dir_name, 'all_data.hdf5'), key='file0')
239 | train_meta_exact.to_hdf(os.path.join('data', dir_name, 'train_meta_exact.hdf5'), key='file0')
240 | train_meta_approx.to_hdf(os.path.join('data', dir_name, 'train_meta_approx.hdf5'), key='file0')
241 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: plasticc-bt
 2 | dependencies:
 3 |  - python=3.6.5
 4 |  - numpy=1.14.3
 5 |  - matplotlib=2.2.2
 6 |  - pytables=3.4.3
 7 |  - pandas=0.23.0
 8 |  - scikit-learn=0.20.0
 9 |  - pip:
10 |    - lightgbm==2.1.1
11 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn import metrics, model_selection
  4 | import lightgbm as lgb
  5 | import os
  6 | 
  7 | 
  8 | # if test_mode is True, just run training and cross-validation on training data;
  9 | # if False, also make predictions on test set
 10 | test_mode = False
 11 | 
 12 | # read data
 13 | all_meta = pd.read_hdf(os.path.join('data', 'features', 'all_data.hdf5'), key='file0')
 14 | train_meta_approx = pd.read_hdf(os.path.join('data', 'features', 'train_meta_approx.hdf5'), key='file0')
 15 | train_meta_exact = pd.read_hdf(os.path.join('data', 'features', 'train_meta_exact.hdf5'), key='file0')
 16 | 
 17 | # map classes to range [0, 14]
 18 | classes = np.sort(all_meta.loc[all_meta['target'].notnull(), 'target'].unique().astype(int))
 19 | # Train separate models for galatic and extra-galactic, since these classes contain disjoint sets of objects,
 20 | # and can be distinguished by whether hostgl_photoz == 0, as observed here:
 21 | # https://www.kaggle.com/kyleboone/naive-benchmark-galactic-vs-extragalactic?scriptVersionId=6104036#
 22 | galactic_bool = all_meta['hostgal_photoz'] == 0
 23 | exact_bool = all_meta['hostgal_photoz'].notnull().astype(int)
 24 | galactic_classes = np.sort(all_meta.loc[all_meta['target'].notnull() & galactic_bool, 'target'].unique().astype(int))
 25 | non_galactic_classes = np.sort(
 26 |     all_meta.loc[all_meta['target'].notnull() & ~galactic_bool, 'target'].unique().astype(int))
 27 | # transform the target so the classes are the integers range(num_classes)
 28 | for df in [all_meta, train_meta_approx, train_meta_exact]:
 29 |     df['target_trans'] = np.nan
 30 |     df['target_trans_galactic'] = np.nan
 31 |     df['target_trans_non_galactic'] = np.nan
 32 |     for k, class_list in enumerate([classes, galactic_classes, non_galactic_classes]):
 33 |         if k == 0:
 34 |             suffix = ''
 35 |         elif k == 1:
 36 |             suffix = '_galactic'
 37 |         else:
 38 |             suffix = '_non_galactic'
 39 |         for i in range(len(class_list)):
 40 |             df.loc[df['target'] == class_list[i], 'target_trans' + suffix] = i
 41 | 
 42 | # train 2 models for each class, one for when we have exact redshift, and another for when we don't
 43 | train_cols_exact_redshift \
 44 |     = [c for c in train_meta_exact.columns if
 45 |       c not in ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'target', 'target_trans', 'target_trans_galactic',
 46 |                  'target_trans_non_galactic', 'ddf',
 47 |                  'distmod', 'mwebv', 'hostgal_photoz', 'hostgal_photoz_err', 'index']]
 48 | train_cols_approx_redshift \
 49 |     = [c for c in train_meta_approx.columns if
 50 |       c not in ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'target', 'target_trans', 'target_trans_galactic',
 51 |                  'target_trans_non_galactic', 'ddf',
 52 |                  'distmod', 'mwebv', 'hostgal_specz', 'index']]
 53 | 
 54 | # separate parameters for galactic and non-galactic
 55 | params_galactic = {'boosting_type': 'gbdt', 'application': 'binary', 'num_leaves': 32, 'seed': 0, 'verbose': -1,
 56 |                    'min_data_in_leaf': 1, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l1': 0, 'lambda_l2': 1,
 57 |                    'learning_rate': 0.02}
 58 | params_non_galactic = {'boosting_type': 'gbdt', 'application': 'binary', 'num_leaves': 16, 'seed': 0, 'verbose': -1,
 59 |                        'min_data_in_leaf': 1, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'lambda_l1': 0,
 60 |                        'lambda_l2': 1, 'learning_rate': 0.02}
 61 | num_rounds = 3000
 62 | 
 63 | # cross-validate on train set, and measure distribution of out-of-sample predicted values
 64 | train_err_exact = []
 65 | test_err_exact = []
 66 | train_err_approx = []
 67 | test_err_approx = []
 68 | cv = model_selection.KFold(5, shuffle=True, random_state=4)
 69 | galactic_bool_train = train_meta_exact['hostgal_photoz'] == 0
 70 | train_meta_exact['predict_max_exact'] = 0
 71 | train_meta_exact['predict_max_approx'] = 0
 72 | train_meta_approx['predict_max_exact'] = 0
 73 | train_meta_approx['predict_max_approx'] = 0
 74 | predict_cols = ['class_' + str(c) for c in classes]
 75 | train_prediction_exact \
 76 |     = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols)
 77 | train_prediction_approx \
 78 |     = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols)
 79 | eval_prediction_exact \
 80 |     = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols)
 81 | eval_prediction_approx \
 82 |     = pd.DataFrame(np.zeros((len(train_meta_exact), 14)), index=train_meta_exact.index, columns=predict_cols)
 83 | importance = {}
 84 | best_iter_exact = {c: [] for c in classes}
 85 | best_iter_approx = {c: [] for c in classes}
 86 | # Evaluate accuracy on resampled training set having similar distribution to test. The data note says
 87 | # "The training data are mostly composed of nearby, low-redshift, brighter objects while the test data contain
 88 | # more distant (higher redshift) and fainter objects."  So we resample to achieve a similar distribution of
 89 | # hostgal_photoz.
 90 | train_bool = all_meta['target'].notnull()
 91 | ddf = all_meta['ddf'] == 1
 92 | w = pd.DataFrame(index=train_meta_exact.index, columns=['galactic', 'non_galactic'])
 93 | w['galactic'] = galactic_bool_train.astype(int)
 94 | w['non_galactic'] = np.nan
 95 | bands = np.arange(all_meta.loc[~train_bool, 'hostgal_photoz'].min(),
 96 |                   all_meta.loc[~train_bool, 'hostgal_photoz'].max() + 0.00001, 0.1)
 97 | for i in range(len(bands[:-1])):
 98 |     band_bool = ~galactic_bool_train & ~ddf & (train_meta_exact['hostgal_photoz'] >= bands[i]) \
 99 |                 & (train_meta_exact['hostgal_photoz'] <= bands[i + 1])
100 |     train_prop = band_bool.sum() / (~galactic_bool_train & ~ddf).sum()
101 |     test_prop = ((all_meta.loc[~train_bool & ~galactic_bool, 'hostgal_photoz'] >= bands[i])
102 |                  & (all_meta.loc[~train_bool & ~galactic_bool, 'hostgal_photoz'] <= bands[i + 1])).sum() \
103 |                 / (~train_bool & ~galactic_bool).sum()
104 |     w.loc[band_bool, 'non_galactic'] = test_prop / train_prop
105 | w.loc[ddf] = 0
106 | for train_ind, test_ind in list(cv.split(train_meta_exact.index, train_meta_exact['target_trans'])):
107 |     train_bool = train_meta_exact.index.isin(train_ind)
108 |     ddf = train_meta_exact['ddf'] == 1
109 | 
110 |     for i, c in enumerate(classes):
111 |         g = c in galactic_classes
112 |         gal_bool_train_curr = galactic_bool_train == g
113 |         params = params_galactic if g else params_non_galactic
114 |         col = 'class_' + str(c)
115 |         weight_col = 'galactic' if g else 'non_galactic'
116 | 
117 |         # exact redshift model
118 |         lgb_train = lgb.Dataset(train_meta_exact.loc[train_bool & gal_bool_train_curr, train_cols_exact_redshift],
119 |                                 label=(train_meta_exact.loc[train_bool & gal_bool_train_curr, 'target'] == c).astype(int))
120 |         lgb_valid = lgb.Dataset(train_meta_exact.loc[(~train_bool) & gal_bool_train_curr & ~ddf, train_cols_exact_redshift],
121 |                                 label=(train_meta_exact.loc[(~train_bool) & gal_bool_train_curr & ~ddf, 'target'] == c).astype(int),
122 |                                 weight=w.loc[(~train_bool) & gal_bool_train_curr & ~ddf, weight_col])
123 |         est = lgb.train(train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid'],
124 |                         params=params, num_boost_round=num_rounds, early_stopping_rounds=100)
125 |         best_iter_exact[c].append(est.best_iteration)
126 |         train_prediction_exact.loc[~train_bool & gal_bool_train_curr, col] = est.predict(
127 |             train_meta_exact.loc[(~train_bool) & gal_bool_train_curr, train_cols_exact_redshift],
128 |             num_iteration=est.best_iteration)
129 |         # measure errors on train and test
130 |         eval_prediction_exact.loc[gal_bool_train_curr, col] \
131 |             = est.predict(train_meta_exact.loc[gal_bool_train_curr, train_cols_exact_redshift],
132 |                           num_iteration=est.best_iteration)
133 | 
134 |         # approx redshift models
135 |         lgb_train = lgb.Dataset(train_meta_approx.loc[train_bool & gal_bool_train_curr, train_cols_approx_redshift],
136 |                                 label=(train_meta_approx.loc[train_bool & gal_bool_train_curr, 'target'] == c).astype(int))
137 |         lgb_valid = lgb.Dataset(train_meta_approx.loc[(~train_bool) & gal_bool_train_curr & ~ddf, train_cols_approx_redshift],
138 |                                 label=(train_meta_approx.loc[(~train_bool) & gal_bool_train_curr & ~ddf, 'target'] == c).astype(int),
139 |                                 weight=w.loc[(~train_bool) & gal_bool_train_curr & ~ddf, weight_col])
140 |         est = lgb.train(train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid'],
141 |                         params=params, num_boost_round=num_rounds, early_stopping_rounds=100)
142 |         best_iter_approx[c].append(est.best_iteration)
143 |         train_prediction_approx.loc[(~train_bool) & gal_bool_train_curr, col] = est.predict(
144 |             train_meta_approx.loc[(~train_bool) & gal_bool_train_curr, train_cols_approx_redshift],
145 |             num_iteration=est.best_iteration)
146 |         # measure errors on train and test
147 |         eval_prediction_approx.loc[gal_bool_train_curr, col] \
148 |             = est.predict(train_meta_approx.loc[gal_bool_train_curr, train_cols_approx_redshift],
149 |                           num_iteration=est.best_iteration)
150 | 
151 |         imp_arr = est.feature_importance()
152 |         importance[g] = {c: imp_arr[i] for i, c in enumerate(train_cols_approx_redshift)}
153 | 
154 |     # fill nulls
155 |     for df in [train_prediction_exact, train_prediction_approx, eval_prediction_exact, eval_prediction_approx]:
156 |         df[predict_cols] = df[predict_cols].fillna(0)
157 | 
158 |     # scale so columns add to 1
159 |     for df in [eval_prediction_exact, eval_prediction_approx]:
160 |         col_sum = df.sum(axis=1)
161 |         for c in df.columns:
162 |             df[c] /= col_sum
163 | 
164 |     train_err_exact.append(metrics.log_loss(train_meta_exact.loc[train_bool & ~ddf, 'target_trans'],
165 |                                       eval_prediction_exact.loc[train_bool & ~ddf, predict_cols]))
166 |     test_err_exact.append(metrics.log_loss(train_meta_exact.loc[~train_bool & ~ddf, 'target_trans'],
167 |                                      eval_prediction_exact.loc[~train_bool & ~ddf, predict_cols]))
168 |     print('Train exact error: ', train_err_exact)
169 |     print('Test exact error: ', test_err_exact)
170 |     train_err_approx.append(metrics.log_loss(train_meta_approx.loc[train_bool & ~ddf, 'target_trans'],
171 |                                       eval_prediction_approx.loc[train_bool & ~ddf, predict_cols]))
172 |     test_err_approx.append(metrics.log_loss(train_meta_approx.loc[~train_bool & ~ddf, 'target_trans'],
173 |                                      eval_prediction_approx.loc[~train_bool & ~ddf, predict_cols]))
174 |     print('Train approx error: ', train_err_approx)
175 |     print('Test approx error: ', test_err_approx)
176 | 
177 | with open('log.txt', 'w') as f:
178 |     f.write('Train exact error: '.format(train_err_exact))
179 |     f.write('\nTest exact error: '.format(test_err_exact))
180 |     f.write('\nTrain approx error: {}'.format(train_err_approx))
181 |     f.write('\nTest approx error: {}'.format(test_err_approx))
182 |     f.write('\nMean exact train error: {}'.format(np.mean(train_err_exact)))
183 |     f.write('\nMean exact test error: {}'.format(np.mean(test_err_exact)))
184 |     f.write('\nMean approx train error: {}'.format(np.mean(train_err_approx)))
185 |     f.write('\nMean approx test error: {}'.format(np.mean(test_err_approx)))
186 | 
187 | if not test_mode:
188 |     prediction = pd.DataFrame(np.zeros((len(all_meta), 14)), columns=predict_cols, index=all_meta.index)
189 |     exact_bool = all_meta['hostgal_specz'].notnull()
190 |     train_bool = all_meta['target'].notnull()
191 | 
192 |     for i, c in enumerate(classes):
193 |         g = c in galactic_classes
194 |         gal_bool_train_curr = galactic_bool_train == g
195 |         galactic_bool_curr = galactic_bool == g
196 |         params = params_galactic if g else params_non_galactic
197 |         col = 'class_' + str(c)
198 | 
199 |         if not g:
200 |             # model for exact redshift, only needed for non-galactic objects, since all galactic objects in the
201 |             # test set have approx data
202 |             lgb_train = lgb.Dataset(train_meta_exact.loc[gal_bool_train_curr, train_cols_exact_redshift],
203 |                                     label=(train_meta_exact.loc[gal_bool_train_curr, 'target'] == c).astype(int))
204 |             est_exact_redshift = lgb.train(train_set=lgb_train, valid_sets=[lgb_train], valid_names=['train'],
205 |                                           params=params, num_boost_round=int(np.max(best_iter_exact[c])))
206 |             prediction.loc[exact_bool & galactic_bool_curr, col] \
207 |                 = est_exact_redshift.predict(all_meta.loc[exact_bool & galactic_bool_curr, train_cols_exact_redshift])
208 | 
209 |         # model for approx redshift
210 |         lgb_train = lgb.Dataset(train_meta_approx.loc[gal_bool_train_curr, train_cols_approx_redshift],
211 |                                 label=(train_meta_approx.loc[gal_bool_train_curr, 'target'] == c).astype(int))
212 |         est_approx_redshift = lgb.train(train_set=lgb_train, valid_sets=[lgb_train], valid_names=['train'],
213 |                                         params=params, num_boost_round=int(np.max(best_iter_approx[c])))
214 |         prediction.loc[~exact_bool & galactic_bool_curr, col] \
215 |             = est_approx_redshift.predict(all_meta.loc[~exact_bool & galactic_bool_curr, train_cols_approx_redshift])
216 | 
217 |     # fill nulls
218 |     prediction[predict_cols] = prediction[predict_cols].fillna(0)
219 | 
220 |     # We will calculate the probability that the object is class 99 using 1 - max(other columns). But this is an
221 |     # overestimate, since the max is always less than 1, even in the training set. So adjust for this by comparing to
222 |     # the max in the training set, resampled as before to account for the different distribution. Different
223 |     # distributions for each combination of galactic/non-galactic and approx/exact. All galactic objects have approx
224 |     # redshift, so only need three combinations. First add the target column.
225 |     train_prediction_exact['target'] = train_meta_exact['target']
226 |     train_prediction_approx['target'] = train_meta_approx['target']
227 |     train_approx_galactic_resample \
228 |         = train_prediction_approx.sample(n=10000, weights=w['galactic'], replace=True, random_state=0)
229 |     train_exact_non_galactic_resample \
230 |         = train_prediction_exact.sample(n=10000, weights=w['non_galactic'], replace=True, random_state=0)
231 |     train_approx_non_galactic_resample \
232 |         = train_prediction_approx.sample(n=10000, weights=w['non_galactic'], replace=True, random_state=0)
233 | 
234 |     # predict class 99
235 |     prediction['class_99'] = 1 - prediction[predict_cols].max(axis=1)
236 |     # adjust as described above
237 |     for exact in [True, False]:
238 |         for g in [True, False]:
239 |             if exact and g:
240 |                 continue
241 |             prediction_ind = ~train_bool & (exact_bool == exact) & (galactic_bool == g)
242 |             if exact:
243 |                 train_avg_max = train_exact_non_galactic_resample[predict_cols].max(axis=1).mean()
244 |             else:
245 |                 if g:
246 |                     train_avg_max = train_approx_galactic_resample[predict_cols].max(axis=1).mean()
247 |                 else:
248 |                     train_avg_max = train_approx_non_galactic_resample[predict_cols].max(axis=1).mean()
249 |             if (1 - train_avg_max) < prediction.loc[prediction_ind, 'class_99'].mean():
250 |                 old_avg = prediction.loc[prediction_ind, 'class_99'].mean()
251 |                 new_avg = old_avg - (1 - train_avg_max)
252 |                 prediction.loc[prediction_ind, 'class_99'] *= new_avg / old_avg
253 | 
254 |     # write output
255 |     prediction[predict_cols + ['class_99']] = prediction[predict_cols + ['class_99']]
256 |     prediction['object_id'] = all_meta['object_id']
257 |     filename = 'Prediction_raw.csv'
258 |     prediction.loc[~train_bool, ['object_id'] + predict_cols + ['class_99']].to_csv(filename, index=False, header=True)
259 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2018.11.29
 2 | cycler==0.10.0
 3 | kiwisolver==1.0.1
 4 | lightgbm==2.1.1
 5 | matplotlib==2.2.2
 6 | mkl-fft==1.0.6
 7 | mkl-random==1.0.1
 8 | numexpr==2.6.8
 9 | numpy==1.14.3
10 | pandas==0.23.0
11 | pyparsing==2.3.0
12 | python-dateutil==2.7.5
13 | pytz==2018.7
14 | scikit-learn==0.20.0
15 | scipy==1.1.0
16 | six==1.12.0
17 | tables==3.4.3
18 | tornado==5.1.1
19 | wincertstore==0.2
20 | 


--------------------------------------------------------------------------------
/scale.py:
--------------------------------------------------------------------------------
 1 | """Scale raw predictions to add to 1 and apply regularisation."""
 2 | import pandas as pd
 3 | import numpy as np
 4 | import datetime as dt
 5 | import os
 6 | 
 7 | 
 8 | # read raw predictions
 9 | prediction = pd.read_csv('Prediction_raw.csv', dtype={'object_id': np.int32})
10 | col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8,
11 |             'detected': np.int8}
12 | test_meta = pd.read_csv(os.path.join('data', 'test_set_metadata.csv'), dtype=col_dict)
13 | test_meta['galactic'] = test_meta['hostgal_photoz'] == 0
14 | test_meta['exact'] = test_meta['hostgal_specz'].notnull()
15 | prediction = pd.merge(prediction, test_meta[['object_id', 'galactic', 'exact']], 'left', 'object_id')
16 | 
17 | # Regularise class 99 prediction
18 | # Use separate mean for galactic/non-galactic and approx/exact, since the predicted averages are different
19 | # (which also accords with the claims in the following thread that class 99 occurs much less frequently for galactic
20 | # objects compared to extra-galactic):
21 | # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/68943
22 | alpha = 0.5  # regularisation parameter, between 0 and 1; small alpha = more regularisation
23 | mean_99 = prediction.groupby(['exact', 'galactic'])['class_99'].transform('mean')
24 | prediction['class_99'] \
25 |     = mean_99 + alpha * (prediction['class_99'] - mean_99)
26 | prediction.drop(['exact', 'galactic'], axis=1, inplace=True)
27 | 
28 | # scale so remaining columns sum to 1 - Pr(class_99)
29 | predict_cols = [c for c in prediction.columns if (c not in ['object_id', 'class_99'])]
30 | predict_sum = prediction[predict_cols].sum(axis=1)
31 | for c in predict_cols:
32 |     prediction[c] *= (1 - prediction['class_99']) / predict_sum
33 | 
34 | # calculate the weights
35 | # losses from https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
36 | losses = [32.62, 30.702, 32.62, 32.62, 32.62, 32.622, 32.62, 30.692, 32.62, 32.62, 32.62, 32.62, 32.62, 32.62, 30.701]
37 | # Assume WLOG that the weight of class 0 is 1, and use the above losses to calculate the other weights
38 | p_min = np.log(10 ** -15)
39 | # It can be shown that if classes c, d have losses L_c, L_d, then the ratio of their weights is
40 | # w_c / w_d = (L_c + p_min) / (L_d + p_max)
41 | w = [(loss + p_min) / (32.62 + p_min) for loss in losses]
42 | weights = pd.DataFrame(w, columns=['W'])
43 | # (As described in the following post, a close approximation is given by setting all weights to 1, except for
44 | # classes 15, 64, 99 which have weight 2; this is consistent with the weights found above)
45 | # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194#397153
46 | 
47 | # estimate proportions of class 99
48 | weights['N'] = prediction[predict_cols + ['class_99']].mean().values
49 | weights['N'] /= weights['N'].sum()
50 | 
51 | # scale using the class weights
52 | for i, c in enumerate(predict_cols + ['class_99']):
53 |     prediction[c] *= weights.loc[i, 'W'] / weights.loc[i, 'N']
54 | 
55 | # scale so columns sum to 1
56 | predict_sum = prediction[predict_cols + ['class_99']].sum(axis=1)
57 | for c in predict_cols + ['class_99']:
58 |     prediction[c] /= predict_sum
59 | 
60 | # write output
61 | prediction[predict_cols + ['class_99']] = prediction[predict_cols + ['class_99']].astype(np.float16)
62 | filename = 'Submission_alpha_{}_{}.csv'.format(alpha, dt.datetime.now().strftime('%y%m%d_%H%M'))
63 | prediction[['object_id'] + predict_cols + ['class_99']].to_csv(filename, index=False, header=True)
64 | 


--------------------------------------------------------------------------------
/split_test.py:
--------------------------------------------------------------------------------
 1 | """Split the test data into chunks."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import os
 6 | 
 7 | n_chunks = 100
 8 | if not os.path.exists(os.path.join('data', 'split_{}'.format(n_chunks))):
 9 |     os.mkdir(os.path.join('data', 'split_{}'.format(n_chunks)))
10 | 
11 | col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8,
12 |             'detected': np.int8}
13 | test = pd.read_csv(os.path.join('data', 'test_set.csv'), dtype=col_dict)
14 | test.sort_values('object_id', inplace=True)
15 | test = test.reset_index()
16 | test_len = len(test)
17 | 
18 | id_diff = test.loc[test['object_id'].diff() != 0].index
19 | chunk_starts = [id_diff[int(len(id_diff) * i / n_chunks)] for i in range(n_chunks)]
20 | for i in range(n_chunks):
21 |     if i == n_chunks - 1:
22 |         end = len(test)
23 |     else:
24 |         end = chunk_starts[i + 1]
25 |     test.iloc[chunk_starts[i]: end - 1].to_hdf(os.path.join('data', 'split_{}'.format(n_chunks),
26 |                                                             'chunk_{}.hdf5'.format(i)), key='file0')
27 | 
28 | 


--------------------------------------------------------------------------------