├── common ├── __init__.py ├── data.py └── time.py ├── seizure_prediction ├── __init__.py ├── cross_validation │ ├── __init__.py │ ├── sequences.py │ ├── legacy_strategy.py │ └── kfold_strategy.py ├── fft_bins.py ├── scores.py ├── settings.py ├── classifiers.py ├── hdf5.py ├── feature_selection.py ├── pipeline.py ├── data.py ├── tasks.py └── transforms.py ├── SETTINGS.json ├── submissions └── combine.py ├── .gitignore ├── LICENSE ├── examine_cv_strategies.py ├── mat_to_hdf5.py ├── ensemble.py ├── main.py ├── README.md └── genetic.py /common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /seizure_prediction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /seizure_prediction/cross_validation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SETTINGS.json: -------------------------------------------------------------------------------- 1 | { 2 | "competition-data-dir": "data", 3 | "data-cache-dir": "data-cache", 4 | "submission-dir": "submissions", 5 | "num-jobs": "auto" 6 | } 7 | -------------------------------------------------------------------------------- /seizure_prediction/fft_bins.py: -------------------------------------------------------------------------------- 1 | # NOTE(mike): FFT bin ranges I used a lot 2 | super_duper_bins = [0.5, 2, 3.5, 5, 6.5, 8, 10, 17, 24, 31, 39] 3 | super_bins = [0.5, 2, 3.5, 5, 6.5, 8, 10, 17, 24, 31, 39, 48] 4 | winning_bins = [0.5, 2.25, 4, 5.5, 7, 9.5, 12, 21, 30, 39, 48] 5 | original_bins = [0.5, 4, 7, 12, 30, 48] 6 | -------------------------------------------------------------------------------- /common/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | 4 | 5 | def makedirs(dir): 6 | try: 7 | os.makedirs(dir) 8 | except: 9 | pass 10 | 11 | class jsdict(dict): 12 | def __init__(self, *args, **kwargs): 13 | super(jsdict, self).__init__(*args, **kwargs) 14 | self.__dict__ = self 15 | -------------------------------------------------------------------------------- /seizure_prediction/scores.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # helper methods for printing scores 4 | 5 | 6 | def get_score_summary(name, scores): 7 | summary = 'mean=%.3f std=%.3f' % (np.mean(scores), np.std(scores)) 8 | score_list = ['%.3f' % score for score in scores] 9 | return '%s [%s] %s' % (summary, ','.join(score_list), name) 10 | 11 | 12 | def print_results(summaries): 13 | summaries.sort(cmp=lambda x,y: cmp(x[1], y[1])) 14 | if len(summaries) > 1: 15 | print 'summaries' 16 | for s, mean in summaries: 17 | print s 18 | -------------------------------------------------------------------------------- /submissions/combine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import numpy as np 4 | import sys 5 | import gzip 6 | 7 | if len(sys.argv) != 3: 8 | print >>sys.err, 'Usage: ./combine.py submission0.csv.gz submission1.csv.gz | gzip >combined-0-1.csv.gz' 9 | 10 | filenames = sys.argv[1:] 11 | files = [gzip.open(filename, 'rb') for filename in filenames] 12 | 13 | print [f.readline() for f in files][0], 14 | 15 | def split(line): 16 | t, p = line.split(',') 17 | return t, float(p) 18 | 19 | while True: 20 | lines = [f.readline() for f in files] 21 | if lines[0] == "": 22 | break; 23 | 24 | t, p = zip(*[split(line) for line in lines]) 25 | 26 | for tt in t: 27 | assert(tt == t[0]) 28 | 29 | p = np.mean(p) 30 | print '%s,%.10f' % (t[0], p) 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /common/time.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | def unix_time(dt): 4 | epoch = datetime.utcfromtimestamp(0) 5 | delta = dt - epoch 6 | return delta.total_seconds() 7 | 8 | 9 | def unix_time_millis(dt): 10 | return int(unix_time(dt) * 1000.0) 11 | 12 | 13 | def get_millis(): 14 | return unix_time_millis(datetime.now()) 15 | 16 | 17 | def get_seconds(): 18 | return get_millis() / 1000.0 19 | 20 | 21 | class Timer: 22 | def __init__(self): 23 | self.start = get_millis() 24 | 25 | def elapsed_millis(self): 26 | return get_millis() - self.start 27 | 28 | def elapsed_seconds(self): 29 | return long(self.elapsed_millis() / 1000.0) 30 | 31 | def pretty_str(self): 32 | ms = self.elapsed_millis() 33 | if ms > 5000: 34 | return '%ds' % long(ms / 1000.0) 35 | return '%dms' % ms 36 | -------------------------------------------------------------------------------- /seizure_prediction/settings.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import json 3 | import multiprocessing 4 | import os 5 | 6 | Settings = namedtuple('Settings', ['data_dir', 'cache_dir', 'submission_dir', 'N_jobs']) 7 | 8 | 9 | def load_settings(): 10 | with open('SETTINGS.json') as f: 11 | settings = json.load(f) 12 | 13 | data_dir = str(settings['competition-data-dir']) 14 | cache_dir = str(settings['data-cache-dir']) 15 | submission_dir = str(settings['submission-dir']) 16 | N_jobs = str(settings['num-jobs']) 17 | N_jobs = multiprocessing.cpu_count() if N_jobs == 'auto' else int(N_jobs) 18 | 19 | for d in (cache_dir, submission_dir): 20 | try: 21 | os.makedirs(d) 22 | except: 23 | pass 24 | 25 | return Settings(data_dir=data_dir, cache_dir=cache_dir, submission_dir=submission_dir, N_jobs=N_jobs) 26 | -------------------------------------------------------------------------------- /seizure_prediction/cross_validation/sequences.py: -------------------------------------------------------------------------------- 1 | import sklearn.utils 2 | 3 | def collect_sequence_ranges(sequences): 4 | assert len(sequences) > 0 5 | seq_starts = [0] 6 | prev = sequences[0] 7 | for i, seq in enumerate(sequences[1:]): 8 | if seq != prev + 1: 9 | seq_starts.append(i + 1) 10 | prev = seq 11 | 12 | seq_ranges = [] 13 | prev_start = seq_starts[0] 14 | for start in seq_starts[1:]: 15 | seq_ranges.append((prev_start, start)) 16 | prev_start = start 17 | 18 | seq_ranges.append((prev_start, len(sequences))) 19 | 20 | return seq_ranges 21 | 22 | def collect_sequence_ranges_from_meta(meta, shuffle=True): 23 | sequences = meta.sequence 24 | seq_ranges = collect_sequence_ranges(sequences) 25 | if shuffle: 26 | seq_ranges = sklearn.utils.shuffle(seq_ranges, random_state=2) 27 | return seq_ranges 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Michael Hills 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /seizure_prediction/classifiers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn 3 | import sklearn.pipeline 4 | from sklearn.linear_model import LogisticRegression, LinearRegression 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.svm import SVC 7 | 8 | 9 | # NOTE(mike): doesn't handle multi-class 10 | class SimpleLogisticRegression(LinearRegression): 11 | def predict_proba(self, X): 12 | predictions = self.predict(X) 13 | predictions = sklearn.preprocessing.scale(predictions) 14 | predictions = 1.0 / (1.0 + np.exp(-0.5 * predictions)) 15 | return np.vstack((1.0 - predictions, predictions)).T 16 | 17 | 18 | def make_svm(gamma, C): 19 | cls = sklearn.pipeline.make_pipeline(StandardScaler(), 20 | SVC(gamma=gamma, C=C, probability=True, cache_size=500, random_state=0)) 21 | name = 'ss-svc-g%.4f-C%.1f' % (gamma, C) 22 | return (cls, name) 23 | 24 | 25 | def make_lr(C): 26 | cls = sklearn.pipeline.make_pipeline(StandardScaler(), LogisticRegression(C=C)) 27 | name = 'ss-lr-C%.4f' % C 28 | return (cls, name) 29 | 30 | 31 | def make_simple_lr(): 32 | return (sklearn.pipeline.make_pipeline(StandardScaler(), SimpleLogisticRegression()), 'ss-slr') 33 | -------------------------------------------------------------------------------- /seizure_prediction/cross_validation/legacy_strategy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.cross_validation 3 | from seizure_prediction.cross_validation.sequences import collect_sequence_ranges_from_meta 4 | 5 | 6 | class LegacyStrategy: 7 | """ 8 | Hand-picked random folds maintaining sequence integrity with 80% train/cv split. 9 | See k_fold_strategy for docs on each method. 10 | """ 11 | 12 | def get_name(self): 13 | return 'legacy' 14 | 15 | def get_folds(self, preictal_meta): 16 | # hand-picked on my system to give a nice spread when num_sequences = 3, 17 | # i.e. (0, 1), (0, 2), (1, 2) when using 3 folds 18 | # The new way is to use k_fold.py instead of this 19 | return [8, 11, 14] 20 | 21 | def get_sequence_ranges(self, meta, fold_number, interictal=None, shuffle=None): 22 | train_size = 0.8 23 | seq_ranges = collect_sequence_ranges_from_meta(meta, shuffle=False) 24 | return sklearn.cross_validation.train_test_split(seq_ranges, train_size=train_size, random_state=fold_number) 25 | 26 | def split_train_cv(self, data, meta, fold_number, interictal=False): 27 | 28 | train_ranges, cv_ranges = self.get_sequence_ranges(meta, fold_number, interictal=interictal) 29 | 30 | train_data = [] 31 | for start, end in train_ranges: 32 | train_data.append(data[start:end]) 33 | train_data = np.concatenate(train_data, axis=0) 34 | 35 | cv_data = [] 36 | for start, end in cv_ranges: 37 | cv_data.append(data[start:end]) 38 | cv_data = np.concatenate(cv_data, axis=0) 39 | 40 | return train_data, cv_data 41 | -------------------------------------------------------------------------------- /seizure_prediction/hdf5.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import re 4 | from common.data import jsdict 5 | 6 | # Helper method to dump a dictionary of ndarrays or primitives to hdf5, and then read them back. 7 | # It looks like I also added list support, cool. 8 | 9 | METADATA_TAG = '__metadata' 10 | 11 | list_regex = re.compile(r"""__list_(.*)_(\d+)""") 12 | 13 | 14 | def write(filename, obj): 15 | data = h5py.File(filename, 'w-', libver='latest') 16 | meta_dataset = data.create_dataset(METADATA_TAG, shape=(1,)) 17 | 18 | for key in obj.keys(): 19 | value = obj[key] 20 | if isinstance(value, np.ndarray): 21 | data.create_dataset(key, data=value) 22 | elif isinstance(value, list): 23 | for i, v in enumerate(value): 24 | assert isinstance(v, np.ndarray) 25 | data.create_dataset('__list_%s_%d' % (key, i), data=v) 26 | else: 27 | meta_dataset.attrs[key] = value 28 | 29 | data.close() 30 | 31 | 32 | def read(filename): 33 | data = h5py.File(filename, 'r') 34 | obj = {} 35 | for key in data.keys(): 36 | value = data[key] 37 | if key == METADATA_TAG: 38 | for metakey in value.attrs.keys(): 39 | obj[metakey] = value.attrs[metakey] 40 | elif not key.startswith('__list'): 41 | obj[key] = value[:] 42 | 43 | list_keys = [key for key in data.keys() if key.startswith('__list')] 44 | if len(list_keys) > 0: 45 | list_keys.sort() 46 | for key in list_keys: 47 | match = list_regex.match(key) 48 | assert match is not None 49 | list_key = match.group(1) 50 | list_index = int(match.group(2)) 51 | out_list = obj.setdefault(list_key, []) 52 | assert len(out_list) == list_index 53 | out_list.append(data[key][:]) 54 | 55 | data.close() 56 | 57 | return jsdict(obj) 58 | 59 | -------------------------------------------------------------------------------- /examine_cv_strategies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import numpy as np 4 | 5 | from seizure_prediction.cross_validation.kfold_strategy import KFoldStrategy 6 | from seizure_prediction.cross_validation.legacy_strategy import LegacyStrategy 7 | from seizure_prediction.cross_validation.sequences import collect_sequence_ranges 8 | from seizure_prediction.pipeline import Pipeline, InputSource 9 | from seizure_prediction.settings import load_settings 10 | from seizure_prediction.tasks import load_pipeline_data 11 | 12 | 13 | targets = [ 14 | 'Dog_1', 15 | 'Dog_2', 16 | 'Dog_3', 17 | 'Dog_4', 18 | 'Dog_5', 19 | 'Patient_1', 20 | 'Patient_2' 21 | ] 22 | 23 | class Zero: 24 | def get_name(self): 25 | return 'zero' 26 | 27 | def apply(self, X, meta): 28 | return np.zeros(list(X.shape[:-1]) + [1]) 29 | 30 | settings = load_settings() 31 | pipeline = Pipeline(InputSource(), Zero()) 32 | 33 | strategies = [ 34 | LegacyStrategy(), 35 | KFoldStrategy(), 36 | ] 37 | 38 | for strategy in strategies: 39 | print 'Strategy', strategy.get_name() 40 | for target in targets: 41 | _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=True, meta_only=True) 42 | # _, interictal_meta = load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=True, meta_only=True) 43 | fold_numbers = strategy.get_folds(preictal_meta) 44 | data = np.arange(0, preictal_meta.X_shape[0]).astype(np.int) 45 | sequence_ranges = collect_sequence_ranges(preictal_meta.sequence) 46 | print '%s\n%d folds from %d sequences %s' % (target, len(fold_numbers), len(sequence_ranges), sequence_ranges) 47 | for fold_number in fold_numbers: 48 | train_folds, cv_folds = strategy.get_sequence_ranges(preictal_meta, fold_number, interictal=False, shuffle=False) 49 | print [list(f) for f in train_folds] 50 | print 51 | 52 | -------------------------------------------------------------------------------- /seizure_prediction/feature_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from seizure_prediction.tasks import load_pipeline_data 4 | 5 | 6 | # Generate random feature masks using split_ratio as the rough guide to how many features are ON and how many are OFF. 7 | def generate_feature_masks(settings, target, pipeline, num_masks, split_ratio, random_state, threshold=500, quiet=False): 8 | if not quiet: print target 9 | def get_pipeline_data(pipeline): 10 | _, preictal_meta = \ 11 | load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, meta_only=True, quiet=quiet) 12 | _, interictal_meta = \ 13 | load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, meta_only=True, quiet=quiet) 14 | num_features = preictal_meta.X_shape[-1] 15 | num_train_segments = preictal_meta.num_segments + interictal_meta.num_segments 16 | return num_features, num_train_segments 17 | 18 | if len(pipeline.get_pipelines()) == 0: 19 | return [] 20 | 21 | num_features, num_training_examples = get_pipeline_data(pipeline) 22 | 23 | # NOTE(mike): Seemingly some patients benefit from these feature masks and some don't. 24 | # Currently the only pattern is number of training examples but this may or may not hold 25 | # true without doing further testing. Some manual testing against public leaderboard showed 26 | # a negative effect on Patient 1 and 2 but positive effects on Dogs 3 and 4. Dog 1 seemed to 27 | # have little to no effect and maybe a very slight positive effect on Dog 2. 28 | if num_training_examples < threshold: 29 | ratio = 1.0 30 | else: 31 | ratio = split_ratio 32 | 33 | if not quiet: print 'num features', num_features 34 | if not quiet: print 'ratio', ratio 35 | if not quiet: print 'num wanted features', int(num_features * ratio) 36 | 37 | if ratio == 1.0: 38 | masks = np.ones((num_masks, num_features)) 39 | else: 40 | prng = np.random.RandomState(random_state) 41 | masks = (prng.random_sample((num_masks, num_features)) <= ratio) 42 | 43 | masks = list(masks.astype(np.int)) 44 | 45 | if not quiet: print np.shape(masks) 46 | if not quiet: print 'generated', [np.sum(mask) for mask in masks] 47 | return list(masks) 48 | 49 | -------------------------------------------------------------------------------- /seizure_prediction/pipeline.py: -------------------------------------------------------------------------------- 1 | 2 | class Pipeline(object): 3 | """ 4 | A Pipeline is an object representing the data transformations to make 5 | on the input data, finally outputting extracted features. 6 | 7 | input_source: Where to source the data from, InputSource() for original data or 8 | InputSource(some_pipeline) to load the output of a pipeline. 9 | transforms: List of transforms to apply one by one to the input data. 10 | """ 11 | def __init__(self, input_source, *transforms): 12 | self.input_source = input_source 13 | input_source_pipeline = input_source.get_pipeline() 14 | self.transforms = transforms 15 | full_pipeline = [input_source_pipeline] + list(transforms) if input_source_pipeline is not None else transforms 16 | names = [t.get_name() for t in full_pipeline] 17 | self.name = 'empty' if len(names) == 0 else '_'.join(names) 18 | 19 | def get_name(self): 20 | return self.name 21 | 22 | def get_names(self): 23 | return [self.name] 24 | 25 | def apply(self, data, meta): 26 | for transform in self.transforms: 27 | data = transform.apply(data, meta) 28 | return data 29 | 30 | def get_input_source(self): 31 | return self.input_source 32 | 33 | def get_pipelines(self): 34 | return [self] 35 | 36 | 37 | class FeatureConcatPipeline(object): 38 | """ 39 | Represents a list of pipelines with their features concatenated together. 40 | Useful for combining separate feature sets to see if they combine well. 41 | """ 42 | def __init__(self, *pipelines, **options): 43 | pipelines = list(pipelines) 44 | if 'sort' not in options or options['sort'] == True: 45 | pipelines.sort(lambda x, y: cmp(x.get_name(), y.get_name())) 46 | self.pipelines = pipelines 47 | self.names = [p.get_name() for p in pipelines] 48 | self.name = 'FCP_' + '_cc_'.join(self.names) 49 | for p in pipelines: 50 | assert isinstance(p, Pipeline) 51 | 52 | def get_name(self): 53 | return self.name 54 | 55 | def get_names(self): 56 | return self.names 57 | 58 | def apply(self, data, meta): 59 | raise NotImplementedError() 60 | 61 | def get_pipelines(self): 62 | return self.pipelines 63 | 64 | def get_input_source(self): 65 | raise NotImplementedError() 66 | 67 | 68 | class InputSource: 69 | """ 70 | Wraps a pipeline to represent it as a data-source. 71 | """ 72 | def __init__(self, *transforms): 73 | self.pipeline = Pipeline(InputSource(), *transforms) if len(transforms) > 0 else None 74 | 75 | def get_pipeline(self): 76 | return self.pipeline 77 | 78 | -------------------------------------------------------------------------------- /seizure_prediction/cross_validation/kfold_strategy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn 3 | from seizure_prediction.cross_validation.sequences import collect_sequence_ranges_from_meta 4 | 5 | class KFoldStrategy: 6 | """ 7 | Create a k-fold strategy focused on preictal segments. The idea is to create a small number of folds 8 | that maximise coverage of the training set. Small number of folds as to keep performance in check. 9 | If there are 3 preictal sequences, then do 3 folds of (0,1), (0,2), (1,2). If there are 6 sequences, 10 | do 3 folds (0,1), (2,3), (4,5). The sequences are shuffled before being allocated to folds. 11 | 12 | However, interictal sequences are partitioned randomly as there are a lot more of them that random 13 | should more or less be fine. 14 | """ 15 | 16 | def get_name(self): 17 | return 'kfold' 18 | 19 | def get_folds(self, preictal_meta): 20 | """ 21 | :param preictal_meta: metadata from preictal segments 22 | :return: iterable of fold numbers to pass to split_train_cv 23 | """ 24 | num_seqs = len(collect_sequence_ranges_from_meta(preictal_meta)) 25 | assert num_seqs >= 2 26 | if num_seqs <= 2: 27 | num_folds = 2 28 | elif num_seqs <= 6: 29 | num_folds = 3 30 | else: 31 | num_folds = num_seqs / 2 32 | 33 | return xrange(num_folds) 34 | 35 | def get_sequence_ranges(self, meta, fold_number, interictal=False, shuffle=True): 36 | seq_ranges = collect_sequence_ranges_from_meta(meta, shuffle=shuffle) 37 | num_seqs = len(seq_ranges) 38 | 39 | # calculate the split numbers for a fold 40 | def get_num_train_seqs(num_seqs): 41 | if num_seqs <= 3: 42 | return 2 43 | else: 44 | return 3 45 | 46 | if interictal: 47 | interictal_ratio = 0.8 if num_seqs <= 20 else 0.4 48 | train_ranges, cv_ranges = sklearn.cross_validation.train_test_split(seq_ranges, train_size=interictal_ratio, random_state=fold_number) 49 | else: 50 | train_size = get_num_train_seqs(num_seqs) 51 | if num_seqs == 3: 52 | combinations = [[0, 1], [0, 2], [1, 2]] 53 | else: 54 | first_pass = [range(i, i + train_size) for i in range(0, num_seqs, train_size) if (i + train_size) <= num_seqs] 55 | remainder = num_seqs % train_size 56 | if remainder == 0: 57 | gap = [] 58 | else: 59 | seq = range(num_seqs - remainder, num_seqs) 60 | needed = train_size - remainder 61 | gap_fillers = [i * train_size for i in range(needed)] 62 | gap_fillers = [x for x in gap_fillers if x < num_seqs] 63 | # print 'gf', gap_fillers 64 | if len(gap_fillers) < train_size: 65 | gap_fillers = [i * (train_size-1) for i in range(needed)] 66 | gap_fillers = [x for x in gap_fillers if x < num_seqs] 67 | gap = [gap_fillers + seq] 68 | second_pass = [range(i, i + train_size**2, train_size) for i in range(num_seqs)] 69 | second_pass = [x for x in second_pass if len(x) == train_size and x < num_seqs] 70 | third_pass = [range(i, i + train_size) for i in range(1, num_seqs, train_size) if (i + train_size) <= num_seqs] 71 | # third_pass = [range(i, i + train_size) for i in range(2, num_seqs, train_size) if (i + train_size) < num_seqs] 72 | combinations = first_pass + gap + second_pass + third_pass 73 | indices = combinations[fold_number] 74 | # print 'indices', indices 75 | train_ranges = [seq_ranges[i] for i in indices] 76 | cv_ranges = np.delete(seq_ranges, indices, axis=0) 77 | 78 | return train_ranges, cv_ranges 79 | 80 | def split_train_cv(self, data, meta, fold_number, interictal=False): 81 | train_ranges, cv_ranges = self.get_sequence_ranges(meta, fold_number, interictal) 82 | 83 | train_data = [] 84 | for start, end in train_ranges: 85 | train_data.append(data[start:end]) 86 | train_data = np.concatenate(train_data, axis=0) 87 | 88 | cv_data = [] 89 | for start, end in cv_ranges: 90 | cv_data.append(data[start:end]) 91 | cv_data = np.concatenate(cv_data, axis=0) 92 | 93 | return train_data, cv_data 94 | -------------------------------------------------------------------------------- /mat_to_hdf5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | from collections import namedtuple 4 | from multiprocessing import Pool 5 | from common.data import jsdict 6 | from common.time import Timer 7 | from seizure_prediction import hdf5 8 | from seizure_prediction.data import accumulate_data 9 | from seizure_prediction.settings import load_settings 10 | import numpy as np 11 | import scipy.io 12 | import scipy.signal 13 | import os.path 14 | import sys 15 | 16 | 17 | Reader = namedtuple('Reader', ['read', 'exists', 'filename']) 18 | 19 | 20 | class Metadata(object): 21 | 22 | def __init__(self): 23 | self.shape = None 24 | self.data_length_sec = None 25 | self.sampling_frequency = None 26 | self.channels = None 27 | self.sequences = [] 28 | 29 | def add_shape(self, shape): 30 | if self.shape is None: 31 | self.shape = shape 32 | else: 33 | assert shape == self.shape 34 | 35 | def add_data_length_sec(self, data_length_sec): 36 | if self.data_length_sec is None: 37 | self.data_length_sec = data_length_sec 38 | else: 39 | assert data_length_sec == self.data_length_sec 40 | 41 | def add_sampling_frequency(self, sampling_frequency): 42 | if self.sampling_frequency is None: 43 | self.sampling_frequency = sampling_frequency 44 | else: 45 | assert sampling_frequency == self.sampling_frequency 46 | 47 | def add_channels(self, channels): 48 | if self.channels is None: 49 | self.channels = channels 50 | else: 51 | assert np.alltrue(channels == self.channels) 52 | 53 | def add_sequence(self, sequence): 54 | if sequence is not None: 55 | self.sequences.append(sequence) 56 | 57 | def __str__(self): 58 | seq_groups = [] 59 | prev = None 60 | prev_start = None 61 | for seq in self.sequences: 62 | if prev_start is None: 63 | prev_start = seq 64 | else: 65 | if seq != prev + 1: 66 | if prev_start == prev: 67 | seq_groups.append('%d' % prev) 68 | else: 69 | seq_groups.append('%d-%d' % (prev_start, prev)) 70 | prev_start = seq 71 | prev = seq 72 | if prev_start is not None: 73 | seq_groups.append('%d-%d' % (prev_start, prev)) 74 | 75 | seq_mega_groups = [] 76 | prev = None 77 | count = 1 78 | for group in seq_groups: 79 | if prev is not None: 80 | if prev != group: 81 | seq_mega_groups.append(('%d of %s' % (count, prev)) if count > 1 else prev) 82 | count = 1 83 | else: 84 | count += 1 85 | prev = group 86 | if prev is not None: 87 | seq_mega_groups.append('%d of %s' % (count, prev) if count > 1 else prev) 88 | 89 | return str({ 90 | 'shape': self.shape, 91 | 'data_length_sec': self.data_length_sec, 92 | 'sampling_frequency': self.sampling_frequency, 93 | 'channels': len(self.channels) if self.channels is not None else None, 94 | 'sequences': seq_mega_groups 95 | }) 96 | 97 | 98 | def process_data_sub_job(settings, filename_in_fmt, filename_out_fmt, id, num_jobs): 99 | 100 | pid = os.getpid() 101 | reader = mat_reader(target, settings.data_dir) 102 | 103 | num_processed = 0 104 | for i in xrange(id + 1, sys.maxint, num_jobs): 105 | out_index = i - 1 106 | filename_in = filename_in_fmt % i 107 | filename_out = filename_out_fmt % out_index if filename_out_fmt is not None else None 108 | filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None 109 | 110 | if filename_out is not None and os.path.exists(filename_out): 111 | num_processed += 1 112 | continue 113 | 114 | if not reader.exists(filename_in): 115 | if i == id + 1: 116 | print 'Could not find file', reader.filename(filename_in) 117 | return 0 118 | break 119 | 120 | print 'Runner %d processing %s' % (id, reader.filename(filename_in)) 121 | 122 | segment = reader.read(filename_in) 123 | data = process_data(segment) 124 | hdf5.write(filename_out_temp, data) 125 | 126 | os.rename(filename_out_temp, filename_out) 127 | 128 | num_processed += 1 129 | 130 | return num_processed 131 | 132 | 133 | def process_data(segment): 134 | data_key = [key for key in segment.keys() if not key.startswith('_')][0] 135 | data = segment[data_key][0][0] 136 | 137 | X = data[0] 138 | data_length_sec = int(data[1][0][0]) 139 | sampling_frequency = float(data[2][0][0]) 140 | channels = [ch[0] for ch in data[3][0]] 141 | sequence = int(data[4][0][0]) if len(data) >= 5 else None 142 | 143 | min_freq = 195.0 144 | def find_q(): 145 | q = 2 146 | while True: 147 | f = sampling_frequency / q 148 | if f < min_freq: 149 | return q - 1 150 | q += 1 151 | 152 | if sampling_frequency > min_freq: 153 | q = find_q() 154 | if q > 1: 155 | # if X.dtype != np.float64: 156 | # X = X.astype(np.float64) 157 | # X -= X.mean(axis=0) 158 | X = scipy.signal.decimate(X, q, ftype='fir', axis=X.ndim-1) 159 | X = np.round(X).astype(np.int16) 160 | # if X.dtype != np.float32: 161 | # X = X.astype(np.float32) 162 | sampling_frequency /= q 163 | 164 | channels = np.array(channels, dtype=str(channels[0].dtype).replace('U', 'S')) 165 | out = { 166 | 'X': X, 167 | 'data_length_sec': data_length_sec, 168 | 'sampling_frequency': sampling_frequency, 169 | 'num_channels': X.shape[0], 170 | 'channels': channels, 171 | 'target': target, 172 | 'data_type': data_type, 173 | } 174 | if sequence is not None: 175 | out['sequence'] = sequence 176 | 177 | return jsdict(out) 178 | 179 | 180 | #used for verifying and printing 181 | def collect_metadata(data, metadata_accum): 182 | metadata_accum.add_shape(data.X.shape) 183 | metadata_accum.add_data_length_sec(data.data_length_sec) 184 | metadata_accum.add_sampling_frequency(data.sampling_frequency) 185 | metadata_accum.add_channels(data.channels) 186 | if 'sequence' in data: 187 | metadata_accum.add_sequence(data.sequence) 188 | 189 | 190 | def process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs): 191 | filename_out = os.path.join(out_dir, '%s_%s.hdf5' % (target, data_type)) 192 | 193 | if os.path.exists(filename_out): 194 | return 0 195 | 196 | print 'Processing %s ...' % filename_out 197 | 198 | filename_in_fmt = '%s_%s_segment_%%.4d' % (target, data_type) 199 | filename_out_fmt = '%s/%s_%s_segment_%%d.hdf5' % (out_dir, target, data_type) 200 | 201 | # process_data_sub_job(settings, filename_in_fmt, filename_out_fmt, 0, 1) 202 | pool = Pool(N_jobs) 203 | results = [pool.apply_async(process_data_sub_job, [settings, filename_in_fmt, filename_out_fmt, id, N_jobs]) 204 | for id in range(N_jobs)] 205 | pool.close() 206 | pool.join() 207 | 208 | num_processed = np.sum([r.get() for r in results]) 209 | for i in xrange(num_processed): 210 | data = hdf5.read(filename_out_fmt % i) 211 | collect_metadata(data, metadata) 212 | 213 | _, accum_meta = accumulate_data(settings, target, data_type, tag=None, 214 | output_to_original_data_dir=True, quiet=True) 215 | 216 | return accum_meta.num_segments 217 | 218 | 219 | def mat_reader(target, dir): 220 | ext = '.mat' 221 | expand_filename = lambda filename: os.path.join(dir, target, filename + ext) 222 | read = lambda filename: scipy.io.loadmat(expand_filename(filename)) 223 | exists = lambda filename: os.path.exists(expand_filename(filename)) 224 | return Reader(read=read, exists=exists, filename=expand_filename) 225 | 226 | 227 | def process_mat_into_hdf5(settings, target, data_type, N_jobs): 228 | assert data_type in ('preictal', 'interictal', 'test') 229 | 230 | print 'Loading data ...' 231 | timer = Timer() 232 | 233 | out_dir = os.path.join(settings.data_dir) 234 | metadata = Metadata() 235 | segments_processed = process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs) 236 | 237 | print 'Processed %d segments in %s' % (segments_processed, timer.pretty_str()) 238 | print data_type, 'Metadata', metadata 239 | 240 | 241 | if __name__ == "__main__": 242 | 243 | settings = load_settings() 244 | N_jobs = 8 245 | 246 | data_types = [ 247 | 'preictal', 248 | 'interictal', 249 | 'test' 250 | ] 251 | 252 | targets = [ 253 | 'Dog_1', 254 | 'Dog_2', 255 | 'Dog_3', 256 | 'Dog_4', 257 | 'Dog_5', 258 | 'Patient_1', 259 | 'Patient_2' 260 | ] 261 | 262 | for target in targets: 263 | for data_type in data_types: 264 | process_mat_into_hdf5(settings, target, data_type, N_jobs) 265 | -------------------------------------------------------------------------------- /ensemble.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | from multiprocessing import Pool 4 | import sys 5 | 6 | import numpy as np 7 | from sklearn.metrics import roc_auc_score 8 | 9 | from seizure_prediction.classifiers import make_svm, make_simple_lr, make_lr 10 | from seizure_prediction.feature_selection import generate_feature_masks 11 | from seizure_prediction.fft_bins import * 12 | from seizure_prediction.pipeline import Pipeline, FeatureConcatPipeline, InputSource 13 | from seizure_prediction.scores import get_score_summary, print_results 14 | from seizure_prediction.tasks import make_csv_for_target_predictions, write_submission_file, \ 15 | cross_validation_score, check_training_data_loaded, check_test_data_loaded, make_submission_predictions 16 | from seizure_prediction.transforms import Windower, Correlation, FreqCorrelation, FFT, \ 17 | Magnitude, PIBSpectralEntropy, Log10, FreqBinning, FlattenChannels, Preprocess, HFD, PFD, Hurst 18 | from seizure_prediction.settings import load_settings 19 | from main import run_prepare_data_for_cross_validation 20 | 21 | 22 | def run_make_submission(settings, targets_and_pipelines, split_ratio): 23 | pool = Pool(settings.N_jobs) 24 | for i, (target, pipeline, feature_masks, classifier, classifier_name) in enumerate(targets_and_pipelines): 25 | for j, feature_mask in enumerate(feature_masks): 26 | progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks)) 27 | pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name], 28 | {'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': True}) 29 | pool.close() 30 | pool.join() 31 | 32 | guesses = ['clip,preictal'] 33 | num_masks = None 34 | classifier_names = [] 35 | for target, pipeline, feature_masks, classifier, classifier_name in targets_and_pipelines: 36 | classifier_names.append(classifier_name) 37 | if num_masks is None: 38 | num_masks = len(feature_masks) 39 | else: 40 | assert num_masks == len(feature_masks) 41 | 42 | test_predictions = [] 43 | 44 | for feature_mask in feature_masks: 45 | data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask) 46 | test_predictions.append(data.mean_predictions) 47 | 48 | predictions = np.mean(test_predictions, axis=0) 49 | guesses += make_csv_for_target_predictions(target, predictions) 50 | 51 | output = '\n'.join(guesses) 52 | write_submission_file(settings, output, 'ensemble n=%d split_ratio=%s' % (num_masks, split_ratio), None, str(classifier_names), targets_and_pipelines) 53 | 54 | 55 | def run_prepare_data(settings, targets, pipelines, train=True, test=False, quiet=False): 56 | for pipeline in pipelines: 57 | for target in targets: 58 | print 'Preparing data for', target 59 | if train: 60 | check_training_data_loaded(settings, target, pipeline, quiet=quiet) 61 | if test: 62 | check_test_data_loaded(settings, target, pipeline, quiet=quiet) 63 | 64 | 65 | def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers): 66 | pool = Pool(settings.N_jobs) 67 | for i, pipeline in enumerate(pipelines): 68 | for j, (classifier, classifier_name) in enumerate(classifiers): 69 | for k, target in enumerate(targets): 70 | pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True}) 71 | for split_num, split_ratio in enumerate(split_ratios): 72 | masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True) 73 | for mask_num, mask in enumerate(masks): 74 | progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks)) 75 | cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str) 76 | pool.close() 77 | pool.join() 78 | print 'Finished cross validation mp' 79 | 80 | summaries = [] 81 | for p_num, pipeline in enumerate(pipelines): 82 | for classifier, classifier_name in classifiers: 83 | scores_full = [] 84 | scores_masked = [[[] for y in mask_range] for x in split_ratios] 85 | for i, target in enumerate(targets): 86 | run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True) 87 | data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True) 88 | scores_full.append(data.mean_score) 89 | 90 | for split_index, split_ratio in enumerate(split_ratios): 91 | masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True) 92 | for mask_index, num_masks in enumerate(mask_range): 93 | predictions = [] 94 | y_cvs = None 95 | for mask in masks[0:num_masks]: 96 | data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True) 97 | predictions.append(data.mean_predictions) 98 | if y_cvs is None: 99 | y_cvs = data.y_cvs 100 | else: 101 | for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs): 102 | assert np.alltrue(y_cv_1 == y_cv_2) 103 | 104 | predictions = np.mean(predictions, axis=0) 105 | scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)] 106 | score = np.mean(scores) 107 | scores_masked[split_index][mask_index].append(score) 108 | 109 | summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full) 110 | summaries.append((summary, np.mean(scores_full))) 111 | for split_index, split_ratio in enumerate(split_ratios): 112 | for mask_index, num_masks in enumerate(mask_range): 113 | scores = scores_masked[split_index][mask_index] 114 | summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores) 115 | summaries.append((summary, np.mean(scores))) 116 | print summary 117 | 118 | print_results(summaries) 119 | 120 | 121 | def main(): 122 | settings = load_settings() 123 | 124 | pipelines = [ 125 | FeatureConcatPipeline( 126 | Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), 127 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), 128 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()), 129 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), 130 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), 131 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), 132 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), 133 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), 134 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), 135 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), 136 | Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), 137 | Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), 138 | Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), 139 | ), 140 | ] 141 | 142 | targets = [ 143 | 'Dog_1', 144 | 'Dog_2', 145 | 'Dog_3', 146 | 'Dog_4', 147 | 'Dog_5', 148 | 'Patient_1', 149 | 'Patient_2' 150 | ] 151 | 152 | classifiers = [ 153 | make_svm(gamma=0.0079, C=2.7), 154 | make_svm(gamma=0.0068, C=2.0), 155 | make_svm(gamma=0.003, C=150.0), 156 | make_lr(C=0.04), 157 | make_simple_lr(), 158 | ] 159 | 160 | 161 | make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission' 162 | do_cv = not make_submission 163 | 164 | if do_cv: 165 | mask_range = [3] 166 | split_ratios = [0.4, 0.525, 0.6] 167 | run_prepare_data_for_cross_validation(settings, targets, pipelines) 168 | run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers) 169 | 170 | if make_submission: 171 | num_masks = 10 172 | split_ratio = 0.525 173 | classifiers = [ 174 | # make_svm(gamma=0.0079, C=2.7), 175 | make_svm(gamma=0.0068, C=2.0), 176 | # make_svm(gamma=0.003, C=150.0), 177 | # make_lr(C=0.04), 178 | # make_simple_lr(), 179 | ] 180 | 181 | targets_and_pipelines = [] 182 | pipeline = pipelines[0] 183 | for classifier, classifier_name in classifiers: 184 | for i, target in enumerate(targets): 185 | run_prepare_data(settings, [target], [pipeline], test=True) 186 | feature_masks = generate_feature_masks(settings, target, pipeline, num_masks, split_ratio, random_state=0, quiet=True) 187 | targets_and_pipelines.append((target, pipeline, feature_masks, classifier, classifier_name)) 188 | 189 | run_make_submission(settings, targets_and_pipelines, split_ratio) 190 | 191 | 192 | if __name__ == "__main__": 193 | main() 194 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | from multiprocessing import Pool 4 | import sys 5 | 6 | import numpy as np 7 | 8 | from seizure_prediction.classifiers import make_svm, make_lr, make_simple_lr 9 | from seizure_prediction.cross_validation.kfold_strategy import KFoldStrategy 10 | from seizure_prediction.cross_validation.legacy_strategy import LegacyStrategy 11 | from seizure_prediction.pipeline import Pipeline, FeatureConcatPipeline, InputSource 12 | from seizure_prediction.scores import get_score_summary, print_results 13 | from seizure_prediction.tasks import make_submission_csv, cross_validation_score, \ 14 | write_submission_file, check_training_data_loaded, check_test_data_loaded 15 | from seizure_prediction.transforms import FFT, Magnitude, Log10, Windower, \ 16 | Correlation, FreqCorrelation, FlattenChannels, \ 17 | Hurst, PFD, PIBSpectralEntropy, FreqBinning, HFD, Preprocess 18 | from seizure_prediction.settings import load_settings 19 | from seizure_prediction.fft_bins import * 20 | 21 | 22 | # cross_validation_strategy = KFoldStrategy() 23 | cross_validation_strategy = LegacyStrategy() 24 | 25 | 26 | def run_prepare_data_for_cross_validation(settings, targets, pipelines, quiet=False): 27 | if not quiet: print '\n'.join([p.get_name() for p in pipelines]) 28 | for i, pipeline in enumerate(pipelines): 29 | for j, target in enumerate(targets): 30 | if not quiet: print 'Running prepare data', 'P=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(targets)) 31 | check_training_data_loaded(settings, target, pipeline) 32 | 33 | 34 | def run_prepare_data_for_submission(settings, targets, pipelines): 35 | for pipeline in pipelines: 36 | for target in targets: 37 | print 'Running %s pipeline %s' % (target, pipeline.get_name()) 38 | check_training_data_loaded(settings, target, pipeline) 39 | check_test_data_loaded(settings, target, pipeline) 40 | 41 | 42 | def run_cross_validation(settings, targets, classifiers, pipelines): 43 | print 'Cross-validation task' 44 | print 'Targets', ', '.join(targets) 45 | print 'Pipelines:\n ', '\n '.join([p.get_name() for p in pipelines]) 46 | print 'Classifiers', ', '.join([c[1] for c in classifiers]) 47 | 48 | run_prepare_data_for_cross_validation(settings, targets, pipelines) 49 | 50 | # run on pool first, then show results after 51 | pool = Pool(settings.N_jobs) 52 | for i, pipeline in enumerate(pipelines): 53 | for j, (classifier, classifier_name) in enumerate(classifiers): 54 | for k, target in enumerate(targets): 55 | progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets)) 56 | cross_validation_score(settings, target, pipeline, classifier, classifier_name, 57 | strategy=cross_validation_strategy, pool=pool, progress_str=progress_str, return_data=False, quiet=True) 58 | pool.close() 59 | pool.join() 60 | 61 | summaries = [] 62 | best = {} 63 | for p_num, pipeline in enumerate(pipelines): 64 | for c_num, (classifier, classifier_name) in enumerate(classifiers): 65 | mean_scores = [] 66 | median_scores = [] 67 | datas = [] 68 | for target in targets: 69 | print 'Running %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name) 70 | data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, 71 | strategy=cross_validation_strategy, quiet=True) 72 | datas.append(data) 73 | if data.mean_score != data.median_score: 74 | print '%.3f (mean)' % data.mean_score, data.mean_scores 75 | print '%.3f (median)' % data.median_score, data.median_scores 76 | else: 77 | print '%.3f' % data.mean_score 78 | mean_scores.append(data.mean_score) 79 | median_scores.append(data.median_score) 80 | 81 | best_score = best.get(target, [0, None, None, None])[0] 82 | cur_score = max(data.mean_score, data.median_score) 83 | if cur_score > best_score: 84 | best[target] = [cur_score, pipeline, classifier, classifier_name] 85 | 86 | name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name, pipeline.get_name()) 87 | summary = get_score_summary(name, mean_scores) 88 | summaries.append((summary, np.mean(mean_scores))) 89 | print summary 90 | name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name, pipeline.get_name()) 91 | summary = get_score_summary(name, median_scores) 92 | summaries.append((summary, np.mean(median_scores))) 93 | print summary 94 | 95 | print_results(summaries) 96 | 97 | print '\nbest' 98 | for target in targets: 99 | pipeline = best[target][1] 100 | classifier_name = best[target][3] 101 | print target, best[target][0], classifier_name, pipeline.get_names() 102 | 103 | 104 | def run_make_submission(settings, targets, classifiers, pipelines): 105 | print 'Submissions task' 106 | print 'Targets', ', '.join(targets) 107 | print 'Pipelines', ', '.join([p.get_name() for p in pipelines]) 108 | print 'Classifiers', ', '.join([c[1] for c in classifiers]) 109 | 110 | run_prepare_data_for_submission(settings, targets, pipelines) 111 | 112 | pool = Pool(settings.N_jobs) 113 | for pipeline in pipelines: 114 | for classifier, classifier_name in classifiers: 115 | for target in targets: 116 | pool.apply_async(make_submission_csv, [settings, target, pipeline, classifier, classifier_name]) 117 | pool.close() 118 | pool.join() 119 | 120 | use_median_submissions = False 121 | 122 | for pipeline in pipelines: 123 | for classifier, classifier_name in classifiers: 124 | guesses_mean = ['clip,preictal'] 125 | guesses_median = ['clip,preictal'] 126 | for target in targets: 127 | print 'Target %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name) 128 | predictions_mean, predictions_median = make_submission_csv(settings, target, pipeline, classifier, classifier_name) 129 | guesses_mean += predictions_mean 130 | guesses_median += predictions_median 131 | 132 | mean_output = '\n'.join(guesses_mean) 133 | median_output = '\n'.join(guesses_median) 134 | 135 | out = [] 136 | if use_median_submissions and mean_output != median_output: 137 | out.append((mean_output, 'mean')) 138 | out.append((median_output, 'median')) 139 | else: 140 | out.append((mean_output, None)) 141 | 142 | for guesses, name in out: 143 | write_submission_file(settings, guesses, name, pipeline, classifier_name) 144 | 145 | 146 | 147 | def main(): 148 | 149 | settings = load_settings() 150 | 151 | targets = [ 152 | 'Dog_1', 153 | 'Dog_2', 154 | 'Dog_3', 155 | 'Dog_4', 156 | 'Dog_5', 157 | 'Patient_1', 158 | 'Patient_2' 159 | ] 160 | 161 | pipelines = [ 162 | FeatureConcatPipeline( 163 | Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), 164 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), 165 | 166 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()), 167 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), 168 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), 169 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), 170 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), 171 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), 172 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), 173 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), 174 | 175 | Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), 176 | Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), 177 | Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), 178 | ), 179 | ] 180 | 181 | classifiers = [ 182 | make_svm(gamma=0.0079, C=2.7), 183 | make_svm(gamma=0.0068, C=2.0), 184 | make_svm(gamma=0.003, C=150.0), 185 | make_lr(C=0.04), 186 | make_simple_lr(), 187 | ] 188 | 189 | submission_pipelines = [ 190 | FeatureConcatPipeline( 191 | Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), 192 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), 193 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()), 194 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), 195 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), 196 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), 197 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), 198 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), 199 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), 200 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), 201 | Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), 202 | Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), 203 | Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), 204 | ), 205 | ] 206 | 207 | submission_classifiers = [ 208 | make_simple_lr(), 209 | ] 210 | 211 | if len(sys.argv) >= 2 and sys.argv[1] == 'submission': 212 | run_make_submission(settings, targets, submission_classifiers, submission_pipelines) 213 | else: 214 | run_cross_validation(settings, targets, classifiers, pipelines) 215 | 216 | 217 | if __name__ == "__main__": 218 | main() 219 | 220 | -------------------------------------------------------------------------------- /seizure_prediction/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from common.data import jsdict 3 | from common.time import Timer 4 | import os.path 5 | from multiprocessing import Pool 6 | import h5py 7 | import sys 8 | import re 9 | import glob 10 | 11 | 12 | def read_hdf5_segment(file, key, start=None, end=None): 13 | dset = file[key] 14 | meta = {} 15 | for key, value in dset.attrs.iteritems(): 16 | meta[key] = value 17 | 18 | if start is None and end is None: 19 | X = dset[:] 20 | else: 21 | if start >= dset.shape[0]: 22 | return None 23 | if (start + 1 == end): 24 | X = dset[start] 25 | else: 26 | X = dset[start:end] 27 | 28 | return X, meta 29 | 30 | 31 | def write_hdf5_segment(file, key, data, meta=None): 32 | dset = file.create_dataset(key, data=data) 33 | 34 | if meta is not None: 35 | for key, value in meta.iteritems(): 36 | dset.attrs[key] = value 37 | # print key, value 38 | 39 | 40 | # NOTE(mike): just doing np.array(list_of_numpy_arrays) seems really slow, 41 | # This seems to be a bit faster. However I really need to do some benchmarking 42 | # to determine what is the fastest method. 43 | def to_np_array(X): 44 | if isinstance(X[0], np.ndarray): 45 | # return np.vstack(X) 46 | out = np.empty([len(X)] + list(X[0].shape), dtype=X[0].dtype) 47 | for i, x in enumerate(X): 48 | out[i] = x 49 | return out 50 | 51 | return np.array(X) 52 | 53 | # The worker method for a process to work on it's subset of the data. It will push 54 | # the data through the pipeline working on 1 segment at a time. Segments are pulled 55 | # in 1 at a time to keep working-set of memory to a minimum. 56 | def process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn): 57 | if not os.path.exists(filename_in): 58 | return 0 59 | 60 | pid = os.getpid() 61 | 62 | num_processed = 0 63 | for i in xrange(id, sys.maxint, num_jobs): 64 | 65 | filename_out = filename_out_fmt % i if filename_out_fmt is not None else None 66 | # Use temp filename then rename the completed file to the proper name. 67 | # This is more or less an atomic update. Cancelling the program should 68 | # never leave data in a half-written state. Hence only the tempfile 69 | # will be in a half-written state and the pid determines when the process 70 | # is still alive and still processing the data. An inactive pid means the 71 | # tempfile is trash and can be deleted. 72 | filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None 73 | 74 | if filename_out is not None and os.path.exists(filename_out): 75 | num_processed += 1 76 | continue 77 | 78 | with h5py.File(filename_in, 'r') as f: 79 | segment = read_hdf5_segment(f, 'X', start=i, end=i+1) 80 | if segment is None: 81 | break 82 | X, meta = segment 83 | 84 | data_obj = {} 85 | for k, v in meta.iteritems(): 86 | data_obj[k] = v 87 | 88 | # save disk space 89 | if X.dtype != np.float32: 90 | X = X.astype(np.float32) 91 | 92 | X = process_data_fn(X, jsdict(data_obj)) 93 | 94 | if filename_out is not None: 95 | with h5py.File(filename_out_temp, 'w', libver='latest') as f: 96 | if X.dtype != np.float32: 97 | X = X.astype(np.float32) 98 | write_hdf5_segment(f, 'X', X) 99 | 100 | os.rename(filename_out_temp, filename_out) 101 | 102 | num_processed += 1 103 | 104 | return num_processed 105 | 106 | # filenames for single accumulated file 107 | def single_filename_builder(target, data_type, dir, tag=None): 108 | if tag is not None: 109 | filename = '%s_%s_%s.hdf5' % (target, data_type, tag) 110 | else: 111 | filename = '%s_%s.hdf5' % (target, data_type) 112 | 113 | return os.path.join(dir, filename) 114 | 115 | 116 | # filenames for individual segments before they get accumulated into one big file 117 | def segment_filename_builder(target, data_type, dir, tag=None): 118 | if tag is not None: 119 | filename = '%s_%s_%s_segment_%%d.hdf5' % (target, data_type, tag) 120 | else: 121 | filename = '%s_%s_segment_%%d.hdf5' % (target, data_type) 122 | 123 | return os.path.join(dir, filename) 124 | 125 | # glue code around process_data_sub_job to setup input/output destinations and the 126 | # processing method (applying pipeline on input data) 127 | def process_data_job(settings, target, data_type, id, num_jobs, pipeline): 128 | 129 | def process(data, meta): 130 | out = pipeline.apply(data, meta) 131 | return out 132 | 133 | input_source = pipeline.get_input_source() 134 | input_source_pipeline = input_source.get_pipeline() 135 | input_tag = input_source_pipeline.get_name() if input_source_pipeline is not None else None 136 | input_data_dir = settings.data_dir if input_tag is None else settings.cache_dir 137 | filename_in = single_filename_builder(target, data_type, input_data_dir, input_tag) 138 | filename_out_fmt = segment_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name()) 139 | return process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn=process) 140 | 141 | # Accumulates N segments into a single file as it is faster to load data this way. 142 | def accumulate_data(settings, target, data_type, tag, output_to_original_data_dir=False, quiet=False, meta_only=False): 143 | output_dir = settings.data_dir if output_to_original_data_dir else settings.cache_dir 144 | filename_out = single_filename_builder(target, data_type, output_dir, tag) 145 | orig_filename_in = single_filename_builder(target, data_type, settings.data_dir) 146 | 147 | def collect_meta(filename): 148 | meta = {} 149 | with h5py.File(filename, 'r') as f: 150 | meta['num_segments'] = f['X'].shape[0] 151 | if 'sequence' in f.keys(): 152 | meta['sequence'] = f['sequence'][:] 153 | for k, v in f['X'].attrs.iteritems(): 154 | meta[k] = v 155 | return meta 156 | 157 | # load already processed output file 158 | if os.path.exists(filename_out): 159 | # pull meta off original data 160 | meta = collect_meta(orig_filename_in) 161 | 162 | # pull X data off processed data 163 | with h5py.File(filename_out, 'r') as f: 164 | meta['X_shape'] = f['X'].shape 165 | X = f['X'][:] if not meta_only else None 166 | if not quiet: print 'from cache ...', 167 | return X, jsdict(meta) 168 | else: 169 | # get ready to process all segments into 1 file, starting with getting the meta-data ready 170 | if not quiet: print 'processing ...', 171 | pid = os.getpid() 172 | filename_in_fmt = segment_filename_builder(target, data_type, output_dir, tag) 173 | 174 | orig_filename_in = single_filename_builder(target, data_type, settings.data_dir) 175 | 176 | # meta-data is collected differently when doing the first data conversion from mat to hdf5 177 | if output_to_original_data_dir: 178 | print 'Collecting metadata...' 179 | # Creating original files... pull metadata off first one, and also collect sequences 180 | meta = None 181 | sequence = [] 182 | num_segments = 0 183 | for i in xrange(0, sys.maxint, 1): 184 | filename = filename_in_fmt % i 185 | if not os.path.exists(filename): 186 | if num_segments == 0: 187 | print 'Could not find file ', filename 188 | sys.exit(1) 189 | break 190 | 191 | with h5py.File(filename, 'r') as f_in: 192 | meta_attrs = f_in['__metadata'].attrs 193 | if 'sequence' in meta_attrs: 194 | sequence.append(meta_attrs['sequence']) 195 | 196 | if meta is None: 197 | meta = {} 198 | meta['channels'] = f_in['channels'][:] 199 | for key in meta_attrs.keys(): 200 | if key != 'sequence': 201 | meta[key] = meta_attrs[key] 202 | num_segments += 1 203 | 204 | if len(sequence) > 0: 205 | meta['sequence'] = sequence 206 | 207 | meta['num_segments'] = num_segments 208 | 209 | print 'Accumulating segments...' 210 | else: 211 | # pull metadata off the original data files 212 | meta = collect_meta(orig_filename_in) 213 | 214 | # now accumulate X data to a single file 215 | num_segments = meta['num_segments'] 216 | filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None 217 | with h5py.File(filename_out_temp, 'w-', libver='latest') as f_out: 218 | X_out = None 219 | for i in xrange(num_segments): 220 | with h5py.File(filename_in_fmt % i, 'r') as f_in: 221 | X_in = f_in['X'] 222 | # init X_out 223 | if X_out is None: 224 | X_out = f_out.create_dataset('X', shape=[num_segments] + list(X_in.shape), dtype=X_in.dtype) 225 | meta['X_shape'] = X_out.shape 226 | for k, v in meta.iteritems(): 227 | X_out.attrs[k] = v 228 | 229 | X_out[i] = X_in[:] 230 | X = X_out[:] 231 | 232 | # finalize 233 | os.rename(filename_out_temp, filename_out) 234 | # clean up 235 | for i in xrange(num_segments): 236 | try: 237 | os.remove(filename_in_fmt % i) 238 | except: 239 | pass 240 | 241 | return X, jsdict(meta) 242 | 243 | 244 | # helper to check whether data exists in the data cache 245 | def data_exists(settings, target, data_type, pipeline): 246 | filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name()) 247 | return os.path.exists(filename_out) 248 | 249 | 250 | # Multi-process data loading, data segments are processed through the given pipeline, then are accumulated 251 | # to a single file. 252 | # 253 | # check_only: returns True if data exists else false 254 | # quiet: suppress prints if True 255 | # meta_only: Actual X data is not fetched if meta_only is True, useful for light-weight data-loading 256 | # to check number of training samples or number of features. 257 | def load_data_mp(settings, target, data_type, pipeline, check_only=False, quiet=False, meta_only=False): 258 | filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name()) 259 | filename_out_exists = os.path.exists(filename_out) 260 | if check_only: 261 | return filename_out_exists 262 | 263 | input_source = pipeline.get_input_source() 264 | input_source_pipeline = input_source.get_pipeline() 265 | if input_source_pipeline is not None: 266 | if not load_data_mp(settings, target, data_type, input_source_pipeline, check_only=True, quiet=quiet, meta_only=meta_only): 267 | if not quiet: print 'Preparing input source', input_source_pipeline.get_name() 268 | load_data_mp(settings, target, data_type, input_source_pipeline, check_only=False, quiet=quiet, meta_only=meta_only) 269 | if not quiet: print 'Input source ready' 270 | 271 | 272 | if not quiet: print 'Loading %s data ...' % data_type, 273 | timer = Timer() 274 | 275 | # TODO(mike): re-implement tmpfile cleanup that isn't really slow in the face of the genetic algorithm 276 | # spamming the disk with cross-validation score files. 277 | 278 | # clear cache of tmp files 279 | # regex = re.compile(r""".*\.pid\.(\d+)""") 280 | # for file in glob.glob(os.path.join(settings.cache_dir, '*.tmp')): 281 | # match = regex.match(file) 282 | # assert match is not None 283 | # pid = int(match.group(1)) 284 | # try: 285 | # os.getpgid(pid) 286 | # except: 287 | # print 'Removing', file 288 | # os.remove(file) 289 | 290 | if not filename_out_exists: 291 | # DEBUG 292 | debug = False 293 | # debug = True 294 | if debug: 295 | print 'DEBUG' 296 | process_data_job(settings, target, data_type, 0, 1, pipeline) 297 | print 'Done' 298 | else: 299 | pool = Pool(settings.N_jobs) 300 | [pool.apply_async(process_data_job, [settings, target, data_type, i, settings.N_jobs, pipeline]) for i in range(settings.N_jobs)] 301 | pool.close() 302 | pool.join() 303 | 304 | accum, accum_meta = accumulate_data(settings, target, data_type, pipeline.get_name(), quiet=quiet, meta_only=meta_only) 305 | 306 | if not quiet: print 'prepared %d segments in %s %s %s' % (accum_meta.num_segments, timer.pretty_str(), accum_meta.X_shape, pipeline.get_name()) 307 | 308 | return accum, accum_meta 309 | 310 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Seizure Prediction 2 | 3 | This repository contains the code I used for the American Epilepsy Society Seizure's 4 | Prediction Challenge on Kaggle. 5 | 6 | http://www.kaggle.com/c/seizure-prediction 7 | 8 | As a side note this won't generate my exact submission as the randomness was affected 9 | after cleaning up the code. It doesn't score as well which demonstrates the fragility 10 | of my approach. I have also included the linear regression approach as used by 11 | Jonathan Tapson. It makes my genetic algorithm and random feature mask ensembling a 12 | little redundant, hence I use his approach in `main.py`, but demonstrate my own approaches 13 | in `genetic.py` and `ensemble.py` 14 | 15 | I discuss further down my genetic algorithm approach and the features I used. Taking a 16 | look at the code might also yield more insights. 17 | 18 | You probably need 100-150GB free disk space to run this code. 19 | 20 | ###Hardware / OS platform used 21 | 22 | * 15" Retina MacBook Pro (Late 2013) 2.7GHz Core i7, 16GB RAM 23 | * OS X Mavericks 24 | * 512GB SSD 25 | 26 | ###Dependencies 27 | 28 | ####Required 29 | 30 | * Python 2.7 (I used built-in OS X Python 2.7.6) 31 | * scikit\_learn-0.15.2 32 | * numpy-1.9.0 33 | * pandas-0.14.1 34 | * scipy-0.14.0 35 | * h5py-2.3.1 36 | * hdf5 (see http://www.hdfgroup.org/HDF5) 37 | * deap-1.0 38 | 39 | ####Optional (to try out various data transforms) 40 | 41 | * spectrum (for auto-regressive model) 42 | 43 | ### SETTINGS.json 44 | 45 | ``` 46 | { 47 | "competition-data-dir": "data", 48 | "data-cache-dir": "data-cache", 49 | "submission-dir": "submissions", 50 | "num-jobs": "auto" 51 | } 52 | ``` 53 | 54 | * `competition-data-dir`: directory containing the downloaded competition data 55 | * `data-cache-dir`: directory the task framework will store cached data 56 | * `submission-dir`: directory submissions are written to 57 | * `num-jobs`: "auto" or integer specifying number of processes to use in multiprocessing Pool 58 | 59 | ### Getting started 60 | 61 | #### Preprocess data into hdf5 format 62 | 63 | First place the competition data under ./data/ (or as specified in SETTINGS.json) 64 | 65 | ``` 66 | data/Dog_1/Dog_1_preictal_segment_0001.mat 67 | data/Dog_1/Dog_1_preictal_segment_0002.mat 68 | ... 69 | 70 | ``` 71 | 72 | Then run the `mat_to_hdf5.py` script. 73 | 74 | ``` 75 | $ ./mat_to_hdf5.py 76 | Loading data ... 77 | Processing data/Dog_1_preictal.hdf5 ... 78 | Runner 0 processing data/Dog_1/Dog_1_preictal_segment_0001.mat 79 | Runner 1 processing data/Dog_1/Dog_1_preictal_segment_0002.mat 80 | Runner 2 processing data/Dog_1/Dog_1_preictal_segment_0003.mat 81 | Runner 3 processing data/Dog_1/Dog_1_preictal_segment_0004.mat 82 | Runner 4 processing data/Dog_1/Dog_1_preictal_segment_0005.mat 83 | Runner 5 processing data/Dog_1/Dog_1_preictal_segment_0006.mat 84 | Runner 6 processing data/Dog_1/Dog_1_preictal_segment_0007.mat 85 | Runner 7 processing data/Dog_1/Dog_1_preictal_segment_0008.mat 86 | ... 87 | ``` 88 | 89 | This took ~38 minutes to run on my machine to process all the patients. After this is done you 90 | can feel free to delete the original matlab files as my code generates hdf5 files to replace them. 91 | 92 | All patients have their signals decimated down to 200Hz to save disk space and improve processing times. 93 | 94 | #### Run cross-validation with full-features 95 | ``` 96 | ./main.py 97 | ``` 98 | 99 | #### Make a submission 100 | ``` 101 | ./main.py submission 102 | ``` 103 | 104 | This takes ~30 minutes on my machine with an empty data-cache. 105 | 106 | ### Three build variants (main/ensemble/genetic) 107 | 108 | ### main.py 109 | 110 | This file contains the initial standard setup training per-patient models and not doing any 111 | sub-feature selection. The default selected classifier for submission is linear regression. 112 | A list of classifiers are used in cross-validation to compare scores. 113 | 114 | ``` 115 | ./main.py 116 | ./main.py submission 117 | ``` 118 | 119 | ### ensemble.py 120 | 121 | This file contains the ensemble variant, generating N random feature masks, training N models 122 | per-patient, and then averaging those N models predictions. I did not find the cross-validation 123 | to be of much use but I left it in anyway. For submission this lead to better scores than 124 | `main.py` when using SVC with specific parameters `gamma=0.0079` and `C=2.7`. For these 125 | parameters `main.py` would achieve around 0.796 on public LB, and this ensembling approach 126 | would achieve around 0.829. I later learned that using different parameters `gamma=0.003` and 127 | `C=150.0` I could achieve similar scores around 0.829 without any ensembling. 128 | 129 | I mostly used N=10 masks. 130 | 131 | ``` 132 | ./ensemble.py 133 | ./ensemble.py submission 134 | ``` 135 | 136 | ### genetic.py 137 | 138 | This file contains my genetic algorithm approach. This is what I used for my 5th place submission. 139 | However the code as it is right now will not generate my exact submission as I renamed some of the 140 | transforms which changed some orderings and randomness which led to different CV results and 141 | ultimately different selected feature masks. It doesn't score too far off though. 142 | 143 | The genetic algorithm starts with population size of 30 and runs for 10 generations. The population 144 | is initialised with random feature masks consisting of roughly 55% features activated and the other 145 | 45% masked away. The fitness function is simply CV ROC AUC score. 146 | 147 | This is quite slow, taking on the order of 1-2 hours to run. I also ran 3 sets of genetic algorithm, 148 | each using a different subset of the features. I believe this to more or less just be myself optimising 149 | random chance against the public LB. 150 | 151 | Other than the 3 feature groups, other features which appeared to not benefit from the genetic algorithm 152 | instead used random feature masks. Two masks were used for each feature group, 2 of the best masks 153 | for each of the GA groups and 2 random masks for the random groups. Again optimising against the 154 | leaderboard, a 52.5% active features ratio was used for the random feature masks. 155 | 156 | To be honest this is all a bit of voodoo, and using the linear regression approach more or less makes 157 | all of this a waste of time. Later testing showed that only Dog\_3 and Dog\_4 really benefited from 158 | the sub-feature mask ensembling. Dog\_1 showed little change, Dog\_2 a very minor improvement, I 159 | didn't test Dog\_5. Patient\_1 and Patient\_2 actually always performed worse when using sub-feature 160 | masks whether genetic or random. There was correlation in training sample size and having a benefit 161 | from feature masks, so I didn't use feature masks when the number of training samples was less than 162 | 500 (excludes Patient 1 and 2). More testing needs to be done to actually verify that's the right 163 | thing to do. 164 | 165 | ``` 166 | ./genetic.py 167 | ./genetic.py submission 168 | ``` 169 | 170 | ### Features used 171 | 172 | * Time correlation matrix upper right triangle and sorted eigenvalues 173 | * Frequency correlation matrix upper right triangle and sorted eigenvalues (omits 0Hz bucket) 174 | * FFT Magnitude Log10 buckets for various ranges (see code below), where the power-in-band is calculated between the specified frequencies. The power-in-band is actually the average and not the sum. I saw minor boosts to perform Log10 after calculating power-in-band. 175 | * Power-in-band spectral entropies 176 | * Higuchi fractal dimension with kmax=2 177 | * Petrosian fractal dimension 178 | * Hurst exponent 179 | 180 | Code doc in `seizure_prediction/transforms.py` contains more information. 181 | 182 | In the code all these features are specified and joined together like so: 183 | ``` 184 | FeatureConcatPipeline( 185 | Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), 186 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), 187 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning([0.5, 2.25, 4, 5.5, 7, 9.5, 12, 21, 30, 39, 48], 'mean'), Log10(), FlattenChannels()), 188 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), 189 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), 190 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), 191 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), 192 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), 193 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), 194 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), 195 | Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), 196 | Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), 197 | Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), 198 | ) 199 | 200 | ``` 201 | 202 | ### Pipelines and data transformations 203 | 204 | #### Pipeline 205 | 206 | A `Pipeline` is a series of transforms to be applied to the source data. All the transforms I've implemented 207 | can be found under `seizure_prediction/transforms.py` 208 | 209 | Once data has been passed through the pipeline, the output is saved in the `data-cache` directory and 210 | can be reloaded almost instantly next time (a few millseconds on my machine). 211 | 212 | ``` 213 | Pipeline(Windower(75), Correlation()) 214 | ``` 215 | 216 | One particularly useful pipeline is the FFT magnitude. It is generally the first step of many spectral 217 | transforms such as just raw magnitudes or spectral entropy. Recalculating the FFT for all of these 218 | pipelines over and over again is slow and wasteful. Which leads me to... 219 | 220 | #### InputSource 221 | 222 | It's much faster to load up previously processed data and reuse it than to compute it every time. 223 | The `InputSource()` class lets you specify where you want the data to be loaded from. No argument 224 | means the original time-series data. If you specify a pipeline, it will load it from there instead. 225 | If you look up a bit in the features section you can see the InputSource being used to load 226 | previously-computed FFT data. 227 | 228 | I haven't found another use for this yet other than the FFT data, but it was worth it alone for that. 229 | The only time I don't use it for FFT data is for frequency correlation. I store everything in the data 230 | cache as float32, and this seems to cause issues with the `Correlation` transformation having more 231 | issues with NaNs etc. So for now `FreqCorrelation` does duplicate FFT work. 232 | 233 | Replacing: 234 | 235 | ``` 236 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), Slice(1, None), Correlation('none')), 237 | ``` 238 | with 239 | ``` 240 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), 241 | ``` 242 | 243 | is low-hanging fruit. It just needs to be verified that the classification performance is not worse. 244 | I was lazy in replacing it as I had already computed these transforms weeks earlier so it didn't 245 | bother me too much. It does however slow down from-scratch data processing which needs to do the 246 | extra work, such as when you clone this repo or if you clear the data cache to free up some disk space. 247 | 248 | More examples: 249 | ``` 250 | InputSource() 251 | InputSource(Preprocess(), FFT(), Magnitude()) 252 | InputSource(Preprocess(), Windower(75), FFT(), Magnitude()) 253 | ``` 254 | 255 | Also note that this can chew up a lot of disk space for caching these results. 256 | 257 | #### FeatureConcatPipeline 258 | 259 | It's nice and clean to specify individual transforms and pipelines. However it's very practical to combine features. The `FeatureConcatPipeline` does exactly this. It will load each pipeline individually, then concatenate all the features together. 260 | 261 | ``` 262 | FeatureConcatPipeline( 263 | Pipeline(Windower(75), Correlation()), 264 | Pipeline(Windower(75), Hurst()) 265 | ) 266 | ``` 267 | 268 | #### Safe to kill whenever you like 269 | 270 | You can kill the program without fear of losing much progress. A unit of work for the data processing is a single segment (equivalent to one of the original matlab file segments) and a unit of work for the cross-validation is one fold. Results are saved to the data cache and things can pick up where they left off last time automatically. 271 | 272 | There is one caveat however, there's a bug with Python multiprocessing pools and KeyboardInterrupt. I run my code from IntelliJ 14 Ultimate so I don't have a problem, but if you Ctrl-C from the commandline the pool doesn't exit properly so killing from the commandline is a bit of a pain and I have just been using `killall Python` for the time being to get around it. Not ideal, but not generally an issue for me given I use IntelliJ. 273 | 274 | ### Cross-validation strategies 275 | 276 | I have implemented two cross-validation strategies, both based on using folds. 277 | 278 | #### LegacyStrategy 279 | 280 | Found in `seizure_prediction/cross_validation/legacy_strategy.py` 281 | 282 | This strategy uses 3 folds per target, using hand-picked random seeds that seemed to give good 283 | results on my system. I'm not sure this will even work well on other peoples' systems if the 284 | random seeds generate different folds. This is what I used for the whole competition hence the 285 | legacy name so I've left it in there. 286 | 287 | #### KFoldStrategy 288 | 289 | Found in `seizure_prediction/cross_validation/kfold_strategy.py` 290 | 291 | This was a post-competition half-hearted attempt to build a more robust K-fold cross-validation 292 | setup. The selected sequences do not rely on random seeds, and instead I roughly hand-picked 293 | (via an algorithm) a good number of folds and also a good selection across the preictal sequences 294 | that somewhat maximises the coverage of the preictal set. 295 | 296 | For example, given 3 sequences in the preictal set it will use 3 folds `[(0, 1), (0, 2), (1,2)]`. 297 | For 6 sequences and 3 folds it will use `[(0, 1), (2, 3), (4, 5)]`. 298 | 299 | It seems to roughly work okay now, but I've never had much trust in the cross-validation scores 300 | versus the leaderboard scores given that the test set is generally much bigger than the given 301 | training data. 302 | 303 | ### Misc 304 | 305 | I haven't fully cleaned up the code as much as I could, nor documented it as much as I could. 306 | I cleaned it up enough and tried to describe enough that you could take this code base and try 307 | out new transforms etc without too much difficulty. 308 | 309 | If you clone this repo, you will probably want to start looking at `main.py` and it should 310 | hopefully be straightforward to get things going. 311 | 312 | Feel free to message me with any questions. 313 | -------------------------------------------------------------------------------- /genetic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import random 4 | from multiprocessing import Pool 5 | import sys 6 | 7 | import numpy as np 8 | from deap import creator, base, tools 9 | 10 | from seizure_prediction.classifiers import make_svm 11 | from seizure_prediction.cross_validation.legacy_strategy import LegacyStrategy 12 | from seizure_prediction.feature_selection import generate_feature_masks 13 | from seizure_prediction.pipeline import Pipeline, FeatureConcatPipeline, InputSource 14 | from seizure_prediction.scores import get_score_summary, print_results 15 | from seizure_prediction.tasks import load_training_data, make_csv_for_target_predictions, write_submission_file, \ 16 | cross_validation_score, check_training_data_loaded, check_test_data_loaded, make_submission_predictions, \ 17 | calc_feature_mask_string 18 | from seizure_prediction.transforms import Windower, Correlation, FreqCorrelation, FFT, \ 19 | Magnitude, PIBSpectralEntropy, Log10, FreqBinning, FlattenChannels, PFD, HFD, Hurst, Preprocess 20 | from seizure_prediction.settings import load_settings 21 | from main import run_prepare_data_for_cross_validation 22 | from seizure_prediction.fft_bins import * 23 | 24 | 25 | cross_validation_strategy = LegacyStrategy() 26 | 27 | 28 | def evaluate_fitness_score(settings, target, pipeline, classifier, classifier_name, quiet, arg): 29 | individual, best_score = arg 30 | if np.sum(individual) == 0: 31 | score = 0.0 32 | else: 33 | score = float(cross_validation_score(settings, target, pipeline, classifier, classifier_name, 34 | strategy=cross_validation_strategy, feature_mask=individual, quiet=True).mean_score) 35 | 36 | if score > best_score: 37 | if not quiet: print score, np.sum(individual) 38 | return score, 39 | 40 | 41 | creator.create("RocAucMax", base.Fitness, weights=(1.0,)) 42 | creator.create("Individual", list, fitness=creator.RocAucMax) 43 | 44 | 45 | def random_bool(threshold): 46 | return 1 if random.random() <= threshold else 0 47 | 48 | 49 | def get_pipeline_data(settings, target, pipeline): 50 | data = load_training_data(settings, target, pipeline, check_only=False, quiet=True) 51 | num_features = data.X_train.shape[data.X_train.ndim-1] 52 | return num_features, data.num_train_segments 53 | 54 | 55 | def process_target(settings, target, pipeline, classifier, classifier_name, ratio, ngen, quiet, threshold=400): 56 | # make results repeatable 57 | random.seed(0) 58 | 59 | num_features, num_training_examples = get_pipeline_data(settings, target, pipeline) 60 | 61 | # Using sub-feature selection for the human patients appears to perform worse than 62 | # using full feature set. My guess is that perhaps there is not enough training samples 63 | # for this technique to work effectively. So do not run GA if there are too few training 64 | # samples. The threshold parameter can be tweaked with more testing. 65 | if num_training_examples < threshold: 66 | score = float(cross_validation_score(settings, target, pipeline, classifier, classifier_name, 67 | strategy=cross_validation_strategy, quiet=True).mean_score) 68 | return score, [[1] * num_features] 69 | 70 | num_wanted_features = int(num_features * ratio) 71 | if not quiet: print 'ratio', ratio 72 | if not quiet: print 'num features', num_features 73 | if not quiet: print 'num wanted features', num_wanted_features 74 | 75 | if not quiet: print target, classifier_name 76 | 77 | pool = Pool(settings.N_jobs) 78 | 79 | toolbox = base.Toolbox() 80 | toolbox.register("map", pool.map) 81 | toolbox.register("attr_bool", random_bool, ratio) 82 | toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, num_features) 83 | toolbox.register("population", tools.initRepeat, list, toolbox.individual) 84 | 85 | toolbox.register("evaluate", evaluate_fitness_score, settings, target, pipeline, classifier, classifier_name, quiet) 86 | toolbox.register("mate", tools.cxTwoPoint) 87 | toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) 88 | toolbox.register("select", tools.selTournament, tournsize=3) 89 | 90 | pop = toolbox.population(n=30) 91 | CXPB, MUTPB, NGEN = 0.5, 0.2, ngen 92 | 93 | best_score = 0 94 | best_feature_mask = None 95 | all_feature_masks = {} 96 | 97 | # Evaluate the entire population 98 | if not quiet: print 'evaluating pop %d' % len(pop) 99 | fitnesses = toolbox.map(toolbox.evaluate, [(ind, 1.0) for ind in pop]) 100 | if not quiet: print 'done evaluating' 101 | 102 | for ind, fit in zip(pop, fitnesses): 103 | ind.fitness.values = fit 104 | all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0]) 105 | 106 | # calc first best 107 | fits = [ind.fitness.values[0] for ind in pop] 108 | best_index = np.argmax(fits) 109 | score = fits[best_index] 110 | if score > best_score: 111 | best_score = score 112 | best_feature_mask = pop[best_index] 113 | if not quiet: print 'new best', best_score, np.sum(best_feature_mask) 114 | 115 | # Begin the evolution 116 | for g in range(NGEN): 117 | if not quiet: print("-- %s: Generation %i --" % (target, g)) 118 | 119 | # Select the next generation individuals 120 | offspring = toolbox.select(pop, int(len(pop))) 121 | # Clone the selected individuals 122 | offspring = list(toolbox.map(toolbox.clone, offspring)) 123 | 124 | # Apply crossover and mutation on the offspring 125 | for child1, child2 in zip(offspring[::2], offspring[1::2]): 126 | if random.random() < CXPB: 127 | toolbox.mate(child1, child2) 128 | del child1.fitness.values 129 | del child2.fitness.values 130 | 131 | for mutant in offspring: 132 | if random.random() < MUTPB: 133 | toolbox.mutate(mutant) 134 | del mutant.fitness.values 135 | 136 | # Evaluate the individuals with an invalid fitness 137 | invalid_ind = [ind for ind in offspring if not ind.fitness.valid] 138 | fitnesses = toolbox.map(toolbox.evaluate, [(ind, best_score) for ind in invalid_ind]) 139 | for ind, fit in zip(invalid_ind, fitnesses): 140 | ind.fitness.values = fit 141 | all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0]) 142 | 143 | if not quiet: print(" Evaluated %i individuals (pop size %d)" % (len(invalid_ind), len(offspring))) 144 | 145 | # The population is entirely replaced by the offspring 146 | pop[:] = offspring 147 | 148 | # Gather all the fitnesses in one list and print the stats 149 | fits = [ind.fitness.values[0] for ind in pop] 150 | best_index = np.argmax(fits) 151 | all_f = [np.sum(ind) for ind in pop] 152 | if not quiet: print ' %s, %s, %s (%d-%d)' % (target, fits[best_index], np.sum(pop[best_index]), np.min(all_f), np.max(all_f)) 153 | 154 | length = len(pop) 155 | mean = sum(fits) / length 156 | 157 | if not quiet: print(" Min %s" % min(fits)) 158 | if not quiet: print(" Max %s" % max(fits)) 159 | if not quiet: print(" Avg %s" % mean) 160 | 161 | score = fits[best_index] 162 | if score > best_score: 163 | best_score = score 164 | best_feature_mask = pop[best_index] 165 | if not quiet: print 'new best', best_score, np.sum(best_feature_mask) 166 | 167 | if not quiet: print("-- End of (successful) evolution --") 168 | 169 | best_ind = tools.selBest(pop, 1)[0] 170 | if not quiet: print "-- Finished --\n%s\n%s\n%s" % (target, best_ind.fitness.values[0], best_ind) 171 | 172 | pop = list(all_feature_masks.values()) 173 | pop.sort(cmp=lambda x1, x2: cmp(x2[1], x1[1])) 174 | sorted_pop = [ind for ind, score in pop] 175 | print target, 'best', pop[0][1], 'worst', pop[-1][1] 176 | 177 | return best_score, sorted_pop 178 | 179 | 180 | def run_make_submission(settings, targets_and_pipelines, classifier, classifier_name): 181 | pool = Pool(settings.N_jobs) 182 | for i, (target, pipeline, feature_masks) in enumerate(targets_and_pipelines): 183 | for j, feature_mask in enumerate(feature_masks): 184 | progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks)) 185 | pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name], {'feature_mask': feature_mask, 'quiet': True, 'progress_str': progress_str}) 186 | pool.close() 187 | pool.join() 188 | 189 | guesses = ['clip,preictal'] 190 | for target, pipeline, feature_masks in targets_and_pipelines: 191 | test_predictions = [] 192 | 193 | for feature_mask in feature_masks: 194 | data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask) 195 | test_predictions.append(data.mean_predictions) 196 | 197 | predictions = np.mean(test_predictions, axis=0) 198 | guesses += make_csv_for_target_predictions(target, predictions) 199 | 200 | output = '\n'.join(guesses) 201 | submission_targets_and_pipelines = [(target, pipeline, feature_masks, classifier, classifier_name) 202 | for target, pipeline, feature_masks in targets_and_pipelines] 203 | write_submission_file(settings, output, None, None, classifier_name, submission_targets_and_pipelines) 204 | 205 | 206 | def run_prepare_data(settings, targets_and_pipelines, train=True, test=False): 207 | for target, pipeline, feature_masks in targets_and_pipelines: 208 | if train: 209 | check_training_data_loaded(settings, target, pipeline) 210 | if test: 211 | check_test_data_loaded(settings, target, pipeline) 212 | 213 | 214 | def extract_masks_for_pipeline_and_masks(settings, target, pipeline, masks): 215 | outs = [{} for mask in masks] 216 | offset = 0 217 | for p in pipeline.get_pipelines(): 218 | num_features, _ = get_pipeline_data(settings, target, p) 219 | for i, mask in enumerate(masks): 220 | p_mask = mask[offset:offset + num_features] 221 | outs[i][p.get_name()] = p_mask 222 | offset += num_features 223 | for mask in masks: 224 | assert offset == len(mask) 225 | return outs 226 | 227 | 228 | def merge_dicts(*dicts): 229 | x = dicts[0].copy() 230 | for d in dicts[1:]: 231 | x.update(d) 232 | return x 233 | 234 | 235 | def get_submission_targets_and_masks(settings, targets, classifier, classifier_name, pipeline_groups, random_pipelines, random_ratio=0.525, ngen=10, limit=2, random_limit=2): 236 | assert random_limit % limit == 0 237 | random_multiplier = random_limit / limit 238 | quiet = True 239 | 240 | random_pipeline = FeatureConcatPipeline(*random_pipelines) 241 | 242 | all_pipelines = [] 243 | all_pipelines.extend(random_pipelines) 244 | for pg, ratio in pipeline_groups: 245 | all_pipelines.extend(pg) 246 | full_pipeline = FeatureConcatPipeline(*all_pipelines) 247 | run_prepare_data(settings, [(target, full_pipeline, []) for target in targets], test=True) 248 | 249 | def get_pipeline_and_feature_masks(target, pipelines, classifier, classifier_name, ratio, ngen): 250 | print target, 'fetching GA pipelines', [p.get_name() for p in pipelines] 251 | pipeline = FeatureConcatPipeline(*pipelines) 252 | score, best_N = process_target(settings, target, pipeline, classifier, classifier_name, ratio=ratio, ngen=ngen, quiet=quiet) 253 | return pipeline, best_N 254 | 255 | targets_and_pipelines = [] 256 | for target in targets: 257 | # NOTE(mike): All this stuff is a bit nasty. It gets the random-masks and the genetic-masks 258 | # for different pipelines, and then pulls out the mask for each individual pipeline. A single 259 | # FeatureConcatPipeline is then created to represent all the features, and the masks for each 260 | # member of the FCP are merged together to form the single feature mask across the whole FCP. 261 | 262 | random_masks = generate_feature_masks(settings, target, random_pipeline, random_limit, random_ratio, random_state=0, quiet=quiet) 263 | # contains a list of pairs, (pipeline, mask) 264 | ga_groups = [get_pipeline_and_feature_masks(target, p, classifier, classifier_name, ratio, ngen) for p, ratio in pipeline_groups] 265 | ga_groups = [(p, masks[0:limit]) for p, masks in ga_groups] 266 | 267 | print target, 'extracting GA per-pipeline masks...' 268 | # contains a list of mask dictionaries 269 | ga_dicts = [extract_masks_for_pipeline_and_masks(settings, target, pipeline, masks) for pipeline, masks in ga_groups] 270 | ga_dicts = [mask_dicts * random_multiplier for mask_dicts in ga_dicts] 271 | 272 | r_dicts = extract_masks_for_pipeline_and_masks(settings, target, random_pipeline, random_masks) 273 | # this contains a list of dictionaries which maps pipeline names to masks 274 | # e.g. [r_dicts, ga_dicts0, ga_dicts1, ...] 275 | zip_group = [r_dicts] + ga_dicts 276 | 277 | print target, 'merging all masks...' 278 | feature_mask_dicts = [merge_dicts(*x) for x in zip(*zip_group)] 279 | 280 | feature_masks = [] 281 | for feature_mask_dict in feature_mask_dicts: 282 | mask = [] 283 | for p in full_pipeline.get_pipelines(): 284 | mask.extend(feature_mask_dict[p.get_name()]) 285 | feature_masks.append(mask) 286 | 287 | targets_and_pipelines.append((target, full_pipeline, feature_masks)) 288 | return targets_and_pipelines 289 | 290 | 291 | def main(): 292 | settings = load_settings() 293 | 294 | targets = [ 295 | 'Dog_1', 296 | 'Dog_2', 297 | 'Dog_3', 298 | 'Dog_4', 299 | 'Dog_5', 300 | 'Patient_1', 301 | 'Patient_2' 302 | ] 303 | 304 | # The genetic algorithm will be run individually on each pipeline group 305 | pipeline_groups = [ 306 | ([ 307 | Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), 308 | ], 0.55), 309 | ([ 310 | Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), 311 | ], 0.55), 312 | ([ 313 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), 314 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), 315 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), 316 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), 317 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), 318 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), 319 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), 320 | Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), 321 | ], 0.55), 322 | ] 323 | 324 | make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission' 325 | run_ga = not make_submission 326 | 327 | # This classifier is used in the genetic algorithm 328 | ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7) 329 | 330 | if run_ga: 331 | quiet = False 332 | summaries = [] 333 | for ngen in [10]: 334 | for pipelines, ratio in pipeline_groups: 335 | out = [] 336 | for target in targets: 337 | print 'Running target', target 338 | run_prepare_data_for_cross_validation(settings, [target], pipelines, quiet=True) 339 | pipeline = FeatureConcatPipeline(*pipelines) 340 | score, best_N = process_target(settings, target, pipeline, ga_classifier, ga_classifier_name, ratio=ratio, ngen=ngen, quiet=quiet) 341 | print target, score, [np.sum(mask) for mask in best_N[0:10]] 342 | out.append((target, score, pipeline, best_N)) 343 | 344 | scores = np.array([score for _, score, _, _ in out]) 345 | summary = get_score_summary('%s ngen=%d' % (ga_classifier_name, ngen), scores) 346 | summaries.append((summary, np.mean(scores))) 347 | print summary 348 | 349 | print_results(summaries) 350 | 351 | if make_submission: 352 | random_pipelines = [ 353 | Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), 354 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), 355 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()), 356 | ] 357 | 358 | # These classifiers are used to make the final predictions 359 | final_classifiers = [ 360 | # make_svm(gamma=0.0079, C=2.7), 361 | make_svm(gamma=0.0068, C=2.0), 362 | # make_svm(gamma=0.003, C=150.0), 363 | # make_lr(C=0.04), 364 | # make_simple_lr(), 365 | ] 366 | targets_and_pipelines = get_submission_targets_and_masks(settings, targets, ga_classifier, ga_classifier_name, pipeline_groups, random_pipelines) 367 | for classifier, classifier_name in final_classifiers: 368 | run_make_submission(settings, targets_and_pipelines, classifier, classifier_name) 369 | 370 | 371 | if __name__ == "__main__": 372 | main() 373 | -------------------------------------------------------------------------------- /seizure_prediction/tasks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.utils 3 | from sklearn.metrics import roc_auc_score 4 | from common import time 5 | from common.data import jsdict 6 | from seizure_prediction import hdf5 7 | from seizure_prediction.cross_validation.kfold_strategy import KFoldStrategy 8 | from seizure_prediction.data import to_np_array 9 | import gzip 10 | import os.path 11 | import hashlib 12 | from seizure_prediction.data import load_data_mp 13 | from seizure_prediction.pipeline import FeatureConcatPipeline 14 | 15 | 16 | # flatten data down to 2 dimensions for putting through a classifier 17 | # supports input shapes: 18 | # (num_segments, num_features) 19 | # (num_segments, num_windows, num_features) 20 | # (num_segments, num_windows, num_channels, num_features) 21 | def flatten(data): 22 | if data.ndim == 2: 23 | return data 24 | if not data.ndim >= 3: 25 | print 'data shape', data.shape 26 | assert data.ndim >= 3 27 | s = data.shape 28 | out = data.reshape((np.product(s[0:2]), np.product(s[2:]))) 29 | 30 | return out 31 | 32 | 33 | # Load data for a given pipeline. This wraps load_data_mp to also provide FeatureConcatPipeline support. 34 | # See load_data_mp for description of check_only and meta_only parameters. 35 | def load_pipeline_data(settings, target, data_type, pipeline, check_only, quiet=False, meta_only=False): 36 | if check_only: 37 | return np.alltrue([load_data_mp(settings, target, data_type, p, check_only=True, quiet=quiet) 38 | for p in pipeline.get_pipelines()]) 39 | 40 | if isinstance(pipeline, FeatureConcatPipeline): 41 | data = [] 42 | meta = None 43 | num_features = 0 44 | 45 | for p in pipeline.get_pipelines(): 46 | _data, _meta = load_data_mp(settings, target, data_type, p, quiet=quiet, meta_only=meta_only) 47 | data.append(_data) 48 | if meta is None: 49 | meta = _meta 50 | for k in meta.keys(): 51 | if k == 'X_shape': 52 | assert meta[k][:-1] == _meta[k][:-1] 53 | num_features += _meta[k][-1] 54 | elif isinstance(_meta[k], np.ndarray): 55 | assert np.alltrue(meta[k] == _meta[k]) 56 | else: 57 | assert meta[k] == _meta[k] 58 | 59 | d0 = data[0] 60 | if meta_only: 61 | data = None 62 | # combine shapes 63 | meta['X_shape'] = list(meta['X_shape'][:-1]) + [num_features] 64 | else: 65 | for d in data[1:]: 66 | if d0.ndim != d.ndim: 67 | print pipeline.get_name() 68 | print 'd0', d0.shape, 'other', d.shape 69 | assert d0.ndim == d.ndim 70 | assert d0.shape[:-1] == d.shape[:-1] 71 | data = np.concatenate(data, axis=data[0].ndim-1) 72 | else: 73 | data, meta = load_data_mp(settings, target, data_type, pipeline, quiet=quiet, meta_only=meta_only) 74 | 75 | return data, meta 76 | 77 | 78 | # Load training data, this loads the preictal and interictal pipeline data, optionally separates the 79 | # data into training set and cross-validation set, and generates labels. 80 | # 81 | # strategy: cross-validation strategy, see LegacyStrategy() and KFoldStrategy() 82 | # cv_fold_number: None to specify no cross-validation set for when making a submission, 83 | # otherwise a number generated by the cross-validation strategy. 84 | def load_training_data(settings, target, pipeline, check_only, strategy=None, cv_fold_number=None, quiet=False): 85 | cv = cv_fold_number is not None 86 | if check_only: 87 | return load_pipeline_data(settings, target, 'preictal', pipeline, check_only=True, quiet=quiet) or \ 88 | load_pipeline_data(settings, target, 'interictal', pipeline, check_only=True, quiet=quiet) 89 | 90 | preictal, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet) 91 | interictal, interictal_meta = load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=quiet) 92 | 93 | total_segments = preictal_meta.num_segments + interictal_meta.num_segments 94 | # print 'total_segments', total_segments 95 | 96 | if not quiet: print 'Preparing data ...', 97 | start = time.get_seconds() 98 | 99 | def make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv): 100 | num_train_segments = preictal_X_train.shape[0] + interictal_X_train.shape[0] 101 | num_cv_segments = preictal_X_cv.shape[0] + interictal_X_cv.shape[0] 102 | assert (num_train_segments + num_cv_segments) == total_segments 103 | 104 | flattened_preictal_X_train = flatten(preictal_X_train) 105 | flattened_interictal_X_train = flatten(interictal_X_train) 106 | flattened_preictal_X_cv = flatten(preictal_X_cv) if cv else np.empty((0,)) 107 | flattened_interictal_X_cv = flatten(interictal_X_cv) if cv else np.empty((0,)) 108 | 109 | X_train = np.concatenate((flattened_preictal_X_train, flattened_interictal_X_train), axis=0) 110 | X_cv = np.concatenate((flattened_preictal_X_cv, flattened_interictal_X_cv), axis=0) 111 | 112 | preictal_y_train = np.ones((flattened_preictal_X_train.shape[0],)) 113 | preictal_y_cv = np.ones((preictal_X_cv.shape[0],)) 114 | interictal_y_train = np.zeros((flattened_interictal_X_train.shape[0],)) 115 | interictal_y_cv = np.zeros((interictal_X_cv.shape[0],)) 116 | 117 | y_train = np.concatenate((preictal_y_train, interictal_y_train), axis=0) 118 | y_cv = np.concatenate((preictal_y_cv, interictal_y_cv), axis=0) 119 | 120 | X_train, y_train = sklearn.utils.shuffle(X_train, y_train, random_state=0) 121 | 122 | return jsdict({ 123 | 'X_train': X_train, 124 | 'y_train': y_train, 125 | 'X_cv': X_cv, 126 | 'y_cv': y_cv, 127 | 'num_train_segments': num_train_segments, 128 | 'num_cv_segments': num_cv_segments 129 | }) 130 | 131 | if cv: 132 | preictal_X_train, preictal_X_cv = strategy.split_train_cv(preictal, preictal_meta, cv_fold_number) 133 | interictal_X_train, interictal_X_cv = strategy.split_train_cv(interictal, interictal_meta, cv_fold_number, interictal=True) 134 | data = make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv) 135 | else: 136 | preictal_X_train = preictal 137 | preictal_X_cv = np.empty((0,)) 138 | interictal_X_train = interictal 139 | interictal_X_cv = np.empty((0,)) 140 | data = make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv) 141 | 142 | if not quiet: print '%ds' % (time.get_seconds() - start) 143 | 144 | if not quiet: print 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape 145 | 146 | return data 147 | 148 | 149 | # Load the test data for a given pipeline 150 | def load_test_data(settings, target, pipeline, quiet=False): 151 | test, meta = load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet) 152 | X_test = flatten(test) 153 | if not quiet: print 'X_test', test.shape, 'num_segments', meta.num_segments 154 | return jsdict({ 155 | 'X_test': X_test, 156 | 'num_segments': meta.num_segments 157 | }) 158 | 159 | 160 | # Train a classifier 161 | def train(classifier, training_data, quiet=False): 162 | X_train = training_data.X_train 163 | y_train = training_data.y_train 164 | if not quiet: print 'Training ...', 165 | start = time.get_seconds() 166 | classifier.fit(X_train, y_train) 167 | if not quiet: print '%ds' % (time.get_seconds() - start) 168 | 169 | 170 | # Make predictions, and then combine the N predictions if using windows using mean and median. 171 | # Returns (mean_predictions, median_predictions, raw_predictions) 172 | def make_predictions(classifier, X, num_segments): 173 | predictions = classifier.predict_proba(X)[:, 1] 174 | split_data = np.split(predictions, num_segments) 175 | return to_np_array([np.mean(ps) for ps in split_data]), to_np_array([np.median(ps) for ps in split_data]), predictions 176 | 177 | 178 | # Save the output of function fn to os.path.join(*paths) if it doesn't exist on disk, 179 | # otherwise load the data from disk. Note that this changes the current working directory 180 | # in order to deal with too-big filenames generated by a large number of concatenated 181 | # features in FeatureConcatPipeline. 182 | def memoize(fn, paths): 183 | cwd = os.getcwd() 184 | 185 | def change_to_target_dir(): 186 | for dir in paths[:-1]: 187 | try: 188 | os.mkdir(dir) 189 | except OSError, e: 190 | pass 191 | os.chdir(dir) 192 | 193 | change_to_target_dir() 194 | filename = paths[-1] 195 | if os.path.exists(filename): 196 | data = hdf5.read(filename) 197 | os.chdir(cwd) 198 | return data 199 | 200 | os.chdir(cwd) 201 | data = fn() 202 | change_to_target_dir() 203 | tmp = '%s.pid.%d.tmp' % (filename, os.getpid()) 204 | hdf5.write(tmp, data) 205 | os.rename(tmp, filename) 206 | os.chdir(cwd) 207 | 208 | return jsdict(data) 209 | 210 | 211 | # Fast process-if-not-yet-processed method for training data 212 | def check_training_data_loaded(settings, target, pipeline, quiet=False): 213 | if not load_pipeline_data(settings, target, 'preictal', pipeline, check_only=True, quiet=quiet): 214 | load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet) 215 | if not load_pipeline_data(settings, target, 'interictal', pipeline, check_only=True, quiet=quiet): 216 | load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=quiet) 217 | 218 | 219 | # Fast process-if-not-yet-processed method for test data 220 | def check_test_data_loaded(settings, target, pipeline, quiet=False): 221 | if not load_pipeline_data(settings, target, 'test', pipeline, check_only=True, quiet=quiet): 222 | load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet) 223 | 224 | 225 | # Represent a feature_mask e.g. [1,0,0,1,1,1,0] as binary. 226 | def calc_feature_mask_bigint(mask): 227 | mask = [int(x) for x in mask] 228 | out = 0 229 | for i, x in enumerate(mask): 230 | out += x << i 231 | return out 232 | 233 | 234 | # Represent a feature_mask e.g. [1,0,0,1,1,1,0] as string by concatenating 235 | # md5 and sha1. Should be unique enough. Used to provide a short-name for 236 | # saving data to disk. Otherwise the filename would be way too long. 237 | def calc_feature_mask_string(mask): 238 | if mask is None: 239 | return None 240 | 241 | out = calc_feature_mask_bigint(mask) 242 | 243 | hex_str = hex(out) 244 | md5 = hashlib.md5(hex_str).hexdigest() 245 | sha1 = hashlib.sha1(hex_str).hexdigest() 246 | return md5 + sha1 247 | 248 | 249 | # Calculate cross-validation score for a single cv fold. 250 | def cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, fold, strategy, feature_mask=None, progress_str=None, quiet=False): 251 | def process(): 252 | 253 | data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet) 254 | 255 | if feature_mask is not None: 256 | s = [slice(None),] * data.X_train.ndim 257 | s[-1] = np.where(np.array(feature_mask) == True)[0] 258 | data['X_train'] = data.X_train[s] 259 | data['X_cv'] = data.X_cv[s] 260 | if not quiet: print ' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape 261 | 262 | train(classifier, data, quiet=quiet) 263 | if not quiet: print "Making predictions...", 264 | timer = time.Timer() 265 | mean_predictions, median_predictions, raw_predictions = make_predictions(classifier, data.X_cv, data.num_cv_segments) 266 | if not quiet: print timer.pretty_str() 267 | 268 | mean_score = roc_auc_score(data.y_cv, mean_predictions) 269 | median_score = roc_auc_score(data.y_cv, median_predictions) 270 | 271 | return jsdict({ 272 | 'mean_score': mean_score, 273 | 'median_score': median_score, 274 | 'mean_predictions': mean_predictions, 275 | 'median_predictions': median_predictions, 276 | 'y_cv': data.y_cv 277 | }) 278 | 279 | feature_mask_string = calc_feature_mask_string(feature_mask) 280 | fm_path = [feature_mask_string] if feature_mask_string is not None else [] 281 | paths = [settings.cache_dir, target, classifier_name] + pipeline.get_names() + fm_path + ['cv_%s_fold%d.hdf5' % (strategy.get_name(), fold)] 282 | 283 | if progress_str is not None: 284 | print 'Running', progress_str, 'fold %d' % fold 285 | return memoize(process, paths) 286 | 287 | 288 | # Calculate the average cross-validation score across N folds. 289 | # 290 | # pool: Optional multi-processing pool to use to schedule the folds, otherwise folds 291 | # will be processed one-by-one 292 | # strategy: cross-validation strategy, see LegacyStrategy() and KFoldStrategy() 293 | # feature_mask: The feature_mask to apply before training. 294 | # progress_str: helper string for printing progress inside multiprocessing pool 295 | # return_data: returns full result if True, otherwise simply processes the folds without doing an consolidation work 296 | def cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=None, pool=None, progress_str=None, feature_mask=None, return_data=True, quiet=False): 297 | if strategy is None: 298 | strategy = KFoldStrategy() 299 | 300 | if feature_mask is not None and np.count_nonzero(feature_mask) == len(feature_mask): 301 | feature_mask = None 302 | 303 | _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet, meta_only=True) 304 | cv_folds = strategy.get_folds(preictal_meta) 305 | 306 | if pool is not None: 307 | results = [pool.apply_async(cross_val_score_for_one_fold, [settings, target, pipeline, classifier, classifier_name, fold], 308 | {'strategy': strategy, 'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': quiet}) 309 | for fold in cv_folds] 310 | if return_data: 311 | out = [r.get() for r in results] 312 | else: 313 | out = [cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, strategy=strategy, 314 | fold=fold, feature_mask=feature_mask, progress_str=progress_str, quiet=quiet) for fold in cv_folds] 315 | 316 | if return_data: 317 | mean_scores = [d.mean_score for d in out] 318 | median_scores = [d.median_score for d in out] 319 | mean_predictions = [d.mean_predictions for d in out] 320 | median_predictions = [d.median_predictions for d in out] 321 | y_cvs = [d.y_cv for d in out] 322 | 323 | return jsdict({ 324 | 'mean_score': np.mean(mean_scores), 325 | 'median_score': np.mean(median_scores), 326 | 'mean_scores': np.array(mean_scores), 327 | 'median_scores': np.array(median_scores), 328 | 'mean_predictions': mean_predictions, 329 | 'median_predictions': median_predictions, 330 | 'y_cvs': y_cvs 331 | }) 332 | 333 | 334 | # Make submission predictions for a given pipeline and classifier. 335 | # 336 | # feature_mask: The feature_mask to apply before training. 337 | # progress_str: helper string for printing progress inside multiprocessing pool 338 | def make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=None, quiet=False, progress_str=None): 339 | if progress_str is not None: 340 | print 'Running', progress_str 341 | 342 | feature_mask_string = calc_feature_mask_string(feature_mask) 343 | 344 | def process(): 345 | data = load_training_data(settings, target, pipeline, check_only=False, quiet=quiet) 346 | 347 | if feature_mask is not None: 348 | s = [slice(None),] * data.X_train.ndim 349 | s[-1] = np.where(np.array(feature_mask) == True)[0] 350 | data['X_train'] = data.X_train[s] 351 | if not quiet: print 'Feature mask', 'X_train', data.X_train.shape 352 | 353 | train(classifier, data, quiet=quiet) 354 | train_predictions = classifier.predict_proba(data.X_train)[:, 1] 355 | y_train = data.y_train 356 | del data 357 | 358 | data = load_test_data(settings, target, pipeline, quiet=quiet) 359 | 360 | if feature_mask is not None: 361 | s = [slice(None),] * data.X_test.ndim 362 | s[-1] = np.where(np.array(feature_mask) == True)[0] 363 | data['X_test'] = data.X_test[s] 364 | if not quiet: print 'Feature mask', 'X_test', data.X_test.shape 365 | 366 | predictions = make_predictions(classifier, data.X_test, data.num_segments) 367 | predictions_mean, predictions_median, test_predictions = predictions 368 | 369 | return { 370 | 'mean_predictions': predictions_mean, 371 | 'median_predictions': predictions_median, 372 | 'train_predictions': train_predictions, 373 | 'y_train': y_train, 374 | 'test_predictions': test_predictions, 375 | 'num_segments': data.num_segments 376 | } 377 | 378 | fm_path = [feature_mask_string] if feature_mask_string is not None else [] 379 | paths = [settings.cache_dir, target, classifier_name] + pipeline.get_names() + fm_path + ['predictions.hdf5'] 380 | return memoize(process, paths) 381 | 382 | 383 | # Convert predictions into csv submission format 384 | def make_csv_for_target_predictions(target, predictions): 385 | return ['%s_test_segment_%.4d.mat,%.10f' % (target, i+1, p) for i, p in enumerate(predictions)] 386 | 387 | 388 | # Wrapper to return both mean and median-combined predictions in csv submission format 389 | def make_submission_csv(settings, target, pipeline, classifier, classifier_name): 390 | data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, quiet=True) 391 | 392 | csv_mean = make_csv_for_target_predictions(target, data.mean_predictions) 393 | csv_median = make_csv_for_target_predictions(target, data.median_predictions) 394 | 395 | return csv_mean, csv_median 396 | 397 | 398 | # Write a submission file given the guesses either as a list of strings or already as the final string. 399 | # Filename is generated as submission%d.csv.gz with companion submission%d.txt where the number is 400 | # auto-increment given existing files in the submission directory. The companion txt file provides info 401 | # about what was used to generate that submission. 402 | def write_submission_file(settings, guesses, name, pipeline, classifier_name, targets_and_pipelines=None, target_pipelines=None): 403 | guesses = '\n'.join(guesses) if isinstance(guesses, list) else guesses 404 | id = 0 405 | done = False 406 | while not done: 407 | try: 408 | filename = os.path.join(settings.submission_dir, 'submission%d.csv.gz' % id) 409 | # make the file to 'take it' 410 | fd = os.open(filename, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0644) 411 | os.close(fd) 412 | 413 | f = gzip.open(filename, 'wb') 414 | f.write(guesses) 415 | f.close() 416 | 417 | print 'wrote', filename 418 | 419 | filename = os.path.join(settings.submission_dir, 'submission%d.txt' % id) 420 | with open(filename, 'w') as f: 421 | print >>f, classifier_name 422 | print >>f, name 423 | if target_pipelines is not None: 424 | for target in sorted(target_pipelines.keys()): 425 | pipeline = target_pipelines[target] 426 | print >>f, target 427 | print >>f, pipeline.get_name() 428 | if targets_and_pipelines is not None: 429 | for target, pipeline, feature_masks, _, _ in targets_and_pipelines: 430 | print >>f, target 431 | print >>f, 'FEATURE MASKS' 432 | print >>f, '\n'.join(pipeline.get_names()) 433 | for i, mask in enumerate(feature_masks): 434 | print >>f, 'Mask %d' % i 435 | print >>f, mask 436 | else: 437 | for p_name in pipeline.get_names(): 438 | print >>f, p_name 439 | print 'wrote', filename 440 | 441 | done = True 442 | 443 | except OSError, e: 444 | id += 1 445 | -------------------------------------------------------------------------------- /seizure_prediction/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.signal import hilbert 3 | from sklearn import preprocessing 4 | import scipy.stats 5 | import pandas as pd 6 | 7 | from data import to_np_array 8 | 9 | 10 | # optional modules for trying out different transforms 11 | try: 12 | import pywt 13 | except ImportError, e: 14 | pass 15 | 16 | try: 17 | from scikits.talkbox.features import mfcc 18 | except ImportError, e: 19 | pass 20 | 21 | # for auto regressive model 22 | try: 23 | import spectrum 24 | except ImportError, e: 25 | pass 26 | 27 | 28 | 29 | # NOTE(mike): Some transforms operate on the raw data in the shape (NUM_CHANNELS, NUM_FEATURES). 30 | # Others operate on windowed data in the shape (NUM_WINDOWS, NUM_CHANNELS, NUM_FEATURES). 31 | # I've been a bit lazy and just made the ApplyManyTransform base class helper... so if you intend 32 | # a transform to work on pre-windowed data, just write a plain transform with apply method, if 33 | # you intend to work on windowed-data, derive from ApplyManyTransform and implement apply_one method. 34 | # Really this is just a problem of number of axes, and np.apply_along_axis could probably be used to 35 | # clean up this mess. :) I haven't bothered updating it as things are working as they are. 36 | 37 | class ApplyManyTransform(object): 38 | def apply(self, datas, meta): 39 | if datas.ndim >= 3: 40 | out = [] 41 | for d in datas: 42 | out.append(self.apply_one(d, meta)) 43 | 44 | return to_np_array(out) 45 | else: 46 | return self.apply_one(datas, meta) 47 | 48 | 49 | class FFT: 50 | """ 51 | Apply Fast Fourier Transform to the last axis. 52 | """ 53 | def get_name(self): 54 | return "fft" 55 | 56 | def apply(self, data, meta=None): 57 | axis = data.ndim - 1 58 | return np.fft.rfft(data, axis=axis) 59 | 60 | 61 | class Slice: 62 | """ 63 | Take a slice of the data on the last axis. 64 | e.g. Slice(1, 48) works like a normal python slice, that is 1-47 will be taken 65 | """ 66 | def __init__(self, start, end=None): 67 | self.start = start 68 | self.end = end 69 | 70 | def get_name(self): 71 | return "slice%d%s" % (self.start, '-%d' % self.end if self.end is not None else '') 72 | 73 | def apply(self, data, meta=None): 74 | s = [slice(None),] * data.ndim 75 | s[-1] = slice(self.start, self.end) 76 | return data[s] 77 | 78 | 79 | class MFCC: 80 | """ 81 | Mel-frequency cepstrum coefficients 82 | """ 83 | def get_name(self): 84 | return "mfcc" 85 | 86 | def apply(self, data, meta=None): 87 | all_ceps = [] 88 | for ch in data: 89 | ceps, mspec, spec = mfcc(ch) 90 | all_ceps.append(ceps.ravel()) 91 | 92 | return to_np_array(all_ceps) 93 | 94 | 95 | class Magnitude: 96 | """ 97 | Take magnitudes of Complex data 98 | """ 99 | def get_name(self): 100 | return "mag" 101 | 102 | def apply(self, data, meta=None): 103 | return np.abs(data) 104 | 105 | 106 | class Log: 107 | """ 108 | Apply LogE 109 | """ 110 | def get_name(self): 111 | return "log" 112 | 113 | def apply(self, data, meta=None): 114 | indices = np.where(data <= 0) 115 | data[indices] = np.max(data) 116 | data[indices] = (np.min(data) * 0.1) 117 | return np.log(data) 118 | 119 | 120 | class Log2: 121 | """ 122 | Apply Log2 123 | """ 124 | def get_name(self): 125 | return "log2" 126 | 127 | def apply(self, data, meta=None): 128 | indices = np.where(data <= 0) 129 | data[indices] = np.max(data) 130 | data[indices] = (np.min(data) * 0.1) 131 | return np.log2(data) 132 | 133 | 134 | class Log10: 135 | """ 136 | Apply Log10 137 | """ 138 | def get_name(self): 139 | return "log10" 140 | 141 | def apply(self, data, meta=None): 142 | indices = np.where(data <= 0) 143 | data[indices] = np.max(data) 144 | data[indices] = (np.min(data) * 0.1) 145 | return np.log10(data) 146 | 147 | 148 | class Stats(ApplyManyTransform): 149 | """ 150 | Subtract the mean, then take (min, max, standard_deviation) for each channel. 151 | """ 152 | def get_name(self): 153 | return "stats" 154 | 155 | def apply_one(self, data, meta=None): 156 | # data[ch][dim] 157 | shape = data.shape 158 | out = np.empty((shape[0], 3)) 159 | for i in range(len(data)): 160 | ch_data = data[i] 161 | ch_data -= np.mean(ch_data) 162 | outi = out[i] 163 | outi[0] = np.std(ch_data) 164 | outi[1] = np.min(ch_data) 165 | outi[2] = np.max(ch_data) 166 | 167 | return out 168 | 169 | 170 | class MomentPerChannel(ApplyManyTransform): 171 | """ 172 | Calculate the Nth moment per channel. 173 | """ 174 | def __init__(self, n): 175 | self.n = n 176 | 177 | def get_name(self): 178 | return "moment%d" % self.n 179 | 180 | def apply_one(self, data, meta=None): 181 | return scipy.stats.moment(data, moment=self.n, axis=data.ndim-1) 182 | 183 | 184 | class UnitScale: 185 | """ 186 | Scale across the last axis. 187 | """ 188 | def get_name(self): 189 | return 'unit-scale' 190 | 191 | def apply(self, data, meta=None): 192 | return preprocessing.scale(data, axis=data.ndim-1) 193 | 194 | 195 | class UnitScaleFeat: 196 | """ 197 | Scale across the first axis, i.e. scale each feature. 198 | """ 199 | def get_name(self): 200 | return 'unit-scale-feat' 201 | 202 | def apply(self, data, meta=None): 203 | return preprocessing.scale(data.astype(np.float64), axis=0) 204 | 205 | 206 | class CorrelationMatrix(ApplyManyTransform): 207 | """ 208 | Calculate correlation coefficients matrix across all EEG channels. 209 | """ 210 | def get_name(self): 211 | return 'corr-mat' 212 | 213 | def apply_one(self, data, meta=None): 214 | return np.corrcoef(data) 215 | 216 | 217 | class Eigenvalues(ApplyManyTransform): 218 | """ 219 | Take eigenvalues of a matrix, and sort them by magnitude in order to 220 | make them useful as features (as they have no inherent order). 221 | """ 222 | def get_name(self): 223 | return 'eigen' 224 | 225 | def apply_one(self, data, meta=None): 226 | w, v = np.linalg.eig(data) 227 | w = np.absolute(w) 228 | w.sort() 229 | return w 230 | 231 | 232 | # Take the upper right triangle of a matrix 233 | def upper_right_triangle(matrix): 234 | accum = [] 235 | for i in range(matrix.shape[0]): 236 | for j in range(i+1, matrix.shape[1]): 237 | accum.append(matrix[i, j]) 238 | 239 | return to_np_array(accum) 240 | 241 | 242 | class UpperRightTriangle(ApplyManyTransform): 243 | """ 244 | Take the upper right triangle of a matrix. 245 | """ 246 | def get_name(self): 247 | return 'urt' 248 | 249 | def apply_one(self, data, meta=None): 250 | assert data.ndim == 2 and data.shape[0] == data.shape[1] 251 | return upper_right_triangle(data) 252 | 253 | 254 | class FreqCorrelation(ApplyManyTransform): 255 | """ 256 | Correlation in the frequency domain. First take FFT with (start, end) slice options, 257 | then calculate correlation co-efficients on the FFT output, followed by calculating 258 | eigenvalues on the correlation co-efficients matrix. 259 | 260 | The output features are (fft, upper_right_diagonal(correlation_coefficients), eigenvalues) 261 | 262 | Features can be selected/omitted using the constructor arguments. 263 | """ 264 | def __init__(self, start_hz, end_hz, option, use_phase=False, with_fft=False, with_corr=True, with_eigen=True): 265 | self.start_hz = start_hz 266 | self.end_hz = end_hz 267 | self.option = option 268 | self.with_fft = with_fft 269 | self.with_corr = with_corr 270 | self.with_eigen = with_eigen 271 | self.use_phase = use_phase 272 | assert option in ('us', 'usf', 'none', 'fft_in') 273 | assert with_corr or with_eigen 274 | 275 | def get_name(self): 276 | selections = [] 277 | if self.option in ('us', 'usf', 'fft_in'): 278 | selections.append(self.option) 279 | if self.with_fft: 280 | selections.append('fft') 281 | if not self.with_corr: 282 | selections.append('nocorr') 283 | if not self.with_eigen: 284 | selections.append('noeig') 285 | if len(selections) > 0: 286 | selection_str = '-' + '-'.join(selections) 287 | else: 288 | selection_str = '' 289 | return 'freq-corr%s-%s-%s%s' % ('-phase' if self.use_phase else '', self.start_hz, self.end_hz, selection_str) 290 | 291 | def apply_one(self, data, meta=None): 292 | num_time_samples = data.shape[-1] if self.option != 'fft_in' else (data.shape[-1] - 1) * 2 # revert FFT shape change 293 | if self.start_hz == 1 and self.end_hz is None: 294 | freq_slice = Slice(self.start_hz, self.end_hz) 295 | else: 296 | # FFT range is from 0Hz to 101Hz 297 | def calc_index(f): 298 | return int((f / (meta.sampling_frequency/2.0)) * num_time_samples) if f is not None else num_time_samples 299 | freq_slice = Slice(calc_index(self.start_hz), calc_index(self.end_hz)) 300 | # print data.shape, freq_slice.start, freq_slice.end 301 | # import sys 302 | # sys.exit(0) 303 | 304 | data1 = data 305 | if self.option != 'fft_in': 306 | data1 = FFT().apply(data1) 307 | data1 = freq_slice.apply(data1) 308 | if self.use_phase: 309 | data1 = np.angle(data1) 310 | else: 311 | data1 = Magnitude().apply(data1) 312 | data1 = Log10().apply(data1) 313 | 314 | data2 = data1 315 | if self.option == 'usf': 316 | data2 = UnitScaleFeat().apply(data2) 317 | elif self.option == 'us': 318 | data2 = UnitScale().apply(data2) 319 | 320 | data2 = CorrelationMatrix().apply_one(data2) 321 | 322 | if self.with_eigen: 323 | w = Eigenvalues().apply_one(data2) 324 | 325 | out = [] 326 | if self.with_corr: 327 | data2 = upper_right_triangle(data2) 328 | out.append(data2) 329 | if self.with_eigen: 330 | out.append(w) 331 | if self.with_fft: 332 | data1 = data1.ravel() 333 | out.append(data1) 334 | for d in out: 335 | assert d.ndim == 1 336 | 337 | return np.concatenate(out, axis=0) 338 | 339 | 340 | class Correlation(ApplyManyTransform): 341 | """ 342 | Correlation in the time domain. Calculate correlation co-efficients 343 | followed by calculating eigenvalues on the correlation co-efficients matrix. 344 | 345 | The output features are (upper_right_diagonal(correlation_coefficients), eigenvalues) 346 | 347 | Features can be selected/omitted using the constructor arguments. 348 | """ 349 | def __init__(self, scale_option, with_corr=True, with_eigen=True): 350 | self.scale_option = scale_option 351 | self.with_corr = with_corr 352 | self.with_eigen = with_eigen 353 | assert scale_option in ('us', 'usf', 'none') 354 | assert with_corr or with_eigen 355 | 356 | def get_name(self): 357 | selections = [] 358 | if self.scale_option != 'none': 359 | selections.append(self.scale_option) 360 | if not self.with_corr: 361 | selections.append('nocorr') 362 | if not self.with_eigen: 363 | selections.append('noeig') 364 | if len(selections) > 0: 365 | selection_str = '-' + '-'.join(selections) 366 | else: 367 | selection_str = '' 368 | return 'corr%s' % (selection_str) 369 | 370 | def apply_one(self, data, meta=None): 371 | data1 = data 372 | if self.scale_option == 'usf': 373 | data1 = UnitScaleFeat().apply(data1) 374 | elif self.scale_option == 'us': 375 | data1 = UnitScale().apply(data1) 376 | 377 | data1 = CorrelationMatrix().apply_one(data1) 378 | 379 | # patch nans 380 | data1[np.where(np.isnan(data1))] = -2 381 | 382 | if self.with_eigen: 383 | w = Eigenvalues().apply_one(data1) 384 | 385 | out = [] 386 | if self.with_corr: 387 | data1 = upper_right_triangle(data1) 388 | out.append(data1) 389 | if self.with_eigen: 390 | out.append(w) 391 | 392 | for d in out: 393 | assert d.ndim == 1 394 | 395 | return np.concatenate(out, axis=0) 396 | 397 | 398 | class FlattenChannels(object): 399 | """ 400 | Reshapes the data from (..., N_CHANNELS, N_FEATURES) to (..., N_CHANNELS * N_FEATURES) 401 | """ 402 | def get_name(self): 403 | return 'fch' 404 | 405 | def apply(self, data, meta=None): 406 | if data.ndim == 2: 407 | return data.ravel() 408 | elif data.ndim == 3: 409 | s = data.shape 410 | return data.reshape((s[0], np.product(s[1:]))) 411 | else: 412 | raise NotImplementedError() 413 | 414 | 415 | class Windower: 416 | """ 417 | Breaks the time-series data into N second segments, for example 60s windows 418 | will create 10 windows given a 600s segment. The output is the reshaped data 419 | e.g. (600, 120000) -> (600, 10, 12000) 420 | """ 421 | def __init__(self, window_secs=None): 422 | self.window_secs = window_secs 423 | self.name = 'w-%ds' % window_secs if window_secs is not None else 'w-whole' 424 | 425 | def get_name(self): 426 | return self.name 427 | 428 | def apply(self, X, meta=None): 429 | if self.window_secs is None: 430 | return X.reshape([1] + list(X.shape)) 431 | 432 | num_windows = meta.data_length_sec / self.window_secs 433 | samples_per_window = self.window_secs * int(meta.sampling_frequency) 434 | samples_used = num_windows * samples_per_window 435 | samples_dropped = X.shape[-1] - samples_used 436 | X = Slice(samples_dropped).apply(X) 437 | out = np.split(X, num_windows, axis=X.ndim-1) 438 | out = to_np_array(out) 439 | return out 440 | 441 | class PreictalWindowGenerator: 442 | """ 443 | Experimental windower that generates overlapping windows for preictal segments only. 444 | The window_secs parameter describes how long each window is, and gen_factor describes 445 | how many extra windows you want as a multiplier. 446 | 447 | For example given a 600s segment, a window size of 60s will give you 10 windows, 448 | this number is then multiplied by gen_factor, e.g. 20 windows if gen_factor is 2. 449 | The window size is fixed and the starting point for each window will be evenly-spaced. 450 | 451 | It's been a while since I've used this, not even sure if it works properly... 452 | """ 453 | def __init__(self, window_secs, gen_factor): 454 | self.window_secs = window_secs 455 | self.gen_factor = gen_factor 456 | self.name = 'wg-%ds-%d' % (window_secs, gen_factor) 457 | self.windower = Windower(window_secs) 458 | 459 | def get_name(self): 460 | return self.name 461 | 462 | def apply(self, X, meta): 463 | if meta.data_type == 'preictal': 464 | num_windows = (meta.data_length_sec / self.window_secs) * self.gen_factor 465 | samples_per_window = self.window_secs * int(meta.sampling_frequency) / self.gen_factor 466 | samples_used = num_windows * samples_per_window 467 | samples_dropped = X.shape[-1] - samples_used 468 | X = Slice(samples_dropped).apply(X) 469 | pieces = np.split(X, num_windows, axis=X.ndim-1) 470 | pieces_per_window = self.gen_factor 471 | gen = [np.concatenate(pieces[i:i+pieces_per_window], axis=pieces[0].ndim - 1) for i in range(0, num_windows - self.gen_factor + 1)] 472 | gen = to_np_array(gen) 473 | return gen 474 | else: 475 | return self.windower.apply(X, meta) 476 | 477 | 478 | class Hurst: 479 | """ 480 | Hurst exponent per-channel, see http://en.wikipedia.org/wiki/Hurst_exponent 481 | 482 | Another description can be found here: http://www.ijetch.org/papers/698-W10024.pdf 483 | Kavya Devarajan, S. Bagyaraj, Vinitha Balasampath, Jyostna. E. and Jayasri. K., 484 | "EEG-Based Epilepsy Detection and Prediction," International Journal of Engineering 485 | and Technology vol. 6, no. 3, pp. 212-216, 2014. 486 | 487 | """ 488 | def get_name(self): 489 | return 'hurst' 490 | 491 | def apply(self, X, meta): 492 | def apply_one(x): 493 | x -= x.mean() 494 | z = np.cumsum(x) 495 | r = (np.maximum.accumulate(z) - np.minimum.accumulate(z))[1:] 496 | s = pd.expanding_std(x)[1:] 497 | 498 | # prevent division by 0 499 | s[np.where(s == 0)] = 1e-12 500 | r += 1e-12 501 | 502 | y_axis = np.log(r / s) 503 | x_axis = np.log(np.arange(1, len(y_axis) + 1)) 504 | x_axis = np.vstack([x_axis, np.ones(len(x_axis))]).T 505 | 506 | m, b = np.linalg.lstsq(x_axis, y_axis)[0] 507 | return m 508 | 509 | return np.apply_along_axis(apply_one, -1, X) 510 | 511 | 512 | class PFD(ApplyManyTransform): 513 | """ 514 | Petrosian fractal dimension per-channel 515 | 516 | Implementation derived from reading: 517 | http://arxiv.org/pdf/0804.3361.pdf 518 | F.S. Bao, D.Y.Lie,Y.Zhang,"A new approach to automated epileptic diagnosis using EEG 519 | and probabilistic neural network",ICTAI'08, pp. 482-486, 2008. 520 | """ 521 | def get_name(self): 522 | return 'pfd' 523 | 524 | def pfd_for_ch(self, ch): 525 | diff = np.diff(ch, n=1, axis=0) 526 | 527 | asign = np.sign(diff) 528 | sign_changes = ((np.roll(asign, 1) - asign) != 0).astype(int) 529 | N_delta = np.count_nonzero(sign_changes) 530 | 531 | n = len(ch) 532 | log10n = np.log10(n) 533 | return log10n / (log10n + np.log10(n / (n + 0.4 * N_delta))) 534 | 535 | def apply_one(self, X, meta=None): 536 | return to_np_array([self.pfd_for_ch(ch) for ch in X]) 537 | 538 | 539 | def hfd(X, kmax): 540 | N = len(X) 541 | Nm1 = float(N - 1) 542 | L = np.empty((kmax,)) 543 | L[0] = np.sum(abs(np.diff(X, n=1))) # shortcut :) 544 | for k in xrange(2, kmax + 1): 545 | Lmks = np.empty((k,)) 546 | for m in xrange(1, k + 1): 547 | i_end = (N - m) / k # int 548 | Lmk_sum = np.sum(abs(np.diff(X[np.arange(m - 1, m + (i_end + 1) * k - 1, k)], n=1))) 549 | Lmk = Lmk_sum * Nm1 / (i_end * k) 550 | Lmks[m-1] = Lmk 551 | 552 | L[k - 1] = np.mean(Lmks) 553 | 554 | a = np.empty((kmax, 2)) 555 | a[:, 0] = np.log(1.0 / np.arange(1.0, kmax + 1.0)) 556 | a[:, 1] = 1.0 557 | 558 | b = np.log(L) 559 | 560 | # find x by solving for ax = b 561 | x, residues, rank, s = np.linalg.lstsq(a, b) 562 | return x[0] 563 | 564 | 565 | class HFD(ApplyManyTransform): 566 | """ 567 | Higuchi fractal dimension per-channel 568 | 569 | Implementation derived from reading: 570 | http://arxiv.org/pdf/0804.3361.pdf 571 | F.S. Bao, D.Y.Lie,Y.Zhang,"A new approach to automated epileptic diagnosis using EEG 572 | and probabilistic neural network",ICTAI'08, pp. 482-486, 2008. 573 | """ 574 | def __init__(self, kmax): 575 | self.kmax = kmax 576 | 577 | def get_name(self): 578 | return 'hfd-%d' % self.kmax 579 | 580 | def apply_one(self, data, meta=None): 581 | return to_np_array([hfd(ch, self.kmax) for ch in data]) 582 | 583 | 584 | class Diff(ApplyManyTransform): 585 | """ 586 | Wrapper for np.diff 587 | """ 588 | def __init__(self, order): 589 | self.order = order 590 | 591 | def get_name(self): 592 | return 'diff-%d' % self.order 593 | 594 | def apply_one(self, data, meta=None): 595 | return np.diff(data, n=self.order, axis=data.ndim-1) 596 | 597 | 598 | class SpectralEntropy(ApplyManyTransform): 599 | """ 600 | Calculates Shannon entropy between the given frequency ranges. 601 | e.g. The probability density function of FFT magnitude is calculated, then 602 | given range [1,2,3], Shannon entropy is calculated between 1hz and 2hz, 2hz and 3hz 603 | in this case giving 2 values per channel. 604 | 605 | NOTE(mike): Input for this transform must be from (FFT(), Magnitude()) 606 | """ 607 | def __init__(self, freq_ranges, flatten=True): 608 | self.freq_ranges = freq_ranges 609 | self.flatten = flatten 610 | 611 | def get_name(self): 612 | return 'spec-ent-%s%s' % ('-'.join([str(f) for f in self.freq_ranges]), '-nf' if not self.flatten else '') 613 | 614 | def apply_one(self, fft_mag, meta): 615 | num_time_samples = (fft_mag.shape[-1] - 1) * 2 # revert FFT shape change 616 | 617 | X = fft_mag ** 2 618 | for ch in X: 619 | ch /= np.sum(ch + 1e-12) 620 | 621 | psd = X # pdf 622 | 623 | out = [] 624 | 625 | #[0,1,2] -> [[0,1], [1,2]] 626 | for start_freq, end_freq in zip(self.freq_ranges[:-1], self.freq_ranges[1:]): 627 | start_index = np.floor((start_freq / meta.sampling_frequency) * num_time_samples) 628 | end_index = np.floor((end_freq / meta.sampling_frequency) * num_time_samples) 629 | selected = psd[:, start_index:end_index] 630 | 631 | entropies = - np.sum(selected * np.log2(selected + 1e-12), axis=selected.ndim-1) / np.log2(end_index - start_index) 632 | if self.flatten: 633 | out.append(entropies.ravel()) 634 | else: 635 | out.append(entropies) 636 | 637 | if self.flatten: 638 | return np.concatenate(out) 639 | else: 640 | return to_np_array(out) 641 | 642 | 643 | class PIBSpectralEntropy(ApplyManyTransform): 644 | """ 645 | Similar to the calculations in SpectralEntropy transform, but instead power-in-band 646 | is calculated over the given freq_ranges, finally Shannon entropy is calculated on that. 647 | The output is a single entropy value per-channel. 648 | 649 | NOTE(mike): Input for this transform must be from (FFT(), Magnitude()) 650 | """ 651 | def __init__(self, freq_ranges): 652 | self.freq_ranges = freq_ranges 653 | 654 | def get_name(self): 655 | return 'pib-spec-ent-%s' % '-'.join([str(f) for f in self.freq_ranges]) 656 | 657 | def apply_one(self, data, meta=None): 658 | num_channels = data.shape[0] 659 | num_time_samples = float((data.shape[-1] - 1) * 2) # revert FFT shape change 660 | 661 | def norm(X): 662 | for ch in X: 663 | ch /= np.sum(ch + 1e-12) 664 | return X 665 | 666 | psd = data ** 2 667 | psd = norm(psd) 668 | 669 | # group into bins 670 | def binned_psd(psd, out): 671 | prev = freq_ranges[0] 672 | for i, cur in enumerate(freq_ranges[1:]): 673 | prev_index = np.floor((prev / meta.sampling_frequency) * num_time_samples) 674 | cur_index = np.floor((cur / meta.sampling_frequency) * num_time_samples) 675 | out[i] = np.sum(psd[prev_index:cur_index]) 676 | prev = cur 677 | 678 | freq_ranges = self.freq_ranges 679 | out = np.empty((num_channels, len(freq_ranges) - 1,)) 680 | for ch in range(num_channels): 681 | binned_psd(psd[ch], out[ch]) 682 | 683 | psd_per_bin = norm(out) 684 | 685 | def entropy_per_channel(psd): 686 | entropy_components = psd * np.log2(psd + 1e-12) 687 | entropy = -np.sum(entropy_components) / np.log2(psd.shape[-1]) 688 | return entropy 689 | 690 | out = np.empty((num_channels,)) 691 | for i, ch in enumerate(psd_per_bin): 692 | out[i] = entropy_per_channel(ch) 693 | 694 | return out 695 | 696 | 697 | class FreqBinning(ApplyManyTransform): 698 | """ 699 | Given spectral magnitude data, select a range of bins, and then choose a consolidation function 700 | to use to calculate each bin. The sum can be used, or the mean, or the standard deviation. 701 | 702 | NOTE(mike): Input for this transform must be from (FFT(), Magnitude()) 703 | """ 704 | def __init__(self, freq_ranges, func=None): 705 | self.freq_ranges = freq_ranges 706 | assert func is None or func in ('sum', 'mean', 'std') 707 | self.func = func 708 | 709 | def get_name(self): 710 | return 'fbin%s%s' % ('' if self.func is None else '-' + self.func, '-' + '-'.join([str(f) for f in self.freq_ranges])) 711 | 712 | def apply_one(self, X, meta): 713 | num_channels = X.shape[0] 714 | num_time_samples = (X.shape[-1] - 1) * 2 # revert FFT shape change 715 | 716 | if self.func == 'mean': 717 | func = np.mean 718 | elif self.func == 'std': 719 | func = np.std 720 | else: 721 | func = np.sum 722 | 723 | # group into bins 724 | def binned_freq(data, out): 725 | prev = freq_ranges[0] 726 | for i, cur in enumerate(freq_ranges[1:]): 727 | prev_index = np.floor((prev / meta.sampling_frequency) * num_time_samples) 728 | cur_index = np.floor((cur / meta.sampling_frequency) * num_time_samples) 729 | out[i] = func(data[prev_index:cur_index]) 730 | prev = cur 731 | 732 | freq_ranges = self.freq_ranges 733 | out = np.empty((num_channels, len(freq_ranges) - 1,)) 734 | for ch in range(num_channels): 735 | binned_freq(X[ch], out[ch]) 736 | 737 | return out 738 | 739 | 740 | class AR(ApplyManyTransform): 741 | """ 742 | Auto-regressive model as suggested by: 743 | http://hdl.handle.net/1807/33224 744 | https://tspace.library.utoronto.ca/bitstream/1807/33224/1/Green_Adrian_CA_201211_MASc_thesis.pdf 745 | 746 | It is suggested to use a model order of 8. 747 | """ 748 | def __init__(self, order): 749 | self.order = order 750 | 751 | def get_name(self): 752 | return 'ar%d' % self.order 753 | 754 | def calc_for_ch(self, ch): 755 | ar_coeffs, dnr, reflection_coeffs = spectrum.aryule(ch, self.order) 756 | return np.abs(ar_coeffs) 757 | 758 | def apply_one(self, X, meta): 759 | return np.concatenate([self.calc_for_ch(ch) for ch in X], axis=0) 760 | 761 | 762 | class SubMean: 763 | """ 764 | For each feature, subtract from each channel the mean across all channels. 765 | This is to perform average reference montage. 766 | """ 767 | def get_name(self): 768 | return 'subm' 769 | 770 | def apply(self, X, meta): 771 | assert X.ndim == 2 772 | X -= X.mean(axis=0) 773 | return X 774 | 775 | 776 | def index_for_hz(X, hz, sampling_frequency): 777 | return int((float(hz) / sampling_frequency) * X.shape[-1]) 778 | 779 | 780 | class Preprocess: 781 | """ 782 | Data preprocessing stage to normalize the data across all patients. 783 | Data that has not had average reference montage applied needs it applied. 784 | """ 785 | def get_name(self): 786 | return 'pp' 787 | 788 | def apply(self, X, meta): 789 | # NOTE(mike): Patient 1 and 2 have not subtracted the average reference from their raw data 790 | # whereas Dogs 1 to 5 have. So bring these two patients into line to normalize the preprocessing 791 | # across ALL patients. 792 | if meta.target in ('Patient_1', 'Patient_2'): 793 | X = SubMean().apply(X, meta) 794 | return X 795 | 796 | 797 | class PhaseSynchrony(ApplyManyTransform): 798 | """ 799 | Calculate phase synchrony between channels using Hilbert transform and Shannon entropy. 800 | 801 | Method described in: 802 | http://www.researchgate.net/publication/222567264_Comparison_of_Hilbert_transform_and_wavelet_methods_for_the_analysis_of_neuronal_synchrony/links/0deec52baa808a3812000000 803 | Le Van Quyen M, Foucher J, Lachaux J-P, Rodriguez E, Lutz A, Martinerie JM, Varela FJ (2001) 804 | Comparison of Hilbert transform and wavelet methods for the analysis of neural synchrony. 805 | J Neurosci Methods 111:83-98 806 | 807 | NOTE(mike): This seemed to work well in cross-validation, but I never got an increased 808 | on the leaderboard. 809 | """ 810 | def __init__(self, with_eigen=False, with_raw=True): 811 | assert with_eigen or with_raw 812 | self.with_raw = with_raw 813 | self.with_eigen = with_eigen 814 | 815 | def get_name(self): 816 | return 'phase-synchrony-%s%s' % ('-eigen' if self.with_eigen else '', '-noraw' if not self.with_raw else '') 817 | 818 | def apply_one(self, X, meta): 819 | h = X + (1j * hilbert(X)) 820 | phase = np.angle(h) 821 | 822 | num_bins = int(np.exp(0.626 + 0.4 * np.log(X.shape[-1] - 1))) 823 | Hmax = np.log(num_bins) 824 | 825 | num_channels = X.shape[0] 826 | if self.with_eigen: 827 | M = np.ones((num_channels, num_channels), dtype=np.float64) 828 | out = np.empty((num_channels * (num_channels - 1) / 2,), dtype=np.float64) 829 | count = 0 830 | for i in range(num_channels): 831 | for j in range(i + 1, num_channels): 832 | ch1_phase = phase[i] 833 | ch2_phase = phase[j] 834 | 835 | phase_diff = np.mod(np.abs(ch1_phase - ch2_phase), np.pi * 2.0) 836 | 837 | # convert phase_diff into a pdf of num_bins 838 | hist = np.histogram(phase_diff, bins=num_bins)[0] 839 | pdf = hist.astype(np.float64) / np.sum(hist) 840 | 841 | H = np.sum(pdf * np.log(pdf + 1e-12)) 842 | 843 | p = (H + Hmax) / Hmax 844 | 845 | if self.with_eigen: 846 | M[i][j] = p 847 | M[j][i] = p 848 | out[count] = p 849 | count += 1 850 | 851 | if self.with_eigen: 852 | eigen = Eigenvalues().apply_one(M) 853 | 854 | if self.with_eigen and self.with_raw: 855 | return np.concatenate((out, eigen)) 856 | 857 | if self.with_eigen: 858 | return eigen 859 | else: 860 | return out 861 | --------------------------------------------------------------------------------