├── common
    ├── __init__.py
    ├── data.py
    └── time.py
├── seizure_prediction
    ├── __init__.py
    ├── cross_validation
    │   ├── __init__.py
    │   ├── sequences.py
    │   ├── legacy_strategy.py
    │   └── kfold_strategy.py
    ├── fft_bins.py
    ├── scores.py
    ├── settings.py
    ├── classifiers.py
    ├── hdf5.py
    ├── feature_selection.py
    ├── pipeline.py
    ├── data.py
    ├── tasks.py
    └── transforms.py
├── SETTINGS.json
├── submissions
    └── combine.py
├── .gitignore
├── LICENSE
├── examine_cv_strategies.py
├── mat_to_hdf5.py
├── ensemble.py
├── main.py
├── README.md
└── genetic.py


/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/seizure_prediction/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/seizure_prediction/cross_validation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/SETTINGS.json:
--------------------------------------------------------------------------------
1 | {
2 |   "competition-data-dir": "data",
3 |   "data-cache-dir": "data-cache",
4 |   "submission-dir": "submissions",
5 |   "num-jobs": "auto"
6 | }
7 | 


--------------------------------------------------------------------------------
/seizure_prediction/fft_bins.py:
--------------------------------------------------------------------------------
1 | # NOTE(mike): FFT bin ranges I used a lot
2 | super_duper_bins = [0.5, 2, 3.5, 5, 6.5, 8, 10, 17, 24, 31, 39]
3 | super_bins = [0.5, 2, 3.5, 5, 6.5, 8, 10, 17, 24, 31, 39, 48]
4 | winning_bins = [0.5, 2.25, 4, 5.5, 7, 9.5, 12, 21, 30, 39, 48]
5 | original_bins = [0.5, 4, 7, 12, 30, 48]
6 | 


--------------------------------------------------------------------------------
/common/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path
 3 | 
 4 | 
 5 | def makedirs(dir):
 6 |     try:
 7 |         os.makedirs(dir)
 8 |     except:
 9 |         pass
10 | 
11 | class jsdict(dict):
12 |     def __init__(self, *args, **kwargs):
13 |         super(jsdict, self).__init__(*args, **kwargs)
14 |         self.__dict__ = self
15 | 


--------------------------------------------------------------------------------
/seizure_prediction/scores.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # helper methods for printing scores
 4 | 
 5 | 
 6 | def get_score_summary(name, scores):
 7 |     summary = 'mean=%.3f std=%.3f' % (np.mean(scores), np.std(scores))
 8 |     score_list = ['%.3f' % score for score in scores]
 9 |     return '%s [%s] %s' % (summary, ','.join(score_list), name)
10 | 
11 | 
12 | def print_results(summaries):
13 |     summaries.sort(cmp=lambda x,y: cmp(x[1], y[1]))
14 |     if len(summaries) > 1:
15 |         print 'summaries'
16 |         for s, mean in summaries:
17 |             print s
18 | 


--------------------------------------------------------------------------------
/submissions/combine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | 
 3 | import numpy as np
 4 | import sys
 5 | import gzip
 6 | 
 7 | if len(sys.argv) != 3:
 8 |   print >>sys.err, 'Usage: ./combine.py submission0.csv.gz submission1.csv.gz | gzip >combined-0-1.csv.gz'
 9 | 
10 | filenames = sys.argv[1:]
11 | files = [gzip.open(filename, 'rb') for filename in filenames]
12 | 
13 | print [f.readline() for f in files][0],
14 | 
15 | def split(line):
16 |   t, p = line.split(',')
17 |   return t, float(p)
18 | 
19 | while True:
20 |   lines = [f.readline() for f in files]
21 |   if lines[0] == "":
22 |     break;
23 | 
24 |   t, p = zip(*[split(line) for line in lines])
25 | 
26 |   for tt in t:
27 |     assert(tt == t[0])
28 | 
29 |   p = np.mean(p)
30 |   print '%s,%.10f' % (t[0], p)
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/common/time.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | def unix_time(dt):
 4 |     epoch = datetime.utcfromtimestamp(0)
 5 |     delta = dt - epoch
 6 |     return delta.total_seconds()
 7 | 
 8 | 
 9 | def unix_time_millis(dt):
10 |     return int(unix_time(dt) * 1000.0)
11 | 
12 | 
13 | def get_millis():
14 |     return unix_time_millis(datetime.now())
15 | 
16 | 
17 | def get_seconds():
18 |     return get_millis() / 1000.0
19 | 
20 | 
21 | class Timer:
22 |     def __init__(self):
23 |         self.start = get_millis()
24 | 
25 |     def elapsed_millis(self):
26 |         return get_millis() - self.start
27 | 
28 |     def elapsed_seconds(self):
29 |         return long(self.elapsed_millis() / 1000.0)
30 | 
31 |     def pretty_str(self):
32 |         ms = self.elapsed_millis()
33 |         if ms > 5000:
34 |             return '%ds' % long(ms / 1000.0)
35 |         return '%dms' % ms
36 | 


--------------------------------------------------------------------------------
/seizure_prediction/settings.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import json
 3 | import multiprocessing
 4 | import os
 5 | 
 6 | Settings = namedtuple('Settings', ['data_dir', 'cache_dir', 'submission_dir', 'N_jobs'])
 7 | 
 8 | 
 9 | def load_settings():
10 |     with open('SETTINGS.json') as f:
11 |         settings = json.load(f)
12 | 
13 |     data_dir = str(settings['competition-data-dir'])
14 |     cache_dir = str(settings['data-cache-dir'])
15 |     submission_dir = str(settings['submission-dir'])
16 |     N_jobs = str(settings['num-jobs'])
17 |     N_jobs = multiprocessing.cpu_count() if N_jobs == 'auto' else int(N_jobs)
18 | 
19 |     for d in (cache_dir, submission_dir):
20 |         try:
21 |             os.makedirs(d)
22 |         except:
23 |             pass
24 | 
25 |     return Settings(data_dir=data_dir, cache_dir=cache_dir, submission_dir=submission_dir, N_jobs=N_jobs)
26 | 


--------------------------------------------------------------------------------
/seizure_prediction/cross_validation/sequences.py:
--------------------------------------------------------------------------------
 1 | import sklearn.utils
 2 | 
 3 | def collect_sequence_ranges(sequences):
 4 |     assert len(sequences) > 0
 5 |     seq_starts = [0]
 6 |     prev = sequences[0]
 7 |     for i, seq in enumerate(sequences[1:]):
 8 |         if seq != prev + 1:
 9 |             seq_starts.append(i + 1)
10 |         prev = seq
11 | 
12 |     seq_ranges = []
13 |     prev_start = seq_starts[0]
14 |     for start in seq_starts[1:]:
15 |         seq_ranges.append((prev_start, start))
16 |         prev_start = start
17 | 
18 |     seq_ranges.append((prev_start, len(sequences)))
19 | 
20 |     return seq_ranges
21 | 
22 | def collect_sequence_ranges_from_meta(meta, shuffle=True):
23 |     sequences = meta.sequence
24 |     seq_ranges = collect_sequence_ranges(sequences)
25 |     if shuffle:
26 |         seq_ranges = sklearn.utils.shuffle(seq_ranges, random_state=2)
27 |     return seq_ranges
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Michael Hills
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/seizure_prediction/classifiers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn
 3 | import sklearn.pipeline
 4 | from sklearn.linear_model import LogisticRegression, LinearRegression
 5 | from sklearn.preprocessing import StandardScaler
 6 | from sklearn.svm import SVC
 7 | 
 8 | 
 9 | # NOTE(mike): doesn't handle multi-class
10 | class SimpleLogisticRegression(LinearRegression):
11 |     def predict_proba(self, X):
12 |         predictions = self.predict(X)
13 |         predictions = sklearn.preprocessing.scale(predictions)
14 |         predictions = 1.0 / (1.0 + np.exp(-0.5 * predictions))
15 |         return np.vstack((1.0 - predictions, predictions)).T
16 | 
17 | 
18 | def make_svm(gamma, C):
19 |     cls = sklearn.pipeline.make_pipeline(StandardScaler(),
20 |         SVC(gamma=gamma, C=C, probability=True, cache_size=500, random_state=0))
21 |     name = 'ss-svc-g%.4f-C%.1f' % (gamma, C)
22 |     return (cls, name)
23 | 
24 | 
25 | def make_lr(C):
26 |     cls = sklearn.pipeline.make_pipeline(StandardScaler(), LogisticRegression(C=C))
27 |     name = 'ss-lr-C%.4f' % C
28 |     return (cls, name)
29 | 
30 | 
31 | def make_simple_lr():
32 |     return (sklearn.pipeline.make_pipeline(StandardScaler(), SimpleLogisticRegression()), 'ss-slr')
33 | 


--------------------------------------------------------------------------------
/seizure_prediction/cross_validation/legacy_strategy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn.cross_validation
 3 | from seizure_prediction.cross_validation.sequences import collect_sequence_ranges_from_meta
 4 | 
 5 | 
 6 | class LegacyStrategy:
 7 |     """
 8 |     Hand-picked random folds maintaining sequence integrity with 80% train/cv split.
 9 |     See k_fold_strategy for docs on each method.
10 |     """
11 | 
12 |     def get_name(self):
13 |         return 'legacy'
14 | 
15 |     def get_folds(self, preictal_meta):
16 |         # hand-picked on my system to give a nice spread when num_sequences = 3,
17 |         # i.e. (0, 1), (0, 2), (1, 2) when using 3 folds
18 |         # The new way is to use k_fold.py instead of this
19 |         return [8, 11, 14]
20 | 
21 |     def get_sequence_ranges(self, meta, fold_number, interictal=None, shuffle=None):
22 |         train_size = 0.8
23 |         seq_ranges = collect_sequence_ranges_from_meta(meta, shuffle=False)
24 |         return sklearn.cross_validation.train_test_split(seq_ranges, train_size=train_size, random_state=fold_number)
25 | 
26 |     def split_train_cv(self, data, meta, fold_number, interictal=False):
27 | 
28 |         train_ranges, cv_ranges = self.get_sequence_ranges(meta, fold_number, interictal=interictal)
29 | 
30 |         train_data = []
31 |         for start, end in train_ranges:
32 |             train_data.append(data[start:end])
33 |         train_data = np.concatenate(train_data, axis=0)
34 | 
35 |         cv_data = []
36 |         for start, end in cv_ranges:
37 |             cv_data.append(data[start:end])
38 |         cv_data = np.concatenate(cv_data, axis=0)
39 | 
40 |         return train_data, cv_data
41 | 


--------------------------------------------------------------------------------
/seizure_prediction/hdf5.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import numpy as np
 3 | import re
 4 | from common.data import jsdict
 5 | 
 6 | # Helper method to dump a dictionary of ndarrays or primitives to hdf5, and then read them back.
 7 | # It looks like I also added list support, cool.
 8 | 
 9 | METADATA_TAG = '__metadata'
10 | 
11 | list_regex = re.compile(r"""__list_(.*)_(\d+)""")
12 | 
13 | 
14 | def write(filename, obj):
15 |     data = h5py.File(filename, 'w-', libver='latest')
16 |     meta_dataset = data.create_dataset(METADATA_TAG, shape=(1,))
17 | 
18 |     for key in obj.keys():
19 |         value = obj[key]
20 |         if isinstance(value, np.ndarray):
21 |             data.create_dataset(key, data=value)
22 |         elif isinstance(value, list):
23 |             for i, v in enumerate(value):
24 |                 assert isinstance(v, np.ndarray)
25 |                 data.create_dataset('__list_%s_%d' % (key, i), data=v)
26 |         else:
27 |             meta_dataset.attrs[key] = value
28 | 
29 |     data.close()
30 | 
31 | 
32 | def read(filename):
33 |     data = h5py.File(filename, 'r')
34 |     obj = {}
35 |     for key in data.keys():
36 |         value = data[key]
37 |         if key == METADATA_TAG:
38 |             for metakey in value.attrs.keys():
39 |                 obj[metakey] = value.attrs[metakey]
40 |         elif not key.startswith('__list'):
41 |             obj[key] = value[:]
42 | 
43 |     list_keys = [key for key in data.keys() if key.startswith('__list')]
44 |     if len(list_keys) > 0:
45 |         list_keys.sort()
46 |         for key in list_keys:
47 |             match = list_regex.match(key)
48 |             assert match is not None
49 |             list_key = match.group(1)
50 |             list_index = int(match.group(2))
51 |             out_list = obj.setdefault(list_key, [])
52 |             assert len(out_list) == list_index
53 |             out_list.append(data[key][:])
54 | 
55 |     data.close()
56 | 
57 |     return jsdict(obj)
58 | 
59 | 


--------------------------------------------------------------------------------
/examine_cv_strategies.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | 
 3 | import numpy as np
 4 | 
 5 | from seizure_prediction.cross_validation.kfold_strategy import KFoldStrategy
 6 | from seizure_prediction.cross_validation.legacy_strategy import LegacyStrategy
 7 | from seizure_prediction.cross_validation.sequences import collect_sequence_ranges
 8 | from seizure_prediction.pipeline import Pipeline, InputSource
 9 | from seizure_prediction.settings import load_settings
10 | from seizure_prediction.tasks import load_pipeline_data
11 | 
12 | 
13 | targets = [
14 |     'Dog_1',
15 |     'Dog_2',
16 |     'Dog_3',
17 |     'Dog_4',
18 |     'Dog_5',
19 |     'Patient_1',
20 |     'Patient_2'
21 | ]
22 | 
23 | class Zero:
24 |     def get_name(self):
25 |         return 'zero'
26 | 
27 |     def apply(self, X, meta):
28 |         return np.zeros(list(X.shape[:-1]) + [1])
29 | 
30 | settings = load_settings()
31 | pipeline = Pipeline(InputSource(), Zero())
32 | 
33 | strategies = [
34 |     LegacyStrategy(),
35 |     KFoldStrategy(),
36 | ]
37 | 
38 | for strategy in strategies:
39 |     print 'Strategy', strategy.get_name()
40 |     for target in targets:
41 |         _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=True, meta_only=True)
42 |         # _, interictal_meta = load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=True, meta_only=True)
43 |         fold_numbers = strategy.get_folds(preictal_meta)
44 |         data = np.arange(0, preictal_meta.X_shape[0]).astype(np.int)
45 |         sequence_ranges = collect_sequence_ranges(preictal_meta.sequence)
46 |         print '%s\n%d folds from %d sequences %s' % (target, len(fold_numbers), len(sequence_ranges), sequence_ranges)
47 |         for fold_number in fold_numbers:
48 |             train_folds, cv_folds = strategy.get_sequence_ranges(preictal_meta, fold_number, interictal=False, shuffle=False)
49 |             print [list(f) for f in train_folds]
50 |         print
51 | 
52 | 


--------------------------------------------------------------------------------
/seizure_prediction/feature_selection.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from seizure_prediction.tasks import load_pipeline_data
 4 | 
 5 | 
 6 | # Generate random feature masks using split_ratio as the rough guide to how many features are ON and how many are OFF.
 7 | def generate_feature_masks(settings, target, pipeline, num_masks, split_ratio, random_state, threshold=500, quiet=False):
 8 |     if not quiet: print target
 9 |     def get_pipeline_data(pipeline):
10 |         _, preictal_meta = \
11 |             load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, meta_only=True, quiet=quiet)
12 |         _, interictal_meta = \
13 |             load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, meta_only=True, quiet=quiet)
14 |         num_features = preictal_meta.X_shape[-1]
15 |         num_train_segments = preictal_meta.num_segments + interictal_meta.num_segments
16 |         return num_features, num_train_segments
17 | 
18 |     if len(pipeline.get_pipelines()) == 0:
19 |         return []
20 | 
21 |     num_features, num_training_examples = get_pipeline_data(pipeline)
22 | 
23 |     # NOTE(mike): Seemingly some patients benefit from these feature masks and some don't.
24 |     # Currently the only pattern is number of training examples but this may or may not hold
25 |     # true without doing further testing. Some manual testing against public leaderboard showed
26 |     # a negative effect on Patient 1 and 2 but positive effects on Dogs 3 and 4. Dog 1 seemed to
27 |     # have little to no effect and maybe a very slight positive effect on Dog 2.
28 |     if num_training_examples < threshold:
29 |         ratio = 1.0
30 |     else:
31 |         ratio = split_ratio
32 | 
33 |     if not quiet: print 'num features', num_features
34 |     if not quiet: print 'ratio', ratio
35 |     if not quiet: print 'num wanted features', int(num_features * ratio)
36 | 
37 |     if ratio == 1.0:
38 |         masks = np.ones((num_masks, num_features))
39 |     else:
40 |         prng = np.random.RandomState(random_state)
41 |         masks = (prng.random_sample((num_masks, num_features)) <= ratio)
42 | 
43 |     masks = list(masks.astype(np.int))
44 | 
45 |     if not quiet: print np.shape(masks)
46 |     if not quiet: print 'generated', [np.sum(mask) for mask in masks]
47 |     return list(masks)
48 | 
49 | 


--------------------------------------------------------------------------------
/seizure_prediction/pipeline.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Pipeline(object):
 3 |     """
 4 |     A Pipeline is an object representing the data transformations to make
 5 |     on the input data, finally outputting extracted features.
 6 | 
 7 |     input_source: Where to source the data from, InputSource() for original data or
 8 |                   InputSource(some_pipeline) to load the output of a pipeline.
 9 |     transforms: List of transforms to apply one by one to the input data.
10 |     """
11 |     def __init__(self, input_source, *transforms):
12 |         self.input_source = input_source
13 |         input_source_pipeline = input_source.get_pipeline()
14 |         self.transforms = transforms
15 |         full_pipeline = [input_source_pipeline] + list(transforms) if input_source_pipeline is not None else transforms
16 |         names = [t.get_name() for t in full_pipeline]
17 |         self.name = 'empty' if len(names) == 0 else '_'.join(names)
18 | 
19 |     def get_name(self):
20 |         return self.name
21 | 
22 |     def get_names(self):
23 |         return [self.name]
24 | 
25 |     def apply(self, data, meta):
26 |         for transform in self.transforms:
27 |             data = transform.apply(data, meta)
28 |         return data
29 | 
30 |     def get_input_source(self):
31 |         return self.input_source
32 | 
33 |     def get_pipelines(self):
34 |         return [self]
35 | 
36 | 
37 | class FeatureConcatPipeline(object):
38 |     """
39 |     Represents a list of pipelines with their features concatenated together.
40 |     Useful for combining separate feature sets to see if they combine well.
41 |     """
42 |     def __init__(self, *pipelines, **options):
43 |         pipelines = list(pipelines)
44 |         if 'sort' not in options or options['sort'] == True:
45 |             pipelines.sort(lambda x, y: cmp(x.get_name(), y.get_name()))
46 |         self.pipelines = pipelines
47 |         self.names = [p.get_name() for p in pipelines]
48 |         self.name = 'FCP_' + '_cc_'.join(self.names)
49 |         for p in pipelines:
50 |             assert isinstance(p, Pipeline)
51 | 
52 |     def get_name(self):
53 |         return self.name
54 | 
55 |     def get_names(self):
56 |         return self.names
57 | 
58 |     def apply(self, data, meta):
59 |         raise NotImplementedError()
60 | 
61 |     def get_pipelines(self):
62 |         return self.pipelines
63 | 
64 |     def get_input_source(self):
65 |         raise NotImplementedError()
66 | 
67 | 
68 | class InputSource:
69 |     """
70 |     Wraps a pipeline to represent it as a data-source.
71 |     """
72 |     def __init__(self, *transforms):
73 |         self.pipeline = Pipeline(InputSource(), *transforms) if len(transforms) > 0 else None
74 | 
75 |     def get_pipeline(self):
76 |         return self.pipeline
77 | 
78 | 


--------------------------------------------------------------------------------
/seizure_prediction/cross_validation/kfold_strategy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn
 3 | from seizure_prediction.cross_validation.sequences import collect_sequence_ranges_from_meta
 4 | 
 5 | class KFoldStrategy:
 6 |     """
 7 |     Create a k-fold strategy focused on preictal segments. The idea is to create a small number of folds
 8 |     that maximise coverage of the training set. Small number of folds as to keep performance in check.
 9 |     If there are 3 preictal sequences, then do 3 folds of (0,1), (0,2), (1,2). If there are 6 sequences,
10 |     do 3 folds (0,1), (2,3), (4,5). The sequences are shuffled before being allocated to folds.
11 | 
12 |     However, interictal sequences are partitioned randomly as there are a lot more of them that random
13 |     should more or less be fine.
14 |     """
15 | 
16 |     def get_name(self):
17 |         return 'kfold'
18 | 
19 |     def get_folds(self, preictal_meta):
20 |         """
21 |         :param preictal_meta: metadata from preictal segments
22 |         :return: iterable of fold numbers to pass to split_train_cv
23 |         """
24 |         num_seqs = len(collect_sequence_ranges_from_meta(preictal_meta))
25 |         assert num_seqs >= 2
26 |         if num_seqs <= 2:
27 |             num_folds = 2
28 |         elif num_seqs <= 6:
29 |             num_folds = 3
30 |         else:
31 |             num_folds = num_seqs / 2
32 | 
33 |         return xrange(num_folds)
34 | 
35 |     def get_sequence_ranges(self, meta, fold_number, interictal=False, shuffle=True):
36 |         seq_ranges = collect_sequence_ranges_from_meta(meta, shuffle=shuffle)
37 |         num_seqs = len(seq_ranges)
38 | 
39 |         # calculate the split numbers for a fold
40 |         def get_num_train_seqs(num_seqs):
41 |             if num_seqs <= 3:
42 |                 return 2
43 |             else:
44 |                 return 3
45 | 
46 |         if interictal:
47 |             interictal_ratio = 0.8 if num_seqs <= 20 else 0.4
48 |             train_ranges, cv_ranges = sklearn.cross_validation.train_test_split(seq_ranges, train_size=interictal_ratio, random_state=fold_number)
49 |         else:
50 |             train_size = get_num_train_seqs(num_seqs)
51 |             if num_seqs == 3:
52 |                 combinations = [[0, 1], [0, 2], [1, 2]]
53 |             else:
54 |                 first_pass = [range(i, i + train_size) for i in range(0, num_seqs, train_size) if (i + train_size) <= num_seqs]
55 |                 remainder = num_seqs % train_size
56 |                 if remainder == 0:
57 |                     gap = []
58 |                 else:
59 |                     seq = range(num_seqs - remainder, num_seqs)
60 |                     needed = train_size - remainder
61 |                     gap_fillers = [i * train_size for i in range(needed)]
62 |                     gap_fillers = [x for x in gap_fillers if x < num_seqs]
63 |                     # print 'gf', gap_fillers
64 |                     if len(gap_fillers) < train_size:
65 |                         gap_fillers = [i * (train_size-1) for i in range(needed)]
66 |                         gap_fillers = [x for x in gap_fillers if x < num_seqs]
67 |                     gap = [gap_fillers + seq]
68 |                 second_pass = [range(i, i + train_size**2, train_size) for i in range(num_seqs)]
69 |                 second_pass = [x for x in second_pass if len(x) == train_size and x < num_seqs]
70 |                 third_pass = [range(i, i + train_size) for i in range(1, num_seqs, train_size) if (i + train_size) <= num_seqs]
71 |                 # third_pass = [range(i, i + train_size) for i in range(2, num_seqs, train_size) if (i + train_size) < num_seqs]
72 |                 combinations = first_pass + gap + second_pass + third_pass
73 |             indices = combinations[fold_number]
74 |             # print 'indices', indices
75 |             train_ranges = [seq_ranges[i] for i in indices]
76 |             cv_ranges = np.delete(seq_ranges, indices, axis=0)
77 | 
78 |         return train_ranges, cv_ranges
79 | 
80 |     def split_train_cv(self, data, meta, fold_number, interictal=False):
81 |         train_ranges, cv_ranges = self.get_sequence_ranges(meta, fold_number, interictal)
82 | 
83 |         train_data = []
84 |         for start, end in train_ranges:
85 |             train_data.append(data[start:end])
86 |         train_data = np.concatenate(train_data, axis=0)
87 | 
88 |         cv_data = []
89 |         for start, end in cv_ranges:
90 |             cv_data.append(data[start:end])
91 |         cv_data = np.concatenate(cv_data, axis=0)
92 | 
93 |         return train_data, cv_data
94 | 


--------------------------------------------------------------------------------
/mat_to_hdf5.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | from collections import namedtuple
  4 | from multiprocessing import Pool
  5 | from common.data import jsdict
  6 | from common.time import Timer
  7 | from seizure_prediction import hdf5
  8 | from seizure_prediction.data import accumulate_data
  9 | from seizure_prediction.settings import load_settings
 10 | import numpy as np
 11 | import scipy.io
 12 | import scipy.signal
 13 | import os.path
 14 | import sys
 15 | 
 16 | 
 17 | Reader = namedtuple('Reader', ['read', 'exists', 'filename'])
 18 | 
 19 | 
 20 | class Metadata(object):
 21 | 
 22 |     def __init__(self):
 23 |         self.shape = None
 24 |         self.data_length_sec = None
 25 |         self.sampling_frequency = None
 26 |         self.channels = None
 27 |         self.sequences = []
 28 | 
 29 |     def add_shape(self, shape):
 30 |         if self.shape is None:
 31 |             self.shape = shape
 32 |         else:
 33 |             assert shape == self.shape
 34 | 
 35 |     def add_data_length_sec(self, data_length_sec):
 36 |         if self.data_length_sec is None:
 37 |             self.data_length_sec = data_length_sec
 38 |         else:
 39 |             assert data_length_sec == self.data_length_sec
 40 | 
 41 |     def add_sampling_frequency(self, sampling_frequency):
 42 |         if self.sampling_frequency is None:
 43 |             self.sampling_frequency = sampling_frequency
 44 |         else:
 45 |             assert sampling_frequency == self.sampling_frequency
 46 | 
 47 |     def add_channels(self, channels):
 48 |         if self.channels is None:
 49 |             self.channels = channels
 50 |         else:
 51 |             assert np.alltrue(channels == self.channels)
 52 | 
 53 |     def add_sequence(self, sequence):
 54 |         if sequence is not None:
 55 |             self.sequences.append(sequence)
 56 | 
 57 |     def __str__(self):
 58 |         seq_groups = []
 59 |         prev = None
 60 |         prev_start = None
 61 |         for seq in self.sequences:
 62 |             if prev_start is None:
 63 |                 prev_start = seq
 64 |             else:
 65 |                 if seq != prev + 1:
 66 |                     if prev_start == prev:
 67 |                         seq_groups.append('%d' % prev)
 68 |                     else:
 69 |                         seq_groups.append('%d-%d' % (prev_start, prev))
 70 |                     prev_start = seq
 71 |             prev = seq
 72 |         if prev_start is not None:
 73 |             seq_groups.append('%d-%d' % (prev_start, prev))
 74 | 
 75 |         seq_mega_groups = []
 76 |         prev = None
 77 |         count = 1
 78 |         for group in seq_groups:
 79 |             if prev is not None:
 80 |                 if prev != group:
 81 |                     seq_mega_groups.append(('%d of %s' % (count, prev)) if count > 1 else prev)
 82 |                     count = 1
 83 |                 else:
 84 |                     count += 1
 85 |             prev = group
 86 |         if prev is not None:
 87 |             seq_mega_groups.append('%d of %s' % (count, prev) if count > 1 else prev)
 88 | 
 89 |         return str({
 90 |             'shape': self.shape,
 91 |             'data_length_sec': self.data_length_sec,
 92 |             'sampling_frequency': self.sampling_frequency,
 93 |             'channels': len(self.channels) if self.channels is not None else None,
 94 |             'sequences': seq_mega_groups
 95 |         })
 96 | 
 97 | 
 98 | def process_data_sub_job(settings, filename_in_fmt, filename_out_fmt, id, num_jobs):
 99 | 
100 |     pid = os.getpid()
101 |     reader = mat_reader(target, settings.data_dir)
102 | 
103 |     num_processed = 0
104 |     for i in xrange(id + 1, sys.maxint, num_jobs):
105 |         out_index = i - 1
106 |         filename_in = filename_in_fmt % i
107 |         filename_out = filename_out_fmt % out_index if filename_out_fmt is not None else None
108 |         filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None
109 | 
110 |         if filename_out is not None and os.path.exists(filename_out):
111 |             num_processed += 1
112 |             continue
113 | 
114 |         if not reader.exists(filename_in):
115 |             if i == id + 1:
116 |                 print 'Could not find file', reader.filename(filename_in)
117 |                 return 0
118 |             break
119 | 
120 |         print 'Runner %d processing %s' % (id, reader.filename(filename_in))
121 | 
122 |         segment = reader.read(filename_in)
123 |         data = process_data(segment)
124 |         hdf5.write(filename_out_temp, data)
125 | 
126 |         os.rename(filename_out_temp, filename_out)
127 | 
128 |         num_processed += 1
129 | 
130 |     return num_processed
131 | 
132 | 
133 | def process_data(segment):
134 |     data_key = [key for key in segment.keys() if not key.startswith('_')][0]
135 |     data = segment[data_key][0][0]
136 | 
137 |     X = data[0]
138 |     data_length_sec = int(data[1][0][0])
139 |     sampling_frequency = float(data[2][0][0])
140 |     channels = [ch[0] for ch in data[3][0]]
141 |     sequence = int(data[4][0][0]) if len(data) >= 5 else None
142 | 
143 |     min_freq = 195.0
144 |     def find_q():
145 |         q = 2
146 |         while True:
147 |             f = sampling_frequency / q
148 |             if f < min_freq:
149 |                 return q - 1
150 |             q += 1
151 | 
152 |     if sampling_frequency > min_freq:
153 |         q = find_q()
154 |         if q > 1:
155 |             # if X.dtype != np.float64:
156 |             #     X = X.astype(np.float64)
157 |             # X -= X.mean(axis=0)
158 |             X = scipy.signal.decimate(X, q, ftype='fir', axis=X.ndim-1)
159 |             X = np.round(X).astype(np.int16)
160 |             # if X.dtype != np.float32:
161 |             #     X = X.astype(np.float32)
162 |             sampling_frequency /= q
163 | 
164 |     channels = np.array(channels, dtype=str(channels[0].dtype).replace('U', 'S'))
165 |     out = {
166 |         'X': X,
167 |         'data_length_sec': data_length_sec,
168 |         'sampling_frequency': sampling_frequency,
169 |         'num_channels': X.shape[0],
170 |         'channels': channels,
171 |         'target': target,
172 |         'data_type': data_type,
173 |     }
174 |     if sequence is not None:
175 |         out['sequence'] = sequence
176 | 
177 |     return jsdict(out)
178 | 
179 | 
180 | #used for verifying and printing
181 | def collect_metadata(data, metadata_accum):
182 |     metadata_accum.add_shape(data.X.shape)
183 |     metadata_accum.add_data_length_sec(data.data_length_sec)
184 |     metadata_accum.add_sampling_frequency(data.sampling_frequency)
185 |     metadata_accum.add_channels(data.channels)
186 |     if 'sequence' in data:
187 |         metadata_accum.add_sequence(data.sequence)
188 | 
189 | 
190 | def process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs):
191 |     filename_out = os.path.join(out_dir, '%s_%s.hdf5' % (target, data_type))
192 | 
193 |     if os.path.exists(filename_out):
194 |         return 0
195 | 
196 |     print 'Processing %s ...' % filename_out
197 | 
198 |     filename_in_fmt = '%s_%s_segment_%%.4d' % (target, data_type)
199 |     filename_out_fmt = '%s/%s_%s_segment_%%d.hdf5' % (out_dir, target, data_type)
200 | 
201 |     # process_data_sub_job(settings, filename_in_fmt, filename_out_fmt, 0, 1)
202 |     pool = Pool(N_jobs)
203 |     results = [pool.apply_async(process_data_sub_job, [settings, filename_in_fmt, filename_out_fmt, id, N_jobs])
204 |         for id in range(N_jobs)]
205 |     pool.close()
206 |     pool.join()
207 | 
208 |     num_processed = np.sum([r.get() for r in results])
209 |     for i in xrange(num_processed):
210 |         data = hdf5.read(filename_out_fmt % i)
211 |         collect_metadata(data, metadata)
212 | 
213 |     _, accum_meta = accumulate_data(settings, target, data_type, tag=None,
214 |         output_to_original_data_dir=True, quiet=True)
215 | 
216 |     return accum_meta.num_segments
217 | 
218 | 
219 | def mat_reader(target, dir):
220 |     ext = '.mat'
221 |     expand_filename = lambda filename: os.path.join(dir, target, filename + ext)
222 |     read = lambda filename: scipy.io.loadmat(expand_filename(filename))
223 |     exists = lambda filename: os.path.exists(expand_filename(filename))
224 |     return Reader(read=read, exists=exists, filename=expand_filename)
225 | 
226 | 
227 | def process_mat_into_hdf5(settings, target, data_type, N_jobs):
228 |     assert data_type in ('preictal', 'interictal', 'test')
229 | 
230 |     print 'Loading data ...'
231 |     timer = Timer()
232 | 
233 |     out_dir = os.path.join(settings.data_dir)
234 |     metadata = Metadata()
235 |     segments_processed = process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs)
236 | 
237 |     print 'Processed %d segments in %s' % (segments_processed, timer.pretty_str())
238 |     print data_type, 'Metadata', metadata
239 | 
240 | 
241 | if __name__ == "__main__":
242 | 
243 |     settings = load_settings()
244 |     N_jobs = 8
245 | 
246 |     data_types = [
247 |         'preictal',
248 |         'interictal',
249 |         'test'
250 |     ]
251 | 
252 |     targets = [
253 |         'Dog_1',
254 |         'Dog_2',
255 |         'Dog_3',
256 |         'Dog_4',
257 |         'Dog_5',
258 |         'Patient_1',
259 |         'Patient_2'
260 |     ]
261 | 
262 |     for target in targets:
263 |         for data_type in data_types:
264 |             process_mat_into_hdf5(settings, target, data_type, N_jobs)
265 | 


--------------------------------------------------------------------------------
/ensemble.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | from multiprocessing import Pool
  4 | import sys
  5 | 
  6 | import numpy as np
  7 | from sklearn.metrics import roc_auc_score
  8 | 
  9 | from seizure_prediction.classifiers import make_svm, make_simple_lr, make_lr
 10 | from seizure_prediction.feature_selection import generate_feature_masks
 11 | from seizure_prediction.fft_bins import *
 12 | from seizure_prediction.pipeline import Pipeline, FeatureConcatPipeline, InputSource
 13 | from seizure_prediction.scores import get_score_summary, print_results
 14 | from seizure_prediction.tasks import make_csv_for_target_predictions, write_submission_file, \
 15 |     cross_validation_score, check_training_data_loaded, check_test_data_loaded, make_submission_predictions
 16 | from seizure_prediction.transforms import Windower, Correlation, FreqCorrelation, FFT, \
 17 |     Magnitude, PIBSpectralEntropy, Log10, FreqBinning, FlattenChannels, Preprocess, HFD, PFD, Hurst
 18 | from seizure_prediction.settings import load_settings
 19 | from main import run_prepare_data_for_cross_validation
 20 | 
 21 | 
 22 | def run_make_submission(settings, targets_and_pipelines, split_ratio):
 23 |     pool = Pool(settings.N_jobs)
 24 |     for i, (target, pipeline, feature_masks, classifier, classifier_name) in enumerate(targets_and_pipelines):
 25 |         for j, feature_mask in enumerate(feature_masks):
 26 |             progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks))
 27 |             pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name],
 28 |                 {'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': True})
 29 |     pool.close()
 30 |     pool.join()
 31 | 
 32 |     guesses = ['clip,preictal']
 33 |     num_masks = None
 34 |     classifier_names = []
 35 |     for target, pipeline, feature_masks, classifier, classifier_name in targets_and_pipelines:
 36 |         classifier_names.append(classifier_name)
 37 |         if num_masks is None:
 38 |             num_masks = len(feature_masks)
 39 |         else:
 40 |             assert num_masks == len(feature_masks)
 41 | 
 42 |         test_predictions = []
 43 | 
 44 |         for feature_mask in feature_masks:
 45 |             data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask)
 46 |             test_predictions.append(data.mean_predictions)
 47 | 
 48 |         predictions = np.mean(test_predictions, axis=0)
 49 |         guesses += make_csv_for_target_predictions(target, predictions)
 50 | 
 51 |     output = '\n'.join(guesses)
 52 |     write_submission_file(settings, output, 'ensemble n=%d split_ratio=%s' % (num_masks, split_ratio), None, str(classifier_names), targets_and_pipelines)
 53 | 
 54 | 
 55 | def run_prepare_data(settings, targets, pipelines, train=True, test=False, quiet=False):
 56 |     for pipeline in pipelines:
 57 |         for target in targets:
 58 |             print 'Preparing data for', target
 59 |             if train:
 60 |                 check_training_data_loaded(settings, target, pipeline, quiet=quiet)
 61 |             if test:
 62 |                 check_test_data_loaded(settings, target, pipeline, quiet=quiet)
 63 | 
 64 | 
 65 | def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers):
 66 |     pool = Pool(settings.N_jobs)
 67 |     for i, pipeline in enumerate(pipelines):
 68 |         for j, (classifier, classifier_name) in enumerate(classifiers):
 69 |             for k, target in enumerate(targets):
 70 |                 pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True})
 71 |                 for split_num, split_ratio in enumerate(split_ratios):
 72 |                     masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
 73 |                     for mask_num, mask in enumerate(masks):
 74 |                         progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks))
 75 |                         cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str)
 76 |     pool.close()
 77 |     pool.join()
 78 |     print 'Finished cross validation mp'
 79 | 
 80 |     summaries = []
 81 |     for p_num, pipeline in enumerate(pipelines):
 82 |         for classifier, classifier_name in classifiers:
 83 |             scores_full = []
 84 |             scores_masked = [[[] for y in mask_range] for x in split_ratios]
 85 |             for i, target in enumerate(targets):
 86 |                 run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True)
 87 |                 data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True)
 88 |                 scores_full.append(data.mean_score)
 89 | 
 90 |                 for split_index, split_ratio in enumerate(split_ratios):
 91 |                     masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
 92 |                     for mask_index, num_masks in enumerate(mask_range):
 93 |                         predictions = []
 94 |                         y_cvs = None
 95 |                         for mask in masks[0:num_masks]:
 96 |                             data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True)
 97 |                             predictions.append(data.mean_predictions)
 98 |                             if y_cvs is None:
 99 |                                 y_cvs = data.y_cvs
100 |                             else:
101 |                                 for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs):
102 |                                     assert np.alltrue(y_cv_1 == y_cv_2)
103 | 
104 |                         predictions = np.mean(predictions, axis=0)
105 |                         scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)]
106 |                         score = np.mean(scores)
107 |                         scores_masked[split_index][mask_index].append(score)
108 | 
109 |             summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full)
110 |             summaries.append((summary, np.mean(scores_full)))
111 |             for split_index, split_ratio in enumerate(split_ratios):
112 |                 for mask_index, num_masks in enumerate(mask_range):
113 |                     scores = scores_masked[split_index][mask_index]
114 |                     summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores)
115 |                     summaries.append((summary, np.mean(scores)))
116 |                     print summary
117 | 
118 |     print_results(summaries)
119 | 
120 | 
121 | def main():
122 |     settings = load_settings()
123 | 
124 |     pipelines = [
125 |         FeatureConcatPipeline(
126 |             Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')),
127 |             Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
128 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
129 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])),
130 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
131 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
132 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])),
133 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])),
134 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])),
135 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])),
136 |             Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
137 |             Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
138 |             Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
139 |         ),
140 |     ]
141 | 
142 |     targets = [
143 |         'Dog_1',
144 |         'Dog_2',
145 |         'Dog_3',
146 |         'Dog_4',
147 |         'Dog_5',
148 |         'Patient_1',
149 |         'Patient_2'
150 |     ]
151 | 
152 |     classifiers = [
153 |         make_svm(gamma=0.0079, C=2.7),
154 |         make_svm(gamma=0.0068, C=2.0),
155 |         make_svm(gamma=0.003, C=150.0),
156 |         make_lr(C=0.04),
157 |         make_simple_lr(),
158 |     ]
159 | 
160 | 
161 |     make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission'
162 |     do_cv = not make_submission
163 | 
164 |     if do_cv:
165 |         mask_range = [3]
166 |         split_ratios = [0.4, 0.525, 0.6]
167 |         run_prepare_data_for_cross_validation(settings, targets, pipelines)
168 |         run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers)
169 | 
170 |     if make_submission:
171 |         num_masks = 10
172 |         split_ratio = 0.525
173 |         classifiers = [
174 |             # make_svm(gamma=0.0079, C=2.7),
175 |             make_svm(gamma=0.0068, C=2.0),
176 |             # make_svm(gamma=0.003, C=150.0),
177 |             # make_lr(C=0.04),
178 |             # make_simple_lr(),
179 |         ]
180 | 
181 |         targets_and_pipelines = []
182 |         pipeline = pipelines[0]
183 |         for classifier, classifier_name in classifiers:
184 |             for i, target in enumerate(targets):
185 |                 run_prepare_data(settings, [target], [pipeline], test=True)
186 |                 feature_masks = generate_feature_masks(settings, target, pipeline, num_masks, split_ratio, random_state=0, quiet=True)
187 |                 targets_and_pipelines.append((target, pipeline, feature_masks, classifier, classifier_name))
188 | 
189 |         run_make_submission(settings, targets_and_pipelines, split_ratio)
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     main()
194 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | from multiprocessing import Pool
  4 | import sys
  5 | 
  6 | import numpy as np
  7 | 
  8 | from seizure_prediction.classifiers import make_svm, make_lr, make_simple_lr
  9 | from seizure_prediction.cross_validation.kfold_strategy import KFoldStrategy
 10 | from seizure_prediction.cross_validation.legacy_strategy import LegacyStrategy
 11 | from seizure_prediction.pipeline import Pipeline, FeatureConcatPipeline, InputSource
 12 | from seizure_prediction.scores import get_score_summary, print_results
 13 | from seizure_prediction.tasks import make_submission_csv, cross_validation_score, \
 14 |     write_submission_file, check_training_data_loaded, check_test_data_loaded
 15 | from seizure_prediction.transforms import FFT, Magnitude, Log10, Windower, \
 16 |     Correlation, FreqCorrelation, FlattenChannels, \
 17 |     Hurst, PFD, PIBSpectralEntropy, FreqBinning, HFD, Preprocess
 18 | from seizure_prediction.settings import load_settings
 19 | from seizure_prediction.fft_bins import *
 20 | 
 21 | 
 22 | # cross_validation_strategy = KFoldStrategy()
 23 | cross_validation_strategy = LegacyStrategy()
 24 | 
 25 | 
 26 | def run_prepare_data_for_cross_validation(settings, targets, pipelines, quiet=False):
 27 |     if not quiet: print '\n'.join([p.get_name() for p in pipelines])
 28 |     for i, pipeline in enumerate(pipelines):
 29 |         for j, target in enumerate(targets):
 30 |             if not quiet: print 'Running prepare data', 'P=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(targets))
 31 |             check_training_data_loaded(settings, target, pipeline)
 32 | 
 33 | 
 34 | def run_prepare_data_for_submission(settings, targets, pipelines):
 35 |     for pipeline in pipelines:
 36 |         for target in targets:
 37 |             print 'Running %s pipeline %s' % (target, pipeline.get_name())
 38 |             check_training_data_loaded(settings, target, pipeline)
 39 |             check_test_data_loaded(settings, target, pipeline)
 40 | 
 41 | 
 42 | def run_cross_validation(settings, targets, classifiers, pipelines):
 43 |     print 'Cross-validation task'
 44 |     print 'Targets', ', '.join(targets)
 45 |     print 'Pipelines:\n ', '\n  '.join([p.get_name() for p in pipelines])
 46 |     print 'Classifiers', ', '.join([c[1] for c in classifiers])
 47 | 
 48 |     run_prepare_data_for_cross_validation(settings, targets, pipelines)
 49 | 
 50 |     # run on pool first, then show results after
 51 |     pool = Pool(settings.N_jobs)
 52 |     for i, pipeline in enumerate(pipelines):
 53 |         for j, (classifier, classifier_name) in enumerate(classifiers):
 54 |             for k, target in enumerate(targets):
 55 |                 progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets))
 56 |                 cross_validation_score(settings, target, pipeline, classifier, classifier_name,
 57 |                     strategy=cross_validation_strategy, pool=pool, progress_str=progress_str, return_data=False, quiet=True)
 58 |     pool.close()
 59 |     pool.join()
 60 | 
 61 |     summaries = []
 62 |     best = {}
 63 |     for p_num, pipeline in enumerate(pipelines):
 64 |         for c_num, (classifier, classifier_name) in enumerate(classifiers):
 65 |             mean_scores = []
 66 |             median_scores = []
 67 |             datas = []
 68 |             for target in targets:
 69 |                 print 'Running %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name)
 70 |                 data = cross_validation_score(settings, target, pipeline, classifier, classifier_name,
 71 |                     strategy=cross_validation_strategy, quiet=True)
 72 |                 datas.append(data)
 73 |                 if data.mean_score != data.median_score:
 74 |                     print '%.3f (mean)' % data.mean_score, data.mean_scores
 75 |                     print '%.3f (median)' % data.median_score, data.median_scores
 76 |                 else:
 77 |                     print '%.3f' % data.mean_score
 78 |                 mean_scores.append(data.mean_score)
 79 |                 median_scores.append(data.median_score)
 80 | 
 81 |                 best_score = best.get(target, [0, None, None, None])[0]
 82 |                 cur_score = max(data.mean_score, data.median_score)
 83 |                 if cur_score > best_score:
 84 |                     best[target] = [cur_score, pipeline, classifier, classifier_name]
 85 | 
 86 |             name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name, pipeline.get_name())
 87 |             summary = get_score_summary(name, mean_scores)
 88 |             summaries.append((summary, np.mean(mean_scores)))
 89 |             print summary
 90 |             name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name, pipeline.get_name())
 91 |             summary = get_score_summary(name, median_scores)
 92 |             summaries.append((summary, np.mean(median_scores)))
 93 |             print summary
 94 | 
 95 |     print_results(summaries)
 96 | 
 97 |     print '\nbest'
 98 |     for target in targets:
 99 |         pipeline = best[target][1]
100 |         classifier_name = best[target][3]
101 |         print target, best[target][0], classifier_name, pipeline.get_names()
102 | 
103 | 
104 | def run_make_submission(settings, targets, classifiers, pipelines):
105 |     print 'Submissions task'
106 |     print 'Targets', ', '.join(targets)
107 |     print 'Pipelines', ', '.join([p.get_name() for p in pipelines])
108 |     print 'Classifiers', ', '.join([c[1] for c in classifiers])
109 | 
110 |     run_prepare_data_for_submission(settings, targets, pipelines)
111 | 
112 |     pool = Pool(settings.N_jobs)
113 |     for pipeline in pipelines:
114 |         for classifier, classifier_name in classifiers:
115 |             for target in targets:
116 |                 pool.apply_async(make_submission_csv, [settings, target, pipeline, classifier, classifier_name])
117 |     pool.close()
118 |     pool.join()
119 | 
120 |     use_median_submissions = False
121 | 
122 |     for pipeline in pipelines:
123 |         for classifier, classifier_name in classifiers:
124 |             guesses_mean = ['clip,preictal']
125 |             guesses_median = ['clip,preictal']
126 |             for target in targets:
127 |                 print 'Target %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name)
128 |                 predictions_mean, predictions_median = make_submission_csv(settings, target, pipeline, classifier, classifier_name)
129 |                 guesses_mean += predictions_mean
130 |                 guesses_median += predictions_median
131 | 
132 |             mean_output = '\n'.join(guesses_mean)
133 |             median_output = '\n'.join(guesses_median)
134 | 
135 |             out = []
136 |             if use_median_submissions and mean_output != median_output:
137 |                 out.append((mean_output, 'mean'))
138 |                 out.append((median_output, 'median'))
139 |             else:
140 |                 out.append((mean_output, None))
141 | 
142 |             for guesses, name in out:
143 |                 write_submission_file(settings, guesses, name, pipeline, classifier_name)
144 | 
145 | 
146 | 
147 | def main():
148 | 
149 |     settings = load_settings()
150 | 
151 |     targets = [
152 |         'Dog_1',
153 |         'Dog_2',
154 |         'Dog_3',
155 |         'Dog_4',
156 |         'Dog_5',
157 |         'Patient_1',
158 |         'Patient_2'
159 |     ]
160 | 
161 |     pipelines = [
162 |         FeatureConcatPipeline(
163 |             Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')),
164 |             Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
165 | 
166 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
167 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])),
168 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
169 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
170 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])),
171 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])),
172 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])),
173 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])),
174 | 
175 |             Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
176 |             Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
177 |             Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
178 |         ),
179 |     ]
180 | 
181 |     classifiers = [
182 |         make_svm(gamma=0.0079, C=2.7),
183 |         make_svm(gamma=0.0068, C=2.0),
184 |         make_svm(gamma=0.003, C=150.0),
185 |         make_lr(C=0.04),
186 |         make_simple_lr(),
187 |     ]
188 | 
189 |     submission_pipelines = [
190 |         FeatureConcatPipeline(
191 |             Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')),
192 |             Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
193 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
194 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])),
195 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
196 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
197 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])),
198 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])),
199 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])),
200 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])),
201 |             Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
202 |             Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
203 |             Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
204 |         ),
205 |     ]
206 | 
207 |     submission_classifiers = [
208 |         make_simple_lr(),
209 |     ]
210 | 
211 |     if len(sys.argv) >= 2 and sys.argv[1] == 'submission':
212 |         run_make_submission(settings, targets, submission_classifiers, submission_pipelines)
213 |     else:
214 |         run_cross_validation(settings, targets, classifiers, pipelines)
215 | 
216 | 
217 | if __name__ == "__main__":
218 |     main()
219 | 
220 | 


--------------------------------------------------------------------------------
/seizure_prediction/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from common.data import jsdict
  3 | from common.time import Timer
  4 | import os.path
  5 | from multiprocessing import Pool
  6 | import h5py
  7 | import sys
  8 | import re
  9 | import glob
 10 | 
 11 | 
 12 | def read_hdf5_segment(file, key, start=None, end=None):
 13 |     dset = file[key]
 14 |     meta = {}
 15 |     for key, value in dset.attrs.iteritems():
 16 |         meta[key] = value
 17 | 
 18 |     if start is None and end is None:
 19 |         X = dset[:]
 20 |     else:
 21 |         if start >= dset.shape[0]:
 22 |             return None
 23 |         if (start + 1 == end):
 24 |             X = dset[start]
 25 |         else:
 26 |             X = dset[start:end]
 27 | 
 28 |     return X, meta
 29 | 
 30 | 
 31 | def write_hdf5_segment(file, key, data, meta=None):
 32 |     dset = file.create_dataset(key, data=data)
 33 | 
 34 |     if meta is not None:
 35 |         for key, value in meta.iteritems():
 36 |             dset.attrs[key] = value
 37 |             # print key, value
 38 | 
 39 | 
 40 | # NOTE(mike): just doing np.array(list_of_numpy_arrays) seems really slow,
 41 | # This seems to be a bit faster. However I really need to do some benchmarking
 42 | # to determine what is the fastest method.
 43 | def to_np_array(X):
 44 |     if isinstance(X[0], np.ndarray):
 45 |         # return np.vstack(X)
 46 |         out = np.empty([len(X)] + list(X[0].shape), dtype=X[0].dtype)
 47 |         for i, x in enumerate(X):
 48 |             out[i] = x
 49 |         return out
 50 | 
 51 |     return np.array(X)
 52 | 
 53 | # The worker method for a process to work on it's subset of the data. It will push
 54 | # the data through the pipeline working on 1 segment at a time. Segments are pulled
 55 | # in 1 at a time to keep working-set of memory to a minimum.
 56 | def process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn):
 57 |     if not os.path.exists(filename_in):
 58 |         return 0
 59 | 
 60 |     pid = os.getpid()
 61 | 
 62 |     num_processed = 0
 63 |     for i in xrange(id, sys.maxint, num_jobs):
 64 | 
 65 |         filename_out = filename_out_fmt % i if filename_out_fmt is not None else None
 66 |         # Use temp filename then rename the completed file to the proper name.
 67 |         # This is more or less an atomic update. Cancelling the program should
 68 |         # never leave data in a half-written state. Hence only the tempfile
 69 |         # will be in a half-written state and the pid determines when the process
 70 |         # is still alive and still processing the data. An inactive pid means the
 71 |         # tempfile is trash and can be deleted.
 72 |         filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None
 73 | 
 74 |         if filename_out is not None and os.path.exists(filename_out):
 75 |             num_processed += 1
 76 |             continue
 77 | 
 78 |         with h5py.File(filename_in, 'r') as f:
 79 |             segment = read_hdf5_segment(f, 'X', start=i, end=i+1)
 80 |             if segment is None:
 81 |                 break
 82 |             X, meta = segment
 83 | 
 84 |         data_obj = {}
 85 |         for k, v in meta.iteritems():
 86 |             data_obj[k] = v
 87 | 
 88 |         # save disk space
 89 |         if X.dtype != np.float32:
 90 |             X = X.astype(np.float32)
 91 | 
 92 |         X = process_data_fn(X, jsdict(data_obj))
 93 | 
 94 |         if filename_out is not None:
 95 |             with h5py.File(filename_out_temp, 'w', libver='latest') as f:
 96 |                 if X.dtype != np.float32:
 97 |                     X = X.astype(np.float32)
 98 |                 write_hdf5_segment(f, 'X', X)
 99 | 
100 |             os.rename(filename_out_temp, filename_out)
101 | 
102 |         num_processed += 1
103 | 
104 |     return num_processed
105 | 
106 | # filenames for single accumulated file
107 | def single_filename_builder(target, data_type, dir, tag=None):
108 |     if tag is not None:
109 |         filename = '%s_%s_%s.hdf5' % (target, data_type, tag)
110 |     else:
111 |         filename = '%s_%s.hdf5' % (target, data_type)
112 | 
113 |     return os.path.join(dir, filename)
114 | 
115 | 
116 | # filenames for individual segments before they get accumulated into one big file
117 | def segment_filename_builder(target, data_type, dir, tag=None):
118 |     if tag is not None:
119 |         filename = '%s_%s_%s_segment_%%d.hdf5' % (target, data_type, tag)
120 |     else:
121 |         filename = '%s_%s_segment_%%d.hdf5' % (target, data_type)
122 | 
123 |     return os.path.join(dir, filename)
124 | 
125 | # glue code around process_data_sub_job to setup input/output destinations and the
126 | # processing method (applying pipeline on input data)
127 | def process_data_job(settings, target, data_type, id, num_jobs, pipeline):
128 | 
129 |     def process(data, meta):
130 |         out = pipeline.apply(data, meta)
131 |         return out
132 | 
133 |     input_source = pipeline.get_input_source()
134 |     input_source_pipeline = input_source.get_pipeline()
135 |     input_tag = input_source_pipeline.get_name() if input_source_pipeline is not None else None
136 |     input_data_dir = settings.data_dir if input_tag is None else settings.cache_dir
137 |     filename_in = single_filename_builder(target, data_type, input_data_dir, input_tag)
138 |     filename_out_fmt = segment_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name())
139 |     return process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn=process)
140 | 
141 | # Accumulates N segments into a single file as it is faster to load data this way.
142 | def accumulate_data(settings, target, data_type, tag, output_to_original_data_dir=False, quiet=False, meta_only=False):
143 |     output_dir = settings.data_dir if output_to_original_data_dir else settings.cache_dir
144 |     filename_out = single_filename_builder(target, data_type, output_dir, tag)
145 |     orig_filename_in = single_filename_builder(target, data_type, settings.data_dir)
146 | 
147 |     def collect_meta(filename):
148 |         meta = {}
149 |         with h5py.File(filename, 'r') as f:
150 |             meta['num_segments'] = f['X'].shape[0]
151 |             if 'sequence' in f.keys():
152 |                 meta['sequence'] = f['sequence'][:]
153 |             for k, v in f['X'].attrs.iteritems():
154 |                 meta[k] = v
155 |         return meta
156 | 
157 |     # load already processed output file
158 |     if os.path.exists(filename_out):
159 |         # pull meta off original data
160 |         meta = collect_meta(orig_filename_in)
161 | 
162 |         # pull X data off processed data
163 |         with h5py.File(filename_out, 'r') as f:
164 |             meta['X_shape'] = f['X'].shape
165 |             X = f['X'][:] if not meta_only else None
166 |             if not quiet: print 'from cache ...',
167 |             return X, jsdict(meta)
168 |     else:
169 |         # get ready to process all segments into 1 file, starting with getting the meta-data ready
170 |         if not quiet: print 'processing ...',
171 |         pid = os.getpid()
172 |         filename_in_fmt = segment_filename_builder(target, data_type, output_dir, tag)
173 | 
174 |         orig_filename_in = single_filename_builder(target, data_type, settings.data_dir)
175 | 
176 |         # meta-data is collected differently when doing the first data conversion from mat to hdf5
177 |         if output_to_original_data_dir:
178 |             print 'Collecting metadata...'
179 |             # Creating original files... pull metadata off first one, and also collect sequences
180 |             meta = None
181 |             sequence = []
182 |             num_segments = 0
183 |             for i in xrange(0, sys.maxint, 1):
184 |                 filename = filename_in_fmt % i
185 |                 if not os.path.exists(filename):
186 |                     if num_segments == 0:
187 |                         print 'Could not find file ', filename
188 |                         sys.exit(1)
189 |                     break
190 | 
191 |                 with h5py.File(filename, 'r') as f_in:
192 |                     meta_attrs = f_in['__metadata'].attrs
193 |                     if 'sequence' in meta_attrs:
194 |                         sequence.append(meta_attrs['sequence'])
195 | 
196 |                     if meta is None:
197 |                         meta = {}
198 |                         meta['channels'] = f_in['channels'][:]
199 |                         for key in meta_attrs.keys():
200 |                             if key != 'sequence':
201 |                                 meta[key] = meta_attrs[key]
202 |                 num_segments += 1
203 | 
204 |             if len(sequence) > 0:
205 |                 meta['sequence'] = sequence
206 | 
207 |             meta['num_segments'] = num_segments
208 | 
209 |             print 'Accumulating segments...'
210 |         else:
211 |             # pull metadata off the original data files
212 |             meta = collect_meta(orig_filename_in)
213 | 
214 |         # now accumulate X data to a single file
215 |         num_segments = meta['num_segments']
216 |         filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None
217 |         with h5py.File(filename_out_temp, 'w-', libver='latest') as f_out:
218 |             X_out = None
219 |             for i in xrange(num_segments):
220 |                 with h5py.File(filename_in_fmt % i, 'r') as f_in:
221 |                     X_in = f_in['X']
222 |                     # init X_out
223 |                     if X_out is None:
224 |                         X_out = f_out.create_dataset('X', shape=[num_segments] + list(X_in.shape), dtype=X_in.dtype)
225 |                         meta['X_shape'] = X_out.shape
226 |                         for k, v in meta.iteritems():
227 |                             X_out.attrs[k] = v
228 | 
229 |                     X_out[i] = X_in[:]
230 |             X = X_out[:]
231 | 
232 |         # finalize
233 |         os.rename(filename_out_temp, filename_out)
234 |         # clean up
235 |         for i in xrange(num_segments):
236 |             try:
237 |                 os.remove(filename_in_fmt % i)
238 |             except:
239 |                 pass
240 | 
241 |         return X, jsdict(meta)
242 | 
243 | 
244 | # helper to check whether data exists in the data cache
245 | def data_exists(settings, target, data_type, pipeline):
246 |     filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name())
247 |     return os.path.exists(filename_out)
248 | 
249 | 
250 | # Multi-process data loading, data segments are processed through the given pipeline, then are accumulated
251 | # to a single file.
252 | #
253 | # check_only: returns True if data exists else false
254 | # quiet: suppress prints if True
255 | # meta_only: Actual X data is not fetched if meta_only is True, useful for light-weight data-loading
256 | #            to check number of training samples or number of features.
257 | def load_data_mp(settings, target, data_type, pipeline, check_only=False, quiet=False, meta_only=False):
258 |     filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name())
259 |     filename_out_exists = os.path.exists(filename_out)
260 |     if check_only:
261 |         return filename_out_exists
262 | 
263 |     input_source = pipeline.get_input_source()
264 |     input_source_pipeline = input_source.get_pipeline()
265 |     if input_source_pipeline is not None:
266 |         if not load_data_mp(settings, target, data_type, input_source_pipeline, check_only=True, quiet=quiet, meta_only=meta_only):
267 |             if not quiet: print 'Preparing input source', input_source_pipeline.get_name()
268 |             load_data_mp(settings, target, data_type, input_source_pipeline, check_only=False, quiet=quiet, meta_only=meta_only)
269 |             if not quiet: print 'Input source ready'
270 | 
271 | 
272 |     if not quiet: print 'Loading %s data ...' % data_type,
273 |     timer = Timer()
274 | 
275 |     # TODO(mike): re-implement tmpfile cleanup that isn't really slow in the face of the genetic algorithm
276 |     # spamming the disk with cross-validation score files.
277 | 
278 |     # clear cache of tmp files
279 |     # regex = re.compile(r""".*\.pid\.(\d+)""")
280 |     # for file in glob.glob(os.path.join(settings.cache_dir, '*.tmp')):
281 |     #     match = regex.match(file)
282 |     #     assert match is not None
283 |     #     pid = int(match.group(1))
284 |     #     try:
285 |     #         os.getpgid(pid)
286 |     #     except:
287 |     #         print 'Removing', file
288 |     #         os.remove(file)
289 | 
290 |     if not filename_out_exists:
291 |         # DEBUG
292 |         debug = False
293 |         # debug = True
294 |         if debug:
295 |             print 'DEBUG'
296 |             process_data_job(settings, target, data_type, 0, 1, pipeline)
297 |             print 'Done'
298 |         else:
299 |             pool = Pool(settings.N_jobs)
300 |             [pool.apply_async(process_data_job, [settings, target, data_type, i, settings.N_jobs, pipeline]) for i in range(settings.N_jobs)]
301 |             pool.close()
302 |             pool.join()
303 | 
304 |     accum, accum_meta = accumulate_data(settings, target, data_type, pipeline.get_name(), quiet=quiet, meta_only=meta_only)
305 | 
306 |     if not quiet: print 'prepared %d segments in %s %s %s' % (accum_meta.num_segments, timer.pretty_str(), accum_meta.X_shape, pipeline.get_name())
307 | 
308 |     return accum, accum_meta
309 | 
310 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Seizure Prediction
  2 | 
  3 | This repository contains the code I used for the American Epilepsy Society Seizure's
  4 | Prediction Challenge on Kaggle.
  5 | 
  6 | http://www.kaggle.com/c/seizure-prediction
  7 | 
  8 | As a side note this won't generate my exact submission as the randomness was affected
  9 | after cleaning up the code. It doesn't score as well which demonstrates the fragility
 10 | of my approach. I have also included the linear regression approach as used by
 11 | Jonathan Tapson. It makes my genetic algorithm and random feature mask ensembling a
 12 | little redundant, hence I use his approach in `main.py`, but demonstrate my own approaches
 13 | in `genetic.py` and `ensemble.py`
 14 | 
 15 | I discuss further down my genetic algorithm approach and the features I used. Taking a
 16 | look at the code might also yield more insights.
 17 | 
 18 | You probably need 100-150GB free disk space to run this code.
 19 | 
 20 | ###Hardware / OS platform used
 21 | 
 22 |  * 15" Retina MacBook Pro (Late 2013) 2.7GHz Core i7, 16GB RAM
 23 |  * OS X Mavericks
 24 |  * 512GB SSD
 25 | 
 26 | ###Dependencies
 27 | 
 28 | ####Required
 29 | 
 30 |  * Python 2.7 (I used built-in OS X Python 2.7.6)
 31 |  * scikit\_learn-0.15.2
 32 |  * numpy-1.9.0
 33 |  * pandas-0.14.1
 34 |  * scipy-0.14.0
 35 |  * h5py-2.3.1
 36 |  * hdf5 (see http://www.hdfgroup.org/HDF5)
 37 |  * deap-1.0
 38 | 
 39 | ####Optional (to try out various data transforms)
 40 | 
 41 |  * spectrum (for auto-regressive model)
 42 | 
 43 | ### SETTINGS.json
 44 | 
 45 | ```
 46 | {
 47 |   "competition-data-dir": "data",
 48 |   "data-cache-dir": "data-cache",
 49 |   "submission-dir": "submissions",
 50 |   "num-jobs": "auto"
 51 | }
 52 | ```
 53 | 
 54 |  * `competition-data-dir`: directory containing the downloaded competition data
 55 |  * `data-cache-dir`: directory the task framework will store cached data
 56 |  * `submission-dir`: directory submissions are written to
 57 |  * `num-jobs`: "auto" or integer specifying number of processes to use in multiprocessing Pool
 58 | 
 59 | ### Getting started
 60 | 
 61 | #### Preprocess data into hdf5 format
 62 | 
 63 | First place the competition data under ./data/ (or as specified in SETTINGS.json)
 64 | 
 65 | ```
 66 | data/Dog_1/Dog_1_preictal_segment_0001.mat
 67 | data/Dog_1/Dog_1_preictal_segment_0002.mat
 68 | ...
 69 | 
 70 | ```
 71 | 
 72 | Then run the `mat_to_hdf5.py` script.
 73 | 
 74 | ```
 75 | $ ./mat_to_hdf5.py
 76 | Loading data ...
 77 | Processing data/Dog_1_preictal.hdf5 ...
 78 | Runner 0 processing data/Dog_1/Dog_1_preictal_segment_0001.mat
 79 | Runner 1 processing data/Dog_1/Dog_1_preictal_segment_0002.mat
 80 | Runner 2 processing data/Dog_1/Dog_1_preictal_segment_0003.mat
 81 | Runner 3 processing data/Dog_1/Dog_1_preictal_segment_0004.mat
 82 | Runner 4 processing data/Dog_1/Dog_1_preictal_segment_0005.mat
 83 | Runner 5 processing data/Dog_1/Dog_1_preictal_segment_0006.mat
 84 | Runner 6 processing data/Dog_1/Dog_1_preictal_segment_0007.mat
 85 | Runner 7 processing data/Dog_1/Dog_1_preictal_segment_0008.mat
 86 | ...
 87 | ```
 88 | 
 89 | This took ~38 minutes to run on my machine to process all the patients. After this is done you
 90 | can feel free to delete the original matlab files as my code generates hdf5 files to replace them.
 91 | 
 92 | All patients have their signals decimated down to 200Hz to save disk space and improve processing times.
 93 | 
 94 | #### Run cross-validation with full-features
 95 | ```
 96 | ./main.py
 97 | ```
 98 | 
 99 | #### Make a submission
100 | ```
101 | ./main.py submission
102 | ```
103 | 
104 | This takes ~30 minutes on my machine with an empty data-cache.
105 | 
106 | ### Three build variants (main/ensemble/genetic)
107 | 
108 | ### main.py
109 | 
110 | This file contains the initial standard setup training per-patient models and not doing any
111 | sub-feature selection. The default selected classifier for submission is linear regression.
112 | A list of classifiers are used in cross-validation to compare scores.
113 | 
114 | ```
115 | ./main.py
116 | ./main.py submission
117 | ```
118 | 
119 | ### ensemble.py
120 | 
121 | This file contains the ensemble variant, generating N random feature masks, training N models
122 | per-patient, and then averaging those N models predictions. I did not find the cross-validation
123 | to be of much use but I left it in anyway. For submission this lead to better scores than
124 | `main.py` when using SVC with specific parameters `gamma=0.0079` and `C=2.7`. For these
125 | parameters `main.py` would achieve around 0.796 on public LB, and this ensembling approach
126 | would achieve around 0.829. I later learned that using different parameters `gamma=0.003` and
127 | `C=150.0` I could achieve similar scores around 0.829 without any ensembling.
128 | 
129 | I mostly used N=10 masks.
130 | 
131 | ```
132 | ./ensemble.py
133 | ./ensemble.py submission
134 | ```
135 | 
136 | ### genetic.py
137 | 
138 | This file contains my genetic algorithm approach. This is what I used for my 5th place submission.
139 | However the code as it is right now will not generate my exact submission as I renamed some of the
140 | transforms which changed some orderings and randomness which led to different CV results and
141 | ultimately different selected feature masks. It doesn't score too far off though.
142 | 
143 | The genetic algorithm starts with population size of 30 and runs for 10 generations. The population
144 | is initialised with random feature masks consisting of roughly 55% features activated and the other
145 | 45% masked away. The fitness function is simply CV ROC AUC score.
146 | 
147 | This is quite slow, taking on the order of 1-2 hours to run. I also ran 3 sets of genetic algorithm,
148 | each using a different subset of the features. I believe this to more or less just be myself optimising
149 | random chance against the public LB.
150 | 
151 | Other than the 3 feature groups, other features which appeared to not benefit from the genetic algorithm
152 | instead used random feature masks. Two masks were used for each feature group, 2 of the best masks
153 | for each of the GA groups and 2 random masks for the random groups. Again optimising against the
154 | leaderboard, a 52.5% active features ratio was used for the random feature masks.
155 | 
156 | To be honest this is all a bit of voodoo, and using the linear regression approach more or less makes
157 | all of this a waste of time. Later testing showed that only Dog\_3 and Dog\_4 really benefited from
158 | the sub-feature mask ensembling. Dog\_1 showed little change, Dog\_2 a very minor improvement, I
159 | didn't test Dog\_5. Patient\_1 and Patient\_2 actually always performed worse when using sub-feature
160 | masks whether genetic or random. There was correlation in training sample size and having a benefit
161 | from feature masks, so I didn't use feature masks when the number of training samples was less than
162 | 500 (excludes Patient 1 and 2). More testing needs to be done to actually verify that's the right
163 | thing to do.
164 | 
165 | ```
166 | ./genetic.py
167 | ./genetic.py submission
168 | ```
169 | 
170 | ### Features used
171 | 
172 |  * Time correlation matrix upper right triangle and sorted eigenvalues
173 |  * Frequency correlation matrix upper right triangle and sorted eigenvalues (omits 0Hz bucket)
174 |  * FFT Magnitude Log10 buckets for various ranges (see code below), where the power-in-band is calculated between the specified frequencies. The power-in-band is actually the average and not the sum. I saw minor boosts to perform Log10 after calculating power-in-band.
175 |  * Power-in-band spectral entropies
176 |  * Higuchi fractal dimension with kmax=2
177 |  * Petrosian fractal dimension
178 |  * Hurst exponent
179 | 
180 | Code doc in `seizure_prediction/transforms.py` contains more information.
181 | 
182 | In the code all these features are specified and joined together like so:
183 | ```
184 | FeatureConcatPipeline(
185 |     Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')),
186 |     Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
187 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning([0.5, 2.25, 4, 5.5, 7, 9.5, 12, 21, 30, 39, 48], 'mean'), Log10(), FlattenChannels()),
188 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])),
189 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
190 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
191 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])),
192 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])),
193 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])),
194 |     Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])),
195 |     Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
196 |     Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
197 |     Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
198 | )
199 | 
200 | ```
201 | 
202 | ### Pipelines and data transformations
203 | 
204 | #### Pipeline
205 | 
206 | A `Pipeline` is a series of transforms to be applied to the source data. All the transforms I've implemented
207 | can be found under `seizure_prediction/transforms.py`
208 | 
209 | Once data has been passed through the pipeline, the output is saved in the `data-cache` directory and
210 | can be reloaded almost instantly next time (a few millseconds on my machine).
211 | 
212 | ```
213 | Pipeline(Windower(75), Correlation())
214 | ```
215 | 
216 | One particularly useful pipeline is the FFT magnitude. It is generally the first step of many spectral
217 | transforms such as just raw magnitudes or spectral entropy. Recalculating the FFT for all of these
218 | pipelines over and over again is slow and wasteful. Which leads me to...
219 | 
220 | #### InputSource
221 | 
222 | It's much faster to load up previously processed data and reuse it than to compute it every time.
223 | The `InputSource()` class lets you specify where you want the data to be loaded from. No argument
224 | means the original time-series data. If you specify a pipeline, it will load it from there instead.
225 | If you look up a bit in the features section you can see the InputSource being used to load
226 | previously-computed FFT data.
227 | 
228 | I haven't found another use for this yet other than the FFT data, but it was worth it alone for that.
229 | The only time I don't use it for FFT data is for frequency correlation. I store everything in the data
230 | cache as float32, and this seems to cause issues with the `Correlation` transformation having more
231 | issues with NaNs etc. So for now `FreqCorrelation` does duplicate FFT work.
232 | 
233 | Replacing:
234 | 
235 | ```
236 | Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), Slice(1, None), Correlation('none')),
237 | ```
238 | with
239 | ```
240 | Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
241 | ```
242 | 
243 | is low-hanging fruit. It just needs to be verified that the classification performance is not worse.
244 | I was lazy in replacing it as I had already computed these transforms weeks earlier so it didn't
245 | bother me too much. It does however slow down from-scratch data processing which needs to do the
246 | extra work, such as when you clone this repo or if you clear the data cache to free up some disk space.
247 | 
248 | More examples:
249 | ```
250 | InputSource()
251 | InputSource(Preprocess(), FFT(), Magnitude())
252 | InputSource(Preprocess(), Windower(75), FFT(), Magnitude())
253 | ```
254 | 
255 | Also note that this can chew up a lot of disk space for caching these results.
256 | 
257 | #### FeatureConcatPipeline
258 | 
259 | It's nice and clean to specify individual transforms and pipelines. However it's very practical to combine features. The `FeatureConcatPipeline` does exactly this. It will load each pipeline individually, then concatenate all the features together.
260 | 
261 | ```
262 | FeatureConcatPipeline(
263 |     Pipeline(Windower(75), Correlation()),
264 |     Pipeline(Windower(75), Hurst())
265 | )
266 | ```
267 | 
268 | #### Safe to kill whenever you like
269 | 
270 | You can kill the program without fear of losing much progress. A unit of work for the data processing is a single segment (equivalent to one of the original matlab file segments) and a unit of work for the cross-validation is one fold. Results are saved to the data cache and things can pick up where they left off last time automatically.
271 | 
272 | There is one caveat however, there's a bug with Python multiprocessing pools and KeyboardInterrupt. I run my code from IntelliJ 14 Ultimate so I don't have a problem, but if you Ctrl-C from the commandline the pool doesn't exit properly so killing from the commandline is a bit of a pain and I have just been using `killall Python` for the time being to get around it. Not ideal, but not generally an issue for me given I use IntelliJ.
273 | 
274 | ### Cross-validation strategies
275 | 
276 | I have implemented two cross-validation strategies, both based on using folds.
277 | 
278 | #### LegacyStrategy
279 | 
280 | Found in `seizure_prediction/cross_validation/legacy_strategy.py`
281 | 
282 | This strategy uses 3 folds per target, using hand-picked random seeds that seemed to give good
283 | results on my system. I'm not sure this will even work well on other peoples' systems if the
284 | random seeds generate different folds. This is what I used for the whole competition hence the
285 | legacy name so I've left it in there.
286 | 
287 | #### KFoldStrategy
288 | 
289 | Found in `seizure_prediction/cross_validation/kfold_strategy.py`
290 | 
291 | This was a post-competition half-hearted attempt to build a more robust K-fold cross-validation
292 | setup. The selected sequences do not rely on random seeds, and instead I roughly hand-picked
293 | (via an algorithm) a good number of folds and also a good selection across the preictal sequences
294 | that somewhat maximises the coverage of the preictal set.
295 | 
296 | For example, given 3 sequences in the preictal set it will use 3 folds `[(0, 1), (0, 2), (1,2)]`.
297 | For 6 sequences and 3 folds it will use `[(0, 1), (2, 3), (4, 5)]`.
298 | 
299 | It seems to roughly work okay now, but I've never had much trust in the cross-validation scores
300 | versus the leaderboard scores given that the test set is generally much bigger than the given
301 | training data.
302 | 
303 | ### Misc
304 | 
305 | I haven't fully cleaned up the code as much as I could, nor documented it as much as I could.
306 | I cleaned it up enough and tried to describe enough that you could take this code base and try
307 | out new transforms etc without too much difficulty.
308 | 
309 | If you clone this repo, you will probably want to start looking at `main.py` and it should
310 | hopefully be straightforward to get things going.
311 | 
312 | Feel free to message me with any questions.
313 | 


--------------------------------------------------------------------------------
/genetic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import random
  4 | from multiprocessing import Pool
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | from deap import creator, base, tools
  9 | 
 10 | from seizure_prediction.classifiers import make_svm
 11 | from seizure_prediction.cross_validation.legacy_strategy import LegacyStrategy
 12 | from seizure_prediction.feature_selection import generate_feature_masks
 13 | from seizure_prediction.pipeline import Pipeline, FeatureConcatPipeline, InputSource
 14 | from seizure_prediction.scores import get_score_summary, print_results
 15 | from seizure_prediction.tasks import load_training_data, make_csv_for_target_predictions, write_submission_file, \
 16 |     cross_validation_score, check_training_data_loaded, check_test_data_loaded, make_submission_predictions, \
 17 |     calc_feature_mask_string
 18 | from seizure_prediction.transforms import Windower, Correlation, FreqCorrelation, FFT, \
 19 |     Magnitude, PIBSpectralEntropy, Log10, FreqBinning, FlattenChannels, PFD, HFD, Hurst, Preprocess
 20 | from seizure_prediction.settings import load_settings
 21 | from main import run_prepare_data_for_cross_validation
 22 | from seizure_prediction.fft_bins import *
 23 | 
 24 | 
 25 | cross_validation_strategy = LegacyStrategy()
 26 | 
 27 | 
 28 | def evaluate_fitness_score(settings, target, pipeline, classifier, classifier_name, quiet, arg):
 29 |     individual, best_score = arg
 30 |     if np.sum(individual) == 0:
 31 |         score = 0.0
 32 |     else:
 33 |         score = float(cross_validation_score(settings, target, pipeline, classifier, classifier_name,
 34 |             strategy=cross_validation_strategy, feature_mask=individual, quiet=True).mean_score)
 35 | 
 36 |     if score > best_score:
 37 |         if not quiet: print score, np.sum(individual)
 38 |     return score,
 39 | 
 40 | 
 41 | creator.create("RocAucMax", base.Fitness, weights=(1.0,))
 42 | creator.create("Individual", list, fitness=creator.RocAucMax)
 43 | 
 44 | 
 45 | def random_bool(threshold):
 46 |     return 1 if random.random() <= threshold else 0
 47 | 
 48 | 
 49 | def get_pipeline_data(settings, target, pipeline):
 50 |     data = load_training_data(settings, target, pipeline, check_only=False, quiet=True)
 51 |     num_features = data.X_train.shape[data.X_train.ndim-1]
 52 |     return num_features, data.num_train_segments
 53 | 
 54 | 
 55 | def process_target(settings, target, pipeline, classifier, classifier_name, ratio, ngen, quiet, threshold=400):
 56 |     # make results repeatable
 57 |     random.seed(0)
 58 | 
 59 |     num_features, num_training_examples = get_pipeline_data(settings, target, pipeline)
 60 | 
 61 |     # Using sub-feature selection for the human patients appears to perform worse than
 62 |     # using full feature set. My guess is that perhaps there is not enough training samples
 63 |     # for this technique to work effectively. So do not run GA if there are too few training
 64 |     # samples. The threshold parameter can be tweaked with more testing.
 65 |     if num_training_examples < threshold:
 66 |         score = float(cross_validation_score(settings, target, pipeline, classifier, classifier_name,
 67 |             strategy=cross_validation_strategy, quiet=True).mean_score)
 68 |         return score, [[1] * num_features]
 69 | 
 70 |     num_wanted_features = int(num_features * ratio)
 71 |     if not quiet: print 'ratio', ratio
 72 |     if not quiet: print 'num features', num_features
 73 |     if not quiet: print 'num wanted features', num_wanted_features
 74 | 
 75 |     if not quiet: print target, classifier_name
 76 | 
 77 |     pool = Pool(settings.N_jobs)
 78 | 
 79 |     toolbox = base.Toolbox()
 80 |     toolbox.register("map", pool.map)
 81 |     toolbox.register("attr_bool", random_bool, ratio)
 82 |     toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, num_features)
 83 |     toolbox.register("population", tools.initRepeat, list, toolbox.individual)
 84 | 
 85 |     toolbox.register("evaluate", evaluate_fitness_score, settings, target, pipeline, classifier, classifier_name, quiet)
 86 |     toolbox.register("mate", tools.cxTwoPoint)
 87 |     toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
 88 |     toolbox.register("select", tools.selTournament, tournsize=3)
 89 | 
 90 |     pop = toolbox.population(n=30)
 91 |     CXPB, MUTPB, NGEN = 0.5, 0.2, ngen
 92 | 
 93 |     best_score = 0
 94 |     best_feature_mask = None
 95 |     all_feature_masks = {}
 96 | 
 97 |     # Evaluate the entire population
 98 |     if not quiet: print 'evaluating pop %d' % len(pop)
 99 |     fitnesses = toolbox.map(toolbox.evaluate, [(ind, 1.0) for ind in pop])
100 |     if not quiet: print 'done evaluating'
101 | 
102 |     for ind, fit in zip(pop, fitnesses):
103 |         ind.fitness.values = fit
104 |         all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0])
105 | 
106 |     # calc first best
107 |     fits = [ind.fitness.values[0] for ind in pop]
108 |     best_index = np.argmax(fits)
109 |     score = fits[best_index]
110 |     if score > best_score:
111 |         best_score = score
112 |         best_feature_mask = pop[best_index]
113 |         if not quiet: print 'new best', best_score, np.sum(best_feature_mask)
114 | 
115 |     # Begin the evolution
116 |     for g in range(NGEN):
117 |         if not quiet: print("-- %s: Generation %i --" % (target, g))
118 | 
119 |         # Select the next generation individuals
120 |         offspring = toolbox.select(pop, int(len(pop)))
121 |         # Clone the selected individuals
122 |         offspring = list(toolbox.map(toolbox.clone, offspring))
123 | 
124 |         # Apply crossover and mutation on the offspring
125 |         for child1, child2 in zip(offspring[::2], offspring[1::2]):
126 |             if random.random() < CXPB:
127 |                 toolbox.mate(child1, child2)
128 |                 del child1.fitness.values
129 |                 del child2.fitness.values
130 | 
131 |         for mutant in offspring:
132 |             if random.random() < MUTPB:
133 |                 toolbox.mutate(mutant)
134 |                 del mutant.fitness.values
135 | 
136 |         # Evaluate the individuals with an invalid fitness
137 |         invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
138 |         fitnesses = toolbox.map(toolbox.evaluate, [(ind, best_score) for ind in invalid_ind])
139 |         for ind, fit in zip(invalid_ind, fitnesses):
140 |             ind.fitness.values = fit
141 |             all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0])
142 | 
143 |         if not quiet: print("  Evaluated %i individuals (pop size %d)" % (len(invalid_ind), len(offspring)))
144 | 
145 |         # The population is entirely replaced by the offspring
146 |         pop[:] = offspring
147 | 
148 |         # Gather all the fitnesses in one list and print the stats
149 |         fits = [ind.fitness.values[0] for ind in pop]
150 |         best_index = np.argmax(fits)
151 |         all_f = [np.sum(ind) for ind in pop]
152 |         if not quiet: print '  %s, %s, %s (%d-%d)' % (target, fits[best_index], np.sum(pop[best_index]), np.min(all_f), np.max(all_f))
153 | 
154 |         length = len(pop)
155 |         mean = sum(fits) / length
156 | 
157 |         if not quiet: print("  Min %s" % min(fits))
158 |         if not quiet: print("  Max %s" % max(fits))
159 |         if not quiet: print("  Avg %s" % mean)
160 | 
161 |         score = fits[best_index]
162 |         if score > best_score:
163 |             best_score = score
164 |             best_feature_mask = pop[best_index]
165 |             if not quiet: print 'new best', best_score, np.sum(best_feature_mask)
166 | 
167 |     if not quiet: print("-- End of (successful) evolution --")
168 | 
169 |     best_ind = tools.selBest(pop, 1)[0]
170 |     if not quiet: print "-- Finished --\n%s\n%s\n%s" % (target, best_ind.fitness.values[0], best_ind)
171 | 
172 |     pop = list(all_feature_masks.values())
173 |     pop.sort(cmp=lambda x1, x2: cmp(x2[1], x1[1]))
174 |     sorted_pop = [ind for ind, score in pop]
175 |     print target, 'best', pop[0][1], 'worst', pop[-1][1]
176 | 
177 |     return best_score, sorted_pop
178 | 
179 | 
180 | def run_make_submission(settings, targets_and_pipelines, classifier, classifier_name):
181 |     pool = Pool(settings.N_jobs)
182 |     for i, (target, pipeline, feature_masks) in enumerate(targets_and_pipelines):
183 |         for j, feature_mask in enumerate(feature_masks):
184 |             progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks))
185 |             pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name], {'feature_mask': feature_mask, 'quiet': True, 'progress_str': progress_str})
186 |     pool.close()
187 |     pool.join()
188 | 
189 |     guesses = ['clip,preictal']
190 |     for target, pipeline, feature_masks in targets_and_pipelines:
191 |         test_predictions = []
192 | 
193 |         for feature_mask in feature_masks:
194 |             data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask)
195 |             test_predictions.append(data.mean_predictions)
196 | 
197 |         predictions = np.mean(test_predictions, axis=0)
198 |         guesses += make_csv_for_target_predictions(target, predictions)
199 | 
200 |     output = '\n'.join(guesses)
201 |     submission_targets_and_pipelines = [(target, pipeline, feature_masks, classifier, classifier_name)
202 |         for target, pipeline, feature_masks in targets_and_pipelines]
203 |     write_submission_file(settings, output, None, None, classifier_name, submission_targets_and_pipelines)
204 | 
205 | 
206 | def run_prepare_data(settings, targets_and_pipelines, train=True, test=False):
207 |     for target, pipeline, feature_masks in targets_and_pipelines:
208 |         if train:
209 |             check_training_data_loaded(settings, target, pipeline)
210 |         if test:
211 |             check_test_data_loaded(settings, target, pipeline)
212 | 
213 | 
214 | def extract_masks_for_pipeline_and_masks(settings, target, pipeline, masks):
215 |     outs = [{} for mask in masks]
216 |     offset = 0
217 |     for p in pipeline.get_pipelines():
218 |         num_features, _ = get_pipeline_data(settings, target, p)
219 |         for i, mask in enumerate(masks):
220 |             p_mask = mask[offset:offset + num_features]
221 |             outs[i][p.get_name()] = p_mask
222 |         offset += num_features
223 |     for mask in masks:
224 |         assert offset == len(mask)
225 |     return outs
226 | 
227 | 
228 | def merge_dicts(*dicts):
229 |     x = dicts[0].copy()
230 |     for d in dicts[1:]:
231 |         x.update(d)
232 |     return x
233 | 
234 | 
235 | def get_submission_targets_and_masks(settings, targets, classifier, classifier_name, pipeline_groups, random_pipelines, random_ratio=0.525, ngen=10, limit=2, random_limit=2):
236 |     assert random_limit % limit == 0
237 |     random_multiplier = random_limit / limit
238 |     quiet = True
239 | 
240 |     random_pipeline = FeatureConcatPipeline(*random_pipelines)
241 | 
242 |     all_pipelines = []
243 |     all_pipelines.extend(random_pipelines)
244 |     for pg, ratio in pipeline_groups:
245 |         all_pipelines.extend(pg)
246 |     full_pipeline = FeatureConcatPipeline(*all_pipelines)
247 |     run_prepare_data(settings, [(target, full_pipeline, []) for target in targets], test=True)
248 | 
249 |     def get_pipeline_and_feature_masks(target, pipelines, classifier, classifier_name, ratio, ngen):
250 |         print target, 'fetching GA pipelines', [p.get_name() for p in pipelines]
251 |         pipeline = FeatureConcatPipeline(*pipelines)
252 |         score, best_N = process_target(settings, target, pipeline, classifier, classifier_name, ratio=ratio, ngen=ngen, quiet=quiet)
253 |         return pipeline, best_N
254 | 
255 |     targets_and_pipelines = []
256 |     for target in targets:
257 |         # NOTE(mike): All this stuff is a bit nasty. It gets the random-masks and the genetic-masks
258 |         # for different pipelines, and then pulls out the mask for each individual pipeline. A single
259 |         # FeatureConcatPipeline is then created to represent all the features, and the masks for each
260 |         # member of the FCP are merged together to form the single feature mask across the whole FCP.
261 | 
262 |         random_masks = generate_feature_masks(settings, target, random_pipeline, random_limit, random_ratio, random_state=0, quiet=quiet)
263 |         # contains a list of pairs, (pipeline, mask)
264 |         ga_groups = [get_pipeline_and_feature_masks(target, p, classifier, classifier_name, ratio, ngen) for p, ratio in pipeline_groups]
265 |         ga_groups = [(p, masks[0:limit]) for p, masks in ga_groups]
266 | 
267 |         print target, 'extracting GA per-pipeline masks...'
268 |         # contains a list of mask dictionaries
269 |         ga_dicts = [extract_masks_for_pipeline_and_masks(settings, target, pipeline, masks) for pipeline, masks in ga_groups]
270 |         ga_dicts = [mask_dicts * random_multiplier for mask_dicts in ga_dicts]
271 | 
272 |         r_dicts = extract_masks_for_pipeline_and_masks(settings, target, random_pipeline, random_masks)
273 |         # this contains a list of dictionaries which maps pipeline names to masks
274 |         # e.g. [r_dicts, ga_dicts0, ga_dicts1, ...]
275 |         zip_group = [r_dicts] + ga_dicts
276 | 
277 |         print target, 'merging all masks...'
278 |         feature_mask_dicts = [merge_dicts(*x) for x in zip(*zip_group)]
279 | 
280 |         feature_masks = []
281 |         for feature_mask_dict in feature_mask_dicts:
282 |             mask = []
283 |             for p in full_pipeline.get_pipelines():
284 |                 mask.extend(feature_mask_dict[p.get_name()])
285 |             feature_masks.append(mask)
286 | 
287 |         targets_and_pipelines.append((target, full_pipeline, feature_masks))
288 |     return targets_and_pipelines
289 | 
290 | 
291 | def main():
292 |     settings = load_settings()
293 | 
294 |     targets = [
295 |         'Dog_1',
296 |         'Dog_2',
297 |         'Dog_3',
298 |         'Dog_4',
299 |         'Dog_5',
300 |         'Patient_1',
301 |         'Patient_2'
302 |     ]
303 | 
304 |     # The genetic algorithm will be run individually on each pipeline group
305 |     pipeline_groups = [
306 |         ([
307 |             Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
308 |         ], 0.55),
309 |         ([
310 |             Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
311 |         ], 0.55),
312 |         ([
313 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])),
314 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
315 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
316 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])),
317 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])),
318 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])),
319 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])),
320 |             Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
321 |         ], 0.55),
322 |     ]
323 | 
324 |     make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission'
325 |     run_ga = not make_submission
326 | 
327 |     # This classifier is used in the genetic algorithm
328 |     ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7)
329 | 
330 |     if run_ga:
331 |         quiet = False
332 |         summaries = []
333 |         for ngen in [10]:
334 |             for pipelines, ratio in pipeline_groups:
335 |                 out = []
336 |                 for target in targets:
337 |                     print 'Running target', target
338 |                     run_prepare_data_for_cross_validation(settings, [target], pipelines, quiet=True)
339 |                     pipeline = FeatureConcatPipeline(*pipelines)
340 |                     score, best_N = process_target(settings, target, pipeline, ga_classifier, ga_classifier_name, ratio=ratio, ngen=ngen, quiet=quiet)
341 |                     print target, score, [np.sum(mask) for mask in best_N[0:10]]
342 |                     out.append((target, score, pipeline, best_N))
343 | 
344 |             scores = np.array([score for _, score, _, _ in out])
345 |             summary = get_score_summary('%s ngen=%d' % (ga_classifier_name, ngen), scores)
346 |             summaries.append((summary, np.mean(scores)))
347 |             print summary
348 | 
349 |         print_results(summaries)
350 | 
351 |     if make_submission:
352 |         random_pipelines = [
353 |             Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')),
354 |             Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
355 |             Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
356 |         ]
357 | 
358 |         # These classifiers are used to make the final predictions
359 |         final_classifiers = [
360 |             # make_svm(gamma=0.0079, C=2.7),
361 |             make_svm(gamma=0.0068, C=2.0),
362 |             # make_svm(gamma=0.003, C=150.0),
363 |             # make_lr(C=0.04),
364 |             # make_simple_lr(),
365 |         ]
366 |         targets_and_pipelines = get_submission_targets_and_masks(settings, targets, ga_classifier, ga_classifier_name, pipeline_groups, random_pipelines)
367 |         for classifier, classifier_name in final_classifiers:
368 |             run_make_submission(settings, targets_and_pipelines, classifier, classifier_name)
369 | 
370 | 
371 | if __name__ == "__main__":
372 |     main()
373 | 


--------------------------------------------------------------------------------
/seizure_prediction/tasks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sklearn.utils
  3 | from sklearn.metrics import roc_auc_score
  4 | from common import time
  5 | from common.data import jsdict
  6 | from seizure_prediction import hdf5
  7 | from seizure_prediction.cross_validation.kfold_strategy import KFoldStrategy
  8 | from seizure_prediction.data import to_np_array
  9 | import gzip
 10 | import os.path
 11 | import hashlib
 12 | from seizure_prediction.data import load_data_mp
 13 | from seizure_prediction.pipeline import FeatureConcatPipeline
 14 | 
 15 | 
 16 | # flatten data down to 2 dimensions for putting through a classifier
 17 | # supports input shapes:
 18 | #   (num_segments, num_features)
 19 | #   (num_segments, num_windows, num_features)
 20 | #   (num_segments, num_windows, num_channels, num_features)
 21 | def flatten(data):
 22 |     if data.ndim == 2:
 23 |         return data
 24 |     if not data.ndim >= 3:
 25 |         print 'data shape', data.shape
 26 |         assert data.ndim >= 3
 27 |     s = data.shape
 28 |     out = data.reshape((np.product(s[0:2]), np.product(s[2:])))
 29 | 
 30 |     return out
 31 | 
 32 | 
 33 | # Load data for a given pipeline. This wraps load_data_mp to also provide FeatureConcatPipeline support.
 34 | # See load_data_mp for description of check_only and meta_only parameters.
 35 | def load_pipeline_data(settings, target, data_type, pipeline, check_only, quiet=False, meta_only=False):
 36 |     if check_only:
 37 |         return np.alltrue([load_data_mp(settings, target, data_type, p, check_only=True, quiet=quiet)
 38 |             for p in pipeline.get_pipelines()])
 39 | 
 40 |     if isinstance(pipeline, FeatureConcatPipeline):
 41 |         data = []
 42 |         meta = None
 43 |         num_features = 0
 44 | 
 45 |         for p in pipeline.get_pipelines():
 46 |             _data, _meta = load_data_mp(settings, target, data_type, p, quiet=quiet, meta_only=meta_only)
 47 |             data.append(_data)
 48 |             if meta is None:
 49 |                 meta = _meta
 50 |             for k in meta.keys():
 51 |                 if k == 'X_shape':
 52 |                     assert meta[k][:-1] == _meta[k][:-1]
 53 |                     num_features += _meta[k][-1]
 54 |                 elif isinstance(_meta[k], np.ndarray):
 55 |                     assert np.alltrue(meta[k] == _meta[k])
 56 |                 else:
 57 |                     assert meta[k] == _meta[k]
 58 | 
 59 |         d0 = data[0]
 60 |         if meta_only:
 61 |             data = None
 62 |             # combine shapes
 63 |             meta['X_shape'] = list(meta['X_shape'][:-1]) + [num_features]
 64 |         else:
 65 |             for d in data[1:]:
 66 |                 if d0.ndim != d.ndim:
 67 |                     print pipeline.get_name()
 68 |                     print 'd0', d0.shape, 'other', d.shape
 69 |                     assert d0.ndim == d.ndim
 70 |                 assert d0.shape[:-1] == d.shape[:-1]
 71 |             data = np.concatenate(data, axis=data[0].ndim-1)
 72 |     else:
 73 |         data, meta = load_data_mp(settings, target, data_type, pipeline, quiet=quiet, meta_only=meta_only)
 74 | 
 75 |     return data, meta
 76 | 
 77 | 
 78 | # Load training data, this loads the preictal and interictal pipeline data, optionally separates the
 79 | # data into training set and cross-validation set, and generates labels.
 80 | #
 81 | # strategy: cross-validation strategy, see LegacyStrategy() and KFoldStrategy()
 82 | # cv_fold_number: None to specify no cross-validation set for when making a submission,
 83 | #                 otherwise a number generated by the cross-validation strategy.
 84 | def load_training_data(settings, target, pipeline, check_only, strategy=None, cv_fold_number=None, quiet=False):
 85 |     cv = cv_fold_number is not None
 86 |     if check_only:
 87 |         return load_pipeline_data(settings, target, 'preictal', pipeline, check_only=True, quiet=quiet) or \
 88 |                load_pipeline_data(settings, target, 'interictal', pipeline, check_only=True, quiet=quiet)
 89 | 
 90 |     preictal, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet)
 91 |     interictal, interictal_meta = load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=quiet)
 92 | 
 93 |     total_segments = preictal_meta.num_segments + interictal_meta.num_segments
 94 |     # print 'total_segments', total_segments
 95 | 
 96 |     if not quiet: print 'Preparing data ...',
 97 |     start = time.get_seconds()
 98 | 
 99 |     def make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv):
100 |         num_train_segments = preictal_X_train.shape[0] + interictal_X_train.shape[0]
101 |         num_cv_segments = preictal_X_cv.shape[0] + interictal_X_cv.shape[0]
102 |         assert (num_train_segments + num_cv_segments) == total_segments
103 | 
104 |         flattened_preictal_X_train = flatten(preictal_X_train)
105 |         flattened_interictal_X_train = flatten(interictal_X_train)
106 |         flattened_preictal_X_cv = flatten(preictal_X_cv) if cv else np.empty((0,))
107 |         flattened_interictal_X_cv = flatten(interictal_X_cv) if cv else np.empty((0,))
108 | 
109 |         X_train = np.concatenate((flattened_preictal_X_train, flattened_interictal_X_train), axis=0)
110 |         X_cv = np.concatenate((flattened_preictal_X_cv, flattened_interictal_X_cv), axis=0)
111 | 
112 |         preictal_y_train = np.ones((flattened_preictal_X_train.shape[0],))
113 |         preictal_y_cv = np.ones((preictal_X_cv.shape[0],))
114 |         interictal_y_train = np.zeros((flattened_interictal_X_train.shape[0],))
115 |         interictal_y_cv = np.zeros((interictal_X_cv.shape[0],))
116 | 
117 |         y_train = np.concatenate((preictal_y_train, interictal_y_train), axis=0)
118 |         y_cv = np.concatenate((preictal_y_cv, interictal_y_cv), axis=0)
119 | 
120 |         X_train, y_train = sklearn.utils.shuffle(X_train, y_train, random_state=0)
121 | 
122 |         return jsdict({
123 |             'X_train': X_train,
124 |             'y_train': y_train,
125 |             'X_cv': X_cv,
126 |             'y_cv': y_cv,
127 |             'num_train_segments': num_train_segments,
128 |             'num_cv_segments': num_cv_segments
129 |         })
130 | 
131 |     if cv:
132 |         preictal_X_train, preictal_X_cv = strategy.split_train_cv(preictal, preictal_meta, cv_fold_number)
133 |         interictal_X_train, interictal_X_cv = strategy.split_train_cv(interictal, interictal_meta, cv_fold_number, interictal=True)
134 |         data = make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv)
135 |     else:
136 |         preictal_X_train = preictal
137 |         preictal_X_cv = np.empty((0,))
138 |         interictal_X_train = interictal
139 |         interictal_X_cv = np.empty((0,))
140 |         data = make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv)
141 | 
142 |     if not quiet: print '%ds' % (time.get_seconds() - start)
143 | 
144 |     if not quiet: print 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape
145 | 
146 |     return data
147 | 
148 | 
149 | # Load the test data for a given pipeline
150 | def load_test_data(settings, target, pipeline, quiet=False):
151 |     test, meta = load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet)
152 |     X_test = flatten(test)
153 |     if not quiet: print 'X_test', test.shape, 'num_segments', meta.num_segments
154 |     return jsdict({
155 |         'X_test': X_test,
156 |         'num_segments': meta.num_segments
157 |     })
158 | 
159 | 
160 | # Train a classifier
161 | def train(classifier, training_data, quiet=False):
162 |     X_train = training_data.X_train
163 |     y_train = training_data.y_train
164 |     if not quiet: print 'Training ...',
165 |     start = time.get_seconds()
166 |     classifier.fit(X_train, y_train)
167 |     if not quiet: print '%ds' % (time.get_seconds() - start)
168 | 
169 | 
170 | # Make predictions, and then combine the N predictions if using windows using mean and median.
171 | # Returns (mean_predictions, median_predictions, raw_predictions)
172 | def make_predictions(classifier, X, num_segments):
173 |     predictions = classifier.predict_proba(X)[:, 1]
174 |     split_data = np.split(predictions, num_segments)
175 |     return to_np_array([np.mean(ps) for ps in split_data]), to_np_array([np.median(ps) for ps in split_data]), predictions
176 | 
177 | 
178 | # Save the output of function fn to os.path.join(*paths) if it doesn't exist on disk,
179 | # otherwise load the data from disk. Note that this changes the current working directory
180 | # in order to deal with too-big filenames generated by a large number of concatenated
181 | # features in FeatureConcatPipeline.
182 | def memoize(fn, paths):
183 |     cwd = os.getcwd()
184 | 
185 |     def change_to_target_dir():
186 |         for dir in paths[:-1]:
187 |             try:
188 |                 os.mkdir(dir)
189 |             except OSError, e:
190 |                 pass
191 |             os.chdir(dir)
192 | 
193 |     change_to_target_dir()
194 |     filename = paths[-1]
195 |     if os.path.exists(filename):
196 |         data = hdf5.read(filename)
197 |         os.chdir(cwd)
198 |         return data
199 | 
200 |     os.chdir(cwd)
201 |     data = fn()
202 |     change_to_target_dir()
203 |     tmp = '%s.pid.%d.tmp' % (filename, os.getpid())
204 |     hdf5.write(tmp, data)
205 |     os.rename(tmp, filename)
206 |     os.chdir(cwd)
207 | 
208 |     return jsdict(data)
209 | 
210 | 
211 | # Fast process-if-not-yet-processed method for training data
212 | def check_training_data_loaded(settings, target, pipeline, quiet=False):
213 |     if not load_pipeline_data(settings, target, 'preictal', pipeline, check_only=True, quiet=quiet):
214 |         load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet)
215 |     if not load_pipeline_data(settings, target, 'interictal', pipeline, check_only=True, quiet=quiet):
216 |         load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=quiet)
217 | 
218 | 
219 | # Fast process-if-not-yet-processed method for test data
220 | def check_test_data_loaded(settings, target, pipeline, quiet=False):
221 |     if not load_pipeline_data(settings, target, 'test', pipeline, check_only=True, quiet=quiet):
222 |         load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet)
223 | 
224 | 
225 | # Represent a feature_mask e.g. [1,0,0,1,1,1,0] as binary.
226 | def calc_feature_mask_bigint(mask):
227 |     mask = [int(x) for x in mask]
228 |     out = 0
229 |     for i, x in enumerate(mask):
230 |         out += x << i
231 |     return out
232 | 
233 | 
234 | # Represent a feature_mask e.g. [1,0,0,1,1,1,0] as string by concatenating
235 | # md5 and sha1. Should be unique enough. Used to provide a short-name for
236 | # saving data to disk. Otherwise the filename would be way too long.
237 | def calc_feature_mask_string(mask):
238 |     if mask is None:
239 |         return None
240 | 
241 |     out = calc_feature_mask_bigint(mask)
242 | 
243 |     hex_str = hex(out)
244 |     md5 = hashlib.md5(hex_str).hexdigest()
245 |     sha1 = hashlib.sha1(hex_str).hexdigest()
246 |     return md5 + sha1
247 | 
248 | 
249 | # Calculate cross-validation score for a single cv fold.
250 | def cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, fold, strategy, feature_mask=None, progress_str=None, quiet=False):
251 |     def process():
252 | 
253 |         data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet)
254 | 
255 |         if feature_mask is not None:
256 |             s = [slice(None),] * data.X_train.ndim
257 |             s[-1] = np.where(np.array(feature_mask) == True)[0]
258 |             data['X_train'] = data.X_train[s]
259 |             data['X_cv'] = data.X_cv[s]
260 |             if not quiet: print ' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape
261 | 
262 |         train(classifier, data, quiet=quiet)
263 |         if not quiet: print "Making predictions...",
264 |         timer = time.Timer()
265 |         mean_predictions, median_predictions, raw_predictions = make_predictions(classifier, data.X_cv, data.num_cv_segments)
266 |         if not quiet: print timer.pretty_str()
267 | 
268 |         mean_score = roc_auc_score(data.y_cv, mean_predictions)
269 |         median_score = roc_auc_score(data.y_cv, median_predictions)
270 | 
271 |         return jsdict({
272 |             'mean_score': mean_score,
273 |             'median_score': median_score,
274 |             'mean_predictions': mean_predictions,
275 |             'median_predictions': median_predictions,
276 |             'y_cv': data.y_cv
277 |         })
278 | 
279 |     feature_mask_string = calc_feature_mask_string(feature_mask)
280 |     fm_path = [feature_mask_string] if feature_mask_string is not None else []
281 |     paths = [settings.cache_dir, target, classifier_name] + pipeline.get_names() + fm_path + ['cv_%s_fold%d.hdf5' % (strategy.get_name(), fold)]
282 | 
283 |     if progress_str is not None:
284 |         print 'Running', progress_str, 'fold %d' % fold
285 |     return memoize(process, paths)
286 | 
287 | 
288 | # Calculate the average cross-validation score across N folds.
289 | #
290 | # pool: Optional multi-processing pool to use to schedule the folds, otherwise folds
291 | #       will be processed one-by-one
292 | # strategy: cross-validation strategy, see LegacyStrategy() and KFoldStrategy()
293 | # feature_mask: The feature_mask to apply before training.
294 | # progress_str: helper string for printing progress inside multiprocessing pool
295 | # return_data: returns full result if True, otherwise simply processes the folds without doing an consolidation work
296 | def cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=None, pool=None, progress_str=None, feature_mask=None, return_data=True, quiet=False):
297 |     if strategy is None:
298 |         strategy = KFoldStrategy()
299 | 
300 |     if feature_mask is not None and np.count_nonzero(feature_mask) == len(feature_mask):
301 |         feature_mask = None
302 | 
303 |     _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet, meta_only=True)
304 |     cv_folds = strategy.get_folds(preictal_meta)
305 | 
306 |     if pool is not None:
307 |         results = [pool.apply_async(cross_val_score_for_one_fold, [settings, target, pipeline, classifier, classifier_name, fold],
308 |             {'strategy': strategy, 'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': quiet})
309 |             for fold in cv_folds]
310 |         if return_data:
311 |             out = [r.get() for r in results]
312 |     else:
313 |         out = [cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, strategy=strategy,
314 |             fold=fold, feature_mask=feature_mask, progress_str=progress_str, quiet=quiet) for fold in cv_folds]
315 | 
316 |     if return_data:
317 |         mean_scores = [d.mean_score for d in out]
318 |         median_scores = [d.median_score for d in out]
319 |         mean_predictions = [d.mean_predictions for d in out]
320 |         median_predictions = [d.median_predictions for d in out]
321 |         y_cvs = [d.y_cv for d in out]
322 | 
323 |         return jsdict({
324 |             'mean_score': np.mean(mean_scores),
325 |             'median_score': np.mean(median_scores),
326 |             'mean_scores': np.array(mean_scores),
327 |             'median_scores': np.array(median_scores),
328 |             'mean_predictions': mean_predictions,
329 |             'median_predictions': median_predictions,
330 |             'y_cvs': y_cvs
331 |         })
332 | 
333 | 
334 | # Make submission predictions for a given pipeline and classifier.
335 | #
336 | # feature_mask: The feature_mask to apply before training.
337 | # progress_str: helper string for printing progress inside multiprocessing pool
338 | def make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=None, quiet=False, progress_str=None):
339 |     if progress_str is not None:
340 |         print 'Running', progress_str
341 | 
342 |     feature_mask_string = calc_feature_mask_string(feature_mask)
343 | 
344 |     def process():
345 |         data = load_training_data(settings, target, pipeline, check_only=False, quiet=quiet)
346 | 
347 |         if feature_mask is not None:
348 |             s = [slice(None),] * data.X_train.ndim
349 |             s[-1] = np.where(np.array(feature_mask) == True)[0]
350 |             data['X_train'] = data.X_train[s]
351 |             if not quiet: print 'Feature mask', 'X_train', data.X_train.shape
352 | 
353 |         train(classifier, data, quiet=quiet)
354 |         train_predictions = classifier.predict_proba(data.X_train)[:, 1]
355 |         y_train = data.y_train
356 |         del data
357 | 
358 |         data = load_test_data(settings, target, pipeline, quiet=quiet)
359 | 
360 |         if feature_mask is not None:
361 |             s = [slice(None),] * data.X_test.ndim
362 |             s[-1] = np.where(np.array(feature_mask) == True)[0]
363 |             data['X_test'] = data.X_test[s]
364 |             if not quiet: print 'Feature mask', 'X_test', data.X_test.shape
365 | 
366 |         predictions = make_predictions(classifier, data.X_test, data.num_segments)
367 |         predictions_mean, predictions_median, test_predictions = predictions
368 | 
369 |         return {
370 |             'mean_predictions': predictions_mean,
371 |             'median_predictions': predictions_median,
372 |             'train_predictions': train_predictions,
373 |             'y_train': y_train,
374 |             'test_predictions': test_predictions,
375 |             'num_segments': data.num_segments
376 |         }
377 | 
378 |     fm_path = [feature_mask_string] if feature_mask_string is not None else []
379 |     paths = [settings.cache_dir, target, classifier_name] + pipeline.get_names() + fm_path + ['predictions.hdf5']
380 |     return memoize(process, paths)
381 | 
382 | 
383 | # Convert predictions into csv submission format
384 | def make_csv_for_target_predictions(target, predictions):
385 |     return ['%s_test_segment_%.4d.mat,%.10f' % (target, i+1, p) for i, p in enumerate(predictions)]
386 | 
387 | 
388 | # Wrapper to return both mean and median-combined predictions in csv submission format
389 | def make_submission_csv(settings, target, pipeline, classifier, classifier_name):
390 |     data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, quiet=True)
391 | 
392 |     csv_mean = make_csv_for_target_predictions(target, data.mean_predictions)
393 |     csv_median = make_csv_for_target_predictions(target, data.median_predictions)
394 | 
395 |     return csv_mean, csv_median
396 | 
397 | 
398 | # Write a submission file given the guesses either as a list of strings or already as the final string.
399 | # Filename is generated as submission%d.csv.gz with companion submission%d.txt where the number is
400 | # auto-increment given existing files in the submission directory. The companion txt file provides info
401 | # about what was used to generate that submission.
402 | def write_submission_file(settings, guesses, name, pipeline, classifier_name, targets_and_pipelines=None, target_pipelines=None):
403 |     guesses = '\n'.join(guesses) if isinstance(guesses, list) else guesses
404 |     id = 0
405 |     done = False
406 |     while not done:
407 |         try:
408 |             filename = os.path.join(settings.submission_dir, 'submission%d.csv.gz' % id)
409 |             # make the file to 'take it'
410 |             fd = os.open(filename, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0644)
411 |             os.close(fd)
412 | 
413 |             f = gzip.open(filename, 'wb')
414 |             f.write(guesses)
415 |             f.close()
416 | 
417 |             print 'wrote', filename
418 | 
419 |             filename = os.path.join(settings.submission_dir, 'submission%d.txt' % id)
420 |             with open(filename, 'w') as f:
421 |                 print >>f, classifier_name
422 |                 print >>f, name
423 |                 if target_pipelines is not None:
424 |                     for target in sorted(target_pipelines.keys()):
425 |                         pipeline = target_pipelines[target]
426 |                         print >>f, target
427 |                         print >>f, pipeline.get_name()
428 |                 if targets_and_pipelines is not None:
429 |                     for target, pipeline, feature_masks, _, _ in targets_and_pipelines:
430 |                         print >>f, target
431 |                         print >>f, 'FEATURE MASKS'
432 |                         print >>f, '\n'.join(pipeline.get_names())
433 |                         for i, mask in enumerate(feature_masks):
434 |                             print >>f, 'Mask %d' % i
435 |                             print >>f, mask
436 |                 else:
437 |                     for p_name in pipeline.get_names():
438 |                         print >>f, p_name
439 |             print 'wrote', filename
440 | 
441 |             done = True
442 | 
443 |         except OSError, e:
444 |             id += 1
445 | 


--------------------------------------------------------------------------------
/seizure_prediction/transforms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.signal import hilbert
  3 | from sklearn import preprocessing
  4 | import scipy.stats
  5 | import pandas as pd
  6 | 
  7 | from data import to_np_array
  8 | 
  9 | 
 10 | # optional modules for trying out different transforms
 11 | try:
 12 |     import pywt
 13 | except ImportError, e:
 14 |     pass
 15 | 
 16 | try:
 17 |     from scikits.talkbox.features import mfcc
 18 | except ImportError, e:
 19 |     pass
 20 | 
 21 | # for auto regressive model
 22 | try:
 23 |     import spectrum
 24 | except ImportError, e:
 25 |     pass
 26 | 
 27 | 
 28 | 
 29 | # NOTE(mike): Some transforms operate on the raw data in the shape (NUM_CHANNELS, NUM_FEATURES).
 30 | # Others operate on windowed data in the shape (NUM_WINDOWS, NUM_CHANNELS, NUM_FEATURES).
 31 | # I've been a bit lazy and just made the ApplyManyTransform base class helper... so if you intend
 32 | # a transform to work on pre-windowed data, just write a plain transform with apply method, if
 33 | # you intend to work on windowed-data, derive from ApplyManyTransform and implement apply_one method.
 34 | # Really this is just a problem of number of axes, and np.apply_along_axis could probably be used to
 35 | # clean up this mess. :) I haven't bothered updating it as things are working as they are.
 36 | 
 37 | class ApplyManyTransform(object):
 38 |     def apply(self, datas, meta):
 39 |         if datas.ndim >= 3:
 40 |             out = []
 41 |             for d in datas:
 42 |                 out.append(self.apply_one(d, meta))
 43 | 
 44 |             return to_np_array(out)
 45 |         else:
 46 |             return self.apply_one(datas, meta)
 47 | 
 48 | 
 49 | class FFT:
 50 |     """
 51 |     Apply Fast Fourier Transform to the last axis.
 52 |     """
 53 |     def get_name(self):
 54 |         return "fft"
 55 | 
 56 |     def apply(self, data, meta=None):
 57 |         axis = data.ndim - 1
 58 |         return np.fft.rfft(data, axis=axis)
 59 | 
 60 | 
 61 | class Slice:
 62 |     """
 63 |     Take a slice of the data on the last axis.
 64 |     e.g. Slice(1, 48) works like a normal python slice, that is 1-47 will be taken
 65 |     """
 66 |     def __init__(self, start, end=None):
 67 |         self.start = start
 68 |         self.end = end
 69 | 
 70 |     def get_name(self):
 71 |         return "slice%d%s" % (self.start, '-%d' % self.end if self.end is not None else '')
 72 | 
 73 |     def apply(self, data, meta=None):
 74 |         s = [slice(None),] * data.ndim
 75 |         s[-1] = slice(self.start, self.end)
 76 |         return data[s]
 77 | 
 78 | 
 79 | class MFCC:
 80 |     """
 81 |     Mel-frequency cepstrum coefficients
 82 |     """
 83 |     def get_name(self):
 84 |         return "mfcc"
 85 | 
 86 |     def apply(self, data, meta=None):
 87 |         all_ceps = []
 88 |         for ch in data:
 89 |             ceps, mspec, spec = mfcc(ch)
 90 |             all_ceps.append(ceps.ravel())
 91 | 
 92 |         return to_np_array(all_ceps)
 93 | 
 94 | 
 95 | class Magnitude:
 96 |     """
 97 |     Take magnitudes of Complex data
 98 |     """
 99 |     def get_name(self):
100 |         return "mag"
101 | 
102 |     def apply(self, data, meta=None):
103 |         return np.abs(data)
104 | 
105 | 
106 | class Log:
107 |     """
108 |     Apply LogE
109 |     """
110 |     def get_name(self):
111 |         return "log"
112 | 
113 |     def apply(self, data, meta=None):
114 |         indices = np.where(data <= 0)
115 |         data[indices] = np.max(data)
116 |         data[indices] = (np.min(data) * 0.1)
117 |         return np.log(data)
118 | 
119 | 
120 | class Log2:
121 |     """
122 |     Apply Log2
123 |     """
124 |     def get_name(self):
125 |         return "log2"
126 | 
127 |     def apply(self, data, meta=None):
128 |         indices = np.where(data <= 0)
129 |         data[indices] = np.max(data)
130 |         data[indices] = (np.min(data) * 0.1)
131 |         return np.log2(data)
132 | 
133 | 
134 | class Log10:
135 |     """
136 |     Apply Log10
137 |     """
138 |     def get_name(self):
139 |         return "log10"
140 | 
141 |     def apply(self, data, meta=None):
142 |         indices = np.where(data <= 0)
143 |         data[indices] = np.max(data)
144 |         data[indices] = (np.min(data) * 0.1)
145 |         return np.log10(data)
146 | 
147 | 
148 | class Stats(ApplyManyTransform):
149 |     """
150 |     Subtract the mean, then take (min, max, standard_deviation) for each channel.
151 |     """
152 |     def get_name(self):
153 |         return "stats"
154 | 
155 |     def apply_one(self, data, meta=None):
156 |         # data[ch][dim]
157 |         shape = data.shape
158 |         out = np.empty((shape[0], 3))
159 |         for i in range(len(data)):
160 |             ch_data = data[i]
161 |             ch_data -= np.mean(ch_data)
162 |             outi = out[i]
163 |             outi[0] = np.std(ch_data)
164 |             outi[1] = np.min(ch_data)
165 |             outi[2] = np.max(ch_data)
166 | 
167 |         return out
168 | 
169 | 
170 | class MomentPerChannel(ApplyManyTransform):
171 |     """
172 |     Calculate the Nth moment per channel.
173 |     """
174 |     def __init__(self, n):
175 |         self.n = n
176 | 
177 |     def get_name(self):
178 |         return "moment%d" % self.n
179 | 
180 |     def apply_one(self, data, meta=None):
181 |         return scipy.stats.moment(data, moment=self.n, axis=data.ndim-1)
182 | 
183 | 
184 | class UnitScale:
185 |     """
186 |     Scale across the last axis.
187 |     """
188 |     def get_name(self):
189 |         return 'unit-scale'
190 | 
191 |     def apply(self, data, meta=None):
192 |         return preprocessing.scale(data, axis=data.ndim-1)
193 | 
194 | 
195 | class UnitScaleFeat:
196 |     """
197 |     Scale across the first axis, i.e. scale each feature.
198 |     """
199 |     def get_name(self):
200 |         return 'unit-scale-feat'
201 | 
202 |     def apply(self, data, meta=None):
203 |         return preprocessing.scale(data.astype(np.float64), axis=0)
204 | 
205 | 
206 | class CorrelationMatrix(ApplyManyTransform):
207 |     """
208 |     Calculate correlation coefficients matrix across all EEG channels.
209 |     """
210 |     def get_name(self):
211 |         return 'corr-mat'
212 | 
213 |     def apply_one(self, data, meta=None):
214 |         return np.corrcoef(data)
215 | 
216 | 
217 | class Eigenvalues(ApplyManyTransform):
218 |     """
219 |     Take eigenvalues of a matrix, and sort them by magnitude in order to
220 |     make them useful as features (as they have no inherent order).
221 |     """
222 |     def get_name(self):
223 |         return 'eigen'
224 | 
225 |     def apply_one(self, data, meta=None):
226 |         w, v = np.linalg.eig(data)
227 |         w = np.absolute(w)
228 |         w.sort()
229 |         return w
230 | 
231 | 
232 | # Take the upper right triangle of a matrix
233 | def upper_right_triangle(matrix):
234 |     accum = []
235 |     for i in range(matrix.shape[0]):
236 |         for j in range(i+1, matrix.shape[1]):
237 |             accum.append(matrix[i, j])
238 | 
239 |     return to_np_array(accum)
240 | 
241 | 
242 | class UpperRightTriangle(ApplyManyTransform):
243 |     """
244 |     Take the upper right triangle of a matrix.
245 |     """
246 |     def get_name(self):
247 |         return 'urt'
248 | 
249 |     def apply_one(self, data, meta=None):
250 |         assert data.ndim == 2 and data.shape[0] == data.shape[1]
251 |         return upper_right_triangle(data)
252 | 
253 | 
254 | class FreqCorrelation(ApplyManyTransform):
255 |     """
256 |     Correlation in the frequency domain. First take FFT with (start, end) slice options,
257 |     then calculate correlation co-efficients on the FFT output, followed by calculating
258 |     eigenvalues on the correlation co-efficients matrix.
259 | 
260 |     The output features are (fft, upper_right_diagonal(correlation_coefficients), eigenvalues)
261 | 
262 |     Features can be selected/omitted using the constructor arguments.
263 |     """
264 |     def __init__(self, start_hz, end_hz, option, use_phase=False, with_fft=False, with_corr=True, with_eigen=True):
265 |         self.start_hz = start_hz
266 |         self.end_hz = end_hz
267 |         self.option = option
268 |         self.with_fft = with_fft
269 |         self.with_corr = with_corr
270 |         self.with_eigen = with_eigen
271 |         self.use_phase = use_phase
272 |         assert option in ('us', 'usf', 'none', 'fft_in')
273 |         assert with_corr or with_eigen
274 | 
275 |     def get_name(self):
276 |         selections = []
277 |         if self.option in ('us', 'usf', 'fft_in'):
278 |             selections.append(self.option)
279 |         if self.with_fft:
280 |             selections.append('fft')
281 |         if not self.with_corr:
282 |             selections.append('nocorr')
283 |         if not self.with_eigen:
284 |             selections.append('noeig')
285 |         if len(selections) > 0:
286 |             selection_str = '-' + '-'.join(selections)
287 |         else:
288 |             selection_str = ''
289 |         return 'freq-corr%s-%s-%s%s' % ('-phase' if self.use_phase else '', self.start_hz, self.end_hz, selection_str)
290 | 
291 |     def apply_one(self, data, meta=None):
292 |         num_time_samples = data.shape[-1] if self.option != 'fft_in' else (data.shape[-1] - 1) * 2 # revert FFT shape change
293 |         if self.start_hz == 1 and self.end_hz is None:
294 |             freq_slice = Slice(self.start_hz, self.end_hz)
295 |         else:
296 |             # FFT range is from 0Hz to 101Hz
297 |             def calc_index(f):
298 |                 return int((f / (meta.sampling_frequency/2.0)) * num_time_samples) if f is not None else num_time_samples
299 |             freq_slice = Slice(calc_index(self.start_hz), calc_index(self.end_hz))
300 |             # print data.shape, freq_slice.start, freq_slice.end
301 |             # import sys
302 |             # sys.exit(0)
303 | 
304 |         data1 = data
305 |         if self.option != 'fft_in':
306 |             data1 = FFT().apply(data1)
307 |         data1 = freq_slice.apply(data1)
308 |         if self.use_phase:
309 |             data1 = np.angle(data1)
310 |         else:
311 |             data1 = Magnitude().apply(data1)
312 |             data1 = Log10().apply(data1)
313 | 
314 |         data2 = data1
315 |         if self.option == 'usf':
316 |             data2 = UnitScaleFeat().apply(data2)
317 |         elif self.option == 'us':
318 |             data2 = UnitScale().apply(data2)
319 | 
320 |         data2 = CorrelationMatrix().apply_one(data2)
321 | 
322 |         if self.with_eigen:
323 |             w = Eigenvalues().apply_one(data2)
324 | 
325 |         out = []
326 |         if self.with_corr:
327 |             data2 = upper_right_triangle(data2)
328 |             out.append(data2)
329 |         if self.with_eigen:
330 |             out.append(w)
331 |         if self.with_fft:
332 |             data1 = data1.ravel()
333 |             out.append(data1)
334 |         for d in out:
335 |             assert d.ndim == 1
336 | 
337 |         return np.concatenate(out, axis=0)
338 | 
339 | 
340 | class Correlation(ApplyManyTransform):
341 |     """
342 |     Correlation in the time domain. Calculate correlation co-efficients
343 |     followed by calculating eigenvalues on the correlation co-efficients matrix.
344 | 
345 |     The output features are (upper_right_diagonal(correlation_coefficients), eigenvalues)
346 | 
347 |     Features can be selected/omitted using the constructor arguments.
348 |     """
349 |     def __init__(self, scale_option, with_corr=True, with_eigen=True):
350 |         self.scale_option = scale_option
351 |         self.with_corr = with_corr
352 |         self.with_eigen = with_eigen
353 |         assert scale_option in ('us', 'usf', 'none')
354 |         assert with_corr or with_eigen
355 | 
356 |     def get_name(self):
357 |         selections = []
358 |         if self.scale_option != 'none':
359 |             selections.append(self.scale_option)
360 |         if not self.with_corr:
361 |             selections.append('nocorr')
362 |         if not self.with_eigen:
363 |             selections.append('noeig')
364 |         if len(selections) > 0:
365 |             selection_str = '-' + '-'.join(selections)
366 |         else:
367 |             selection_str = ''
368 |         return 'corr%s' % (selection_str)
369 | 
370 |     def apply_one(self, data, meta=None):
371 |         data1 = data
372 |         if self.scale_option == 'usf':
373 |             data1 = UnitScaleFeat().apply(data1)
374 |         elif self.scale_option == 'us':
375 |             data1 = UnitScale().apply(data1)
376 | 
377 |         data1 = CorrelationMatrix().apply_one(data1)
378 | 
379 |         # patch nans
380 |         data1[np.where(np.isnan(data1))] = -2
381 | 
382 |         if self.with_eigen:
383 |             w = Eigenvalues().apply_one(data1)
384 | 
385 |         out = []
386 |         if self.with_corr:
387 |             data1 = upper_right_triangle(data1)
388 |             out.append(data1)
389 |         if self.with_eigen:
390 |             out.append(w)
391 | 
392 |         for d in out:
393 |             assert d.ndim == 1
394 | 
395 |         return np.concatenate(out, axis=0)
396 | 
397 | 
398 | class FlattenChannels(object):
399 |     """
400 |     Reshapes the data from (..., N_CHANNELS, N_FEATURES) to (..., N_CHANNELS * N_FEATURES)
401 |     """
402 |     def get_name(self):
403 |         return 'fch'
404 | 
405 |     def apply(self, data, meta=None):
406 |         if data.ndim == 2:
407 |             return data.ravel()
408 |         elif data.ndim == 3:
409 |             s = data.shape
410 |             return data.reshape((s[0], np.product(s[1:])))
411 |         else:
412 |             raise NotImplementedError()
413 | 
414 | 
415 | class Windower:
416 |     """
417 |     Breaks the time-series data into N second segments, for example 60s windows
418 |     will create 10 windows given a 600s segment. The output is the reshaped data
419 |     e.g. (600, 120000) -> (600, 10, 12000)
420 |     """
421 |     def __init__(self, window_secs=None):
422 |         self.window_secs = window_secs
423 |         self.name = 'w-%ds' % window_secs if window_secs is not None else 'w-whole'
424 | 
425 |     def get_name(self):
426 |         return self.name
427 | 
428 |     def apply(self, X, meta=None):
429 |         if self.window_secs is None:
430 |             return X.reshape([1] + list(X.shape))
431 | 
432 |         num_windows = meta.data_length_sec / self.window_secs
433 |         samples_per_window = self.window_secs * int(meta.sampling_frequency)
434 |         samples_used = num_windows * samples_per_window
435 |         samples_dropped = X.shape[-1] - samples_used
436 |         X = Slice(samples_dropped).apply(X)
437 |         out = np.split(X, num_windows, axis=X.ndim-1)
438 |         out = to_np_array(out)
439 |         return out
440 | 
441 | class PreictalWindowGenerator:
442 |     """
443 |     Experimental windower that generates overlapping windows for preictal segments only.
444 |     The window_secs parameter describes how long each window is, and gen_factor describes
445 |     how many extra windows you want as a multiplier.
446 | 
447 |     For example given a 600s segment, a window size of 60s will give you 10 windows,
448 |     this number is then multiplied by gen_factor, e.g. 20 windows if gen_factor is 2.
449 |     The window size is fixed and the starting point for each window will be evenly-spaced.
450 | 
451 |     It's been a while since I've used this, not even sure if it works properly...
452 |     """
453 |     def __init__(self, window_secs, gen_factor):
454 |         self.window_secs = window_secs
455 |         self.gen_factor = gen_factor
456 |         self.name = 'wg-%ds-%d' % (window_secs, gen_factor)
457 |         self.windower = Windower(window_secs)
458 | 
459 |     def get_name(self):
460 |         return self.name
461 | 
462 |     def apply(self, X, meta):
463 |         if meta.data_type == 'preictal':
464 |             num_windows = (meta.data_length_sec / self.window_secs) * self.gen_factor
465 |             samples_per_window = self.window_secs * int(meta.sampling_frequency) / self.gen_factor
466 |             samples_used = num_windows * samples_per_window
467 |             samples_dropped = X.shape[-1] - samples_used
468 |             X = Slice(samples_dropped).apply(X)
469 |             pieces = np.split(X, num_windows, axis=X.ndim-1)
470 |             pieces_per_window = self.gen_factor
471 |             gen = [np.concatenate(pieces[i:i+pieces_per_window], axis=pieces[0].ndim - 1) for i in range(0, num_windows - self.gen_factor + 1)]
472 |             gen = to_np_array(gen)
473 |             return gen
474 |         else:
475 |             return self.windower.apply(X, meta)
476 | 
477 | 
478 | class Hurst:
479 |     """
480 |     Hurst exponent per-channel, see http://en.wikipedia.org/wiki/Hurst_exponent
481 | 
482 |     Another description can be found here: http://www.ijetch.org/papers/698-W10024.pdf
483 |     Kavya Devarajan, S. Bagyaraj, Vinitha Balasampath, Jyostna. E. and Jayasri. K.,
484 |     "EEG-Based Epilepsy Detection and Prediction," International Journal of Engineering
485 |     and Technology vol. 6, no. 3, pp. 212-216, 2014.
486 | 
487 |     """
488 |     def get_name(self):
489 |         return 'hurst'
490 | 
491 |     def apply(self, X, meta):
492 |         def apply_one(x):
493 |             x -= x.mean()
494 |             z = np.cumsum(x)
495 |             r = (np.maximum.accumulate(z) - np.minimum.accumulate(z))[1:]
496 |             s = pd.expanding_std(x)[1:]
497 | 
498 |             # prevent division by 0
499 |             s[np.where(s == 0)] = 1e-12
500 |             r += 1e-12
501 | 
502 |             y_axis = np.log(r / s)
503 |             x_axis = np.log(np.arange(1, len(y_axis) + 1))
504 |             x_axis = np.vstack([x_axis, np.ones(len(x_axis))]).T
505 | 
506 |             m, b = np.linalg.lstsq(x_axis, y_axis)[0]
507 |             return m
508 | 
509 |         return np.apply_along_axis(apply_one, -1, X)
510 | 
511 | 
512 | class PFD(ApplyManyTransform):
513 |     """
514 |     Petrosian fractal dimension per-channel
515 | 
516 |     Implementation derived from reading:
517 |     http://arxiv.org/pdf/0804.3361.pdf
518 |     F.S. Bao, D.Y.Lie,Y.Zhang,"A new approach to automated epileptic diagnosis using EEG
519 |     and probabilistic neural network",ICTAI'08, pp. 482-486, 2008.
520 |     """
521 |     def get_name(self):
522 |         return 'pfd'
523 | 
524 |     def pfd_for_ch(self, ch):
525 |         diff = np.diff(ch, n=1, axis=0)
526 | 
527 |         asign = np.sign(diff)
528 |         sign_changes = ((np.roll(asign, 1) - asign) != 0).astype(int)
529 |         N_delta = np.count_nonzero(sign_changes)
530 | 
531 |         n = len(ch)
532 |         log10n = np.log10(n)
533 |         return log10n / (log10n + np.log10(n / (n + 0.4 * N_delta)))
534 | 
535 |     def apply_one(self, X, meta=None):
536 |         return to_np_array([self.pfd_for_ch(ch) for ch in X])
537 | 
538 | 
539 | def hfd(X, kmax):
540 |     N = len(X)
541 |     Nm1 = float(N - 1)
542 |     L = np.empty((kmax,))
543 |     L[0] = np.sum(abs(np.diff(X, n=1))) # shortcut :)
544 |     for k in xrange(2, kmax + 1):
545 |         Lmks = np.empty((k,))
546 |         for m in xrange(1, k + 1):
547 |             i_end = (N - m) / k # int
548 |             Lmk_sum = np.sum(abs(np.diff(X[np.arange(m - 1, m + (i_end + 1) * k - 1, k)], n=1)))
549 |             Lmk = Lmk_sum * Nm1 / (i_end * k)
550 |             Lmks[m-1] = Lmk
551 | 
552 |         L[k - 1] = np.mean(Lmks)
553 | 
554 |     a = np.empty((kmax, 2))
555 |     a[:, 0] = np.log(1.0 / np.arange(1.0, kmax + 1.0))
556 |     a[:, 1] = 1.0
557 | 
558 |     b = np.log(L)
559 | 
560 |     # find x by solving for ax = b
561 |     x, residues, rank, s = np.linalg.lstsq(a, b)
562 |     return x[0]
563 | 
564 | 
565 | class HFD(ApplyManyTransform):
566 |     """
567 |     Higuchi fractal dimension per-channel
568 | 
569 |     Implementation derived from reading:
570 |     http://arxiv.org/pdf/0804.3361.pdf
571 |     F.S. Bao, D.Y.Lie,Y.Zhang,"A new approach to automated epileptic diagnosis using EEG
572 |     and probabilistic neural network",ICTAI'08, pp. 482-486, 2008.
573 |     """
574 |     def __init__(self, kmax):
575 |         self.kmax = kmax
576 | 
577 |     def get_name(self):
578 |         return 'hfd-%d' % self.kmax
579 | 
580 |     def apply_one(self, data, meta=None):
581 |         return to_np_array([hfd(ch, self.kmax) for ch in data])
582 | 
583 | 
584 | class Diff(ApplyManyTransform):
585 |     """
586 |     Wrapper for np.diff
587 |     """
588 |     def __init__(self, order):
589 |         self.order = order
590 | 
591 |     def get_name(self):
592 |         return 'diff-%d' % self.order
593 | 
594 |     def apply_one(self, data, meta=None):
595 |         return np.diff(data, n=self.order, axis=data.ndim-1)
596 | 
597 | 
598 | class SpectralEntropy(ApplyManyTransform):
599 |     """
600 |     Calculates Shannon entropy between the given frequency ranges.
601 |     e.g. The probability density function of FFT magnitude is calculated, then
602 |     given range [1,2,3], Shannon entropy is calculated between 1hz and 2hz, 2hz and 3hz
603 |     in this case giving 2 values per channel.
604 | 
605 |     NOTE(mike): Input for this transform must be from (FFT(), Magnitude())
606 |     """
607 |     def __init__(self, freq_ranges, flatten=True):
608 |         self.freq_ranges = freq_ranges
609 |         self.flatten = flatten
610 | 
611 |     def get_name(self):
612 |         return 'spec-ent-%s%s' % ('-'.join([str(f) for f in self.freq_ranges]), '-nf' if not self.flatten else '')
613 | 
614 |     def apply_one(self, fft_mag, meta):
615 |         num_time_samples = (fft_mag.shape[-1] - 1) * 2 # revert FFT shape change
616 | 
617 |         X = fft_mag ** 2
618 |         for ch in X:
619 |             ch /= np.sum(ch + 1e-12)
620 | 
621 |         psd = X # pdf
622 | 
623 |         out = []
624 | 
625 |         #[0,1,2] -> [[0,1], [1,2]]
626 |         for start_freq, end_freq in zip(self.freq_ranges[:-1], self.freq_ranges[1:]):
627 |             start_index = np.floor((start_freq / meta.sampling_frequency) * num_time_samples)
628 |             end_index = np.floor((end_freq / meta.sampling_frequency) * num_time_samples)
629 |             selected = psd[:, start_index:end_index]
630 | 
631 |             entropies = - np.sum(selected * np.log2(selected + 1e-12), axis=selected.ndim-1) / np.log2(end_index - start_index)
632 |             if self.flatten:
633 |                 out.append(entropies.ravel())
634 |             else:
635 |                 out.append(entropies)
636 | 
637 |         if self.flatten:
638 |             return np.concatenate(out)
639 |         else:
640 |             return to_np_array(out)
641 | 
642 | 
643 | class PIBSpectralEntropy(ApplyManyTransform):
644 |     """
645 |     Similar to the calculations in SpectralEntropy transform, but instead power-in-band
646 |     is calculated over the given freq_ranges, finally Shannon entropy is calculated on that.
647 |     The output is a single entropy value per-channel.
648 | 
649 |     NOTE(mike): Input for this transform must be from (FFT(), Magnitude())
650 |     """
651 |     def __init__(self, freq_ranges):
652 |         self.freq_ranges = freq_ranges
653 | 
654 |     def get_name(self):
655 |         return 'pib-spec-ent-%s' % '-'.join([str(f) for f in self.freq_ranges])
656 | 
657 |     def apply_one(self, data, meta=None):
658 |         num_channels = data.shape[0]
659 |         num_time_samples = float((data.shape[-1] - 1) * 2) # revert FFT shape change
660 | 
661 |         def norm(X):
662 |             for ch in X:
663 |                 ch /= np.sum(ch + 1e-12)
664 |             return X
665 | 
666 |         psd = data ** 2
667 |         psd = norm(psd)
668 | 
669 |         # group into bins
670 |         def binned_psd(psd, out):
671 |             prev = freq_ranges[0]
672 |             for i, cur in enumerate(freq_ranges[1:]):
673 |                 prev_index = np.floor((prev / meta.sampling_frequency) * num_time_samples)
674 |                 cur_index = np.floor((cur / meta.sampling_frequency) * num_time_samples)
675 |                 out[i] = np.sum(psd[prev_index:cur_index])
676 |                 prev = cur
677 | 
678 |         freq_ranges = self.freq_ranges
679 |         out = np.empty((num_channels, len(freq_ranges) - 1,))
680 |         for ch in range(num_channels):
681 |             binned_psd(psd[ch], out[ch])
682 | 
683 |         psd_per_bin = norm(out)
684 | 
685 |         def entropy_per_channel(psd):
686 |             entropy_components = psd * np.log2(psd + 1e-12)
687 |             entropy = -np.sum(entropy_components) / np.log2(psd.shape[-1])
688 |             return entropy
689 | 
690 |         out = np.empty((num_channels,))
691 |         for i, ch in enumerate(psd_per_bin):
692 |             out[i] = entropy_per_channel(ch)
693 | 
694 |         return out
695 | 
696 | 
697 | class FreqBinning(ApplyManyTransform):
698 |     """
699 |     Given spectral magnitude data, select a range of bins, and then choose a consolidation function
700 |     to use to calculate each bin. The sum can be used, or the mean, or the standard deviation.
701 | 
702 |     NOTE(mike): Input for this transform must be from (FFT(), Magnitude())
703 |     """
704 |     def __init__(self, freq_ranges, func=None):
705 |         self.freq_ranges = freq_ranges
706 |         assert func is None or func in ('sum', 'mean', 'std')
707 |         self.func = func
708 | 
709 |     def get_name(self):
710 |         return 'fbin%s%s' % ('' if self.func is None else '-' + self.func, '-' + '-'.join([str(f) for f in self.freq_ranges]))
711 | 
712 |     def apply_one(self, X, meta):
713 |         num_channels = X.shape[0]
714 |         num_time_samples = (X.shape[-1] - 1) * 2 # revert FFT shape change
715 | 
716 |         if self.func == 'mean':
717 |             func = np.mean
718 |         elif self.func == 'std':
719 |             func = np.std
720 |         else:
721 |             func = np.sum
722 | 
723 |         # group into bins
724 |         def binned_freq(data, out):
725 |             prev = freq_ranges[0]
726 |             for i, cur in enumerate(freq_ranges[1:]):
727 |                 prev_index = np.floor((prev / meta.sampling_frequency) * num_time_samples)
728 |                 cur_index = np.floor((cur / meta.sampling_frequency) * num_time_samples)
729 |                 out[i] = func(data[prev_index:cur_index])
730 |                 prev = cur
731 | 
732 |         freq_ranges = self.freq_ranges
733 |         out = np.empty((num_channels, len(freq_ranges) - 1,))
734 |         for ch in range(num_channels):
735 |             binned_freq(X[ch], out[ch])
736 | 
737 |         return out
738 | 
739 | 
740 | class AR(ApplyManyTransform):
741 |     """
742 |     Auto-regressive model as suggested by:
743 |     http://hdl.handle.net/1807/33224
744 |     https://tspace.library.utoronto.ca/bitstream/1807/33224/1/Green_Adrian_CA_201211_MASc_thesis.pdf
745 | 
746 |     It is suggested to use a model order of 8.
747 |     """
748 |     def __init__(self, order):
749 |         self.order = order
750 | 
751 |     def get_name(self):
752 |         return 'ar%d' % self.order
753 | 
754 |     def calc_for_ch(self, ch):
755 |         ar_coeffs, dnr, reflection_coeffs = spectrum.aryule(ch, self.order)
756 |         return np.abs(ar_coeffs)
757 | 
758 |     def apply_one(self, X, meta):
759 |         return np.concatenate([self.calc_for_ch(ch) for ch in X], axis=0)
760 | 
761 | 
762 | class SubMean:
763 |     """
764 |     For each feature, subtract from each channel the mean across all channels.
765 |     This is to perform average reference montage.
766 |     """
767 |     def get_name(self):
768 |         return 'subm'
769 | 
770 |     def apply(self, X, meta):
771 |         assert X.ndim == 2
772 |         X -= X.mean(axis=0)
773 |         return X
774 | 
775 | 
776 | def index_for_hz(X, hz, sampling_frequency):
777 |     return int((float(hz) / sampling_frequency) * X.shape[-1])
778 | 
779 | 
780 | class Preprocess:
781 |     """
782 |     Data preprocessing stage to normalize the data across all patients.
783 |     Data that has not had average reference montage applied needs it applied.
784 |     """
785 |     def get_name(self):
786 |         return 'pp'
787 | 
788 |     def apply(self, X, meta):
789 |         # NOTE(mike): Patient 1 and 2 have not subtracted the average reference from their raw data
790 |         # whereas Dogs 1 to 5 have. So bring these two patients into line to normalize the preprocessing
791 |         # across ALL patients.
792 |         if meta.target in ('Patient_1', 'Patient_2'):
793 |             X = SubMean().apply(X, meta)
794 |         return X
795 | 
796 | 
797 | class PhaseSynchrony(ApplyManyTransform):
798 |     """
799 |     Calculate phase synchrony between channels using Hilbert transform and Shannon entropy.
800 | 
801 |     Method described in:
802 |     http://www.researchgate.net/publication/222567264_Comparison_of_Hilbert_transform_and_wavelet_methods_for_the_analysis_of_neuronal_synchrony/links/0deec52baa808a3812000000
803 |     Le Van Quyen M, Foucher J, Lachaux J-P, Rodriguez E, Lutz A, Martinerie JM, Varela FJ (2001)
804 |         Comparison of Hilbert transform and wavelet methods for the analysis of neural synchrony.
805 |         J Neurosci Methods 111:83-98
806 | 
807 |     NOTE(mike): This seemed to work well in cross-validation, but I never got an increased
808 |     on the leaderboard.
809 |     """
810 |     def __init__(self, with_eigen=False, with_raw=True):
811 |         assert with_eigen or with_raw
812 |         self.with_raw = with_raw
813 |         self.with_eigen = with_eigen
814 | 
815 |     def get_name(self):
816 |         return 'phase-synchrony-%s%s' % ('-eigen' if self.with_eigen else '', '-noraw' if not self.with_raw else '')
817 | 
818 |     def apply_one(self, X, meta):
819 |         h = X + (1j * hilbert(X))
820 |         phase = np.angle(h)
821 | 
822 |         num_bins = int(np.exp(0.626 + 0.4 * np.log(X.shape[-1] - 1)))
823 |         Hmax = np.log(num_bins)
824 | 
825 |         num_channels = X.shape[0]
826 |         if self.with_eigen:
827 |             M = np.ones((num_channels, num_channels), dtype=np.float64)
828 |         out = np.empty((num_channels * (num_channels - 1) / 2,), dtype=np.float64)
829 |         count = 0
830 |         for i in range(num_channels):
831 |             for j in range(i + 1, num_channels):
832 |                 ch1_phase = phase[i]
833 |                 ch2_phase = phase[j]
834 | 
835 |                 phase_diff = np.mod(np.abs(ch1_phase - ch2_phase), np.pi * 2.0)
836 | 
837 |                 # convert phase_diff into a pdf of num_bins
838 |                 hist = np.histogram(phase_diff, bins=num_bins)[0]
839 |                 pdf = hist.astype(np.float64) / np.sum(hist)
840 | 
841 |                 H = np.sum(pdf * np.log(pdf + 1e-12))
842 | 
843 |                 p = (H + Hmax) / Hmax
844 | 
845 |                 if self.with_eigen:
846 |                     M[i][j] = p
847 |                     M[j][i] = p
848 |                 out[count] = p
849 |                 count += 1
850 | 
851 |         if self.with_eigen:
852 |             eigen = Eigenvalues().apply_one(M)
853 | 
854 |         if self.with_eigen and self.with_raw:
855 |             return np.concatenate((out, eigen))
856 | 
857 |         if self.with_eigen:
858 |             return eigen
859 |         else:
860 |             return out
861 | 


--------------------------------------------------------------------------------