├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── preprocessing_code
    ├── mel.py
    ├── sparKULee_loadRAWtestfiles.py
    ├── sparKULee_loadmwffiles.py
    ├── sparrKULee.py
    └── split_and_normalize.py
├── requirements.txt
├── task1_match_mismatch
    ├── __init__.py
    ├── experiments
    │   ├── __init__.py
    │   ├── dilated_convolutional_model.py
    │   └── test_match_mismatch.py
    └── models
    │   ├── __init__.py
    │   └── dilated_convolutional_model.py
├── task2_regression
    ├── __init__.py
    ├── experiments
    │   ├── __init__.py
    │   ├── linear_baseline.py
    │   ├── test_regression.py
    │   └── vlaai_mel.py
    └── models
    │   ├── __init__.py
    │   ├── linear.py
    │   └── vlaai.py
└── util
    ├── config.json
    └── dataset_generator.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # From https://github.com/github/gitignore/blob/main/Python.gitignore
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | .idea
162 | 
163 | # Specific for tis project
164 | task*/experiments/results*/
165 | *condor*
166 | .err
167 | .out
168 | .log
169 | .job
170 | speech-decoding/*
171 | speech-decoding
172 | .m


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Code and for the Auditory EEG ICASSP Challenge 2024
 2 | 
 3 | Copyright (C) 2022  ExpORL
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation, either version 3 of the License, or
 8 | (at your option) any later version.
 9 | 
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | GNU General Public License for more details.
14 | 
15 | You should have received a copy of the GNU General Public License
16 | along with this program.  If not, see <https://www.gnu.org/licenses/>.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Auditory-eeg-challenge-2024-code
 2 | ================================
 3 | This is the codebase for the [2024 ICASSP Auditory EEG challenge](https://exporl.github.io/auditory-eeg-challenge-2024).
 4 | This codebase contains baseline models and code to preprocess stimuli for both tasks.
 5 | 
 6 | # Prerequisites
 7 | 
 8 | Python >= 3.6
 9 | 
10 | # General setup
11 | 
12 | Steps to get a working setup:
13 | 
14 | ## 1. Clone this repository and install the [requirements.txt](requirements.txt)
15 | ```bash
16 | # Clone this repository
17 | git clone https://github.com/exporl/auditory-eeg-challenge-2024-code
18 | 
19 | # Go to the root folder
20 | cd auditory-eeg-challenge-2024-code
21 | 
22 | # Optional: install a virtual environment
23 | python3 -m venv venv # Optional
24 | source venv/bin/activate # Optional
25 | 
26 | # Install requirements.txt
27 | python3 -m install requirements.txt
28 | ```
29 | 
30 | ## 2. [Download the data](https://homes.esat.kuleuven.be/~lbollens/)
31 | 
32 | You will need a password, which you will receive when you [register](https://exporl.github.io/auditory-eeg-challenge-2024/registration/).
33 | The folder contains multiple folders (and `zip` files containing the same data as their corresponding folders). For bulk downloading, we recommend using the `zip` files, 
34 | 
35 |    1. `split_data(.zip)` contains already preprocessed, split and normalized data; ready for model training/evaluation. 
36 | If you want to get started quickly, you can opt to only download this folder/zipfile.
37 | 
38 |    2. `preprocessed_eeg(.zip)` and `preprocessed_stimuli(.zip)` contain preprocessed EEG and stimuli files (envelope and mel features) respectively.
39 | At this stage data is not yet split into different sets and normalized. To go from this to the data in `split_data`, you will have to run the `split_and_normalize.py` script ([preprocessing_code/split_and_normalize.py](./preprocessing_code/split_and_normalize.py) )
40 | 
41 |    3. `sub_*(.zip)` and `stimuli(.zip)` contain the raw EEG and stimuli files. 
42 | If you want to recreate the preprocessing steps, you will need to download these files and then run `sparrKULee.py` [(preprocessing_code/sparrKULee.py)](./preprocessing_code/sparrKULee.py) to preprocess the EEG and stimuli and then run the `split_and_normalize.py` script to split and normalize the data.
43 | It is possible to adapt the preprocessing steps in `sparrKULee.py` to your own needs, by adding/removing preprocessing steps. For more detailed information on the pipeline, see the [brain_pipe documentation](https://exporl.github.io/brain_pipe/).
44 | 
45 | 
46 | Note that it is possible to use the same preprocessed (and split) dataset for both task 1 and task 2, but it is not required.
47 | 
48 | 
49 | 
50 | ## 3. Adjust the `config.json` accordingly
51 | 
52 | There is a general `config.json` defining the folder names and structure for the data (i.e. [util/config.json](./util/config.json) ).
53 | Adjust `dataset_folder` in the `config.json` file from `null` to the absolute path to the folder containing all data (The `challenge_folder` from the previous point). 
54 | If you follow the BIDS structure, by downloading the whole dataset, the folders preprocessed_eeg, preprocessed_stimuli and split_data, should be located inside the 'derivatives' folder. If you only download these three folders, make sure they are either in a subfolder 'derivatives', or change the 'derivatives' folder in the config, otherwise you will get a file-not-found error when trying to run the experiments. 
55 |   
56 | 
57 | OK, you should be all setup now!
58 | 
59 |     
60 | 
61 | # Running the tasks
62 | 
63 | Each task has already some ready-to-go experiments files defined to give you a
64 | baseline and make you acquainted with the problem. The experiment files live
65 | in the `experiment` subfolder for each task. The training log,
66 | best model and evaluation results will be stored in a folder called
67 | `results_{experiment_name}`. For general ideas, you might want to look at the winners of the 
68 | [previous ICASSP auditory EEG challenge](https://exporl.github.io/auditory-eeg-challenge-2023).  
69 | 
70 | ## Task1: Match-mismatch
71 |     
72 | By running [task1_match_mismatch/experiments/dilated_convolutional_model.py](./task1_match_mismatch/experiments/dilated_convolutional_model.py),
73 | you can train the dilated convolutional model introduced by Accou et al. [(2021a)](https://doi.org/10.23919/Eusipco47968.2020.9287417) and [(2021b)](https://doi.org/10.1088/1741-2552/ac33e9).
74 | 
75 | Other models you might find interesting are [Decheveigné et al (2021)](https://www.sciencedirect.com/science/article/pii/S1053811918300338), [Monesi et al. (2020)](https://ieeexplore.ieee.org/abstract/document/9054000), [Monesi et al. (2021)](https://arxiv.org/abs/2106.09622),….
76 | 
77 | 
78 | 
79 | ## Task2: Regression (reconstructing spectrogram from EEG)
80 | 
81 | By running [task2_regression/experiments/linear_baseline.py](./task2_regression/experiments/linear_baseline.py), you can 
82 | train and evaluate a simple linear baseline model with Pearson correlation as a loss function, similar to the baseline model used in [Accou et al (2022)](https://www.biorxiv.org/content/10.1101/2022.09.28.509945).
83 | 
84 | By running [task2_regression/experiments/vlaai.py](./task2_regression/experiments/vlaai.py), you can train/evaluate
85 | the VLAAI model as proposed by [Accou et al (2022)](https://www.biorxiv.org/content/10.1101/2022.09.28.509945). You can find a pre-trained model at [VLAAI's github page](https://github.com/exporl/vlaai).
86 | 
87 | Other models you might find interesting are: [Thornton et al. (2022)](https://iopscience.iop.org/article/10.1088/1741-2552/ac7976),...
88 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | """Code for the ICASSP 2023 auditory eeg challenge."""
2 | 


--------------------------------------------------------------------------------
/preprocessing_code/mel.py:
--------------------------------------------------------------------------------
 1 | """Code to calculate mel spectrograms."""
 2 | import math
 3 | 
 4 | import librosa
 5 | import numpy as np
 6 | import scipy.signal
 7 | 
 8 | def calculate_mel_spectrogram(
 9 |     audio_path,
10 |     target_fs=64,
11 |     fmin=0,
12 |     fmax=5000,
13 |     nb_filters=10,
14 |     hop_length=None,
15 |     win_length=None,
16 | ):
17 |     """Calculates mel spectrogram of a raw speech file. This function makes the same calucation as
18 |     in the sparrKULee pipeline and is the regression objective for task 2.
19 | 
20 |     Parameters
21 |     ---------
22 |     audio_path: str
23 |         audio file path
24 |     target_fs: int
25 |         Sampling frequency of the calculated mel spectrogram
26 |     fmin: Union[float, int]
27 |         Minimum center frequency used in mel filter matrix
28 |     fmax: Union[float, int]
29 |         Maximum center frequency used in mel filter matrix
30 |     nb_filters: int
31 |         Number of mel spectrogram frequency bands
32 |     hop_length: int
33 |         Hop length (in samples) used for calculation of the spectrogram
34 |     win_length: int
35 |         Window length (in samples) of each frame
36 | 
37 |     Returns
38 |     -------
39 |     numpy.ndarray
40 |         Mel spectrogram
41 |     """
42 | 
43 |     # unzip audio file
44 | 
45 | 
46 |     speech = dict(np.load(audio_path))
47 |     audio, fs = speech["audio"], speech["fs"]
48 |     if not hop_length:
49 |         hop_length = int((1 / target_fs) * fs)  # this will downsample the signal to target_fs Hz
50 |     if not win_length:
51 |         win_length = int(0.025 * fs)  # 25 milli seconds
52 | 
53 |     # Finds the closest power of 2
54 |     # that is bigger than win_length
55 |     n_fft = int(math.pow(2, math.ceil(math.log2(win_length))))
56 | 
57 |     # DC removal
58 |     audio = audio - np.mean(audio)
59 | 
60 |     mel_spectrogram = librosa.feature.melspectrogram(audio, window='hann',
61 |                                        sr=fs, n_fft=n_fft, hop_length=hop_length,
62 |                                        win_length=win_length, fmin=fmin, fmax=fmax, htk=False, norm='slaney',
63 |                                        n_mels=nb_filters, center=False)
64 | 
65 | 
66 |     return mel_spectrogram
67 | 
68 | 
69 | 
70 | # 'Center freqs' of mel bands - uniformly spaced between limits
71 | # mel_f:  [   0.        ,  147.02442191,  324.92910187,  540.19997145,
72 | #         800.6852341 , 1115.88148983, 1497.27995596, 1958.78540639,
73 | #        2517.22310262, 3192.95219807, 4010.6079787 , 5000.        ]
74 | 
75 | 


--------------------------------------------------------------------------------
/preprocessing_code/sparKULee_loadRAWtestfiles.py:
--------------------------------------------------------------------------------
  1 | """Run the default preprocessing pipeline on soarrKULee.
  2 | This script runs the necessary prereprocessing steps on the sparrKULee dataset, starting from the raw caches,
  3 | to arrive at the fully preprocessed files.
  4 | The raw caches are downloaded from the challenge website and should be placed in the folder specified by the
  5 | raw_eeg_dir variable.
  6 | The preprocessed EEG will be saved in the folder specified by the preprocessed_eeg_dir variable.
  7 | The caches have been synchronized with the stimulus data and should all have a length of 5 seconds.
  8 | 
  9 | On the raw caches, the following preprocessing steps have been performed:
 10 | (ie. synched with the stimulus data
 11 | and loaded into python)
 12 | 
 13 | You are free to use these caches for your own preprocessing pipeline,
 14 | however, if you want to use certain artifact steps, such as the artifact removal MWF,
 15 | be aware that the output from these steps might differ when using input windows of just 5 seconds.
 16 | If you want to use our artificat removal steps, we recommend to use the MWF caches,
 17 | on which the artifact removal MWF has already been performed, and not compute it yourself.
 18 | The MWF caches are saved at 1024Hz, see sparKULee_mwf.py for more information on how to use these caches.
 19 | 
 20 | eeg_steps = [
 21 |         LinkStimulusToBrainResponse(
 22 |             stimulus_data=stimulus_steps,
 23 |             extract_stimuli_information_fn=BIDSAPRStimulusInfoExtractor(),
 24 |             grouper=BIDSStimulusGrouper(
 25 |                 bids_root=root_dir,
 26 |                 mapping={"stim_file": "stimulus_path", "trigger_file": "trigger_path"},
 27 |                 subfolders=["stimuli", "eeg"],
 28 |             ),
 29 |         ),
 30 |         LoadEEGNumpy(unit_multiplier=1e6, channels_to_select=list(range(64))),
 31 |         AlignPeriodicBlockTriggers(biosemi_trigger_processing_fn),
 32 |         DefaultSave(raw_eeg_dir,
 33 |                     {'eeg': 'data'},
 34 |                     filename_fn = bids_filename_fn,
 35 |                     clear_output=True,
 36 |                     overwrite=overwrite),
 37 | 
 38 |     ]
 39 | """
 40 | import argparse
 41 | import datetime
 42 | import gzip
 43 | import json
 44 | import logging
 45 | import os
 46 | import glob
 47 | from typing import Any, Dict, Sequence
 48 | 
 49 | import librosa
 50 | import numpy as np
 51 | import math
 52 | import scipy.signal.windows
 53 | from brain_pipe.dataloaders.path import GlobLoader
 54 | from brain_pipe.pipeline.default import DefaultPipeline
 55 | from brain_pipe.preprocessing.brain.artifact import (
 56 |     InterpolateArtifacts,
 57 |     ArtifactRemovalMWF,
 58 | )
 59 | from brain_pipe.preprocessing.brain.eeg.biosemi import (
 60 |     biosemi_trigger_processing_fn,
 61 | )
 62 | from brain_pipe.preprocessing.brain.eeg.load import LoadEEGNumpy
 63 | from brain_pipe.preprocessing.brain.epochs import SplitEpochs
 64 | from brain_pipe.preprocessing.brain.link import (
 65 |     LinkStimulusToBrainResponse,
 66 |     BIDSStimulusInfoExtractor,
 67 | )
 68 | from brain_pipe.preprocessing.brain.rereference import CommonAverageRereference
 69 | from brain_pipe.preprocessing.brain.trigger import (
 70 |     AlignPeriodicBlockTriggers,
 71 | )
 72 | from brain_pipe.preprocessing.filter import SosFiltFilt
 73 | from brain_pipe.preprocessing.resample import ResamplePoly
 74 | from brain_pipe.preprocessing.stimulus.audio.envelope import GammatoneEnvelope
 75 | from brain_pipe.preprocessing.stimulus.audio.spectrogram import LibrosaMelSpectrogram
 76 | 
 77 | from brain_pipe.preprocessing.stimulus.load import LoadStimuli
 78 | from brain_pipe.runner.default import DefaultRunner
 79 | from brain_pipe.save.default import DefaultSave
 80 | # from mel import DefaultSave
 81 | from brain_pipe.utils.log import default_logging, DefaultFormatter
 82 | from brain_pipe.utils.path import BIDSStimulusGrouper
 83 | 
 84 | from typing import Dict, Any, Sequence, Optional, Union, Mapping
 85 | 
 86 | import numpy as np
 87 | 
 88 | from brain_pipe.pipeline.base import PipelineStep
 89 | 
 90 | 
 91 | class LoadEEGNumpyTest(PipelineStep):
 92 |     """Load EEG data.
 93 | 
 94 |     This step uses MNE to load EEG data.
 95 |     """
 96 | 
 97 |     def __init__(
 98 |             self, keys={"data_path": "data"}, copy_data_dict=False, *mne_args, **mne_kwargs
 99 |     ):
100 |         """Create a new LoadEEG instance.
101 | 
102 |         Parameters
103 |         ----------
104 |         eeg_path_key: str
105 |             The key of the EEG path in the data dict.
106 |         eeg_data_key: str
107 |             The key of the EEG data in the data dict.
108 |         """
109 |         super().__init__(copy_data_dict=copy_data_dict)
110 |         self.keys = self.parse_dict_keys(keys, "keys")
111 |         self.mne_args = mne_args
112 |         self.mne_kwargs = mne_kwargs
113 | 
114 | 
115 |     def __call__(self, data_dict: Dict[str, Any]) -> Dict[str, Any]:
116 |         """Load EEG data from a npy file.
117 | 
118 |         Parameters
119 |         ----------
120 |         data_dict: Dict[str, Any]
121 |             The data dict containing the EEG path.
122 | 
123 |         Returns
124 |         -------
125 |         Dict[str, Any]
126 |             The data dict with the EEG data and the EEG info.
127 |         """
128 |         for from_key, to_key in self.keys.items():
129 |             path = data_dict[from_key]
130 | 
131 |             # Support for gzipped files.
132 |             raw =np.load(path)
133 |             # swap axes
134 |             raw = np.swapaxes(raw, 0, 1)
135 | 
136 | 
137 |             data_dict['data'] = raw
138 |             data_dict['eeg_key'] = os.path.basename(path)
139 | 
140 |             data_dict['data_fs'] = 1024
141 | 
142 |         return data_dict
143 | 
144 | class BIDSAPRStimulusInfoExtractor(BIDSStimulusInfoExtractor):
145 |     """Extract BIDS compliant stimulus information from an .apr file."""
146 | 
147 |     def __call__(self, brain_dict: Dict[str, Any]):
148 |         """Extract BIDS compliant stimulus information from an events.tsv file.
149 | 
150 |         Parameters
151 |         ----------
152 |         brain_dict: Dict[str, Any]
153 |             The data dict containing the brain data path.
154 | 
155 |         Returns
156 |         -------
157 |         Sequence[Dict[str, Any]]
158 |             The extracted event information. Each dict contains the information
159 |             of one row in the events.tsv file
160 |         """
161 |         event_info = super().__call__(brain_dict)
162 |         # Find the apr file
163 |         path = brain_dict[self.brain_path_key]
164 |         apr_path = "_".join(path.split("_")[:-1]) + "_eeg.apr"
165 |         # Read apr file
166 |         apr_data = self.get_apr_data(apr_path)
167 |         # Add apr data to event info
168 |         for e_i in event_info:
169 |             e_i.update(apr_data)
170 |         return event_info
171 | 
172 |     def get_apr_data(self, apr_path: str):
173 |         """Get the SNR from an .apr file.
174 | 
175 |         Parameters
176 |         ----------
177 |         apr_path: str
178 |             Path to the .apr file.
179 | 
180 |         Returns
181 |         -------
182 |         Dict[str, Any]
183 |             The SNR.
184 |         """
185 |         import xml.etree.ElementTree as ET
186 | 
187 |         apr_data = {}
188 |         tree = ET.parse(apr_path)
189 |         root = tree.getroot()
190 | 
191 |         # Get SNR
192 |         interactive_elements = root.findall(".//interactive/entry")
193 |         for element in interactive_elements:
194 |             description_element = element.find("description")
195 |             if description_element.text == "SNR":
196 |                 apr_data["snr"] = element.find("new_value").text
197 |         if "snr" not in apr_data:
198 |             logging.warning(f"Could not find SNR in {apr_path}.")
199 |             apr_data["snr"] = 100.0
200 |         return apr_data
201 | 
202 | 
203 | def test_filename_fn(data_dict, feature_name, set_name=None):
204 |     """Default function to generate a filename for the data.
205 | 
206 |     Parameters
207 |     ----------
208 |     data_dict: Dict[str, Any]
209 |         The data dict containing the data to save.
210 |     feature_name: str
211 |         The name of the feature.
212 |     set_name: Optional[str]
213 |         The name of the set. If no set name is given, the set name is not
214 |         included in the filename.
215 | 
216 |     Returns
217 |     -------
218 |     str
219 |         The filename.
220 |     """
221 | 
222 | 
223 |     eeg_key = data_dict['eeg_key']
224 | 
225 |     return eeg_key
226 | 
227 | 
228 | def temp_unpack_data(data_path):
229 |     data = dict(np.load(data_path))
230 |     # save all keys, values in separate data path. TODO: fix this such that we don't have to do this
231 |     for key, value in data.items():
232 |         np.save(os.path.dirname(data_path) + key + '.npy', value)
233 | 
234 | 
235 | def run_preprocessing_pipeline(
236 |         root_dir,
237 |         preprocessed_eeg_dir,
238 |         nb_processes=4,
239 |         overwrite=False,
240 |         log_path="sparrKULee.log",
241 | ):
242 |     """Construct and run the preprocessing on SparrKULee.
243 | 
244 |     Parameters
245 |     ----------
246 |     root_dir: str
247 |         The root directory of the dataset.
248 |     preprocessed_eeg_dir:
249 |         The directory where the preprocessed EEG should be saved.
250 |     nb_processes: int
251 |         The number of processes to use. If -1, the number of processes is
252 |         automatically determined.
253 |     overwrite: bool
254 |         Whether to overwrite existing files.
255 |     log_path: str
256 |         The path to the log file.
257 |     """
258 |     #########
259 |     # PATHS #
260 |     #########
261 |     os.makedirs(preprocessed_eeg_dir, exist_ok=True)
262 | 
263 |     ###########
264 |     # LOGGING #
265 |     ###########
266 |     handler = logging.FileHandler(log_path)
267 |     handler.setLevel(logging.DEBUG)
268 |     handler.setFormatter(DefaultFormatter())
269 |     default_logging(handlers=[handler])
270 | 
271 |     ################
272 |     # DATA LOADING #
273 |     ################
274 |     logging.info("Retrieving layout...")
275 |     data_loader = GlobLoader(
276 |         [os.path.join(root_dir,   "sub*.npy")],
277 |         filter_fns=[],
278 |         key="data_path",
279 |     )
280 | 
281 | 
282 |     #########################
283 |     # RUNNING THE PIPELINE  #
284 |     #########################
285 | 
286 |     logging.info("Starting with the EEG preprocessing")
287 |     logging.info("===================================")
288 | 
289 | 
290 |     eeg_steps = [
291 |         LoadEEGNumpyTest(),
292 |         SosFiltFilt(
293 |             scipy.signal.butter(1, 0.5, "highpass", fs=1024, output="sos"),
294 |             emulate_matlab=True,
295 |             axis=1,
296 |         ),
297 |         InterpolateArtifacts(),
298 |         ArtifactRemovalMWF(),
299 |         CommonAverageRereference(),
300 |         ResamplePoly(64, axis=1),
301 |         DefaultSave(
302 |             preprocessed_eeg_dir,
303 |             {"eeg": "data"},
304 |             overwrite=overwrite,
305 |             clear_output=True,
306 |             filename_fn=test_filename_fn,
307 |         ),
308 |     ]
309 | 
310 |     #########################
311 |     # RUNNING THE PIPELINE  #
312 |     #########################
313 | 
314 |     logging.info("Starting with the EEG preprocessing")
315 |     logging.info("===================================")
316 | 
317 |     # Create data_dicts for the EEG files
318 |     # Create the EEG pipeline
319 |     eeg_pipeline = DefaultPipeline(steps=eeg_steps)
320 | 
321 |     DefaultRunner(
322 |         nb_processes=nb_processes,
323 |         logging_config=lambda: None,
324 |     ).run(
325 |         [(data_loader, eeg_pipeline)],
326 | 
327 |     )
328 | 
329 | 
330 | if __name__ == "__main__":
331 |     # Load the config
332 |     # get the top folder of the dataset
333 |     challenge_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
334 |     with open(os.path.join(challenge_folder, 'util', 'config.json'), "r") as f:
335 |         config = json.load(f)
336 | 
337 |     # Set the correct paths as default arguments
338 |     dataset_folder = config["dataset_folder"]
339 |     test_folder = os.path.join(dataset_folder, config["test_folder"])
340 |     task = 'TASK1_match_mismatch' # [' TASK1_match_mismatch', 'TASK2_regression']
341 | 
342 | 
343 |     preprocessed_eeg_folder = os.path.join(
344 |         test_folder, task, f'{config["preprocessed_eeg_folder"]}'
345 |     )
346 |     raw_eeg_dir = os.path.join(test_folder, task, 'raw_eeg')
347 |     # unpack the data
348 | 
349 |     raw_eeg_data = glob.glob(os.path.join(raw_eeg_dir,  '*_eeg_raw.npz'))
350 |     for data_path in raw_eeg_data:
351 |         print(f'processing {data_path}')
352 |         temp_unpack_data(data_path)
353 | 
354 | 
355 |     default_log_folder = os.path.dirname(os.path.abspath(__file__))
356 | 
357 |     # Parse arguments from the command line
358 |     parser = argparse.ArgumentParser(description="Preprocess the auditory EEG dataset")
359 |     parser.add_argument(
360 |         "--nb_processes",
361 |         type=int,
362 |         default=1,
363 |         help="Number of processes to use for the preprocessing. "
364 |              "The default is to use all available cores (-1).",
365 |     )
366 |     parser.add_argument(
367 |         "--overwrite", action="store_true", help="Overwrite existing files"
368 |     )
369 |     parser.add_argument(
370 |         "--log_path", type=str, default=os.path.join(
371 |             default_log_folder,
372 |             "sparrKULee_{datetime}.log"
373 |         )
374 |     )
375 |     parser.add_argument(
376 |         "--dataset_folder",
377 |         type=str,
378 |         default=raw_eeg_dir,
379 |         help="Path to the folder where the dataset is downloaded",
380 |     )
381 | 
382 |     parser.add_argument(
383 |         "--preprocessed_raw_eeg_path",
384 |         type=str,
385 |         default=preprocessed_eeg_folder,
386 |         help="Path to the folder where the preprocessed EEG will be saved",
387 |     )
388 |     args = parser.parse_args()
389 | 
390 |     # Run the preprocessing pipeline
391 |     run_preprocessing_pipeline(
392 |         args.dataset_folder,
393 |         args.preprocessed_raw_eeg_path,
394 |         args.nb_processes,
395 |         args.overwrite,
396 |         args.log_path.format(
397 |             datetime=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
398 |         ),
399 |     )


--------------------------------------------------------------------------------
/preprocessing_code/sparKULee_loadmwffiles.py:
--------------------------------------------------------------------------------
  1 | """Run the default preprocessing pipeline on sparrKULee.
  2 | This script runs the necessary prereprocessing steps on the sparrKULee dataset, starting from the MFW caches,
  3 | to arrive at the fully preprocessed files.
  4 | The MWF caches are downloaded from the challenge website and should be placed in the folder specified by the
  5 | raw_eeg_dir variable.
  6 | The preprocessed EEG will be saved in the folder specified by the preprocessed_eeg_dir variable.
  7 | The caches have been synchronized with the stimulus data and should all have a length of 5 seconds.
  8 | 
  9 | On the MWF caches, the following preprocessing steps have been performed:
 10 | ( see the original SparrKULee.py file for reference as to what these steps do)
 11 |  eeg_steps = [
 12 |         LinkStimulusToBrainResponse(
 13 |             stimulus_data=stimulus_steps,
 14 |             extract_stimuli_information_fn=BIDSAPRStimulusInfoExtractor(),
 15 |             grouper=BIDSStimulusGrouper(
 16 |                 bids_root=root_dir,
 17 |                 mapping={"stim_file": "stimulus_path", "trigger_file": "trigger_path"},
 18 |                 subfolders=["stimuli", "eeg"],
 19 |             ),
 20 |         ),
 21 |         LoadEEGNumpy(unit_multiplier=1e6, channels_to_select=list(range(64))),
 22 |         SosFiltFilt(
 23 |             scipy.signal.butter(1, 0.5, "highpass", fs=1024, output="sos"),
 24 |             emulate_matlab=True,
 25 |             axis=1,
 26 |         ),
 27 |         InterpolateArtifacts(),
 28 |         AlignPeriodicBlockTriggers(biosemi_trigger_processing_fn),
 29 |         SplitEpochs(),
 30 |         ArtifactRemovalMWF(),
 31 |         DefaultSave(after_wiener_filter_dir,
 32 |                     {'eeg': 'data'},
 33 |                     filename_fn=bids_filename_fn,
 34 |                     clear_output=True,
 35 |                     overwrite=overwrite),
 36 |     ]
 37 | 
 38 | 
 39 | 
 40 | """
 41 | import argparse
 42 | import datetime
 43 | import gzip
 44 | import json
 45 | import logging
 46 | import os
 47 | from typing import Any, Dict, Sequence
 48 | 
 49 | import librosa
 50 | import numpy as np
 51 | import math
 52 | import scipy.signal.windows
 53 | from brain_pipe.dataloaders.path import GlobLoader
 54 | from brain_pipe.pipeline.default import DefaultPipeline
 55 | from brain_pipe.preprocessing.brain.artifact import (
 56 |     InterpolateArtifacts,
 57 |     ArtifactRemovalMWF,
 58 | )
 59 | from brain_pipe.preprocessing.brain.eeg.biosemi import (
 60 |     biosemi_trigger_processing_fn,
 61 | )
 62 | from brain_pipe.preprocessing.brain.eeg.load import LoadEEGNumpy
 63 | from brain_pipe.preprocessing.brain.epochs import SplitEpochs
 64 | from brain_pipe.preprocessing.brain.link import (
 65 |     LinkStimulusToBrainResponse,
 66 |     BIDSStimulusInfoExtractor,
 67 | )
 68 | from brain_pipe.preprocessing.brain.rereference import CommonAverageRereference
 69 | from brain_pipe.preprocessing.brain.trigger import (
 70 |     AlignPeriodicBlockTriggers,
 71 | )
 72 | from brain_pipe.preprocessing.filter import SosFiltFilt
 73 | from brain_pipe.preprocessing.resample import ResamplePoly
 74 | from brain_pipe.preprocessing.stimulus.audio.envelope import GammatoneEnvelope
 75 | from brain_pipe.preprocessing.stimulus.audio.spectrogram import LibrosaMelSpectrogram
 76 | 
 77 | from brain_pipe.preprocessing.stimulus.load import LoadStimuli
 78 | from brain_pipe.runner.default import DefaultRunner
 79 | from brain_pipe.save.default import DefaultSave
 80 | # from mel import DefaultSave
 81 | from brain_pipe.utils.log import default_logging, DefaultFormatter
 82 | from brain_pipe.utils.path import BIDSStimulusGrouper
 83 | 
 84 | from typing import Dict, Any, Sequence, Optional, Union, Mapping
 85 | 
 86 | import numpy as np
 87 | 
 88 | from brain_pipe.pipeline.base import PipelineStep
 89 | import glob
 90 | 
 91 | class LoadEEGNumpyTest(PipelineStep):
 92 |     """Load EEG data.
 93 | 
 94 |     This step uses MNE to load EEG data.
 95 |     """
 96 | 
 97 |     def __init__(
 98 |             self, keys={"data_path": "data"}, copy_data_dict=False, *mne_args, **mne_kwargs
 99 |     ):
100 |         """Create a new LoadEEG instance.
101 | 
102 |         Parameters
103 |         ----------
104 |         eeg_path_key: str
105 |             The key of the EEG path in the data dict.
106 |         eeg_data_key: str
107 |             The key of the EEG data in the data dict.
108 |         """
109 |         super().__init__(copy_data_dict=copy_data_dict)
110 |         self.keys = self.parse_dict_keys(keys, "keys")
111 |         self.mne_args = mne_args
112 |         self.mne_kwargs = mne_kwargs
113 | 
114 | 
115 |     def __call__(self, data_dict: Dict[str, Any]) -> Dict[str, Any]:
116 |         """Load EEG data from a npy file.
117 | 
118 |         Parameters
119 |         ----------
120 |         data_dict: Dict[str, Any]
121 |             The data dict containing the EEG path.
122 | 
123 |         Returns
124 |         -------
125 |         Dict[str, Any]
126 |             The data dict with the EEG data and the EEG info.
127 |         """
128 |         for from_key, to_key in self.keys.items():
129 |             path = data_dict[from_key]
130 | 
131 |             # Support for gzipped files.
132 |             raw =np.load(path)
133 |             # swap axes
134 |             raw = np.swapaxes(raw, 0, 1)
135 | 
136 | 
137 |             data_dict['data'] = raw
138 |             data_dict['eeg_key'] = os.path.basename(path)
139 | 
140 |             data_dict['data_fs'] = 1024
141 | 
142 |         return data_dict
143 | 
144 | class BIDSAPRStimulusInfoExtractor(BIDSStimulusInfoExtractor):
145 |     """Extract BIDS compliant stimulus information from an .apr file."""
146 | 
147 |     def __call__(self, brain_dict: Dict[str, Any]):
148 |         """Extract BIDS compliant stimulus information from an events.tsv file.
149 | 
150 |         Parameters
151 |         ----------
152 |         brain_dict: Dict[str, Any]
153 |             The data dict containing the brain data path.
154 | 
155 |         Returns
156 |         -------
157 |         Sequence[Dict[str, Any]]
158 |             The extracted event information. Each dict contains the information
159 |             of one row in the events.tsv file
160 |         """
161 |         event_info = super().__call__(brain_dict)
162 |         # Find the apr file
163 |         path = brain_dict[self.brain_path_key]
164 |         apr_path = "_".join(path.split("_")[:-1]) + "_eeg.apr"
165 |         # Read apr file
166 |         apr_data = self.get_apr_data(apr_path)
167 |         # Add apr data to event info
168 |         for e_i in event_info:
169 |             e_i.update(apr_data)
170 |         return event_info
171 | 
172 |     def get_apr_data(self, apr_path: str):
173 |         """Get the SNR from an .apr file.
174 | 
175 |         Parameters
176 |         ----------
177 |         apr_path: str
178 |             Path to the .apr file.
179 | 
180 |         Returns
181 |         -------
182 |         Dict[str, Any]
183 |             The SNR.
184 |         """
185 |         import xml.etree.ElementTree as ET
186 | 
187 |         apr_data = {}
188 |         tree = ET.parse(apr_path)
189 |         root = tree.getroot()
190 | 
191 |         # Get SNR
192 |         interactive_elements = root.findall(".//interactive/entry")
193 |         for element in interactive_elements:
194 |             description_element = element.find("description")
195 |             if description_element.text == "SNR":
196 |                 apr_data["snr"] = element.find("new_value").text
197 |         if "snr" not in apr_data:
198 |             logging.warning(f"Could not find SNR in {apr_path}.")
199 |             apr_data["snr"] = 100.0
200 |         return apr_data
201 | 
202 | 
203 | def test_filename_fn(data_dict, feature_name, set_name=None):
204 |     """Default function to generate a filename for the data.
205 | 
206 |     Parameters
207 |     ----------
208 |     data_dict: Dict[str, Any]
209 |         The data dict containing the data to save.
210 |     feature_name: str
211 |         The name of the feature.
212 |     set_name: Optional[str]
213 |         The name of the set. If no set name is given, the set name is not
214 |         included in the filename.
215 | 
216 |     Returns
217 |     -------
218 |     str
219 |         The filename.
220 |     """
221 | 
222 | 
223 |     eeg_key = data_dict['eeg_key']
224 | 
225 |     return eeg_key
226 | 
227 | 
228 | def temp_unpack_data(data_path):
229 |     data = dict(np.load(data_path))
230 |     # save all keys, values in separate data path.
231 |     for key, value in data.items():
232 |         np.save(os.path.join(os.path.dirname(data_path) ,  key + '.npy'), value)
233 | 
234 | 
235 | def run_preprocessing_pipeline(
236 |         root_dir,
237 |         preprocessed_eeg_dir,
238 |         nb_processes=4,
239 |         overwrite=False,
240 |         log_path="sparrKULee.log",
241 | ):
242 |     """Construct and run the preprocessing on SparrKULee.
243 | 
244 |     Parameters
245 |     ----------
246 |     root_dir: str
247 |         The root directory of the dataset.
248 |     preprocessed_eeg_dir:
249 |         The directory where the preprocessed EEG should be saved.
250 |     nb_processes: int
251 |         The number of processes to use. If -1, the number of processes is
252 |         automatically determined.
253 |     overwrite: bool
254 |         Whether to overwrite existing files.
255 |     log_path: str
256 |         The path to the log file.
257 |     """
258 |     #########
259 |     # PATHS #
260 |     #########
261 |     os.makedirs(preprocessed_eeg_dir, exist_ok=True)
262 | 
263 |     ###########
264 |     # LOGGING #
265 |     ###########
266 |     handler = logging.FileHandler(log_path)
267 |     handler.setLevel(logging.DEBUG)
268 |     handler.setFormatter(DefaultFormatter())
269 |     default_logging(handlers=[handler])
270 | 
271 |     ################
272 |     # DATA LOADING #
273 |     ################
274 |     logging.info("Retrieving layout...")
275 |     data_loader = GlobLoader(
276 |         [os.path.join(root_dir,   "sub*.npy")],
277 |         filter_fns=[],
278 |         key="data_path",
279 |     )
280 | 
281 |     #########################
282 |     # RUNNING THE PIPELINE  #
283 |     #########################
284 | 
285 |     logging.info("Starting with the EEG preprocessing")
286 |     logging.info("===================================")
287 | 
288 |     eeg_steps = [
289 |         LoadEEGNumpyTest(),
290 |         CommonAverageRereference(),
291 |         ResamplePoly(64, axis=1),
292 |         DefaultSave(
293 |             preprocessed_eeg_dir,
294 |             {"eeg": "data"},
295 |             overwrite=overwrite,
296 |             clear_output=True,
297 |             filename_fn=test_filename_fn,
298 |         ),
299 |     ]
300 | 
301 |     #########################
302 |     # RUNNING THE PIPELINE  #
303 |     #########################
304 | 
305 |     logging.info("Starting with the EEG preprocessing")
306 |     logging.info("===================================")
307 | 
308 |     # Create data_dicts for the EEG files
309 |     # Create the EEG pipeline
310 |     eeg_pipeline = DefaultPipeline(steps=eeg_steps)
311 | 
312 |     DefaultRunner(
313 |         nb_processes=nb_processes,
314 |         logging_config=lambda: None,
315 |     ).run(
316 |         [(data_loader, eeg_pipeline)],
317 | 
318 |     )
319 | 
320 | 
321 | if __name__ == "__main__":
322 |     # Load the config
323 |     # get the top folder of the dataset
324 |     challenge_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
325 |     with open(os.path.join(challenge_folder, 'util', 'config.json'), "r") as f:
326 |         config = json.load(f)
327 | 
328 |     # Set the correct paths as default arguments
329 |     dataset_folder = config["dataset_folder"]
330 |     test_folder = os.path.join(dataset_folder, config["test_folder"])
331 |     task = 'TASK1_match_mismatch'  # [' TASK1_match_mismatch', 'TASK2_regression']
332 | 
333 |     preprocessed_eeg_folder = os.path.join(
334 |         test_folder, task, f'{config["preprocessed_eeg_folder"]}'
335 |     )
336 |     raw_eeg_dir = os.path.join(test_folder, task, 'MWFilter_eeg')
337 |     # unpack the data
338 | 
339 |     raw_eeg_data = glob.glob(os.path.join(raw_eeg_dir, '*_mwf.npz'))
340 |     for data_path in raw_eeg_data:
341 |         print(f'processing {data_path}')
342 |         temp_unpack_data(data_path)
343 | 
344 |     default_log_folder = os.path.dirname(os.path.abspath(__file__))
345 |     # Parse arguments from the command line
346 |     parser = argparse.ArgumentParser(description="Preprocess the auditory EEG dataset")
347 |     parser.add_argument(
348 |         "--nb_processes",
349 |         type=int,
350 |         default=1,
351 |         help="Number of processes to use for the preprocessing. "
352 |              "The default is to use all available cores (-1).",
353 |     )
354 |     parser.add_argument(
355 |         "--overwrite", action="store_true", help="Overwrite existing files"
356 |     )
357 |     parser.add_argument(
358 |         "--log_path", type=str, default=os.path.join(
359 |             default_log_folder,
360 |             "sparrKULee_{datetime}.log"
361 |         )
362 |     )
363 |     parser.add_argument(
364 |         "--dataset_folder",
365 |         type=str,
366 |         default=raw_eeg_dir,
367 |         help="Path to the folder where the dataset is downloaded",
368 |     )
369 | 
370 |     parser.add_argument(
371 |         "--preprocessed_raw_eeg_path",
372 |         type=str,
373 |         default=preprocessed_eeg_folder,
374 |         help="Path to the folder where the preprocessed EEG will be saved",
375 |     )
376 |     args = parser.parse_args()
377 | 
378 |     # Run the preprocessing pipeline
379 |     run_preprocessing_pipeline(
380 |         args.dataset_folder,
381 |         args.preprocessed_raw_eeg_path,
382 |         args.nb_processes,
383 |         args.overwrite,
384 |         args.log_path.format(
385 |             datetime=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
386 |         ),
387 |     )


--------------------------------------------------------------------------------
/preprocessing_code/sparrKULee.py:
--------------------------------------------------------------------------------
  1 | """Run the default preprocessing pipeline on soarrKULee."""
  2 | import argparse
  3 | import datetime
  4 | import gzip
  5 | import json
  6 | import logging
  7 | import os
  8 | from typing import Any, Dict, Sequence
  9 | 
 10 | import librosa
 11 | import numpy as np
 12 | import math
 13 | import scipy.signal.windows
 14 | from brain_pipe.dataloaders.path import GlobLoader
 15 | from brain_pipe.pipeline.default import DefaultPipeline
 16 | from brain_pipe.preprocessing.brain.artifact import (
 17 |     InterpolateArtifacts,
 18 |     ArtifactRemovalMWF,
 19 | )
 20 | from brain_pipe.preprocessing.brain.eeg.biosemi import (
 21 |     biosemi_trigger_processing_fn,
 22 | )
 23 | from brain_pipe.preprocessing.brain.eeg.load import LoadEEGNumpy
 24 | from brain_pipe.preprocessing.brain.epochs import SplitEpochs
 25 | from brain_pipe.preprocessing.brain.link import (
 26 |     LinkStimulusToBrainResponse,
 27 |     BIDSStimulusInfoExtractor,
 28 | )
 29 | from brain_pipe.preprocessing.brain.rereference import CommonAverageRereference
 30 | from brain_pipe.preprocessing.brain.trigger import (
 31 |     AlignPeriodicBlockTriggers,
 32 | )
 33 | from brain_pipe.preprocessing.filter import SosFiltFilt
 34 | from brain_pipe.preprocessing.resample import ResamplePoly
 35 | from brain_pipe.preprocessing.stimulus.audio.envelope import GammatoneEnvelope
 36 | from brain_pipe.preprocessing.stimulus.audio.spectrogram import LibrosaMelSpectrogram
 37 | 
 38 | from brain_pipe.preprocessing.stimulus.load import LoadStimuli
 39 | from brain_pipe.runner.default import DefaultRunner
 40 | from brain_pipe.save.default import DefaultSave
 41 | # from mel import DefaultSave
 42 | from brain_pipe.utils.log import default_logging, DefaultFormatter
 43 | from brain_pipe.utils.path import BIDSStimulusGrouper
 44 | 
 45 | 
 46 | class BIDSAPRStimulusInfoExtractor(BIDSStimulusInfoExtractor):
 47 |     """Extract BIDS compliant stimulus information from an .apr file."""
 48 | 
 49 |     def __call__(self, brain_dict: Dict[str, Any]):
 50 |         """Extract BIDS compliant stimulus information from an events.tsv file.
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         brain_dict: Dict[str, Any]
 55 |             The data dict containing the brain data path.
 56 | 
 57 |         Returns
 58 |         -------
 59 |         Sequence[Dict[str, Any]]
 60 |             The extracted event information. Each dict contains the information
 61 |             of one row in the events.tsv file
 62 |         """
 63 |         event_info = super().__call__(brain_dict)
 64 |         # Find the apr file
 65 |         path = brain_dict[self.brain_path_key]
 66 |         apr_path = "_".join(path.split("_")[:-1]) + "_eeg.apr"
 67 |         # Read apr file
 68 |         apr_data = self.get_apr_data(apr_path)
 69 |         # Add apr data to event info
 70 |         for e_i in event_info:
 71 |             e_i.update(apr_data)
 72 |         return event_info
 73 | 
 74 |     def get_apr_data(self, apr_path: str):
 75 |         """Get the SNR from an .apr file.
 76 | 
 77 |         Parameters
 78 |         ----------
 79 |         apr_path: str
 80 |             Path to the .apr file.
 81 | 
 82 |         Returns
 83 |         -------
 84 |         Dict[str, Any]
 85 |             The SNR.
 86 |         """
 87 |         import xml.etree.ElementTree as ET
 88 | 
 89 |         apr_data = {}
 90 |         tree = ET.parse(apr_path)
 91 |         root = tree.getroot()
 92 | 
 93 |         # Get SNR
 94 |         interactive_elements = root.findall(".//interactive/entry")
 95 |         for element in interactive_elements:
 96 |             description_element = element.find("description")
 97 |             if description_element.text == "SNR":
 98 |                 apr_data["snr"] = element.find("new_value").text
 99 |         if "snr" not in apr_data:
100 |             logging.warning(f"Could not find SNR in {apr_path}.")
101 |             apr_data["snr"] = 100.0
102 |         return apr_data
103 | 
104 | 
105 | def default_librosa_load_fn(path):
106 |     """Load a stimulus using librosa.
107 | 
108 |     Parameters
109 |     ----------
110 |     path: str
111 |         Path to the audio file.
112 | 
113 |     Returns
114 |     -------
115 |     Dict[str, Any]
116 |         The data and the sampling rate.
117 |     """
118 |     data, sr = librosa.load(path, sr=None)
119 |     return {"data": data, "sr": sr}
120 | 
121 | 
122 | def default_npz_load_fn(path):
123 |     """Load a stimulus from a .npz file.
124 | 
125 |     Parameters
126 |     ----------
127 |     path: str
128 |         Path to the .npz file.
129 | 
130 |     Returns
131 |     -------
132 |     Dict[str, Any]
133 |         The data and the sampling rate.
134 |     """
135 |     np_data = np.load(path)
136 |     return {
137 |         "data": np_data["audio"],
138 |         "sr": np_data["fs"],
139 |     }
140 | 
141 | 
142 | DEFAULT_LOAD_FNS = {
143 |     ".wav": default_librosa_load_fn,
144 |     ".mp3": default_librosa_load_fn,
145 |     ".npz": default_npz_load_fn,
146 | }
147 | 
148 | 
149 | def temp_stimulus_load_fn(path):
150 |     """Load stimuli from (Gzipped) files.
151 | 
152 |     Parameters
153 |     ----------
154 |     path: str
155 |         Path to the stimulus file.
156 | 
157 |     Returns
158 |     -------
159 |     Dict[str, Any]
160 |         Dict containing the data under the key "data" and the sampling rate
161 |         under the key "sr".
162 |     """
163 |     if path.endswith(".gz"):
164 |         with gzip.open(path, "rb") as f_in:
165 |             data = dict(np.load(f_in))
166 |         return {
167 |             "data": data["audio"],
168 |             "sr": data["fs"],
169 |         }
170 | 
171 |     extension = "." + ".".join(path.split(".")[1:])
172 |     if extension not in DEFAULT_LOAD_FNS:
173 |         raise ValueError(
174 |             f"Can't find a load function for extension {extension}. "
175 |             f"Available extensions are {str(list(DEFAULT_LOAD_FNS.keys()))}."
176 |         )
177 |     load_fn = DEFAULT_LOAD_FNS[extension]
178 |     return load_fn(path)
179 | 
180 | 
181 | def bids_filename_fn(data_dict, feature_name, set_name=None):
182 |     """Default function to generate a filename for the data.
183 | 
184 |     Parameters
185 |     ----------
186 |     data_dict: Dict[str, Any]
187 |         The data dict containing the data to save.
188 |     feature_name: str
189 |         The name of the feature.
190 |     set_name: Optional[str]
191 |         The name of the set. If no set name is given, the set name is not
192 |         included in the filename.
193 | 
194 |     Returns
195 |     -------
196 |     str
197 |         The filename.
198 |     """
199 | 
200 |     filename = os.path.basename(data_dict["data_path"]).split("_eeg")[0]
201 | 
202 |     subject = filename.split("_")[0]
203 |     session = filename.split("_")[1]
204 |     filename += f"_desc-preproc-audio-{os.path.basename(data_dict.get('stimulus_path', '*.')).split('.')[0]}_{feature_name}"
205 | 
206 |     if set_name is not None:
207 |         filename += f"_set-{set_name}"
208 | 
209 |     return os.path.join(subject, session, filename + ".npy")
210 | 
211 | def get_hop_length(arg, data_dict):
212 |     return int((1 / 128) * data_dict["stimulus_sr"])
213 | def get_n_fft(arg, data_dict):
214 |     return int(math.pow(2, math.ceil(math.log2(int(0.025 * data_dict["stimulus_sr"])))))
215 | def get_win_length(arg, data_dict):
216 |     return int(0.025 * data_dict["stimulus_sr"])
217 | 
218 | def get_default_librosa_kwargs():
219 | 
220 |     librosa_kwargs = {
221 |         "window": 'hann',
222 |         "hop_length": get_hop_length,
223 |         "n_fft": get_n_fft,
224 |         "win_length": get_win_length,
225 |         "fmin": 0,
226 |         "fmax": 5000,
227 |         "htk": False,
228 |         "n_mels": 10,
229 |         "center": False,
230 |         "norm": 'slaney'
231 |     }
232 |     return librosa_kwargs
233 | 
234 | def run_preprocessing_pipeline(
235 |         root_dir,
236 |         preprocessed_stimuli_dir,
237 |         preprocessed_eeg_dir,
238 |         nb_processes=4,
239 |         overwrite=False,
240 |         log_path="sparrKULee.log",
241 | ):
242 |     """Construct and run the preprocessing on SparrKULee.
243 | 
244 |     Parameters
245 |     ----------
246 |     root_dir: str
247 |         The root directory of the dataset.
248 |     preprocessed_stimuli_dir:
249 |         The directory where the preprocessed stimuli should be saved.
250 |     preprocessed_eeg_dir:
251 |         The directory where the preprocessed EEG should be saved.
252 |     nb_processes: int
253 |         The number of processes to use. If -1, the number of processes is
254 |         automatically determined.
255 |     overwrite: bool
256 |         Whether to overwrite existing files.
257 |     log_path: str
258 |         The path to the log file.
259 |     """
260 |     #########
261 |     # PATHS #
262 |     #########
263 |     os.makedirs(preprocessed_eeg_dir, exist_ok=True)
264 |     os.makedirs(preprocessed_stimuli_dir, exist_ok=True)
265 | 
266 |     ###########
267 |     # LOGGING #
268 |     ###########
269 |     handler = logging.FileHandler(log_path)
270 |     handler.setLevel(logging.DEBUG)
271 |     handler.setFormatter(DefaultFormatter())
272 |     default_logging(handlers=[handler])
273 | 
274 |     ################
275 |     # DATA LOADING #
276 |     ################
277 |     logging.info("Retrieving BIDS layout...")
278 |     data_loader = GlobLoader(
279 |         [os.path.join(root_dir, "sub-*", "*", "eeg", "*.bdf*")],
280 |         filter_fns=[lambda x: "restingState" not in x],
281 |         key="data_path",
282 |     )
283 | 
284 |     #########
285 |     # STEPS #
286 |     #########
287 | 
288 |     stimulus_steps = DefaultPipeline(
289 |         steps=[
290 |             LoadStimuli(load_fn=temp_stimulus_load_fn),
291 |             GammatoneEnvelope(),
292 |             LibrosaMelSpectrogram(librosa_kwargs=get_default_librosa_kwargs()),
293 |             ResamplePoly(64, data_key = ['spectrogram_data', 'envelope_data'], sampling_frequency_key = ['spectrogram_sr', 'stimulus_sr'], axis=0),
294 |             DefaultSave(
295 |                 preprocessed_stimuli_dir,
296 |                 to_save={'mel': 'spectrogram_data', 'envelope': 'envelope_data' },
297 |                 overwrite=overwrite
298 |             ),
299 |             DefaultSave(preprocessed_stimuli_dir, overwrite=overwrite),
300 |         ],
301 |         on_error=DefaultPipeline.RAISE,
302 |     )
303 | 
304 |     eeg_steps = [
305 |         LinkStimulusToBrainResponse(
306 |             stimulus_data=stimulus_steps,
307 |             extract_stimuli_information_fn=BIDSAPRStimulusInfoExtractor(),
308 |             grouper=BIDSStimulusGrouper(
309 |                 bids_root=root_dir,
310 |                 mapping={"stim_file": "stimulus_path", "trigger_file": "trigger_path"},
311 |                 subfolders=["stimuli", "eeg"],
312 |             ),
313 |         ),
314 |         LoadEEGNumpy(unit_multiplier=1e6, channels_to_select=list(range(64))),
315 |         SosFiltFilt(
316 |             scipy.signal.butter(1, 0.5, "highpass", fs=1024, output="sos"),
317 |             emulate_matlab=True,
318 |             axis=1,
319 |         ),
320 |         InterpolateArtifacts(),
321 |         AlignPeriodicBlockTriggers(biosemi_trigger_processing_fn),
322 |         SplitEpochs(),
323 |         ArtifactRemovalMWF(),
324 |         CommonAverageRereference(),
325 |         ResamplePoly(64, axis=1),
326 |         DefaultSave(
327 |             preprocessed_eeg_dir,
328 |             {"eeg": "data"},
329 |             overwrite=overwrite,
330 |             clear_output=True,
331 |             filename_fn=bids_filename_fn,
332 |         ),
333 |     ]
334 | 
335 |     #########################
336 |     # RUNNING THE PIPELINE  #
337 |     #########################
338 | 
339 |     logging.info("Starting with the EEG preprocessing")
340 |     logging.info("===================================")
341 | 
342 |     # Create data_dicts for the EEG files
343 |     # Create the EEG pipeline
344 |     eeg_pipeline = DefaultPipeline(steps=eeg_steps)
345 | 
346 |     DefaultRunner(
347 |         nb_processes=nb_processes,
348 |         logging_config=lambda: None,
349 |     ).run(
350 |         [(data_loader, eeg_pipeline)],
351 | 
352 |     )
353 | 
354 | 
355 | if __name__ == "__main__":
356 |     # Load the config
357 |     # get the top folder of the dataset
358 |     challenge_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
359 |     with open(os.path.join(challenge_folder, 'util', 'config.json'), "r") as f:
360 |         config = json.load(f)
361 | 
362 |     # Set the correct paths as default arguments
363 |     dataset_folder = config["dataset_folder"]
364 |     derivatives_folder = os.path.join(dataset_folder, config["derivatives_folder"])
365 |     preprocessed_stimuli_folder = os.path.join(
366 |         derivatives_folder, config["preprocessed_stimuli_folder"]
367 |     )
368 |     preprocessed_eeg_folder = os.path.join(
369 |         derivatives_folder, config["preprocessed_eeg_folder"]
370 |     )
371 |     default_log_folder = os.path.dirname(os.path.abspath(__file__))
372 | 
373 |     # Parse arguments from the command line
374 |     parser = argparse.ArgumentParser(description="Preprocess the auditory EEG dataset")
375 |     parser.add_argument(
376 |         "--nb_processes",
377 |         type=int,
378 |         default=-1,
379 |         help="Number of processes to use for the preprocessing. "
380 |              "The default is to use all available cores (-1).",
381 |     )
382 |     parser.add_argument(
383 |         "--overwrite", action="store_true", help="Overwrite existing files"
384 |     )
385 |     parser.add_argument(
386 |         "--log_path", type=str, default=os.path.join(
387 |             default_log_folder,
388 |             "sparrKULee_{datetime}.log"
389 |         )
390 |     )
391 |     parser.add_argument(
392 |         "--dataset_folder",
393 |         type=str,
394 |         default=dataset_folder,
395 |         help="Path to the folder where the dataset is downloaded",
396 |     )
397 |     parser.add_argument(
398 |         "--preprocessed_stimuli_path",
399 |         type=str,
400 |         default=preprocessed_stimuli_folder,
401 |         help="Path to the folder where the preprocessed stimuli will be saved",
402 |     )
403 |     parser.add_argument(
404 |         "--preprocessed_eeg_path",
405 |         type=str,
406 |         default=preprocessed_eeg_folder,
407 |         help="Path to the folder where the preprocessed EEG will be saved",
408 |     )
409 |     args = parser.parse_args()
410 | 
411 |     # Run the preprocessing pipeline
412 |     run_preprocessing_pipeline(
413 |         args.dataset_folder,
414 |         args.preprocessed_stimuli_path,
415 |         args.preprocessed_eeg_path,
416 |         args.nb_processes,
417 |         args.overwrite,
418 |         args.log_path.format(
419 |             datetime=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
420 |         ),
421 |     )
422 | 


--------------------------------------------------------------------------------
/preprocessing_code/split_and_normalize.py:
--------------------------------------------------------------------------------
  1 | """Split data in sets and normalize (per recording)."""
  2 | import glob
  3 | import json
  4 | import os
  5 | import pickle
  6 | 
  7 | 
  8 | import numpy as np
  9 | 
 10 | 
 11 | if __name__ == "__main__":
 12 | 
 13 |     # Arguments for splitting and normalizing
 14 |     speech_features = ['envelope', 'mel']
 15 |     splits = [80, 10, 10]
 16 |     split_names = ['train', 'val', 'test']
 17 |     overwrite = False
 18 | 
 19 |     # Calculate the split fraction
 20 |     split_fractions = [x/sum(splits) for x in splits]
 21 | 
 22 |     # Get the path to the config file
 23 |     task_folder = os.path.dirname(os.path.dirname(__file__))
 24 |     config_path = os.path.join(task_folder, 'util', 'config.json')
 25 | 
 26 |     # Load the config
 27 |     with open(config_path) as fp:
 28 |         config = json.load(fp)
 29 | 
 30 |     # Construct the necessary paths
 31 |     processed_eeg_folder = os.path.join(config["dataset_folder"],config['derivatives_folder'], f"{config['preprocessed_eeg_folder']}")
 32 |     processed_stimuli_folder = os.path.join(config["dataset_folder"],config['derivatives_folder'], f"{config['preprocessed_stimuli_folder']}")
 33 |     split_data_folder = os.path.join(config["dataset_folder"],config['derivatives_folder'], config["split_folder"])
 34 | 
 35 |     # Create the output folder
 36 |     os.makedirs(split_data_folder, exist_ok=True)
 37 | 
 38 |     # Find all subjects
 39 |     all_subjects = glob.glob(os.path.join(processed_eeg_folder, "sub*"))
 40 |     nb_subjects = len(all_subjects)
 41 |     print(f"Found {nb_subjects} subjects to split/normalize")
 42 | 
 43 |     # Loop over subjects
 44 |     for subject_index, subject_path in enumerate(all_subjects):
 45 |         subject = os.path.basename(subject_path)
 46 |         print(f"Starting with subject {subject} ({subject_index + 1}/{nb_subjects})...")
 47 |         # Find all recordings
 48 |         all_recordings = glob.glob(os.path.join(subject_path, "*", "*.npy"))
 49 |         print(f"\tFound {len(all_recordings)} recordings for subject {subject}.")
 50 |         # Loop over recordings
 51 |         for recording_index, recording in enumerate(all_recordings):
 52 |             print(f"\tStarting with recording {recording} ({recording_index + 1}/{len(all_recordings)})...")
 53 | 
 54 |             # Load EEG from disk
 55 |             print(f"\t\tLoading EEG for {recording}")
 56 |             eeg = np.load(recording)
 57 | 
 58 |             # swap axes to have time as first dimension
 59 |             eeg = np.swapaxes(eeg, 0, 1)
 60 | 
 61 |             # keep only the 64 channels
 62 |             eeg = eeg[:, :64]
 63 | 
 64 |             # retrieve the stimulus name from the filename
 65 |             stimulus_filename = recording.split('_eeg.')[0].split('-audio-')[1]
 66 | 
 67 |             # Retrieve EEG data and pointer to the stimulus
 68 |             shortest_length = eeg.shape[0]
 69 | 
 70 |             # Create mapping between feature name and feature data
 71 |             all_data_for_recording = {"eeg": eeg}
 72 | 
 73 |             # Find corresponding stimuli for the EEG recording
 74 |             for feature_name in speech_features:
 75 |                 # Load feature from disk
 76 |                 print(f"\t\tLoading {feature_name} for recording {recording} ")
 77 |                 stimulus_feature_path = os.path.join(
 78 |                     processed_stimuli_folder,
 79 |                     stimulus_filename + "_-_" + feature_name + ".npy",
 80 |                 )
 81 |                 feature = np.load(stimulus_feature_path)
 82 |                 # Calculate the shortest length
 83 |                 shortest_length = min(feature.shape[0], shortest_length)
 84 |                 # Update all_data_for_recording
 85 |                 all_data_for_recording[feature_name] = feature
 86 | 
 87 |             # Do the actual splitting
 88 |             print(f"\t\tSplitting/normalizing recording {recording}...")
 89 |             for feature_name, feature in all_data_for_recording.items():
 90 |                 start_index = 0
 91 |                 feature_mean = None
 92 |                 feature_std = None
 93 | 
 94 |                 for split_name, split_fraction in zip(split_names, split_fractions):
 95 |                     end_index = start_index + int(shortest_length * split_fraction)
 96 | 
 97 |                     # Cut the feature to the shortest length
 98 |                     cut_feature = feature[start_index:end_index, ...]
 99 | 
100 |                     # Normalize the feature
101 |                     if feature_mean is None:
102 |                         feature_mean = np.mean(cut_feature, axis=0)
103 |                         feature_std = np.std(cut_feature, axis=0)
104 |                     norm_feature = (cut_feature - feature_mean)/feature_std
105 | 
106 |                     # Save the feature
107 |                     save_filename = f"{split_name}_-_{subject}_-_{stimulus_filename}_-_{feature_name}.npy"
108 |                     save_path = os.path.join(split_data_folder, save_filename)
109 |                     if not os.path.exists(save_path) or overwrite:
110 |                         np.save(save_path, cut_feature)
111 |                     else:
112 |                         print(f"\t\tSkipping {save_filename} because it already exists")
113 |                     start_index = end_index


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >=2.4.0
2 | numpy
3 | scipy
4 | brian2
5 | brian2hears
6 | librosa
7 | brain_pipe


--------------------------------------------------------------------------------
/task1_match_mismatch/__init__.py:
--------------------------------------------------------------------------------
1 | """Code for task 1 of the Auditory EEG ICASSP challenge."""
2 | 


--------------------------------------------------------------------------------
/task1_match_mismatch/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | """Experiments for task1."""
2 | 


--------------------------------------------------------------------------------
/task1_match_mismatch/experiments/dilated_convolutional_model.py:
--------------------------------------------------------------------------------
  1 | """Example experiment for the 2 mismatched segments dilation model."""
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os, sys
  6 | import tensorflow as tf
  7 | 
  8 | import sys
  9 | # add base path to sys
 10 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 11 | from task1_match_mismatch.models.dilated_convolutional_model import dilation_model
 12 | 
 13 | from util.dataset_generator import DataGenerator, batch_equalizer_fn, create_tf_dataset
 14 | 
 15 | 
 16 | def evaluate_model(model, test_dict):
 17 |     """Evaluate a model.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     model: tf.keras.Model
 22 |         Model to evaluate.
 23 |     test_dict: dict
 24 |         Mapping between a subject and a tf.data.Dataset containing the test
 25 |         set for the subject.
 26 | 
 27 |     Returns
 28 |     -------
 29 |     dict
 30 |         Mapping between a subject and the loss/evaluation score on the test set
 31 |     """
 32 |     evaluation = {}
 33 |     for subject, ds_test in test_dict.items():
 34 |         logging.info(f"Scores for subject {subject}:")
 35 |         results = model.evaluate(ds_test, verbose=2)
 36 |         metrics = model.metrics_names
 37 |         evaluation[subject] = dict(zip(metrics, results))
 38 |     return evaluation
 39 | 
 40 | 
 41 | if __name__ == "__main__":
 42 |     # Parameters
 43 |     # Length of the decision window
 44 |     window_length_s = 5
 45 |     fs = 64
 46 | 
 47 |     window_length = window_length_s * fs  # 5 seconds
 48 |     # Hop length between two consecutive decision windows
 49 |     hop_length = 64
 50 | 
 51 |     epochs = 100
 52 |     patience = 5
 53 |     batch_size = 64
 54 |     only_evaluate = True
 55 |     number_mismatch = 4 # or 4
 56 | 
 57 | 
 58 | 
 59 |     training_log_filename = "training_log_{}_{}.csv".format(number_mismatch, window_length_s)
 60 | 
 61 | 
 62 | 
 63 |     # Get the path to the config gile
 64 |     experiments_folder = os.path.dirname(__file__)
 65 |     task_folder = os.path.dirname(experiments_folder)
 66 |     util_folder = os.path.join(os.path.dirname(task_folder), "util")
 67 |     config_path = os.path.join(util_folder, 'config.json')
 68 | 
 69 |     # Load the config
 70 |     with open(config_path) as fp:
 71 |         config = json.load(fp)
 72 | 
 73 |     # Provide the path of the dataset
 74 |     # which is split already to train, val, test
 75 |     data_folder = os.path.join(config["dataset_folder"], config['derivatives_folder'], config["split_folder"])
 76 | 
 77 |     # stimulus feature which will be used for training the model. Can be either 'envelope' ( dimension 1) or 'mel' (dimension 28)
 78 |     stimulus_features = ["envelope"]
 79 |     stimulus_dimension = 1
 80 | 
 81 |     # uncomment if you want to train with the mel spectrogram stimulus representation
 82 |     # stimulus_features = ["mel"]
 83 |     # stimulus_dimension = 10
 84 | 
 85 |     features = ["eeg"] + stimulus_features
 86 | 
 87 |     # Create a directory to store (intermediate) results
 88 |     results_folder = os.path.join(experiments_folder, "results_dilated_convolutional_model_{}_MM_{}_s_{}".format(number_mismatch, window_length_s, stimulus_features[0]))
 89 |     os.makedirs(results_folder, exist_ok=True)
 90 | 
 91 |     # create dilation model
 92 |     model = dilation_model(time_window=window_length, eeg_input_dimension=64, env_input_dimension=stimulus_dimension, num_mismatched_segments = number_mismatch)
 93 | 
 94 |     model_path = os.path.join(results_folder, "model_{}_MM_{}_s_{}.h5".format(number_mismatch, window_length_s, stimulus_features[0]))
 95 | 
 96 |     if only_evaluate:
 97 |         model = tf.keras.models.load_model(model_path)
 98 | 
 99 |     else:
100 | 
101 |         train_files = [x for x in glob.glob(os.path.join(data_folder, "train_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
102 |         # Create list of numpy array files
103 |         train_generator = DataGenerator(train_files, window_length)
104 |         import pdb
105 |         dataset_train = create_tf_dataset(train_generator, window_length, batch_equalizer_fn,
106 |                                           hop_length, batch_size,
107 |                                           number_mismatch=number_mismatch,
108 |                                           data_types=(tf.float32, tf.float32),
109 |                                           feature_dims=(64, stimulus_dimension))
110 | 
111 |         # Create the generator for the validation set
112 |         val_files = [x for x in glob.glob(os.path.join(data_folder, "val_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
113 |         val_generator = DataGenerator(val_files, window_length)
114 |         dataset_val = create_tf_dataset(val_generator,  window_length, batch_equalizer_fn,
115 |                                           hop_length, batch_size,
116 |                                           number_mismatch=number_mismatch,
117 |                                           data_types=(tf.float32, tf.float32),
118 |                                           feature_dims=(64, stimulus_dimension))
119 | 
120 | 
121 |         # Train the model
122 |         model.fit(
123 |             dataset_train,
124 |             epochs=epochs,
125 |             validation_data=dataset_val,
126 |             callbacks=[
127 |                 tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True),
128 |                 tf.keras.callbacks.CSVLogger(os.path.join(results_folder, training_log_filename)),
129 |                 tf.keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
130 |             ],
131 |         )
132 | 
133 |     test_window_lengths = [3,5]
134 |     number_mismatch_test = [2,3,4, 8]
135 |     for number_mismatch in number_mismatch_test:
136 |         for window_length_s in test_window_lengths:
137 |             window_length = window_length_s * fs
138 |             results_filename = 'eval_{}_{}_s.json'.format(number_mismatch, window_length_s)
139 | 
140 |             model = dilation_model(time_window=window_length, eeg_input_dimension=64,
141 |                                    env_input_dimension=stimulus_dimension, num_mismatched_segments=number_mismatch)
142 | 
143 |             model.load_weights(model_path)
144 |             # Evaluate the model on test set
145 |             # Create a dataset generator for each test subject
146 |             test_files = [x for x in glob.glob(os.path.join(data_folder, "test_-_*")) if
147 |                           os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
148 |             # Get all different subjects from the test set
149 |             subjects = list(set([os.path.basename(x).split("_-_")[1] for x in test_files]))
150 |             datasets_test = {}
151 |             # Create a generator for each subject
152 |             for sub in subjects:
153 |                 files_test_sub = [f for f in test_files if sub in os.path.basename(f)]
154 |                 test_generator = DataGenerator(files_test_sub, window_length)
155 |                 datasets_test[sub] = create_tf_dataset(test_generator, window_length, batch_equalizer_fn,
156 |                                                        hop_length, batch_size=1,
157 |                                                        number_mismatch=number_mismatch,
158 |                                                        data_types=(tf.float32, tf.float32),
159 |                                                        feature_dims=(64, stimulus_dimension))
160 | 
161 |             evaluation = evaluate_model(model, datasets_test)
162 | 
163 |             # We can save our results in a json encoded file
164 |             results_path = os.path.join(results_folder, results_filename)
165 |             with open(results_path, "w") as fp:
166 |                 json.dump(evaluation, fp)
167 |             logging.info(f"Results saved at {results_path}")
168 | 


--------------------------------------------------------------------------------
/task1_match_mismatch/experiments/test_match_mismatch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sample code to generate labels for test dataset of
  3 | match-mismatch task. The requested format for submitting the labels is
  4 | as follows:
  5 | for each subject a json file containing a python dictionary in the
  6 | format of  ==> {'sample_id': prediction, ... }.
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import glob
 12 | import json
 13 | import numpy as np
 14 | import glob
 15 | import json
 16 | import logging
 17 | import os, sys
 18 | import tensorflow as tf
 19 | 
 20 | import sys
 21 | # add base path to sys
 22 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 23 | from task1_match_mismatch.models.dilated_convolutional_model import dilation_model
 24 | 
 25 | from util.dataset_generator import DataGenerator, batch_equalizer_fn, create_tf_dataset
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | if __name__ == '__main__':
 32 | 
 33 |     # Parameters
 34 |     # Length of the decision window
 35 |     window_length_s = 5
 36 |     fs = 64
 37 | 
 38 |     window_length = window_length_s * fs  # 5 seconds
 39 |     # Hop length between two consecutive decision windows
 40 |     hop_length = 64
 41 | 
 42 |     epochs = 100
 43 |     patience = 5
 44 |     batch_size = 64
 45 |     number_mismatch = 4  # or 4
 46 | 
 47 |     # Get the path to the config gile
 48 |     experiments_folder = os.path.dirname(__file__)
 49 |     task_folder = os.path.dirname(experiments_folder)
 50 |     util_folder = os.path.join(os.path.dirname(task_folder), "util")
 51 |     config_path = os.path.join(util_folder, 'config.json')
 52 | 
 53 |     # Load the config
 54 |     with open(config_path) as fp:
 55 |         config = json.load(fp)
 56 | 
 57 |     # Provide the path of the dataset
 58 |     # which is split already to train, val, test
 59 |     data_folder = os.path.join(config["dataset_folder"],  config["test_folder"], 'TASK1_match_mismatch')
 60 |     eeg_folder = os.path.join(data_folder, 'preprocessed_eeg')
 61 |     stimulus_folder = os.path.join(data_folder, 'stimulus')
 62 | 
 63 |     # # stimulus feature which will be used for training the model. Can be either 'envelope' ( dimension 1) or 'mel' (dimension 28)
 64 |     # stimulus_features = ["envelope"]
 65 |     # stimulus_dimension = 1
 66 | 
 67 |     # uncomment if you want to train with the mel spectrogram stimulus representation
 68 |     stimulus_features = ["mel"]
 69 |     stimulus_dimension = 10
 70 | 
 71 |     features = ["eeg"] + stimulus_features
 72 | 
 73 |     # Create a directory to store (intermediate) results
 74 |     results_folder = os.path.join(experiments_folder,
 75 |                                   "results_dilated_convolutional_model_{}_MM_{}_s_{}".format(number_mismatch,
 76 |                                                                                              window_length_s,
 77 |                                                                                              stimulus_features[0]))
 78 | 
 79 |     # create dilation model
 80 |     model = dilation_model(time_window=window_length, eeg_input_dimension=64, env_input_dimension=stimulus_dimension,
 81 |                            num_mismatched_segments=number_mismatch)
 82 | 
 83 |     model_path = os.path.join(results_folder,
 84 |                               "model_{}_MM_{}_s_{}.h5".format(number_mismatch, window_length_s, stimulus_features[0]))
 85 |     model.load_weights(model_path)
 86 | 
 87 | 
 88 | 
 89 |     test_eeg_mapping = glob.glob(os.path.join(data_folder, 'sub*mapping.json'))
 90 | 
 91 |     test_stimuli = glob.glob(os.path.join(stimulus_folder, f'*{stimulus_features[0]}*chunks.npz'))
 92 | 
 93 |     #load all test stimuli
 94 |     test_stimuli_data = {}
 95 |     for stimulus_path in test_stimuli:
 96 |         test_stimuli_data = dict(test_stimuli_data, **np.load(stimulus_path))
 97 | 
 98 |     for sub_stimulus_mapping in test_eeg_mapping:
 99 |         subject = os.path.basename(sub_stimulus_mapping).split('_')[0]
100 | 
101 |         # load stimulus mapping
102 |         sub_stimulus_mapping = json.load(open(sub_stimulus_mapping))
103 | 
104 |         #load eeg data
105 |         sub_path = os.path.join(eeg_folder, f'{subject}_eeg.npz')
106 |         sub_eeg_data = dict(np.load(sub_path))
107 | 
108 | 
109 | 
110 |         data_eeg =  np.stack([[sub_eeg_data[value['eeg']]]  for key, value in sub_stimulus_mapping.items() ])
111 |         # change dim 0 and 1 of eeg and unstack
112 |         data_eeg = np.swapaxes(data_eeg, 0, 1)
113 |         data_eeg = list(data_eeg)
114 | 
115 |         data_stimuli = np.stack([[test_stimuli_data[x] for x in value['stimulus']] for key, value in sub_stimulus_mapping.items()])
116 |         # change dim 0 and 1 of stimulus and unstack
117 |         data_stimuli = np.swapaxes(data_stimuli, 0, 1)
118 |         data_stimuli = list(data_stimuli)
119 | 
120 |         id_list= list(sub_stimulus_mapping.keys())
121 | 
122 | 
123 |         predictions = model.predict(data_eeg + data_stimuli)
124 |         labels = np.argmax(predictions, axis=1)
125 | 
126 |         sub = dict(zip(id_list, [int(x) for x in labels]))
127 | 
128 |         prediction_dir = os.path.join(os.path.dirname(__file__), 'predictions')
129 |         os.makedirs(prediction_dir, exist_ok=True)
130 |         with open(os.path.join(prediction_dir, subject + '.json'), 'w') as f:
131 |             json.dump(sub, f)
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/task1_match_mismatch/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Models for task1."""
2 | 


--------------------------------------------------------------------------------
/task1_match_mismatch/models/dilated_convolutional_model.py:
--------------------------------------------------------------------------------
  1 | """2 mismatched segments dilation model."""
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def dilation_model(
  6 |     time_window=None,
  7 |     eeg_input_dimension=64,
  8 |     env_input_dimension=1,
  9 |     layers=3,
 10 |     kernel_size=3,
 11 |     spatial_filters=8,
 12 |     dilation_filters=16,
 13 |     activation="relu",
 14 |     compile=True,
 15 |     num_mismatched_segments=2
 16 | ):
 17 |     """Convolutional dilation model.
 18 | 
 19 |     Code was taken and adapted from
 20 |     https://github.com/exporl/eeg-matching-eusipco2020
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     time_window : int or None
 25 |         Segment length. If None, the model will accept every time window input
 26 |         length.
 27 |     eeg_input_dimension : int
 28 |         number of channels of the EEG
 29 |     env_input_dimension : int
 30 |         dimemsion of the stimulus representation.
 31 |         if stimulus == envelope, env_input_dimension =1
 32 |         if stimulus == mel, env_input_dimension =28
 33 |     layers : int
 34 |         Depth of the network/Number of layers
 35 |     kernel_size : int
 36 |         Size of the kernel for the dilation convolutions
 37 |     spatial_filters : int
 38 |         Number of parallel filters to use in the spatial layer
 39 |     dilation_filters : int
 40 |         Number of parallel filters to use in the dilation layers
 41 |     activation : str or list or tuple
 42 |         Name of the non-linearity to apply after the dilation layers
 43 |         or list/tuple of different non-linearities
 44 |     compile : bool
 45 |         If model should be compiled
 46 |     inputs : tuple
 47 |         Alternative inputs
 48 | 
 49 |     Returns
 50 |     -------
 51 |     tf.Model
 52 |         The dilation model
 53 | 
 54 | 
 55 |     References
 56 |     ----------
 57 |     Accou, B., Jalilpour Monesi, M., Montoya, J., Van hamme, H. & Francart, T.
 58 |     Modeling the relationship between acoustic stimulus and EEG with a dilated
 59 |     convolutional neural network. In 2020 28th European Signal Processing
 60 |     Conference (EUSIPCO), 1175–1179, DOI: 10.23919/Eusipco47968.2020.9287417
 61 |     (2021). ISSN: 2076-1465.
 62 | 
 63 |     Accou, B., Monesi, M. J., hamme, H. V. & Francart, T.
 64 |     Predicting speech intelligibility from EEG in a non-linear classification
 65 |     paradigm. J. Neural Eng. 18, 066008, DOI: 10.1088/1741-2552/ac33e9 (2021).
 66 |     Publisher: IOP Publishing
 67 |     """
 68 | 
 69 |     eeg = tf.keras.layers.Input(shape=[time_window, eeg_input_dimension])
 70 |     stimuli_input = [tf.keras.layers.Input(shape=[time_window, env_input_dimension]) for _ in range(num_mismatched_segments+1)]
 71 | 
 72 |     all_inputs = [eeg]
 73 |     all_inputs.extend(stimuli_input)
 74 | 
 75 | 
 76 |     stimuli_proj = [x for x in stimuli_input]
 77 | 
 78 |     # Activations to apply
 79 |     if isinstance(activation, str):
 80 |         activations = [activation] * layers
 81 |     else:
 82 |         activations = activation
 83 | 
 84 | 
 85 |     # Spatial convolution
 86 |     eeg_proj_1 = tf.keras.layers.Conv1D(spatial_filters, kernel_size=1)(eeg)
 87 | 
 88 |     # Construct dilation layers
 89 |     for layer_index in range(layers):
 90 |         # dilation on EEG
 91 |         eeg_proj_1 = tf.keras.layers.Conv1D(
 92 |             dilation_filters,
 93 |             kernel_size=kernel_size,
 94 |             dilation_rate=kernel_size ** layer_index,
 95 |             strides=1,
 96 |             activation=activations[layer_index],
 97 |         )(eeg_proj_1)
 98 | 
 99 |         # Dilation on envelope data, share weights
100 |         env_proj_layer = tf.keras.layers.Conv1D(
101 |             dilation_filters,
102 |             kernel_size=kernel_size,
103 |             dilation_rate=kernel_size ** layer_index,
104 |             strides=1,
105 |             activation=activations[layer_index],
106 |         )
107 | 
108 |         stimuli_proj = [env_proj_layer(stimulus_proj) for stimulus_proj in stimuli_proj]
109 | 
110 | 
111 |     # Comparison
112 |     cos = [tf.keras.layers.Dot(1, normalize=True)([eeg_proj_1, stimulus_proj]) for stimulus_proj in stimuli_proj]
113 | 
114 |     linear_proj_sim = tf.keras.layers.Dense(1, activation="linear")
115 | 
116 |     # Linear projection of similarity matrices
117 |     cos_proj = [linear_proj_sim(tf.keras.layers.Flatten()(cos_i)) for cos_i in cos]
118 | 
119 | 
120 |     # Classification
121 |     out = tf.keras.activations.softmax((tf.keras.layers.Concatenate()(cos_proj)))
122 | 
123 | 
124 |     model = tf.keras.Model(inputs=all_inputs, outputs=[out])
125 | 
126 |     if compile:
127 |         model.compile(
128 |             optimizer=tf.keras.optimizers.Adam(),
129 |             metrics=["accuracy"],
130 |             loss=["categorical_crossentropy"],
131 |         )
132 |         print(model.summary())
133 |     return model


--------------------------------------------------------------------------------
/task2_regression/__init__.py:
--------------------------------------------------------------------------------
1 | """Code for task 2 of the Auditory EEG ICASSP challenge."""
2 | 


--------------------------------------------------------------------------------
/task2_regression/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | """Experiments for task2."""
2 | 


--------------------------------------------------------------------------------
/task2_regression/experiments/linear_baseline.py:
--------------------------------------------------------------------------------
  1 | """Example experiment for a linear baseline method."""
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os
  6 | # set gpu private
  7 | os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
  8 | 
  9 | import tensorflow as tf
 10 | 
 11 | import sys
 12 | # add base path to sys
 13 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 14 | 
 15 | import scipy.stats
 16 | import numpy as np
 17 | 
 18 | 
 19 | from task2_regression.models.linear import simple_linear_model, pearson_loss_cut, pearson_metric_cut, pearson_metric_cut_non_averaged
 20 | from util.dataset_generator import DataGenerator, create_tf_dataset
 21 | 
 22 | 
 23 | def evaluate_model(model, test_dict):
 24 |     """Evaluate a model.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     model: tf.keras.Model
 29 |         Model to evaluate.
 30 |     test_dict: dict
 31 |         Mapping between a subject and a tf.data.Dataset containing the test
 32 |         set for the subject.
 33 | 
 34 |     Returns
 35 |     -------
 36 |     dict
 37 |         Mapping between a subject and the loss/evaluation score on the test set
 38 |     """
 39 |     evaluation = {}
 40 |     for subject, ds_test in test_dict.items():
 41 |         logging.info(f"Scores for subject {subject}:")
 42 |            # evaluate model
 43 |         ds = [x for x in ds_test]
 44 |         eeg = tf.concat([ x[0] for x in ds], axis=0)
 45 |         labels =tf.concat([ x[1] for x in ds], axis=0)
 46 | 
 47 | 
 48 |         reconstructions = model.predict(eeg)
 49 |         correlations = np.squeeze(pearson_metric_cut_non_averaged(labels, reconstructions))
 50 | 
 51 |         # calculate pearson correlation per band
 52 | 
 53 |         results = model.evaluate(ds_test, verbose=2)
 54 | 
 55 |         metrics = model.metrics_names
 56 |         evaluation[subject] = dict(zip(metrics, results))
 57 | 
 58 | 
 59 |         evaluation[subject]["pearson_correlation_per_band"] = np.mean(correlations, axis=0).tolist()
 60 |         # metrics = model.metrics_names
 61 |         # evaluation[subject] = dict(zip(metrics, results))
 62 |     return evaluation
 63 | 
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     # Parameters
 68 |     # Length of the decision window
 69 |     fs = 64
 70 |     window_length = 60 * fs  # 10 seconds
 71 |     # Hop length between two consecutive decision windows
 72 |     hop_length = 30*fs
 73 |     epochs = 100
 74 |     patience = 5
 75 |     batch_size = 64
 76 |     only_evaluate = True
 77 | 
 78 |     # Get the path to the config gile
 79 |     experiments_folder = os.path.dirname(__file__)
 80 |     task_folder = os.path.dirname(experiments_folder)
 81 |     util_folder = os.path.join(os.path.dirname(task_folder), "util")
 82 |     config_path = os.path.join(util_folder, 'config.json')
 83 | 
 84 |     # Load the config
 85 |     with open(config_path) as fp:
 86 |         config = json.load(fp)
 87 | 
 88 |     # Provide the path of the dataset
 89 |     # which is split already to train, val, test
 90 | 
 91 |     data_folder = os.path.join(config["dataset_folder"],config["derivatives_folder"],  config["split_folder"])
 92 |     stimulus_features = ["mel"]
 93 |     features = ["eeg"] + stimulus_features
 94 | 
 95 |     # Create a directory to store (intermediate) results
 96 |     results_folder = os.path.join(experiments_folder, "results_linear_baseline")
 97 |     os.makedirs(results_folder, exist_ok=True)
 98 | 
 99 |     # train a sub dependent model for each sub
100 |     # Create a dataset generator for each training subject
101 |     # Get all different subjects from the training set
102 |     all_subs = list(
103 |         set([os.path.basename(x).split("_-_")[1] for x in glob.glob(os.path.join(data_folder, "train_-_*"))]))
104 | 
105 | 
106 |     # create a simple linear model
107 |     model = simple_linear_model(integration_window = int(fs*0.25), nb_filters=10)
108 |     model.summary()
109 |     model_path = os.path.join(results_folder, f"model.h5")
110 |     training_log_filename = f"training_log.csv"
111 |     results_filename = f'eval.json'
112 | 
113 | 
114 |     if only_evaluate:
115 |         # load weights
116 |         model.load_weights(model_path)
117 |     else:
118 | 
119 |         train_files = [x for x in glob.glob(os.path.join(data_folder, "train_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features ]
120 |         # Create list of numpy array files
121 |         train_generator = DataGenerator(train_files, window_length)
122 |         dataset_train = create_tf_dataset(train_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10))
123 | 
124 |         # Create the generator for the validation set
125 |         val_files = [x for x in glob.glob(os.path.join(data_folder, "val_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
126 |         val_generator = DataGenerator(val_files, window_length)
127 |         dataset_val = create_tf_dataset(val_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10))
128 | 
129 |         # Train the model
130 |         model.fit(
131 |             dataset_train,
132 |             epochs=epochs,
133 |             validation_data=dataset_val,
134 |             callbacks=[
135 |                 tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True),
136 |                 tf.keras.callbacks.CSVLogger(os.path.join(results_folder, training_log_filename)),
137 |                 tf.keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
138 |             ],
139 |             workers = tf.data.AUTOTUNE,
140 |             use_multiprocessing=True
141 | 
142 |         )
143 | 
144 |     # Evaluate the model on test set
145 |     # Create a dataset generator for each test subject
146 |     test_files = [x for x in glob.glob(os.path.join(data_folder, "test_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
147 |     # Get all different subjects from the test set
148 |     subjects = list(set([os.path.basename(x).split("_-_")[1] for x in test_files]))
149 |     datasets_test = {}
150 |     # Create a generator for each subject
151 |     for sub in subjects:
152 |         files_test_sub = [f for f in test_files if sub in os.path.basename(f)]
153 |         test_generator = DataGenerator(files_test_sub, window_length)
154 |         datasets_test[sub] = create_tf_dataset(test_generator, window_length, None, hop_length, batch_size=1, data_types=(tf.float32, tf.float32), feature_dims=(64, 10))
155 | 
156 |     # Evaluate the model
157 |     evaluation = evaluate_model(model, datasets_test)
158 | 
159 |     # We can save our results in a json encoded file
160 |     results_path = os.path.join(results_folder, results_filename)
161 |     with open(results_path, "w") as fp:
162 |         json.dump(evaluation, fp)
163 |     logging.info(f"Results saved at {results_path}")
164 | 


--------------------------------------------------------------------------------
/task2_regression/experiments/test_regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sample code to generate test labels (reconstructed envelopes) for
  3 | the regression task. The requested format for submitting the reconstructed envelopes is
  4 | as follows:
  5 | for each subject a json file containing a python dictionary in the
  6 | format of  ==> {'sample_id': reconstructed_envelope, ... }.
  7 | """
  8 | 
  9 | 
 10 | import os
 11 | import glob
 12 | import json
 13 | import numpy as np
 14 | import glob
 15 | import json
 16 | import logging
 17 | import os, sys
 18 | import tensorflow as tf
 19 | 
 20 | import sys
 21 | # add base path to sys
 22 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 23 | 
 24 | 
 25 | import os
 26 | import glob
 27 | import json
 28 | import numpy as np
 29 | from task2_regression.models.linear import simple_linear_model
 30 | # from task2_regression.models.vlaai import vlaai, pearson_loss, pearson_metric, pearson_tf_non_averaged
 31 | 
 32 | 
 33 | if __name__ == '__main__':
 34 | 
 35 |     # Parameters
 36 |     window_length_s = 30*64  # 30 seconds
 37 |     # Root dataset directory containing test set
 38 |     # Parameters
 39 |     # Length of the decision window
 40 |     fs = 64
 41 | 
 42 |     window_length = window_length_s * fs  # 5 seconds
 43 |     # Hop length between two consecutive decision windows
 44 | 
 45 |     # Get the path to the config gile
 46 |     experiments_folder = os.path.dirname(__file__)
 47 |     task_folder = os.path.dirname(experiments_folder)
 48 |     util_folder = os.path.join(os.path.dirname(task_folder), "util")
 49 |     config_path = os.path.join(util_folder, 'config.json')
 50 | 
 51 |     # Load the config
 52 |     with open(config_path) as fp:
 53 |         config = json.load(fp)
 54 | 
 55 |     # Provide the path of the dataset
 56 |     # which is split already to train, val, test
 57 |     data_folder = os.path.join(config["dataset_folder"],  config["test_folder"], 'TASK2_regression')
 58 |     eeg_folder = os.path.join(data_folder, 'preprocessed_eeg')
 59 | 
 60 | 
 61 |     # uncomment if you want to train with the mel spectrogram stimulus representation
 62 |     stimulus_features = ["mel"]
 63 |     stimulus_dimension = 10
 64 | 
 65 |     features = ["eeg"] + stimulus_features
 66 | 
 67 |     pretrained_model = os.path.join(os.path.dirname(__file__), 'results_linear_baseline', 'model.h5')
 68 | 
 69 |     # Define and load the pretrained model
 70 |     model = simple_linear_model(integration_window = int(fs*0.25), nb_filters=10)
 71 |     model.load_weights(pretrained_model)
 72 | 
 73 | 
 74 |     test_eeg_mapping = glob.glob(os.path.join(data_folder, 'sub*mapping.json'))
 75 | 
 76 |     for sub_stimulus_mapping in test_eeg_mapping:
 77 |         subject = os.path.basename(sub_stimulus_mapping).split('_')[0]
 78 | 
 79 |         # load stimulus mapping
 80 |         sub_stimulus_mapping = json.load(open(sub_stimulus_mapping))
 81 | 
 82 |         #load eeg data
 83 |         sub_path = os.path.join(eeg_folder, f'{subject}_eeg.npz')
 84 |         sub_eeg_data = dict(np.load(sub_path))
 85 | 
 86 |         data_eeg =  np.stack([sub_eeg_data[value['eeg']]  for key, value in sub_stimulus_mapping.items() ])
 87 | 
 88 |         id_list= list(sub_stimulus_mapping.keys())
 89 | 
 90 |         # predict
 91 |         predictions = model.predict(data_eeg)
 92 | 
 93 |         # Make predictions json-serializable
 94 |         predictions = [np.array(value).tolist() for value in np.squeeze(predictions)]
 95 | 
 96 |         # Create dictionary from id_list and predictions
 97 |         sub = dict(zip(id_list, predictions))
 98 | 
 99 |         prediction_dir = os.path.join(os.path.dirname(__file__), 'predictions')
100 |         os.makedirs(prediction_dir, exist_ok=True)
101 |         with open(os.path.join(prediction_dir, subject + '.json'), 'w') as f:
102 |             json.dump(sub, f)
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/task2_regression/experiments/vlaai_mel.py:
--------------------------------------------------------------------------------
  1 | """Example experiment for the VLAAI model."""
  2 | import glob
  3 | import json
  4 | import logging
  5 | import os
  6 | # set gpu private
  7 | os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | import numpy as np
 12 | import sys
 13 | # add base path to sys
 14 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 15 | 
 16 | from task2_regression.models.vlaai import vlaai, pearson_loss, pearson_metric, pearson_tf_non_averaged
 17 | from util.dataset_generator import DataGenerator, create_tf_dataset
 18 | 
 19 | 
 20 | def evaluate_model(model, test_dict):
 21 |     """Evaluate a model.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     model: tf.keras.Model
 26 |         Model to evaluate.
 27 |     test_dict: dict
 28 |         Mapping between a subject and a tf.data.Dataset containing the test
 29 |         set for the subject.
 30 | 
 31 |     Returns
 32 |     -------
 33 |     dict
 34 |         Mapping between a subject and the loss/evaluation score on the test set
 35 |     """
 36 |     evaluation = {}
 37 |     for subject, ds_test in test_dict.items():
 38 |         logging.info(f"Scores for subject {subject}:")
 39 |         # evaluate model
 40 |         ds = [x for x in ds_test]
 41 |         eeg = tf.concat([x[0] for x in ds], axis=0)
 42 |         labels = tf.concat([x[1] for x in ds], axis=0)
 43 | 
 44 |         reconstructions = model.predict(eeg)
 45 |         correlations = np.squeeze(pearson_tf_non_averaged(labels, reconstructions))
 46 | 
 47 |         # calculate pearson correlation per band
 48 | 
 49 |         results = model.evaluate(ds_test, verbose=2)
 50 | 
 51 |         metrics = model.metrics_names
 52 |         evaluation[subject] = dict(zip(metrics, results))
 53 | 
 54 |         evaluation[subject]["pearson_correlation_per_band"] = np.mean(correlations, axis=0).tolist()
 55 |         # metrics = model.metrics_names
 56 |         # evaluation[subject] = dict(zip(metrics, results))
 57 |     return evaluation
 58 | 
 59 | 
 60 | 
 61 | if __name__ == "__main__":
 62 |     # Parameters
 63 |     # Length of the decision window
 64 |     fs= 64
 65 |     window_length = 5 * fs  # 10 seconds
 66 |     # Hop length between two consecutive decision windows
 67 |     hop_length = 1*fs
 68 |     epochs = 100
 69 |     patience = 10
 70 |     batch_size = 10
 71 |     only_evaluate = True
 72 |     training_log_filename = "training_log.csv"
 73 |     results_filename = 'eval.json'
 74 | 
 75 | 
 76 |    # Get the path to the config gile
 77 |     experiments_folder = os.path.dirname(__file__)
 78 |     task_folder = os.path.dirname(experiments_folder)
 79 |     util_folder = os.path.join(os.path.dirname(task_folder), "util")
 80 |     config_path = os.path.join(util_folder, 'config.json')
 81 | 
 82 |     # Load the config
 83 |     with open(config_path) as fp:
 84 |         config = json.load(fp)
 85 | 
 86 |     # Provide the path of the dataset
 87 |     # which is split already to train, val, test
 88 | 
 89 |     data_folder = os.path.join(config["dataset_folder"],config["derivatives_folder"],  config["split_folder"])
 90 |     stimulus_features = ["mel"]
 91 |     features = ["eeg"] + stimulus_features
 92 | 
 93 | 
 94 |     # Create a directory to store (intermediate) results
 95 |     results_folder = os.path.join(experiments_folder, "results_vlaai_mel")
 96 |     os.makedirs(results_folder, exist_ok=True)
 97 | 
 98 |     # create the model
 99 |     model = vlaai()
100 |     model.compile(tf.keras.optimizers.Adam(), loss=pearson_loss, metrics=[pearson_metric])
101 |     model_path = os.path.join(results_folder, "model.h5")
102 | 
103 |     if only_evaluate:
104 | 
105 |         model.load_weights(model_path)
106 |     else:
107 |         train_files = [x for x in glob.glob(os.path.join(data_folder, "train_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
108 |         # Create list of numpy array files
109 |         train_generator = DataGenerator(train_files, window_length)
110 |         dataset_train = create_tf_dataset(train_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10))
111 | 
112 |         # Create the generator for the validation set
113 |         val_files = [x for x in glob.glob(os.path.join(data_folder, "val_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features ]
114 |         val_generator = DataGenerator(val_files, window_length)
115 |         dataset_val = create_tf_dataset(val_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10))
116 | 
117 |         # Train the model
118 |         model.fit(
119 |             dataset_train,
120 |             epochs=epochs,
121 |             validation_data=dataset_val,
122 |             callbacks=[
123 |                 tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True),
124 |                 tf.keras.callbacks.CSVLogger(os.path.join(results_folder, training_log_filename)),
125 |                 tf.keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
126 |             ],
127 | 	    workers = tf.data.AUTOTUNE,
128 |             use_multiprocessing=True
129 |         )
130 | 
131 |     # Evaluate the model on test set
132 |     # Create a dataset generator for each test subject
133 |     test_files = [x for x in glob.glob(os.path.join(data_folder, "test_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features]
134 |     # Get all different subjects from the test set
135 |     subjects = list(set([os.path.basename(x).split("_-_")[1] for x in test_files]))
136 |     datasets_test = {}
137 |     # Create a generator for each subject
138 |     for sub in subjects:
139 |         files_test_sub = [f for f in test_files if sub in os.path.basename(f)]
140 |         test_generator = DataGenerator(files_test_sub, window_length)
141 |         datasets_test[sub] = create_tf_dataset(test_generator, window_length, None, hop_length,    batch_size=64, data_types=(tf.float32, tf.float32), feature_dims=(64, 10))
142 | 
143 |     evaluation = evaluate_model(model, datasets_test)
144 | 
145 |     # We can save our results in a json encoded file
146 |     results_path = os.path.join(results_folder, results_filename)
147 |     with open(results_path, "w") as fp:
148 |         json.dump(evaluation, fp)
149 |     logging.info(f"Results saved at {results_path}")
150 | 


--------------------------------------------------------------------------------
/task2_regression/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Models for task2."""
2 | 


--------------------------------------------------------------------------------
/task2_regression/models/linear.py:
--------------------------------------------------------------------------------
  1 | """ This module contains linear backward model"""
  2 | import tensorflow as tf
  3 | 
  4 | from task2_regression.models.vlaai import pearson_tf, pearson_tf_non_averaged
  5 | 
  6 | 
  7 | @tf.function
  8 | def pearson_loss_cut(y_true, y_pred, axis=1):
  9 |     """Pearson loss function.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 |     y_true: tf.Tensor
 14 |         True values. Shape is (batch_size, time_steps, n_features)
 15 |     y_pred: tf.Tensor
 16 |         Predicted values. Shape is (batch_size, time_steps, n_features)
 17 | 
 18 |     Returns
 19 |     -------
 20 |     tf.Tensor
 21 |         Pearson loss.
 22 |         Shape is (batch_size, 1, n_features)
 23 |     """
 24 |     return -pearson_tf(y_true[:, : tf.shape(y_pred)[1], :], y_pred, axis=axis)
 25 | 
 26 | 
 27 | @tf.function
 28 | def pearson_metric_cut(y_true, y_pred, axis=1):
 29 |     """Pearson metric function.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     y_true: tf.Tensor
 34 |         True values. Shape is (batch_size, time_steps, n_features)
 35 |     y_pred: tf.Tensor
 36 |         Predicted values. Shape is (batch_size, time_steps, n_features)
 37 | 
 38 |     Returns
 39 |     -------
 40 |     tf.Tensor
 41 |         Pearson metric.
 42 |         Shape is (batch_size, 1, n_features)
 43 |     """
 44 |     return pearson_tf(y_true[:, : tf.shape(y_pred)[1], :], y_pred, axis=axis)
 45 | 
 46 | @tf.function
 47 | def pearson_metric_cut_non_averaged(y_true, y_pred, axis=1):
 48 |     """Pearson metric function.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     y_true: tf.Tensor
 53 |         True values. Shape is (batch_size, time_steps, n_features)
 54 |     y_pred: tf.Tensor
 55 |         Predicted values. Shape is (batch_size, time_steps, n_features)
 56 | 
 57 |     Returns
 58 |     -------
 59 |     tf.Tensor
 60 |         Pearson metric.
 61 |         Shape is (batch_size, 1, n_features)
 62 |     """
 63 |     return pearson_tf_non_averaged(y_true[:, : tf.shape(y_pred)[1], :], y_pred, axis=axis)
 64 | 
 65 | 
 66 | 
 67 | def simple_linear_model(integration_window=32, nb_filters=1, nb_channels=64):
 68 |     inp = tf.keras.layers.Input(
 69 |         (
 70 |             None,
 71 |             nb_channels,
 72 |         )
 73 |     )
 74 |     out = tf.keras.layers.Conv1D(nb_filters, integration_window)(inp)
 75 |     model = tf.keras.models.Model(inputs=[inp], outputs=[out])
 76 |     model.compile(
 77 |         tf.keras.optimizers.Adam(),
 78 |         loss=pearson_loss_cut,
 79 |         metrics=[pearson_metric_cut]
 80 |     )
 81 |     return model
 82 | 
 83 | def simple_linear_model_stimulus(integration_window=32, nb_filters=1, nb_channels=64):
 84 |     inp = tf.keras.layers.Input(
 85 |         (
 86 |             None,
 87 |             nb_channels,
 88 |         )
 89 | 
 90 | 
 91 |     )
 92 |     # env = abs(s)
 93 |     # f0= np.phase(s)
 94 |     # f0 = np.angle(s)
 95 | 
 96 |     # reconstruct env
 97 |     # reconsturct f0
 98 |     # reconstructed s = real(reconstructed_env .*exp(1j*reconstructed_f0))./ np.max(abs(reconstructed_env))
 99 | 
100 |     out = tf.keras.layers.Conv1D(nb_filters, integration_window)(inp)
101 |     model = tf.keras.models.Model(inputs=[inp], outputs=[out])
102 |     model.compile(
103 |         tf.keras.optimizers.Adam(),
104 |         loss=pearson_loss_cut,
105 |         metrics=[pearson_metric_cut]
106 |     )
107 |     return model
108 | 


--------------------------------------------------------------------------------
/task2_regression/models/vlaai.py:
--------------------------------------------------------------------------------
  1 | """Code to construct the VLAAI network.
  2 | Code was extrcted from https://github.com/exporl/vlaai
  3 | """
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | def extractor(
  8 |     filters=(256, 256, 256, 128, 128),
  9 |     kernels=(64,) * 5,
 10 |     dilation_rate = 1,
 11 |     input_channels=64,
 12 |     normalization_fn=lambda x: tf.keras.layers.LayerNormalization()(x),
 13 |     activation_fn=lambda x: tf.keras.layers.LeakyReLU()(x),
 14 |     name="extractor",
 15 | ):
 16 |     """Construct the extractor model.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     filters: Sequence[int]python 
 21 |         Number of filters for each layer.
 22 |     kernels: Sequence[int]
 23 |         Kernel size for each layer.
 24 |     input_channels: int
 25 |         Number of EEG channels in the input
 26 |     normalization_fn: Callable[[tf.Tensor], tf.Tensor]
 27 |         Function to normalize the contents of a tensor.
 28 |     activation_fn: Callable[[tf.Tensor], tf.Tensor]
 29 |         Function to apply an activation function to the contents of a tensor.
 30 |     name: str
 31 |         Name of the model.
 32 | 
 33 |     Returns
 34 |     -------
 35 |     tf.keras.models.Model
 36 |         The extractor model.
 37 |     """
 38 |     eeg = tf.keras.layers.Input((None, input_channels))
 39 | 
 40 |     x = eeg
 41 | 
 42 |     if len(filters) != len(kernels):
 43 |         raise ValueError("'filters' and 'kernels' must have the same length")
 44 | 
 45 |     # Add the convolutional layers
 46 |     i = 0
 47 |     for filter_, kernel in zip(filters, kernels):
 48 |         i +=1
 49 | 
 50 |         if i == len(filters) :
 51 |             padding = 'valid'
 52 |         else:
 53 |             padding = 'valid'
 54 |         x = tf.keras.layers.Conv1D(filter_, kernel, dilation_rate=dilation_rate,padding=padding )(x)
 55 |         x = normalization_fn(x)
 56 |         x = activation_fn(x)
 57 |         x = tf.keras.layers.ZeroPadding1D((0, kernel - 1))(x)
 58 | 
 59 |     return tf.keras.models.Model(inputs=[eeg], outputs=[x], name=name)
 60 | 
 61 | 
 62 | def output_context(
 63 |     filter_=64,
 64 |     kernel=64,
 65 |     input_channels=64,
 66 |     normalization_fn=lambda x: tf.keras.layers.LayerNormalization()(x),
 67 |     activation_fn=lambda x: tf.keras.layers.LeakyReLU()(x),
 68 |     name="output_context_model",
 69 | ):
 70 |     """Construct the output context model.
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     filter_: int
 75 |         Number of filters for the convolutional layer.
 76 |     kernel: int
 77 |         Kernel size for the convolutional layer.
 78 |     input_channels: int
 79 |         Number of EEG channels in the input.
 80 |     normalization_fn: Callable[[tf.Tensor], tf.Tensor]
 81 |         Function to normalize the contents of a tensor.
 82 |     activation_fn: Callable[[tf.Tensor], tf.Tensor]
 83 |         Function to apply an activation function to the contents of a tensor.
 84 |     name: str
 85 |         Name of the model.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     tf.keras.models.Model
 90 |         The output context model.
 91 |     """
 92 |     inp = tf.keras.layers.Input((None, input_channels))
 93 |     x = tf.keras.layers.ZeroPadding1D((kernel - 1, 0))(inp)
 94 |     x = tf.keras.layers.Conv1D(filter_, kernel)(x)
 95 |     x = normalization_fn(x)
 96 |     x = activation_fn(x)
 97 |     return tf.keras.models.Model(inputs=[inp], outputs=[x], name=name)
 98 | 
 99 | 
100 | def vlaai(
101 |     nb_blocks=4,
102 |     extractor_model=None,
103 |     output_context_model=None,
104 |     use_skip=True,
105 |     input_channels=64,
106 |     output_dim=1,
107 |     name="vlaai",
108 | ):
109 |     """Construct the VLAAI model.
110 | 
111 |     Parameters
112 |     ----------
113 |     nb_blocks: int
114 |         Number of repeated blocks to use.
115 |     extractor_model: Callable[[tf.Tensor], tf.Tensor]
116 |         The extractor model to use.
117 |     output_context_model: Callable[[tf.Tensor], tf.Tensor]
118 |         The output context model to use.
119 |     use_skip: bool
120 |         Whether to use skip connections.
121 |     input_channels: int
122 |         Number of EEG channels in the input.
123 |     output_dim: int
124 |         Number of output dimensions.
125 |     name: str
126 |         Name of the model.
127 | 
128 |     Returns
129 |     -------
130 |     tf.keras.models.Model
131 |         The VLAAI model.
132 |     """
133 |     if extractor_model is None:
134 |         extractor_model = extractor()
135 |     if output_context_model is None:
136 |         output_context_model = output_context()
137 | 
138 |     eeg = tf.keras.layers.Input((None, input_channels))
139 | 
140 |     # If using skip connections: start with x set to zero
141 |     if use_skip:
142 |         x = tf.zeros_like(eeg)
143 |     else:
144 |         x = eeg
145 | 
146 |     # Iterate over the blocks
147 |     for i in range(nb_blocks):
148 |         if use_skip:
149 |             x = extractor_model(eeg + x)
150 |         else:
151 |             x = extractor_model(x)
152 |         x = tf.keras.layers.Dense(input_channels)(x)
153 |         x = output_context_model(x)
154 | 
155 |     x = tf.keras.layers.Dense(output_dim)(x)
156 | 
157 |     return tf.keras.models.Model(inputs=[eeg], outputs=[x], name=name)
158 | 
159 | 
160 | def pearson_tf(y_true, y_pred, axis=1):
161 |     """Pearson correlation function implemented in tensorflow.
162 | 
163 |     Parameters
164 |     ----------
165 |     y_true: tf.Tensor
166 |         Ground truth labels. Shape is (batch_size, time_steps, n_features)
167 |     y_pred: tf.Tensor
168 |         Predicted labels. Shape is (batch_size, time_steps, n_features)
169 |     axis: int
170 |         Axis along which to compute the pearson correlation. Default is 1.
171 | 
172 |     Returns
173 |     -------
174 |     tf.Tensor
175 |         Pearson correlation.
176 |         Shape is (batch_size, 1, n_features) if axis is 1.
177 |     """
178 |     # Compute the mean of the true and predicted values
179 |     y_true_mean = tf.reduce_mean(y_true, axis=axis, keepdims=True)
180 |     y_pred_mean = tf.reduce_mean(y_pred, axis=axis, keepdims=True)
181 | 
182 |     # Compute the numerator and denominator of the pearson correlation
183 |     numerator = tf.reduce_sum(
184 |         (y_true - y_true_mean) * (y_pred - y_pred_mean),
185 |         axis=axis,
186 |         keepdims=True,
187 |     )
188 |     std_true = tf.reduce_sum(tf.square(y_true - y_true_mean), axis=axis, keepdims=True)
189 |     std_pred = tf.reduce_sum(tf.square(y_pred - y_pred_mean), axis=axis, keepdims=True)
190 |     denominator = tf.sqrt(std_true * std_pred)
191 | 
192 |     # Compute the pearson correlation
193 |     return tf.reduce_mean(tf.math.divide_no_nan(numerator, denominator), axis=-1)
194 | 
195 | def pearson_tf_non_averaged(y_true, y_pred, axis=1):
196 |     """Pearson correlation function implemented in tensorflow.
197 | 
198 |     Parameters
199 |     ----------
200 |     y_true: tf.Tensor
201 |         Ground truth labels. Shape is (batch_size, time_steps, n_features)
202 |     y_pred: tf.Tensor
203 |         Predicted labels. Shape is (batch_size, time_steps, n_features)
204 |     axis: int
205 |         Axis along which to compute the pearson correlation. Default is 1.
206 | 
207 |     Returns
208 |     -------
209 |     tf.Tensor
210 |         Pearson correlation.
211 |         Shape is (batch_size, 1, n_features) if axis is 1.
212 |     """
213 |     # Compute the mean of the true and predicted values
214 |     y_true_mean = tf.reduce_mean(y_true, axis=axis, keepdims=True)
215 |     y_pred_mean = tf.reduce_mean(y_pred, axis=axis, keepdims=True)
216 | 
217 |     # Compute the numerator and denominator of the pearson correlation
218 |     numerator = tf.reduce_sum(
219 |         (y_true - y_true_mean) * (y_pred - y_pred_mean),
220 |         axis=axis,
221 |         keepdims=True,
222 |     )
223 |     std_true = tf.reduce_sum(tf.square(y_true - y_true_mean), axis=axis, keepdims=True)
224 |     std_pred = tf.reduce_sum(tf.square(y_pred - y_pred_mean), axis=axis, keepdims=True)
225 |     denominator = tf.sqrt(std_true * std_pred)
226 | 
227 |     # Compute the pearson correlation
228 |     return tf.math.divide_no_nan(numerator, denominator)
229 | 
230 | 
231 | @tf.function
232 | def pearson_loss(y_true, y_pred, axis=1):
233 |     """Pearson loss function.
234 | 
235 |     Parameters
236 |     ----------
237 |     y_true: tf.Tensor
238 |         True values. Shape is (batch_size, time_steps, n_features)
239 |     y_pred: tf.Tensor
240 |         Predicted values. Shape is (batch_size, time_steps, n_features)
241 | 
242 |     Returns
243 |     -------
244 |     tf.Tensor
245 |         Pearson loss.
246 |         Shape is (batch_size, 1, n_features)
247 |     """
248 |     return -pearson_tf(y_true, y_pred, axis=axis)
249 | 
250 | 
251 | @tf.function
252 | def pearson_metric(y_true, y_pred, axis=1):
253 |     """Pearson metric function.
254 | 
255 |     Parameters
256 |     ----------
257 |     y_true: tf.Tensor
258 |         True values. Shape is (batch_size, time_steps, n_features)
259 |     y_pred: tf.Tensor
260 |         Predicted values. Shape is (batch_size, time_steps, n_features)
261 | 
262 |     Returns
263 |     -------
264 |     tf.Tensor
265 |         Pearson metric.
266 |         Shape is (batch_size, 1, n_features)
267 |     """
268 |     return pearson_tf(y_true, y_pred, axis=axis)
269 | 


--------------------------------------------------------------------------------
/util/config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dataset_folder": "PATH/TO/sparrKULee",
3 |   "derivatives_folder": "derivatives",
4 |   "preprocessed_eeg_folder": "preprocessed_eeg",
5 |   "preprocessed_stimuli_folder": "preprocessed_stimuli",
6 |   "split_folder": "split_data",
7 |   "test_folder": "test_set"
8 | }
9 | 


--------------------------------------------------------------------------------
/util/dataset_generator.py:
--------------------------------------------------------------------------------
  1 | """Code for the dataset_generator for both tasks."""
  2 | import itertools
  3 | import os
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | @tf.function
  9 | def batch_equalizer_fn(*args):
 10 |     """Batch equalizer.
 11 |     Prepares the inputs for a model to be trained in
 12 |     match-mismatch task. It makes sure that match_env
 13 |     and mismatch_env are equally presented as a first
 14 |     envelope in match-mismatch task.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     args : Sequence[tf.Tensor]
 19 |         List of tensors representing feature data
 20 | 
 21 |     Returns
 22 |     -------
 23 |     Tuple[Tuple[tf.Tensor], tf.Tensor]
 24 |         Tuple of the EEG/speech features serving as the input to the model and
 25 |         the labels for the match/mismatch task
 26 | 
 27 |     Notes
 28 |     -----
 29 |     This function will also double the batch size. E.g. if the batch size of
 30 |     the elements in each of the args was 32, the output features will have
 31 |     a batch size of 64.
 32 |     """
 33 |     eeg = args[0]
 34 |     num_stimuli = len(args) - 1
 35 |     # repeat eeg num_stimuli times
 36 |     new_eeg = tf.concat([eeg] * num_stimuli, axis=0)
 37 |     all_features = [new_eeg]
 38 | 
 39 |     # create args
 40 |     args_to_zip = [args[i::num_stimuli] for i in range(1,num_stimuli+1)]
 41 |     for stimuli_features in zip(*args_to_zip):
 42 | 
 43 |         for i in range(num_stimuli):
 44 |             stimulus_rolled = tf.roll(stimuli_features, shift=i, axis=0)
 45 |             # reshape stimulus_rolled to merge the first two dimensions
 46 |             stimulus_rolled = tf.reshape(stimulus_rolled, [tf.shape(stimulus_rolled)[0] * tf.shape(stimulus_rolled)[1], stimuli_features[0].shape[-2], stimuli_features[0].shape[-1]])
 47 | 
 48 |             all_features.append(stimulus_rolled)
 49 |     labels = tf.concat(
 50 |         [
 51 |             tf.tile(tf.constant([[1 if ii == i else 0 for ii in range(num_stimuli)]]), [tf.shape(eeg)[0], 1]) for i in range(num_stimuli)
 52 |         ], axis=0
 53 |     )
 54 | 
 55 |     return tuple(all_features), labels
 56 | 
 57 | def shuffle_fn(args, number_mismatch):
 58 |     # repeat the last argument number_ mismatch times
 59 |     args = list(args)
 60 |     for _  in range(number_mismatch):
 61 |         args.append(tf.random.shuffle(args[-1]))
 62 |     return tuple(args)
 63 | 
 64 | 
 65 | 
 66 | def create_tf_dataset(
 67 |     data_generator,
 68 |     window_length,
 69 |     batch_equalizer_fn=None,
 70 |     hop_length=64,
 71 |     batch_size=64,
 72 |     data_types=(tf.float32, tf.float32),
 73 |     feature_dims=(64, 1),
 74 |     number_mismatch = None # None for regression, 2 or 4 for match-mismatch
 75 | ):
 76 |     """Creates a tf.data.Dataset.
 77 | 
 78 |     This will be used to create a dataset generator that will
 79 |     pass windowed data to a model in both tasks.
 80 | 
 81 |     Parameters
 82 |     ---------
 83 |     data_generator: DataGenerator
 84 |         A data generator.
 85 |     window_length: int
 86 |         Length of the decision window in samples.
 87 |     batch_equalizer_fn: Callable
 88 |         Function that will be applied on the data after batching (using
 89 |         the `map` method from tf.data.Dataset). In the match/mismatch task,
 90 |         this function creates the imposter segments and labels.
 91 |     hop_length: int
 92 |         Hop length between two consecutive decision windows.
 93 |     batch_size: Optional[int]
 94 |         If not None, specifies the batch size. In the match/mismatch task,
 95 |         this amount will be doubled by the default_batch_equalizer_fn
 96 |     data_types: Union[Sequence[tf.dtype], tf.dtype]
 97 |         The data types that the individual features of data_generator should
 98 |         be cast to. If you only specify a single datatype, it will be chosen
 99 |         for all EEG/speech features.
100 | 
101 |     Returns
102 |     -------
103 |     tf.data.Dataset
104 |         A Dataset object that generates data to train/evaluate models
105 |         efficiently
106 |     """
107 |     # create tf dataset from generator
108 |     dataset = tf.data.Dataset.from_generator(
109 |         data_generator,
110 |         output_signature=tuple(
111 |             tf.TensorSpec(shape=(None, x), dtype=data_types[index])
112 |             for index, x in enumerate(feature_dims)
113 |         ),
114 |     )
115 |     # window dataset
116 |     dataset = dataset.map(
117 |         lambda *args: [
118 |             tf.signal.frame(arg, window_length, hop_length, axis=0)
119 |             for arg in args
120 |         ],
121 |         num_parallel_calls=tf.data.AUTOTUNE
122 |     )
123 | 
124 |     if number_mismatch is not None:
125 |         # map second argument to shifted version
126 | 
127 | 
128 |         dataset = dataset.map( lambda *args : shuffle_fn(args, number_mismatch),
129 | 
130 |             num_parallel_calls=tf.data.AUTOTUNE
131 |         )
132 |     # batch data
133 |     dataset = dataset.interleave(
134 |         lambda *args: tf.data.Dataset.from_tensor_slices(args),
135 |         cycle_length=8,
136 |         block_length=1,
137 |         num_parallel_calls=tf.data.AUTOTUNE,
138 |     )
139 |     if batch_size is not None:
140 |         dataset = dataset.batch(batch_size, drop_remainder=True)
141 | 
142 |     if batch_equalizer_fn is not None:
143 |         # Create the labels and make sure classes are balanced
144 |         dataset = dataset.map(batch_equalizer_fn,
145 |                               num_parallel_calls=tf.data.AUTOTUNE)
146 | 
147 |     return dataset
148 | 
149 | 
150 | def group_recordings(files):
151 |     """Group recordings and corresponding stimuli.
152 | 
153 |     Parameters
154 |     ----------
155 |     files : Sequence[Union[str, pathlib.Path]]
156 |         List of filepaths to preprocessed and split EEG and speech features
157 | 
158 |     Returns
159 |     -------
160 |     list
161 |         Files grouped by the self.group_key_fn and subsequently sorted
162 |         by the self.feature_sort_fn.
163 |     """
164 |     new_files = []
165 |     grouped = itertools.groupby(sorted(files), lambda x: "_-_".join(os.path.basename(x).split("_-_")[:3]))
166 |     for recording_name, feature_paths in grouped:
167 |         new_files += [sorted(feature_paths, key=lambda x: "0" if x == "eeg" else x)]
168 |     return new_files
169 | 
170 | 
171 | 
172 | class DataGenerator:
173 |     """Generate data for the Match/Mismatch task."""
174 | 
175 |     def __init__(
176 |         self,
177 |         files,
178 |         window_length,
179 |     ):
180 |         """Initialize the DataGenerator.
181 | 
182 |         Parameters
183 |         ----------
184 |         files: Sequence[Union[str, pathlib.Path]]
185 |             Files to load.
186 |         window_length: int
187 |             Length of the decision window.
188 |         spacing: int
189 |             Spacing between matched and mismatched samples
190 |         """
191 |         self.window_length = window_length
192 |         self.files = self.group_recordings(files)
193 | 
194 | 
195 |     def group_recordings(self, files):
196 |         """Group recordings and corresponding stimuli.
197 | 
198 |         Parameters
199 |         ----------
200 |         files : Sequence[Union[str, pathlib.Path]]
201 |             List of filepaths to preprocessed and split EEG and speech features
202 | 
203 |         Returns
204 |         -------
205 |         list
206 |             Files grouped by the self.group_key_fn and subsequently sorted
207 |             by the self.feature_sort_fn.
208 |         """
209 |         new_files = []
210 |         grouped = itertools.groupby(sorted(files), lambda x: "_-_".join(os.path.basename(x).split("_-_")[:3]))
211 |         for recording_name, feature_paths in grouped:
212 |             new_files += [sorted(feature_paths, key=lambda x: "0" if x == "eeg" else x)]
213 |         return new_files
214 | 
215 |     def __len__(self):
216 |         return len(self.files)
217 | 
218 |     def __getitem__(self, recording_index):
219 |         """Get data for a certain recording.
220 | 
221 |         Parameters
222 |         ----------
223 |         recording_index: int
224 |             Index of the recording in this dataset
225 | 
226 |         Returns
227 |         -------
228 |         Union[Tuple[tf.Tensor,...], Tuple[np.ndarray,...]]
229 |             The features corresponding to the recording_index recording
230 |         """
231 |         data = []
232 |         for feature in self.files[recording_index]:
233 |             f = np.load(feature).astype(np.float32)
234 |             if f.ndim == 1:
235 |                 f = f[:,None]
236 | 
237 |             data += [f]
238 |         data = self.prepare_data(data)
239 |         return tuple(tf.constant(x) for x in data)
240 | 
241 | 
242 |     def __call__(self):
243 |         """Load data for the next recording.
244 | 
245 |         Yields
246 |         -------
247 |         Union[Tuple[tf.Tensor,...], Tuple[np.ndarray,...]]
248 |             The features corresponding to the recording_index recording
249 |         """
250 |         for idx in range(self.__len__()):
251 |             yield self.__getitem__(idx)
252 | 
253 |             if idx == self.__len__() - 1:
254 |                 self.on_epoch_end()
255 | 
256 |     def on_epoch_end(self):
257 |         """Change state at the end of an epoch."""
258 |         np.random.shuffle(self.files)
259 | 
260 |     def prepare_data(self, data):
261 |         # make sure data has dimensionality of (n_samples, n_features)
262 | 
263 | 
264 |         return data
265 | 
266 | 
267 | 


--------------------------------------------------------------------------------