├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── preprocessing_code ├── mel.py ├── sparKULee_loadRAWtestfiles.py ├── sparKULee_loadmwffiles.py ├── sparrKULee.py └── split_and_normalize.py ├── requirements.txt ├── task1_match_mismatch ├── __init__.py ├── experiments │ ├── __init__.py │ ├── dilated_convolutional_model.py │ └── test_match_mismatch.py └── models │ ├── __init__.py │ └── dilated_convolutional_model.py ├── task2_regression ├── __init__.py ├── experiments │ ├── __init__.py │ ├── linear_baseline.py │ ├── test_regression.py │ └── vlaai_mel.py └── models │ ├── __init__.py │ ├── linear.py │ └── vlaai.py └── util ├── config.json └── dataset_generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | # From https://github.com/github/gitignore/blob/main/Python.gitignore 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | .idea 162 | 163 | # Specific for tis project 164 | task*/experiments/results*/ 165 | *condor* 166 | .err 167 | .out 168 | .log 169 | .job 170 | speech-decoding/* 171 | speech-decoding 172 | .m -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Code and for the Auditory EEG ICASSP Challenge 2024 2 | 3 | Copyright (C) 2022 ExpORL 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Auditory-eeg-challenge-2024-code 2 | ================================ 3 | This is the codebase for the [2024 ICASSP Auditory EEG challenge](https://exporl.github.io/auditory-eeg-challenge-2024). 4 | This codebase contains baseline models and code to preprocess stimuli for both tasks. 5 | 6 | # Prerequisites 7 | 8 | Python >= 3.6 9 | 10 | # General setup 11 | 12 | Steps to get a working setup: 13 | 14 | ## 1. Clone this repository and install the [requirements.txt](requirements.txt) 15 | ```bash 16 | # Clone this repository 17 | git clone https://github.com/exporl/auditory-eeg-challenge-2024-code 18 | 19 | # Go to the root folder 20 | cd auditory-eeg-challenge-2024-code 21 | 22 | # Optional: install a virtual environment 23 | python3 -m venv venv # Optional 24 | source venv/bin/activate # Optional 25 | 26 | # Install requirements.txt 27 | python3 -m install requirements.txt 28 | ``` 29 | 30 | ## 2. [Download the data](https://homes.esat.kuleuven.be/~lbollens/) 31 | 32 | You will need a password, which you will receive when you [register](https://exporl.github.io/auditory-eeg-challenge-2024/registration/). 33 | The folder contains multiple folders (and `zip` files containing the same data as their corresponding folders). For bulk downloading, we recommend using the `zip` files, 34 | 35 | 1. `split_data(.zip)` contains already preprocessed, split and normalized data; ready for model training/evaluation. 36 | If you want to get started quickly, you can opt to only download this folder/zipfile. 37 | 38 | 2. `preprocessed_eeg(.zip)` and `preprocessed_stimuli(.zip)` contain preprocessed EEG and stimuli files (envelope and mel features) respectively. 39 | At this stage data is not yet split into different sets and normalized. To go from this to the data in `split_data`, you will have to run the `split_and_normalize.py` script ([preprocessing_code/split_and_normalize.py](./preprocessing_code/split_and_normalize.py) ) 40 | 41 | 3. `sub_*(.zip)` and `stimuli(.zip)` contain the raw EEG and stimuli files. 42 | If you want to recreate the preprocessing steps, you will need to download these files and then run `sparrKULee.py` [(preprocessing_code/sparrKULee.py)](./preprocessing_code/sparrKULee.py) to preprocess the EEG and stimuli and then run the `split_and_normalize.py` script to split and normalize the data. 43 | It is possible to adapt the preprocessing steps in `sparrKULee.py` to your own needs, by adding/removing preprocessing steps. For more detailed information on the pipeline, see the [brain_pipe documentation](https://exporl.github.io/brain_pipe/). 44 | 45 | 46 | Note that it is possible to use the same preprocessed (and split) dataset for both task 1 and task 2, but it is not required. 47 | 48 | 49 | 50 | ## 3. Adjust the `config.json` accordingly 51 | 52 | There is a general `config.json` defining the folder names and structure for the data (i.e. [util/config.json](./util/config.json) ). 53 | Adjust `dataset_folder` in the `config.json` file from `null` to the absolute path to the folder containing all data (The `challenge_folder` from the previous point). 54 | If you follow the BIDS structure, by downloading the whole dataset, the folders preprocessed_eeg, preprocessed_stimuli and split_data, should be located inside the 'derivatives' folder. If you only download these three folders, make sure they are either in a subfolder 'derivatives', or change the 'derivatives' folder in the config, otherwise you will get a file-not-found error when trying to run the experiments. 55 | 56 | 57 | OK, you should be all setup now! 58 | 59 | 60 | 61 | # Running the tasks 62 | 63 | Each task has already some ready-to-go experiments files defined to give you a 64 | baseline and make you acquainted with the problem. The experiment files live 65 | in the `experiment` subfolder for each task. The training log, 66 | best model and evaluation results will be stored in a folder called 67 | `results_{experiment_name}`. For general ideas, you might want to look at the winners of the 68 | [previous ICASSP auditory EEG challenge](https://exporl.github.io/auditory-eeg-challenge-2023). 69 | 70 | ## Task1: Match-mismatch 71 | 72 | By running [task1_match_mismatch/experiments/dilated_convolutional_model.py](./task1_match_mismatch/experiments/dilated_convolutional_model.py), 73 | you can train the dilated convolutional model introduced by Accou et al. [(2021a)](https://doi.org/10.23919/Eusipco47968.2020.9287417) and [(2021b)](https://doi.org/10.1088/1741-2552/ac33e9). 74 | 75 | Other models you might find interesting are [Decheveigné et al (2021)](https://www.sciencedirect.com/science/article/pii/S1053811918300338), [Monesi et al. (2020)](https://ieeexplore.ieee.org/abstract/document/9054000), [Monesi et al. (2021)](https://arxiv.org/abs/2106.09622),…. 76 | 77 | 78 | 79 | ## Task2: Regression (reconstructing spectrogram from EEG) 80 | 81 | By running [task2_regression/experiments/linear_baseline.py](./task2_regression/experiments/linear_baseline.py), you can 82 | train and evaluate a simple linear baseline model with Pearson correlation as a loss function, similar to the baseline model used in [Accou et al (2022)](https://www.biorxiv.org/content/10.1101/2022.09.28.509945). 83 | 84 | By running [task2_regression/experiments/vlaai.py](./task2_regression/experiments/vlaai.py), you can train/evaluate 85 | the VLAAI model as proposed by [Accou et al (2022)](https://www.biorxiv.org/content/10.1101/2022.09.28.509945). You can find a pre-trained model at [VLAAI's github page](https://github.com/exporl/vlaai). 86 | 87 | Other models you might find interesting are: [Thornton et al. (2022)](https://iopscience.iop.org/article/10.1088/1741-2552/ac7976),... 88 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """Code for the ICASSP 2023 auditory eeg challenge.""" 2 | -------------------------------------------------------------------------------- /preprocessing_code/mel.py: -------------------------------------------------------------------------------- 1 | """Code to calculate mel spectrograms.""" 2 | import math 3 | 4 | import librosa 5 | import numpy as np 6 | import scipy.signal 7 | 8 | def calculate_mel_spectrogram( 9 | audio_path, 10 | target_fs=64, 11 | fmin=0, 12 | fmax=5000, 13 | nb_filters=10, 14 | hop_length=None, 15 | win_length=None, 16 | ): 17 | """Calculates mel spectrogram of a raw speech file. This function makes the same calucation as 18 | in the sparrKULee pipeline and is the regression objective for task 2. 19 | 20 | Parameters 21 | --------- 22 | audio_path: str 23 | audio file path 24 | target_fs: int 25 | Sampling frequency of the calculated mel spectrogram 26 | fmin: Union[float, int] 27 | Minimum center frequency used in mel filter matrix 28 | fmax: Union[float, int] 29 | Maximum center frequency used in mel filter matrix 30 | nb_filters: int 31 | Number of mel spectrogram frequency bands 32 | hop_length: int 33 | Hop length (in samples) used for calculation of the spectrogram 34 | win_length: int 35 | Window length (in samples) of each frame 36 | 37 | Returns 38 | ------- 39 | numpy.ndarray 40 | Mel spectrogram 41 | """ 42 | 43 | # unzip audio file 44 | 45 | 46 | speech = dict(np.load(audio_path)) 47 | audio, fs = speech["audio"], speech["fs"] 48 | if not hop_length: 49 | hop_length = int((1 / target_fs) * fs) # this will downsample the signal to target_fs Hz 50 | if not win_length: 51 | win_length = int(0.025 * fs) # 25 milli seconds 52 | 53 | # Finds the closest power of 2 54 | # that is bigger than win_length 55 | n_fft = int(math.pow(2, math.ceil(math.log2(win_length)))) 56 | 57 | # DC removal 58 | audio = audio - np.mean(audio) 59 | 60 | mel_spectrogram = librosa.feature.melspectrogram(audio, window='hann', 61 | sr=fs, n_fft=n_fft, hop_length=hop_length, 62 | win_length=win_length, fmin=fmin, fmax=fmax, htk=False, norm='slaney', 63 | n_mels=nb_filters, center=False) 64 | 65 | 66 | return mel_spectrogram 67 | 68 | 69 | 70 | # 'Center freqs' of mel bands - uniformly spaced between limits 71 | # mel_f: [ 0. , 147.02442191, 324.92910187, 540.19997145, 72 | # 800.6852341 , 1115.88148983, 1497.27995596, 1958.78540639, 73 | # 2517.22310262, 3192.95219807, 4010.6079787 , 5000. ] 74 | 75 | -------------------------------------------------------------------------------- /preprocessing_code/sparKULee_loadRAWtestfiles.py: -------------------------------------------------------------------------------- 1 | """Run the default preprocessing pipeline on soarrKULee. 2 | This script runs the necessary prereprocessing steps on the sparrKULee dataset, starting from the raw caches, 3 | to arrive at the fully preprocessed files. 4 | The raw caches are downloaded from the challenge website and should be placed in the folder specified by the 5 | raw_eeg_dir variable. 6 | The preprocessed EEG will be saved in the folder specified by the preprocessed_eeg_dir variable. 7 | The caches have been synchronized with the stimulus data and should all have a length of 5 seconds. 8 | 9 | On the raw caches, the following preprocessing steps have been performed: 10 | (ie. synched with the stimulus data 11 | and loaded into python) 12 | 13 | You are free to use these caches for your own preprocessing pipeline, 14 | however, if you want to use certain artifact steps, such as the artifact removal MWF, 15 | be aware that the output from these steps might differ when using input windows of just 5 seconds. 16 | If you want to use our artificat removal steps, we recommend to use the MWF caches, 17 | on which the artifact removal MWF has already been performed, and not compute it yourself. 18 | The MWF caches are saved at 1024Hz, see sparKULee_mwf.py for more information on how to use these caches. 19 | 20 | eeg_steps = [ 21 | LinkStimulusToBrainResponse( 22 | stimulus_data=stimulus_steps, 23 | extract_stimuli_information_fn=BIDSAPRStimulusInfoExtractor(), 24 | grouper=BIDSStimulusGrouper( 25 | bids_root=root_dir, 26 | mapping={"stim_file": "stimulus_path", "trigger_file": "trigger_path"}, 27 | subfolders=["stimuli", "eeg"], 28 | ), 29 | ), 30 | LoadEEGNumpy(unit_multiplier=1e6, channels_to_select=list(range(64))), 31 | AlignPeriodicBlockTriggers(biosemi_trigger_processing_fn), 32 | DefaultSave(raw_eeg_dir, 33 | {'eeg': 'data'}, 34 | filename_fn = bids_filename_fn, 35 | clear_output=True, 36 | overwrite=overwrite), 37 | 38 | ] 39 | """ 40 | import argparse 41 | import datetime 42 | import gzip 43 | import json 44 | import logging 45 | import os 46 | import glob 47 | from typing import Any, Dict, Sequence 48 | 49 | import librosa 50 | import numpy as np 51 | import math 52 | import scipy.signal.windows 53 | from brain_pipe.dataloaders.path import GlobLoader 54 | from brain_pipe.pipeline.default import DefaultPipeline 55 | from brain_pipe.preprocessing.brain.artifact import ( 56 | InterpolateArtifacts, 57 | ArtifactRemovalMWF, 58 | ) 59 | from brain_pipe.preprocessing.brain.eeg.biosemi import ( 60 | biosemi_trigger_processing_fn, 61 | ) 62 | from brain_pipe.preprocessing.brain.eeg.load import LoadEEGNumpy 63 | from brain_pipe.preprocessing.brain.epochs import SplitEpochs 64 | from brain_pipe.preprocessing.brain.link import ( 65 | LinkStimulusToBrainResponse, 66 | BIDSStimulusInfoExtractor, 67 | ) 68 | from brain_pipe.preprocessing.brain.rereference import CommonAverageRereference 69 | from brain_pipe.preprocessing.brain.trigger import ( 70 | AlignPeriodicBlockTriggers, 71 | ) 72 | from brain_pipe.preprocessing.filter import SosFiltFilt 73 | from brain_pipe.preprocessing.resample import ResamplePoly 74 | from brain_pipe.preprocessing.stimulus.audio.envelope import GammatoneEnvelope 75 | from brain_pipe.preprocessing.stimulus.audio.spectrogram import LibrosaMelSpectrogram 76 | 77 | from brain_pipe.preprocessing.stimulus.load import LoadStimuli 78 | from brain_pipe.runner.default import DefaultRunner 79 | from brain_pipe.save.default import DefaultSave 80 | # from mel import DefaultSave 81 | from brain_pipe.utils.log import default_logging, DefaultFormatter 82 | from brain_pipe.utils.path import BIDSStimulusGrouper 83 | 84 | from typing import Dict, Any, Sequence, Optional, Union, Mapping 85 | 86 | import numpy as np 87 | 88 | from brain_pipe.pipeline.base import PipelineStep 89 | 90 | 91 | class LoadEEGNumpyTest(PipelineStep): 92 | """Load EEG data. 93 | 94 | This step uses MNE to load EEG data. 95 | """ 96 | 97 | def __init__( 98 | self, keys={"data_path": "data"}, copy_data_dict=False, *mne_args, **mne_kwargs 99 | ): 100 | """Create a new LoadEEG instance. 101 | 102 | Parameters 103 | ---------- 104 | eeg_path_key: str 105 | The key of the EEG path in the data dict. 106 | eeg_data_key: str 107 | The key of the EEG data in the data dict. 108 | """ 109 | super().__init__(copy_data_dict=copy_data_dict) 110 | self.keys = self.parse_dict_keys(keys, "keys") 111 | self.mne_args = mne_args 112 | self.mne_kwargs = mne_kwargs 113 | 114 | 115 | def __call__(self, data_dict: Dict[str, Any]) -> Dict[str, Any]: 116 | """Load EEG data from a npy file. 117 | 118 | Parameters 119 | ---------- 120 | data_dict: Dict[str, Any] 121 | The data dict containing the EEG path. 122 | 123 | Returns 124 | ------- 125 | Dict[str, Any] 126 | The data dict with the EEG data and the EEG info. 127 | """ 128 | for from_key, to_key in self.keys.items(): 129 | path = data_dict[from_key] 130 | 131 | # Support for gzipped files. 132 | raw =np.load(path) 133 | # swap axes 134 | raw = np.swapaxes(raw, 0, 1) 135 | 136 | 137 | data_dict['data'] = raw 138 | data_dict['eeg_key'] = os.path.basename(path) 139 | 140 | data_dict['data_fs'] = 1024 141 | 142 | return data_dict 143 | 144 | class BIDSAPRStimulusInfoExtractor(BIDSStimulusInfoExtractor): 145 | """Extract BIDS compliant stimulus information from an .apr file.""" 146 | 147 | def __call__(self, brain_dict: Dict[str, Any]): 148 | """Extract BIDS compliant stimulus information from an events.tsv file. 149 | 150 | Parameters 151 | ---------- 152 | brain_dict: Dict[str, Any] 153 | The data dict containing the brain data path. 154 | 155 | Returns 156 | ------- 157 | Sequence[Dict[str, Any]] 158 | The extracted event information. Each dict contains the information 159 | of one row in the events.tsv file 160 | """ 161 | event_info = super().__call__(brain_dict) 162 | # Find the apr file 163 | path = brain_dict[self.brain_path_key] 164 | apr_path = "_".join(path.split("_")[:-1]) + "_eeg.apr" 165 | # Read apr file 166 | apr_data = self.get_apr_data(apr_path) 167 | # Add apr data to event info 168 | for e_i in event_info: 169 | e_i.update(apr_data) 170 | return event_info 171 | 172 | def get_apr_data(self, apr_path: str): 173 | """Get the SNR from an .apr file. 174 | 175 | Parameters 176 | ---------- 177 | apr_path: str 178 | Path to the .apr file. 179 | 180 | Returns 181 | ------- 182 | Dict[str, Any] 183 | The SNR. 184 | """ 185 | import xml.etree.ElementTree as ET 186 | 187 | apr_data = {} 188 | tree = ET.parse(apr_path) 189 | root = tree.getroot() 190 | 191 | # Get SNR 192 | interactive_elements = root.findall(".//interactive/entry") 193 | for element in interactive_elements: 194 | description_element = element.find("description") 195 | if description_element.text == "SNR": 196 | apr_data["snr"] = element.find("new_value").text 197 | if "snr" not in apr_data: 198 | logging.warning(f"Could not find SNR in {apr_path}.") 199 | apr_data["snr"] = 100.0 200 | return apr_data 201 | 202 | 203 | def test_filename_fn(data_dict, feature_name, set_name=None): 204 | """Default function to generate a filename for the data. 205 | 206 | Parameters 207 | ---------- 208 | data_dict: Dict[str, Any] 209 | The data dict containing the data to save. 210 | feature_name: str 211 | The name of the feature. 212 | set_name: Optional[str] 213 | The name of the set. If no set name is given, the set name is not 214 | included in the filename. 215 | 216 | Returns 217 | ------- 218 | str 219 | The filename. 220 | """ 221 | 222 | 223 | eeg_key = data_dict['eeg_key'] 224 | 225 | return eeg_key 226 | 227 | 228 | def temp_unpack_data(data_path): 229 | data = dict(np.load(data_path)) 230 | # save all keys, values in separate data path. TODO: fix this such that we don't have to do this 231 | for key, value in data.items(): 232 | np.save(os.path.dirname(data_path) + key + '.npy', value) 233 | 234 | 235 | def run_preprocessing_pipeline( 236 | root_dir, 237 | preprocessed_eeg_dir, 238 | nb_processes=4, 239 | overwrite=False, 240 | log_path="sparrKULee.log", 241 | ): 242 | """Construct and run the preprocessing on SparrKULee. 243 | 244 | Parameters 245 | ---------- 246 | root_dir: str 247 | The root directory of the dataset. 248 | preprocessed_eeg_dir: 249 | The directory where the preprocessed EEG should be saved. 250 | nb_processes: int 251 | The number of processes to use. If -1, the number of processes is 252 | automatically determined. 253 | overwrite: bool 254 | Whether to overwrite existing files. 255 | log_path: str 256 | The path to the log file. 257 | """ 258 | ######### 259 | # PATHS # 260 | ######### 261 | os.makedirs(preprocessed_eeg_dir, exist_ok=True) 262 | 263 | ########### 264 | # LOGGING # 265 | ########### 266 | handler = logging.FileHandler(log_path) 267 | handler.setLevel(logging.DEBUG) 268 | handler.setFormatter(DefaultFormatter()) 269 | default_logging(handlers=[handler]) 270 | 271 | ################ 272 | # DATA LOADING # 273 | ################ 274 | logging.info("Retrieving layout...") 275 | data_loader = GlobLoader( 276 | [os.path.join(root_dir, "sub*.npy")], 277 | filter_fns=[], 278 | key="data_path", 279 | ) 280 | 281 | 282 | ######################### 283 | # RUNNING THE PIPELINE # 284 | ######################### 285 | 286 | logging.info("Starting with the EEG preprocessing") 287 | logging.info("===================================") 288 | 289 | 290 | eeg_steps = [ 291 | LoadEEGNumpyTest(), 292 | SosFiltFilt( 293 | scipy.signal.butter(1, 0.5, "highpass", fs=1024, output="sos"), 294 | emulate_matlab=True, 295 | axis=1, 296 | ), 297 | InterpolateArtifacts(), 298 | ArtifactRemovalMWF(), 299 | CommonAverageRereference(), 300 | ResamplePoly(64, axis=1), 301 | DefaultSave( 302 | preprocessed_eeg_dir, 303 | {"eeg": "data"}, 304 | overwrite=overwrite, 305 | clear_output=True, 306 | filename_fn=test_filename_fn, 307 | ), 308 | ] 309 | 310 | ######################### 311 | # RUNNING THE PIPELINE # 312 | ######################### 313 | 314 | logging.info("Starting with the EEG preprocessing") 315 | logging.info("===================================") 316 | 317 | # Create data_dicts for the EEG files 318 | # Create the EEG pipeline 319 | eeg_pipeline = DefaultPipeline(steps=eeg_steps) 320 | 321 | DefaultRunner( 322 | nb_processes=nb_processes, 323 | logging_config=lambda: None, 324 | ).run( 325 | [(data_loader, eeg_pipeline)], 326 | 327 | ) 328 | 329 | 330 | if __name__ == "__main__": 331 | # Load the config 332 | # get the top folder of the dataset 333 | challenge_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 334 | with open(os.path.join(challenge_folder, 'util', 'config.json'), "r") as f: 335 | config = json.load(f) 336 | 337 | # Set the correct paths as default arguments 338 | dataset_folder = config["dataset_folder"] 339 | test_folder = os.path.join(dataset_folder, config["test_folder"]) 340 | task = 'TASK1_match_mismatch' # [' TASK1_match_mismatch', 'TASK2_regression'] 341 | 342 | 343 | preprocessed_eeg_folder = os.path.join( 344 | test_folder, task, f'{config["preprocessed_eeg_folder"]}' 345 | ) 346 | raw_eeg_dir = os.path.join(test_folder, task, 'raw_eeg') 347 | # unpack the data 348 | 349 | raw_eeg_data = glob.glob(os.path.join(raw_eeg_dir, '*_eeg_raw.npz')) 350 | for data_path in raw_eeg_data: 351 | print(f'processing {data_path}') 352 | temp_unpack_data(data_path) 353 | 354 | 355 | default_log_folder = os.path.dirname(os.path.abspath(__file__)) 356 | 357 | # Parse arguments from the command line 358 | parser = argparse.ArgumentParser(description="Preprocess the auditory EEG dataset") 359 | parser.add_argument( 360 | "--nb_processes", 361 | type=int, 362 | default=1, 363 | help="Number of processes to use for the preprocessing. " 364 | "The default is to use all available cores (-1).", 365 | ) 366 | parser.add_argument( 367 | "--overwrite", action="store_true", help="Overwrite existing files" 368 | ) 369 | parser.add_argument( 370 | "--log_path", type=str, default=os.path.join( 371 | default_log_folder, 372 | "sparrKULee_{datetime}.log" 373 | ) 374 | ) 375 | parser.add_argument( 376 | "--dataset_folder", 377 | type=str, 378 | default=raw_eeg_dir, 379 | help="Path to the folder where the dataset is downloaded", 380 | ) 381 | 382 | parser.add_argument( 383 | "--preprocessed_raw_eeg_path", 384 | type=str, 385 | default=preprocessed_eeg_folder, 386 | help="Path to the folder where the preprocessed EEG will be saved", 387 | ) 388 | args = parser.parse_args() 389 | 390 | # Run the preprocessing pipeline 391 | run_preprocessing_pipeline( 392 | args.dataset_folder, 393 | args.preprocessed_raw_eeg_path, 394 | args.nb_processes, 395 | args.overwrite, 396 | args.log_path.format( 397 | datetime=datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 398 | ), 399 | ) -------------------------------------------------------------------------------- /preprocessing_code/sparKULee_loadmwffiles.py: -------------------------------------------------------------------------------- 1 | """Run the default preprocessing pipeline on sparrKULee. 2 | This script runs the necessary prereprocessing steps on the sparrKULee dataset, starting from the MFW caches, 3 | to arrive at the fully preprocessed files. 4 | The MWF caches are downloaded from the challenge website and should be placed in the folder specified by the 5 | raw_eeg_dir variable. 6 | The preprocessed EEG will be saved in the folder specified by the preprocessed_eeg_dir variable. 7 | The caches have been synchronized with the stimulus data and should all have a length of 5 seconds. 8 | 9 | On the MWF caches, the following preprocessing steps have been performed: 10 | ( see the original SparrKULee.py file for reference as to what these steps do) 11 | eeg_steps = [ 12 | LinkStimulusToBrainResponse( 13 | stimulus_data=stimulus_steps, 14 | extract_stimuli_information_fn=BIDSAPRStimulusInfoExtractor(), 15 | grouper=BIDSStimulusGrouper( 16 | bids_root=root_dir, 17 | mapping={"stim_file": "stimulus_path", "trigger_file": "trigger_path"}, 18 | subfolders=["stimuli", "eeg"], 19 | ), 20 | ), 21 | LoadEEGNumpy(unit_multiplier=1e6, channels_to_select=list(range(64))), 22 | SosFiltFilt( 23 | scipy.signal.butter(1, 0.5, "highpass", fs=1024, output="sos"), 24 | emulate_matlab=True, 25 | axis=1, 26 | ), 27 | InterpolateArtifacts(), 28 | AlignPeriodicBlockTriggers(biosemi_trigger_processing_fn), 29 | SplitEpochs(), 30 | ArtifactRemovalMWF(), 31 | DefaultSave(after_wiener_filter_dir, 32 | {'eeg': 'data'}, 33 | filename_fn=bids_filename_fn, 34 | clear_output=True, 35 | overwrite=overwrite), 36 | ] 37 | 38 | 39 | 40 | """ 41 | import argparse 42 | import datetime 43 | import gzip 44 | import json 45 | import logging 46 | import os 47 | from typing import Any, Dict, Sequence 48 | 49 | import librosa 50 | import numpy as np 51 | import math 52 | import scipy.signal.windows 53 | from brain_pipe.dataloaders.path import GlobLoader 54 | from brain_pipe.pipeline.default import DefaultPipeline 55 | from brain_pipe.preprocessing.brain.artifact import ( 56 | InterpolateArtifacts, 57 | ArtifactRemovalMWF, 58 | ) 59 | from brain_pipe.preprocessing.brain.eeg.biosemi import ( 60 | biosemi_trigger_processing_fn, 61 | ) 62 | from brain_pipe.preprocessing.brain.eeg.load import LoadEEGNumpy 63 | from brain_pipe.preprocessing.brain.epochs import SplitEpochs 64 | from brain_pipe.preprocessing.brain.link import ( 65 | LinkStimulusToBrainResponse, 66 | BIDSStimulusInfoExtractor, 67 | ) 68 | from brain_pipe.preprocessing.brain.rereference import CommonAverageRereference 69 | from brain_pipe.preprocessing.brain.trigger import ( 70 | AlignPeriodicBlockTriggers, 71 | ) 72 | from brain_pipe.preprocessing.filter import SosFiltFilt 73 | from brain_pipe.preprocessing.resample import ResamplePoly 74 | from brain_pipe.preprocessing.stimulus.audio.envelope import GammatoneEnvelope 75 | from brain_pipe.preprocessing.stimulus.audio.spectrogram import LibrosaMelSpectrogram 76 | 77 | from brain_pipe.preprocessing.stimulus.load import LoadStimuli 78 | from brain_pipe.runner.default import DefaultRunner 79 | from brain_pipe.save.default import DefaultSave 80 | # from mel import DefaultSave 81 | from brain_pipe.utils.log import default_logging, DefaultFormatter 82 | from brain_pipe.utils.path import BIDSStimulusGrouper 83 | 84 | from typing import Dict, Any, Sequence, Optional, Union, Mapping 85 | 86 | import numpy as np 87 | 88 | from brain_pipe.pipeline.base import PipelineStep 89 | import glob 90 | 91 | class LoadEEGNumpyTest(PipelineStep): 92 | """Load EEG data. 93 | 94 | This step uses MNE to load EEG data. 95 | """ 96 | 97 | def __init__( 98 | self, keys={"data_path": "data"}, copy_data_dict=False, *mne_args, **mne_kwargs 99 | ): 100 | """Create a new LoadEEG instance. 101 | 102 | Parameters 103 | ---------- 104 | eeg_path_key: str 105 | The key of the EEG path in the data dict. 106 | eeg_data_key: str 107 | The key of the EEG data in the data dict. 108 | """ 109 | super().__init__(copy_data_dict=copy_data_dict) 110 | self.keys = self.parse_dict_keys(keys, "keys") 111 | self.mne_args = mne_args 112 | self.mne_kwargs = mne_kwargs 113 | 114 | 115 | def __call__(self, data_dict: Dict[str, Any]) -> Dict[str, Any]: 116 | """Load EEG data from a npy file. 117 | 118 | Parameters 119 | ---------- 120 | data_dict: Dict[str, Any] 121 | The data dict containing the EEG path. 122 | 123 | Returns 124 | ------- 125 | Dict[str, Any] 126 | The data dict with the EEG data and the EEG info. 127 | """ 128 | for from_key, to_key in self.keys.items(): 129 | path = data_dict[from_key] 130 | 131 | # Support for gzipped files. 132 | raw =np.load(path) 133 | # swap axes 134 | raw = np.swapaxes(raw, 0, 1) 135 | 136 | 137 | data_dict['data'] = raw 138 | data_dict['eeg_key'] = os.path.basename(path) 139 | 140 | data_dict['data_fs'] = 1024 141 | 142 | return data_dict 143 | 144 | class BIDSAPRStimulusInfoExtractor(BIDSStimulusInfoExtractor): 145 | """Extract BIDS compliant stimulus information from an .apr file.""" 146 | 147 | def __call__(self, brain_dict: Dict[str, Any]): 148 | """Extract BIDS compliant stimulus information from an events.tsv file. 149 | 150 | Parameters 151 | ---------- 152 | brain_dict: Dict[str, Any] 153 | The data dict containing the brain data path. 154 | 155 | Returns 156 | ------- 157 | Sequence[Dict[str, Any]] 158 | The extracted event information. Each dict contains the information 159 | of one row in the events.tsv file 160 | """ 161 | event_info = super().__call__(brain_dict) 162 | # Find the apr file 163 | path = brain_dict[self.brain_path_key] 164 | apr_path = "_".join(path.split("_")[:-1]) + "_eeg.apr" 165 | # Read apr file 166 | apr_data = self.get_apr_data(apr_path) 167 | # Add apr data to event info 168 | for e_i in event_info: 169 | e_i.update(apr_data) 170 | return event_info 171 | 172 | def get_apr_data(self, apr_path: str): 173 | """Get the SNR from an .apr file. 174 | 175 | Parameters 176 | ---------- 177 | apr_path: str 178 | Path to the .apr file. 179 | 180 | Returns 181 | ------- 182 | Dict[str, Any] 183 | The SNR. 184 | """ 185 | import xml.etree.ElementTree as ET 186 | 187 | apr_data = {} 188 | tree = ET.parse(apr_path) 189 | root = tree.getroot() 190 | 191 | # Get SNR 192 | interactive_elements = root.findall(".//interactive/entry") 193 | for element in interactive_elements: 194 | description_element = element.find("description") 195 | if description_element.text == "SNR": 196 | apr_data["snr"] = element.find("new_value").text 197 | if "snr" not in apr_data: 198 | logging.warning(f"Could not find SNR in {apr_path}.") 199 | apr_data["snr"] = 100.0 200 | return apr_data 201 | 202 | 203 | def test_filename_fn(data_dict, feature_name, set_name=None): 204 | """Default function to generate a filename for the data. 205 | 206 | Parameters 207 | ---------- 208 | data_dict: Dict[str, Any] 209 | The data dict containing the data to save. 210 | feature_name: str 211 | The name of the feature. 212 | set_name: Optional[str] 213 | The name of the set. If no set name is given, the set name is not 214 | included in the filename. 215 | 216 | Returns 217 | ------- 218 | str 219 | The filename. 220 | """ 221 | 222 | 223 | eeg_key = data_dict['eeg_key'] 224 | 225 | return eeg_key 226 | 227 | 228 | def temp_unpack_data(data_path): 229 | data = dict(np.load(data_path)) 230 | # save all keys, values in separate data path. 231 | for key, value in data.items(): 232 | np.save(os.path.join(os.path.dirname(data_path) , key + '.npy'), value) 233 | 234 | 235 | def run_preprocessing_pipeline( 236 | root_dir, 237 | preprocessed_eeg_dir, 238 | nb_processes=4, 239 | overwrite=False, 240 | log_path="sparrKULee.log", 241 | ): 242 | """Construct and run the preprocessing on SparrKULee. 243 | 244 | Parameters 245 | ---------- 246 | root_dir: str 247 | The root directory of the dataset. 248 | preprocessed_eeg_dir: 249 | The directory where the preprocessed EEG should be saved. 250 | nb_processes: int 251 | The number of processes to use. If -1, the number of processes is 252 | automatically determined. 253 | overwrite: bool 254 | Whether to overwrite existing files. 255 | log_path: str 256 | The path to the log file. 257 | """ 258 | ######### 259 | # PATHS # 260 | ######### 261 | os.makedirs(preprocessed_eeg_dir, exist_ok=True) 262 | 263 | ########### 264 | # LOGGING # 265 | ########### 266 | handler = logging.FileHandler(log_path) 267 | handler.setLevel(logging.DEBUG) 268 | handler.setFormatter(DefaultFormatter()) 269 | default_logging(handlers=[handler]) 270 | 271 | ################ 272 | # DATA LOADING # 273 | ################ 274 | logging.info("Retrieving layout...") 275 | data_loader = GlobLoader( 276 | [os.path.join(root_dir, "sub*.npy")], 277 | filter_fns=[], 278 | key="data_path", 279 | ) 280 | 281 | ######################### 282 | # RUNNING THE PIPELINE # 283 | ######################### 284 | 285 | logging.info("Starting with the EEG preprocessing") 286 | logging.info("===================================") 287 | 288 | eeg_steps = [ 289 | LoadEEGNumpyTest(), 290 | CommonAverageRereference(), 291 | ResamplePoly(64, axis=1), 292 | DefaultSave( 293 | preprocessed_eeg_dir, 294 | {"eeg": "data"}, 295 | overwrite=overwrite, 296 | clear_output=True, 297 | filename_fn=test_filename_fn, 298 | ), 299 | ] 300 | 301 | ######################### 302 | # RUNNING THE PIPELINE # 303 | ######################### 304 | 305 | logging.info("Starting with the EEG preprocessing") 306 | logging.info("===================================") 307 | 308 | # Create data_dicts for the EEG files 309 | # Create the EEG pipeline 310 | eeg_pipeline = DefaultPipeline(steps=eeg_steps) 311 | 312 | DefaultRunner( 313 | nb_processes=nb_processes, 314 | logging_config=lambda: None, 315 | ).run( 316 | [(data_loader, eeg_pipeline)], 317 | 318 | ) 319 | 320 | 321 | if __name__ == "__main__": 322 | # Load the config 323 | # get the top folder of the dataset 324 | challenge_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 325 | with open(os.path.join(challenge_folder, 'util', 'config.json'), "r") as f: 326 | config = json.load(f) 327 | 328 | # Set the correct paths as default arguments 329 | dataset_folder = config["dataset_folder"] 330 | test_folder = os.path.join(dataset_folder, config["test_folder"]) 331 | task = 'TASK1_match_mismatch' # [' TASK1_match_mismatch', 'TASK2_regression'] 332 | 333 | preprocessed_eeg_folder = os.path.join( 334 | test_folder, task, f'{config["preprocessed_eeg_folder"]}' 335 | ) 336 | raw_eeg_dir = os.path.join(test_folder, task, 'MWFilter_eeg') 337 | # unpack the data 338 | 339 | raw_eeg_data = glob.glob(os.path.join(raw_eeg_dir, '*_mwf.npz')) 340 | for data_path in raw_eeg_data: 341 | print(f'processing {data_path}') 342 | temp_unpack_data(data_path) 343 | 344 | default_log_folder = os.path.dirname(os.path.abspath(__file__)) 345 | # Parse arguments from the command line 346 | parser = argparse.ArgumentParser(description="Preprocess the auditory EEG dataset") 347 | parser.add_argument( 348 | "--nb_processes", 349 | type=int, 350 | default=1, 351 | help="Number of processes to use for the preprocessing. " 352 | "The default is to use all available cores (-1).", 353 | ) 354 | parser.add_argument( 355 | "--overwrite", action="store_true", help="Overwrite existing files" 356 | ) 357 | parser.add_argument( 358 | "--log_path", type=str, default=os.path.join( 359 | default_log_folder, 360 | "sparrKULee_{datetime}.log" 361 | ) 362 | ) 363 | parser.add_argument( 364 | "--dataset_folder", 365 | type=str, 366 | default=raw_eeg_dir, 367 | help="Path to the folder where the dataset is downloaded", 368 | ) 369 | 370 | parser.add_argument( 371 | "--preprocessed_raw_eeg_path", 372 | type=str, 373 | default=preprocessed_eeg_folder, 374 | help="Path to the folder where the preprocessed EEG will be saved", 375 | ) 376 | args = parser.parse_args() 377 | 378 | # Run the preprocessing pipeline 379 | run_preprocessing_pipeline( 380 | args.dataset_folder, 381 | args.preprocessed_raw_eeg_path, 382 | args.nb_processes, 383 | args.overwrite, 384 | args.log_path.format( 385 | datetime=datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 386 | ), 387 | ) -------------------------------------------------------------------------------- /preprocessing_code/sparrKULee.py: -------------------------------------------------------------------------------- 1 | """Run the default preprocessing pipeline on soarrKULee.""" 2 | import argparse 3 | import datetime 4 | import gzip 5 | import json 6 | import logging 7 | import os 8 | from typing import Any, Dict, Sequence 9 | 10 | import librosa 11 | import numpy as np 12 | import math 13 | import scipy.signal.windows 14 | from brain_pipe.dataloaders.path import GlobLoader 15 | from brain_pipe.pipeline.default import DefaultPipeline 16 | from brain_pipe.preprocessing.brain.artifact import ( 17 | InterpolateArtifacts, 18 | ArtifactRemovalMWF, 19 | ) 20 | from brain_pipe.preprocessing.brain.eeg.biosemi import ( 21 | biosemi_trigger_processing_fn, 22 | ) 23 | from brain_pipe.preprocessing.brain.eeg.load import LoadEEGNumpy 24 | from brain_pipe.preprocessing.brain.epochs import SplitEpochs 25 | from brain_pipe.preprocessing.brain.link import ( 26 | LinkStimulusToBrainResponse, 27 | BIDSStimulusInfoExtractor, 28 | ) 29 | from brain_pipe.preprocessing.brain.rereference import CommonAverageRereference 30 | from brain_pipe.preprocessing.brain.trigger import ( 31 | AlignPeriodicBlockTriggers, 32 | ) 33 | from brain_pipe.preprocessing.filter import SosFiltFilt 34 | from brain_pipe.preprocessing.resample import ResamplePoly 35 | from brain_pipe.preprocessing.stimulus.audio.envelope import GammatoneEnvelope 36 | from brain_pipe.preprocessing.stimulus.audio.spectrogram import LibrosaMelSpectrogram 37 | 38 | from brain_pipe.preprocessing.stimulus.load import LoadStimuli 39 | from brain_pipe.runner.default import DefaultRunner 40 | from brain_pipe.save.default import DefaultSave 41 | # from mel import DefaultSave 42 | from brain_pipe.utils.log import default_logging, DefaultFormatter 43 | from brain_pipe.utils.path import BIDSStimulusGrouper 44 | 45 | 46 | class BIDSAPRStimulusInfoExtractor(BIDSStimulusInfoExtractor): 47 | """Extract BIDS compliant stimulus information from an .apr file.""" 48 | 49 | def __call__(self, brain_dict: Dict[str, Any]): 50 | """Extract BIDS compliant stimulus information from an events.tsv file. 51 | 52 | Parameters 53 | ---------- 54 | brain_dict: Dict[str, Any] 55 | The data dict containing the brain data path. 56 | 57 | Returns 58 | ------- 59 | Sequence[Dict[str, Any]] 60 | The extracted event information. Each dict contains the information 61 | of one row in the events.tsv file 62 | """ 63 | event_info = super().__call__(brain_dict) 64 | # Find the apr file 65 | path = brain_dict[self.brain_path_key] 66 | apr_path = "_".join(path.split("_")[:-1]) + "_eeg.apr" 67 | # Read apr file 68 | apr_data = self.get_apr_data(apr_path) 69 | # Add apr data to event info 70 | for e_i in event_info: 71 | e_i.update(apr_data) 72 | return event_info 73 | 74 | def get_apr_data(self, apr_path: str): 75 | """Get the SNR from an .apr file. 76 | 77 | Parameters 78 | ---------- 79 | apr_path: str 80 | Path to the .apr file. 81 | 82 | Returns 83 | ------- 84 | Dict[str, Any] 85 | The SNR. 86 | """ 87 | import xml.etree.ElementTree as ET 88 | 89 | apr_data = {} 90 | tree = ET.parse(apr_path) 91 | root = tree.getroot() 92 | 93 | # Get SNR 94 | interactive_elements = root.findall(".//interactive/entry") 95 | for element in interactive_elements: 96 | description_element = element.find("description") 97 | if description_element.text == "SNR": 98 | apr_data["snr"] = element.find("new_value").text 99 | if "snr" not in apr_data: 100 | logging.warning(f"Could not find SNR in {apr_path}.") 101 | apr_data["snr"] = 100.0 102 | return apr_data 103 | 104 | 105 | def default_librosa_load_fn(path): 106 | """Load a stimulus using librosa. 107 | 108 | Parameters 109 | ---------- 110 | path: str 111 | Path to the audio file. 112 | 113 | Returns 114 | ------- 115 | Dict[str, Any] 116 | The data and the sampling rate. 117 | """ 118 | data, sr = librosa.load(path, sr=None) 119 | return {"data": data, "sr": sr} 120 | 121 | 122 | def default_npz_load_fn(path): 123 | """Load a stimulus from a .npz file. 124 | 125 | Parameters 126 | ---------- 127 | path: str 128 | Path to the .npz file. 129 | 130 | Returns 131 | ------- 132 | Dict[str, Any] 133 | The data and the sampling rate. 134 | """ 135 | np_data = np.load(path) 136 | return { 137 | "data": np_data["audio"], 138 | "sr": np_data["fs"], 139 | } 140 | 141 | 142 | DEFAULT_LOAD_FNS = { 143 | ".wav": default_librosa_load_fn, 144 | ".mp3": default_librosa_load_fn, 145 | ".npz": default_npz_load_fn, 146 | } 147 | 148 | 149 | def temp_stimulus_load_fn(path): 150 | """Load stimuli from (Gzipped) files. 151 | 152 | Parameters 153 | ---------- 154 | path: str 155 | Path to the stimulus file. 156 | 157 | Returns 158 | ------- 159 | Dict[str, Any] 160 | Dict containing the data under the key "data" and the sampling rate 161 | under the key "sr". 162 | """ 163 | if path.endswith(".gz"): 164 | with gzip.open(path, "rb") as f_in: 165 | data = dict(np.load(f_in)) 166 | return { 167 | "data": data["audio"], 168 | "sr": data["fs"], 169 | } 170 | 171 | extension = "." + ".".join(path.split(".")[1:]) 172 | if extension not in DEFAULT_LOAD_FNS: 173 | raise ValueError( 174 | f"Can't find a load function for extension {extension}. " 175 | f"Available extensions are {str(list(DEFAULT_LOAD_FNS.keys()))}." 176 | ) 177 | load_fn = DEFAULT_LOAD_FNS[extension] 178 | return load_fn(path) 179 | 180 | 181 | def bids_filename_fn(data_dict, feature_name, set_name=None): 182 | """Default function to generate a filename for the data. 183 | 184 | Parameters 185 | ---------- 186 | data_dict: Dict[str, Any] 187 | The data dict containing the data to save. 188 | feature_name: str 189 | The name of the feature. 190 | set_name: Optional[str] 191 | The name of the set. If no set name is given, the set name is not 192 | included in the filename. 193 | 194 | Returns 195 | ------- 196 | str 197 | The filename. 198 | """ 199 | 200 | filename = os.path.basename(data_dict["data_path"]).split("_eeg")[0] 201 | 202 | subject = filename.split("_")[0] 203 | session = filename.split("_")[1] 204 | filename += f"_desc-preproc-audio-{os.path.basename(data_dict.get('stimulus_path', '*.')).split('.')[0]}_{feature_name}" 205 | 206 | if set_name is not None: 207 | filename += f"_set-{set_name}" 208 | 209 | return os.path.join(subject, session, filename + ".npy") 210 | 211 | def get_hop_length(arg, data_dict): 212 | return int((1 / 128) * data_dict["stimulus_sr"]) 213 | def get_n_fft(arg, data_dict): 214 | return int(math.pow(2, math.ceil(math.log2(int(0.025 * data_dict["stimulus_sr"]))))) 215 | def get_win_length(arg, data_dict): 216 | return int(0.025 * data_dict["stimulus_sr"]) 217 | 218 | def get_default_librosa_kwargs(): 219 | 220 | librosa_kwargs = { 221 | "window": 'hann', 222 | "hop_length": get_hop_length, 223 | "n_fft": get_n_fft, 224 | "win_length": get_win_length, 225 | "fmin": 0, 226 | "fmax": 5000, 227 | "htk": False, 228 | "n_mels": 10, 229 | "center": False, 230 | "norm": 'slaney' 231 | } 232 | return librosa_kwargs 233 | 234 | def run_preprocessing_pipeline( 235 | root_dir, 236 | preprocessed_stimuli_dir, 237 | preprocessed_eeg_dir, 238 | nb_processes=4, 239 | overwrite=False, 240 | log_path="sparrKULee.log", 241 | ): 242 | """Construct and run the preprocessing on SparrKULee. 243 | 244 | Parameters 245 | ---------- 246 | root_dir: str 247 | The root directory of the dataset. 248 | preprocessed_stimuli_dir: 249 | The directory where the preprocessed stimuli should be saved. 250 | preprocessed_eeg_dir: 251 | The directory where the preprocessed EEG should be saved. 252 | nb_processes: int 253 | The number of processes to use. If -1, the number of processes is 254 | automatically determined. 255 | overwrite: bool 256 | Whether to overwrite existing files. 257 | log_path: str 258 | The path to the log file. 259 | """ 260 | ######### 261 | # PATHS # 262 | ######### 263 | os.makedirs(preprocessed_eeg_dir, exist_ok=True) 264 | os.makedirs(preprocessed_stimuli_dir, exist_ok=True) 265 | 266 | ########### 267 | # LOGGING # 268 | ########### 269 | handler = logging.FileHandler(log_path) 270 | handler.setLevel(logging.DEBUG) 271 | handler.setFormatter(DefaultFormatter()) 272 | default_logging(handlers=[handler]) 273 | 274 | ################ 275 | # DATA LOADING # 276 | ################ 277 | logging.info("Retrieving BIDS layout...") 278 | data_loader = GlobLoader( 279 | [os.path.join(root_dir, "sub-*", "*", "eeg", "*.bdf*")], 280 | filter_fns=[lambda x: "restingState" not in x], 281 | key="data_path", 282 | ) 283 | 284 | ######### 285 | # STEPS # 286 | ######### 287 | 288 | stimulus_steps = DefaultPipeline( 289 | steps=[ 290 | LoadStimuli(load_fn=temp_stimulus_load_fn), 291 | GammatoneEnvelope(), 292 | LibrosaMelSpectrogram(librosa_kwargs=get_default_librosa_kwargs()), 293 | ResamplePoly(64, data_key = ['spectrogram_data', 'envelope_data'], sampling_frequency_key = ['spectrogram_sr', 'stimulus_sr'], axis=0), 294 | DefaultSave( 295 | preprocessed_stimuli_dir, 296 | to_save={'mel': 'spectrogram_data', 'envelope': 'envelope_data' }, 297 | overwrite=overwrite 298 | ), 299 | DefaultSave(preprocessed_stimuli_dir, overwrite=overwrite), 300 | ], 301 | on_error=DefaultPipeline.RAISE, 302 | ) 303 | 304 | eeg_steps = [ 305 | LinkStimulusToBrainResponse( 306 | stimulus_data=stimulus_steps, 307 | extract_stimuli_information_fn=BIDSAPRStimulusInfoExtractor(), 308 | grouper=BIDSStimulusGrouper( 309 | bids_root=root_dir, 310 | mapping={"stim_file": "stimulus_path", "trigger_file": "trigger_path"}, 311 | subfolders=["stimuli", "eeg"], 312 | ), 313 | ), 314 | LoadEEGNumpy(unit_multiplier=1e6, channels_to_select=list(range(64))), 315 | SosFiltFilt( 316 | scipy.signal.butter(1, 0.5, "highpass", fs=1024, output="sos"), 317 | emulate_matlab=True, 318 | axis=1, 319 | ), 320 | InterpolateArtifacts(), 321 | AlignPeriodicBlockTriggers(biosemi_trigger_processing_fn), 322 | SplitEpochs(), 323 | ArtifactRemovalMWF(), 324 | CommonAverageRereference(), 325 | ResamplePoly(64, axis=1), 326 | DefaultSave( 327 | preprocessed_eeg_dir, 328 | {"eeg": "data"}, 329 | overwrite=overwrite, 330 | clear_output=True, 331 | filename_fn=bids_filename_fn, 332 | ), 333 | ] 334 | 335 | ######################### 336 | # RUNNING THE PIPELINE # 337 | ######################### 338 | 339 | logging.info("Starting with the EEG preprocessing") 340 | logging.info("===================================") 341 | 342 | # Create data_dicts for the EEG files 343 | # Create the EEG pipeline 344 | eeg_pipeline = DefaultPipeline(steps=eeg_steps) 345 | 346 | DefaultRunner( 347 | nb_processes=nb_processes, 348 | logging_config=lambda: None, 349 | ).run( 350 | [(data_loader, eeg_pipeline)], 351 | 352 | ) 353 | 354 | 355 | if __name__ == "__main__": 356 | # Load the config 357 | # get the top folder of the dataset 358 | challenge_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 359 | with open(os.path.join(challenge_folder, 'util', 'config.json'), "r") as f: 360 | config = json.load(f) 361 | 362 | # Set the correct paths as default arguments 363 | dataset_folder = config["dataset_folder"] 364 | derivatives_folder = os.path.join(dataset_folder, config["derivatives_folder"]) 365 | preprocessed_stimuli_folder = os.path.join( 366 | derivatives_folder, config["preprocessed_stimuli_folder"] 367 | ) 368 | preprocessed_eeg_folder = os.path.join( 369 | derivatives_folder, config["preprocessed_eeg_folder"] 370 | ) 371 | default_log_folder = os.path.dirname(os.path.abspath(__file__)) 372 | 373 | # Parse arguments from the command line 374 | parser = argparse.ArgumentParser(description="Preprocess the auditory EEG dataset") 375 | parser.add_argument( 376 | "--nb_processes", 377 | type=int, 378 | default=-1, 379 | help="Number of processes to use for the preprocessing. " 380 | "The default is to use all available cores (-1).", 381 | ) 382 | parser.add_argument( 383 | "--overwrite", action="store_true", help="Overwrite existing files" 384 | ) 385 | parser.add_argument( 386 | "--log_path", type=str, default=os.path.join( 387 | default_log_folder, 388 | "sparrKULee_{datetime}.log" 389 | ) 390 | ) 391 | parser.add_argument( 392 | "--dataset_folder", 393 | type=str, 394 | default=dataset_folder, 395 | help="Path to the folder where the dataset is downloaded", 396 | ) 397 | parser.add_argument( 398 | "--preprocessed_stimuli_path", 399 | type=str, 400 | default=preprocessed_stimuli_folder, 401 | help="Path to the folder where the preprocessed stimuli will be saved", 402 | ) 403 | parser.add_argument( 404 | "--preprocessed_eeg_path", 405 | type=str, 406 | default=preprocessed_eeg_folder, 407 | help="Path to the folder where the preprocessed EEG will be saved", 408 | ) 409 | args = parser.parse_args() 410 | 411 | # Run the preprocessing pipeline 412 | run_preprocessing_pipeline( 413 | args.dataset_folder, 414 | args.preprocessed_stimuli_path, 415 | args.preprocessed_eeg_path, 416 | args.nb_processes, 417 | args.overwrite, 418 | args.log_path.format( 419 | datetime=datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 420 | ), 421 | ) 422 | -------------------------------------------------------------------------------- /preprocessing_code/split_and_normalize.py: -------------------------------------------------------------------------------- 1 | """Split data in sets and normalize (per recording).""" 2 | import glob 3 | import json 4 | import os 5 | import pickle 6 | 7 | 8 | import numpy as np 9 | 10 | 11 | if __name__ == "__main__": 12 | 13 | # Arguments for splitting and normalizing 14 | speech_features = ['envelope', 'mel'] 15 | splits = [80, 10, 10] 16 | split_names = ['train', 'val', 'test'] 17 | overwrite = False 18 | 19 | # Calculate the split fraction 20 | split_fractions = [x/sum(splits) for x in splits] 21 | 22 | # Get the path to the config file 23 | task_folder = os.path.dirname(os.path.dirname(__file__)) 24 | config_path = os.path.join(task_folder, 'util', 'config.json') 25 | 26 | # Load the config 27 | with open(config_path) as fp: 28 | config = json.load(fp) 29 | 30 | # Construct the necessary paths 31 | processed_eeg_folder = os.path.join(config["dataset_folder"],config['derivatives_folder'], f"{config['preprocessed_eeg_folder']}") 32 | processed_stimuli_folder = os.path.join(config["dataset_folder"],config['derivatives_folder'], f"{config['preprocessed_stimuli_folder']}") 33 | split_data_folder = os.path.join(config["dataset_folder"],config['derivatives_folder'], config["split_folder"]) 34 | 35 | # Create the output folder 36 | os.makedirs(split_data_folder, exist_ok=True) 37 | 38 | # Find all subjects 39 | all_subjects = glob.glob(os.path.join(processed_eeg_folder, "sub*")) 40 | nb_subjects = len(all_subjects) 41 | print(f"Found {nb_subjects} subjects to split/normalize") 42 | 43 | # Loop over subjects 44 | for subject_index, subject_path in enumerate(all_subjects): 45 | subject = os.path.basename(subject_path) 46 | print(f"Starting with subject {subject} ({subject_index + 1}/{nb_subjects})...") 47 | # Find all recordings 48 | all_recordings = glob.glob(os.path.join(subject_path, "*", "*.npy")) 49 | print(f"\tFound {len(all_recordings)} recordings for subject {subject}.") 50 | # Loop over recordings 51 | for recording_index, recording in enumerate(all_recordings): 52 | print(f"\tStarting with recording {recording} ({recording_index + 1}/{len(all_recordings)})...") 53 | 54 | # Load EEG from disk 55 | print(f"\t\tLoading EEG for {recording}") 56 | eeg = np.load(recording) 57 | 58 | # swap axes to have time as first dimension 59 | eeg = np.swapaxes(eeg, 0, 1) 60 | 61 | # keep only the 64 channels 62 | eeg = eeg[:, :64] 63 | 64 | # retrieve the stimulus name from the filename 65 | stimulus_filename = recording.split('_eeg.')[0].split('-audio-')[1] 66 | 67 | # Retrieve EEG data and pointer to the stimulus 68 | shortest_length = eeg.shape[0] 69 | 70 | # Create mapping between feature name and feature data 71 | all_data_for_recording = {"eeg": eeg} 72 | 73 | # Find corresponding stimuli for the EEG recording 74 | for feature_name in speech_features: 75 | # Load feature from disk 76 | print(f"\t\tLoading {feature_name} for recording {recording} ") 77 | stimulus_feature_path = os.path.join( 78 | processed_stimuli_folder, 79 | stimulus_filename + "_-_" + feature_name + ".npy", 80 | ) 81 | feature = np.load(stimulus_feature_path) 82 | # Calculate the shortest length 83 | shortest_length = min(feature.shape[0], shortest_length) 84 | # Update all_data_for_recording 85 | all_data_for_recording[feature_name] = feature 86 | 87 | # Do the actual splitting 88 | print(f"\t\tSplitting/normalizing recording {recording}...") 89 | for feature_name, feature in all_data_for_recording.items(): 90 | start_index = 0 91 | feature_mean = None 92 | feature_std = None 93 | 94 | for split_name, split_fraction in zip(split_names, split_fractions): 95 | end_index = start_index + int(shortest_length * split_fraction) 96 | 97 | # Cut the feature to the shortest length 98 | cut_feature = feature[start_index:end_index, ...] 99 | 100 | # Normalize the feature 101 | if feature_mean is None: 102 | feature_mean = np.mean(cut_feature, axis=0) 103 | feature_std = np.std(cut_feature, axis=0) 104 | norm_feature = (cut_feature - feature_mean)/feature_std 105 | 106 | # Save the feature 107 | save_filename = f"{split_name}_-_{subject}_-_{stimulus_filename}_-_{feature_name}.npy" 108 | save_path = os.path.join(split_data_folder, save_filename) 109 | if not os.path.exists(save_path) or overwrite: 110 | np.save(save_path, cut_feature) 111 | else: 112 | print(f"\t\tSkipping {save_filename} because it already exists") 113 | start_index = end_index -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow >=2.4.0 2 | numpy 3 | scipy 4 | brian2 5 | brian2hears 6 | librosa 7 | brain_pipe -------------------------------------------------------------------------------- /task1_match_mismatch/__init__.py: -------------------------------------------------------------------------------- 1 | """Code for task 1 of the Auditory EEG ICASSP challenge.""" 2 | -------------------------------------------------------------------------------- /task1_match_mismatch/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | """Experiments for task1.""" 2 | -------------------------------------------------------------------------------- /task1_match_mismatch/experiments/dilated_convolutional_model.py: -------------------------------------------------------------------------------- 1 | """Example experiment for the 2 mismatched segments dilation model.""" 2 | import glob 3 | import json 4 | import logging 5 | import os, sys 6 | import tensorflow as tf 7 | 8 | import sys 9 | # add base path to sys 10 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 11 | from task1_match_mismatch.models.dilated_convolutional_model import dilation_model 12 | 13 | from util.dataset_generator import DataGenerator, batch_equalizer_fn, create_tf_dataset 14 | 15 | 16 | def evaluate_model(model, test_dict): 17 | """Evaluate a model. 18 | 19 | Parameters 20 | ---------- 21 | model: tf.keras.Model 22 | Model to evaluate. 23 | test_dict: dict 24 | Mapping between a subject and a tf.data.Dataset containing the test 25 | set for the subject. 26 | 27 | Returns 28 | ------- 29 | dict 30 | Mapping between a subject and the loss/evaluation score on the test set 31 | """ 32 | evaluation = {} 33 | for subject, ds_test in test_dict.items(): 34 | logging.info(f"Scores for subject {subject}:") 35 | results = model.evaluate(ds_test, verbose=2) 36 | metrics = model.metrics_names 37 | evaluation[subject] = dict(zip(metrics, results)) 38 | return evaluation 39 | 40 | 41 | if __name__ == "__main__": 42 | # Parameters 43 | # Length of the decision window 44 | window_length_s = 5 45 | fs = 64 46 | 47 | window_length = window_length_s * fs # 5 seconds 48 | # Hop length between two consecutive decision windows 49 | hop_length = 64 50 | 51 | epochs = 100 52 | patience = 5 53 | batch_size = 64 54 | only_evaluate = True 55 | number_mismatch = 4 # or 4 56 | 57 | 58 | 59 | training_log_filename = "training_log_{}_{}.csv".format(number_mismatch, window_length_s) 60 | 61 | 62 | 63 | # Get the path to the config gile 64 | experiments_folder = os.path.dirname(__file__) 65 | task_folder = os.path.dirname(experiments_folder) 66 | util_folder = os.path.join(os.path.dirname(task_folder), "util") 67 | config_path = os.path.join(util_folder, 'config.json') 68 | 69 | # Load the config 70 | with open(config_path) as fp: 71 | config = json.load(fp) 72 | 73 | # Provide the path of the dataset 74 | # which is split already to train, val, test 75 | data_folder = os.path.join(config["dataset_folder"], config['derivatives_folder'], config["split_folder"]) 76 | 77 | # stimulus feature which will be used for training the model. Can be either 'envelope' ( dimension 1) or 'mel' (dimension 28) 78 | stimulus_features = ["envelope"] 79 | stimulus_dimension = 1 80 | 81 | # uncomment if you want to train with the mel spectrogram stimulus representation 82 | # stimulus_features = ["mel"] 83 | # stimulus_dimension = 10 84 | 85 | features = ["eeg"] + stimulus_features 86 | 87 | # Create a directory to store (intermediate) results 88 | results_folder = os.path.join(experiments_folder, "results_dilated_convolutional_model_{}_MM_{}_s_{}".format(number_mismatch, window_length_s, stimulus_features[0])) 89 | os.makedirs(results_folder, exist_ok=True) 90 | 91 | # create dilation model 92 | model = dilation_model(time_window=window_length, eeg_input_dimension=64, env_input_dimension=stimulus_dimension, num_mismatched_segments = number_mismatch) 93 | 94 | model_path = os.path.join(results_folder, "model_{}_MM_{}_s_{}.h5".format(number_mismatch, window_length_s, stimulus_features[0])) 95 | 96 | if only_evaluate: 97 | model = tf.keras.models.load_model(model_path) 98 | 99 | else: 100 | 101 | train_files = [x for x in glob.glob(os.path.join(data_folder, "train_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 102 | # Create list of numpy array files 103 | train_generator = DataGenerator(train_files, window_length) 104 | import pdb 105 | dataset_train = create_tf_dataset(train_generator, window_length, batch_equalizer_fn, 106 | hop_length, batch_size, 107 | number_mismatch=number_mismatch, 108 | data_types=(tf.float32, tf.float32), 109 | feature_dims=(64, stimulus_dimension)) 110 | 111 | # Create the generator for the validation set 112 | val_files = [x for x in glob.glob(os.path.join(data_folder, "val_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 113 | val_generator = DataGenerator(val_files, window_length) 114 | dataset_val = create_tf_dataset(val_generator, window_length, batch_equalizer_fn, 115 | hop_length, batch_size, 116 | number_mismatch=number_mismatch, 117 | data_types=(tf.float32, tf.float32), 118 | feature_dims=(64, stimulus_dimension)) 119 | 120 | 121 | # Train the model 122 | model.fit( 123 | dataset_train, 124 | epochs=epochs, 125 | validation_data=dataset_val, 126 | callbacks=[ 127 | tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True), 128 | tf.keras.callbacks.CSVLogger(os.path.join(results_folder, training_log_filename)), 129 | tf.keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True), 130 | ], 131 | ) 132 | 133 | test_window_lengths = [3,5] 134 | number_mismatch_test = [2,3,4, 8] 135 | for number_mismatch in number_mismatch_test: 136 | for window_length_s in test_window_lengths: 137 | window_length = window_length_s * fs 138 | results_filename = 'eval_{}_{}_s.json'.format(number_mismatch, window_length_s) 139 | 140 | model = dilation_model(time_window=window_length, eeg_input_dimension=64, 141 | env_input_dimension=stimulus_dimension, num_mismatched_segments=number_mismatch) 142 | 143 | model.load_weights(model_path) 144 | # Evaluate the model on test set 145 | # Create a dataset generator for each test subject 146 | test_files = [x for x in glob.glob(os.path.join(data_folder, "test_-_*")) if 147 | os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 148 | # Get all different subjects from the test set 149 | subjects = list(set([os.path.basename(x).split("_-_")[1] for x in test_files])) 150 | datasets_test = {} 151 | # Create a generator for each subject 152 | for sub in subjects: 153 | files_test_sub = [f for f in test_files if sub in os.path.basename(f)] 154 | test_generator = DataGenerator(files_test_sub, window_length) 155 | datasets_test[sub] = create_tf_dataset(test_generator, window_length, batch_equalizer_fn, 156 | hop_length, batch_size=1, 157 | number_mismatch=number_mismatch, 158 | data_types=(tf.float32, tf.float32), 159 | feature_dims=(64, stimulus_dimension)) 160 | 161 | evaluation = evaluate_model(model, datasets_test) 162 | 163 | # We can save our results in a json encoded file 164 | results_path = os.path.join(results_folder, results_filename) 165 | with open(results_path, "w") as fp: 166 | json.dump(evaluation, fp) 167 | logging.info(f"Results saved at {results_path}") 168 | -------------------------------------------------------------------------------- /task1_match_mismatch/experiments/test_match_mismatch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sample code to generate labels for test dataset of 3 | match-mismatch task. The requested format for submitting the labels is 4 | as follows: 5 | for each subject a json file containing a python dictionary in the 6 | format of ==> {'sample_id': prediction, ... }. 7 | 8 | """ 9 | 10 | import os 11 | import glob 12 | import json 13 | import numpy as np 14 | import glob 15 | import json 16 | import logging 17 | import os, sys 18 | import tensorflow as tf 19 | 20 | import sys 21 | # add base path to sys 22 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 23 | from task1_match_mismatch.models.dilated_convolutional_model import dilation_model 24 | 25 | from util.dataset_generator import DataGenerator, batch_equalizer_fn, create_tf_dataset 26 | 27 | 28 | 29 | 30 | 31 | if __name__ == '__main__': 32 | 33 | # Parameters 34 | # Length of the decision window 35 | window_length_s = 5 36 | fs = 64 37 | 38 | window_length = window_length_s * fs # 5 seconds 39 | # Hop length between two consecutive decision windows 40 | hop_length = 64 41 | 42 | epochs = 100 43 | patience = 5 44 | batch_size = 64 45 | number_mismatch = 4 # or 4 46 | 47 | # Get the path to the config gile 48 | experiments_folder = os.path.dirname(__file__) 49 | task_folder = os.path.dirname(experiments_folder) 50 | util_folder = os.path.join(os.path.dirname(task_folder), "util") 51 | config_path = os.path.join(util_folder, 'config.json') 52 | 53 | # Load the config 54 | with open(config_path) as fp: 55 | config = json.load(fp) 56 | 57 | # Provide the path of the dataset 58 | # which is split already to train, val, test 59 | data_folder = os.path.join(config["dataset_folder"], config["test_folder"], 'TASK1_match_mismatch') 60 | eeg_folder = os.path.join(data_folder, 'preprocessed_eeg') 61 | stimulus_folder = os.path.join(data_folder, 'stimulus') 62 | 63 | # # stimulus feature which will be used for training the model. Can be either 'envelope' ( dimension 1) or 'mel' (dimension 28) 64 | # stimulus_features = ["envelope"] 65 | # stimulus_dimension = 1 66 | 67 | # uncomment if you want to train with the mel spectrogram stimulus representation 68 | stimulus_features = ["mel"] 69 | stimulus_dimension = 10 70 | 71 | features = ["eeg"] + stimulus_features 72 | 73 | # Create a directory to store (intermediate) results 74 | results_folder = os.path.join(experiments_folder, 75 | "results_dilated_convolutional_model_{}_MM_{}_s_{}".format(number_mismatch, 76 | window_length_s, 77 | stimulus_features[0])) 78 | 79 | # create dilation model 80 | model = dilation_model(time_window=window_length, eeg_input_dimension=64, env_input_dimension=stimulus_dimension, 81 | num_mismatched_segments=number_mismatch) 82 | 83 | model_path = os.path.join(results_folder, 84 | "model_{}_MM_{}_s_{}.h5".format(number_mismatch, window_length_s, stimulus_features[0])) 85 | model.load_weights(model_path) 86 | 87 | 88 | 89 | test_eeg_mapping = glob.glob(os.path.join(data_folder, 'sub*mapping.json')) 90 | 91 | test_stimuli = glob.glob(os.path.join(stimulus_folder, f'*{stimulus_features[0]}*chunks.npz')) 92 | 93 | #load all test stimuli 94 | test_stimuli_data = {} 95 | for stimulus_path in test_stimuli: 96 | test_stimuli_data = dict(test_stimuli_data, **np.load(stimulus_path)) 97 | 98 | for sub_stimulus_mapping in test_eeg_mapping: 99 | subject = os.path.basename(sub_stimulus_mapping).split('_')[0] 100 | 101 | # load stimulus mapping 102 | sub_stimulus_mapping = json.load(open(sub_stimulus_mapping)) 103 | 104 | #load eeg data 105 | sub_path = os.path.join(eeg_folder, f'{subject}_eeg.npz') 106 | sub_eeg_data = dict(np.load(sub_path)) 107 | 108 | 109 | 110 | data_eeg = np.stack([[sub_eeg_data[value['eeg']]] for key, value in sub_stimulus_mapping.items() ]) 111 | # change dim 0 and 1 of eeg and unstack 112 | data_eeg = np.swapaxes(data_eeg, 0, 1) 113 | data_eeg = list(data_eeg) 114 | 115 | data_stimuli = np.stack([[test_stimuli_data[x] for x in value['stimulus']] for key, value in sub_stimulus_mapping.items()]) 116 | # change dim 0 and 1 of stimulus and unstack 117 | data_stimuli = np.swapaxes(data_stimuli, 0, 1) 118 | data_stimuli = list(data_stimuli) 119 | 120 | id_list= list(sub_stimulus_mapping.keys()) 121 | 122 | 123 | predictions = model.predict(data_eeg + data_stimuli) 124 | labels = np.argmax(predictions, axis=1) 125 | 126 | sub = dict(zip(id_list, [int(x) for x in labels])) 127 | 128 | prediction_dir = os.path.join(os.path.dirname(__file__), 'predictions') 129 | os.makedirs(prediction_dir, exist_ok=True) 130 | with open(os.path.join(prediction_dir, subject + '.json'), 'w') as f: 131 | json.dump(sub, f) 132 | 133 | 134 | -------------------------------------------------------------------------------- /task1_match_mismatch/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Models for task1.""" 2 | -------------------------------------------------------------------------------- /task1_match_mismatch/models/dilated_convolutional_model.py: -------------------------------------------------------------------------------- 1 | """2 mismatched segments dilation model.""" 2 | import tensorflow as tf 3 | 4 | 5 | def dilation_model( 6 | time_window=None, 7 | eeg_input_dimension=64, 8 | env_input_dimension=1, 9 | layers=3, 10 | kernel_size=3, 11 | spatial_filters=8, 12 | dilation_filters=16, 13 | activation="relu", 14 | compile=True, 15 | num_mismatched_segments=2 16 | ): 17 | """Convolutional dilation model. 18 | 19 | Code was taken and adapted from 20 | https://github.com/exporl/eeg-matching-eusipco2020 21 | 22 | Parameters 23 | ---------- 24 | time_window : int or None 25 | Segment length. If None, the model will accept every time window input 26 | length. 27 | eeg_input_dimension : int 28 | number of channels of the EEG 29 | env_input_dimension : int 30 | dimemsion of the stimulus representation. 31 | if stimulus == envelope, env_input_dimension =1 32 | if stimulus == mel, env_input_dimension =28 33 | layers : int 34 | Depth of the network/Number of layers 35 | kernel_size : int 36 | Size of the kernel for the dilation convolutions 37 | spatial_filters : int 38 | Number of parallel filters to use in the spatial layer 39 | dilation_filters : int 40 | Number of parallel filters to use in the dilation layers 41 | activation : str or list or tuple 42 | Name of the non-linearity to apply after the dilation layers 43 | or list/tuple of different non-linearities 44 | compile : bool 45 | If model should be compiled 46 | inputs : tuple 47 | Alternative inputs 48 | 49 | Returns 50 | ------- 51 | tf.Model 52 | The dilation model 53 | 54 | 55 | References 56 | ---------- 57 | Accou, B., Jalilpour Monesi, M., Montoya, J., Van hamme, H. & Francart, T. 58 | Modeling the relationship between acoustic stimulus and EEG with a dilated 59 | convolutional neural network. In 2020 28th European Signal Processing 60 | Conference (EUSIPCO), 1175–1179, DOI: 10.23919/Eusipco47968.2020.9287417 61 | (2021). ISSN: 2076-1465. 62 | 63 | Accou, B., Monesi, M. J., hamme, H. V. & Francart, T. 64 | Predicting speech intelligibility from EEG in a non-linear classification 65 | paradigm. J. Neural Eng. 18, 066008, DOI: 10.1088/1741-2552/ac33e9 (2021). 66 | Publisher: IOP Publishing 67 | """ 68 | 69 | eeg = tf.keras.layers.Input(shape=[time_window, eeg_input_dimension]) 70 | stimuli_input = [tf.keras.layers.Input(shape=[time_window, env_input_dimension]) for _ in range(num_mismatched_segments+1)] 71 | 72 | all_inputs = [eeg] 73 | all_inputs.extend(stimuli_input) 74 | 75 | 76 | stimuli_proj = [x for x in stimuli_input] 77 | 78 | # Activations to apply 79 | if isinstance(activation, str): 80 | activations = [activation] * layers 81 | else: 82 | activations = activation 83 | 84 | 85 | # Spatial convolution 86 | eeg_proj_1 = tf.keras.layers.Conv1D(spatial_filters, kernel_size=1)(eeg) 87 | 88 | # Construct dilation layers 89 | for layer_index in range(layers): 90 | # dilation on EEG 91 | eeg_proj_1 = tf.keras.layers.Conv1D( 92 | dilation_filters, 93 | kernel_size=kernel_size, 94 | dilation_rate=kernel_size ** layer_index, 95 | strides=1, 96 | activation=activations[layer_index], 97 | )(eeg_proj_1) 98 | 99 | # Dilation on envelope data, share weights 100 | env_proj_layer = tf.keras.layers.Conv1D( 101 | dilation_filters, 102 | kernel_size=kernel_size, 103 | dilation_rate=kernel_size ** layer_index, 104 | strides=1, 105 | activation=activations[layer_index], 106 | ) 107 | 108 | stimuli_proj = [env_proj_layer(stimulus_proj) for stimulus_proj in stimuli_proj] 109 | 110 | 111 | # Comparison 112 | cos = [tf.keras.layers.Dot(1, normalize=True)([eeg_proj_1, stimulus_proj]) for stimulus_proj in stimuli_proj] 113 | 114 | linear_proj_sim = tf.keras.layers.Dense(1, activation="linear") 115 | 116 | # Linear projection of similarity matrices 117 | cos_proj = [linear_proj_sim(tf.keras.layers.Flatten()(cos_i)) for cos_i in cos] 118 | 119 | 120 | # Classification 121 | out = tf.keras.activations.softmax((tf.keras.layers.Concatenate()(cos_proj))) 122 | 123 | 124 | model = tf.keras.Model(inputs=all_inputs, outputs=[out]) 125 | 126 | if compile: 127 | model.compile( 128 | optimizer=tf.keras.optimizers.Adam(), 129 | metrics=["accuracy"], 130 | loss=["categorical_crossentropy"], 131 | ) 132 | print(model.summary()) 133 | return model -------------------------------------------------------------------------------- /task2_regression/__init__.py: -------------------------------------------------------------------------------- 1 | """Code for task 2 of the Auditory EEG ICASSP challenge.""" 2 | -------------------------------------------------------------------------------- /task2_regression/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | """Experiments for task2.""" 2 | -------------------------------------------------------------------------------- /task2_regression/experiments/linear_baseline.py: -------------------------------------------------------------------------------- 1 | """Example experiment for a linear baseline method.""" 2 | import glob 3 | import json 4 | import logging 5 | import os 6 | # set gpu private 7 | os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' 8 | 9 | import tensorflow as tf 10 | 11 | import sys 12 | # add base path to sys 13 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 14 | 15 | import scipy.stats 16 | import numpy as np 17 | 18 | 19 | from task2_regression.models.linear import simple_linear_model, pearson_loss_cut, pearson_metric_cut, pearson_metric_cut_non_averaged 20 | from util.dataset_generator import DataGenerator, create_tf_dataset 21 | 22 | 23 | def evaluate_model(model, test_dict): 24 | """Evaluate a model. 25 | 26 | Parameters 27 | ---------- 28 | model: tf.keras.Model 29 | Model to evaluate. 30 | test_dict: dict 31 | Mapping between a subject and a tf.data.Dataset containing the test 32 | set for the subject. 33 | 34 | Returns 35 | ------- 36 | dict 37 | Mapping between a subject and the loss/evaluation score on the test set 38 | """ 39 | evaluation = {} 40 | for subject, ds_test in test_dict.items(): 41 | logging.info(f"Scores for subject {subject}:") 42 | # evaluate model 43 | ds = [x for x in ds_test] 44 | eeg = tf.concat([ x[0] for x in ds], axis=0) 45 | labels =tf.concat([ x[1] for x in ds], axis=0) 46 | 47 | 48 | reconstructions = model.predict(eeg) 49 | correlations = np.squeeze(pearson_metric_cut_non_averaged(labels, reconstructions)) 50 | 51 | # calculate pearson correlation per band 52 | 53 | results = model.evaluate(ds_test, verbose=2) 54 | 55 | metrics = model.metrics_names 56 | evaluation[subject] = dict(zip(metrics, results)) 57 | 58 | 59 | evaluation[subject]["pearson_correlation_per_band"] = np.mean(correlations, axis=0).tolist() 60 | # metrics = model.metrics_names 61 | # evaluation[subject] = dict(zip(metrics, results)) 62 | return evaluation 63 | 64 | 65 | 66 | if __name__ == "__main__": 67 | # Parameters 68 | # Length of the decision window 69 | fs = 64 70 | window_length = 60 * fs # 10 seconds 71 | # Hop length between two consecutive decision windows 72 | hop_length = 30*fs 73 | epochs = 100 74 | patience = 5 75 | batch_size = 64 76 | only_evaluate = True 77 | 78 | # Get the path to the config gile 79 | experiments_folder = os.path.dirname(__file__) 80 | task_folder = os.path.dirname(experiments_folder) 81 | util_folder = os.path.join(os.path.dirname(task_folder), "util") 82 | config_path = os.path.join(util_folder, 'config.json') 83 | 84 | # Load the config 85 | with open(config_path) as fp: 86 | config = json.load(fp) 87 | 88 | # Provide the path of the dataset 89 | # which is split already to train, val, test 90 | 91 | data_folder = os.path.join(config["dataset_folder"],config["derivatives_folder"], config["split_folder"]) 92 | stimulus_features = ["mel"] 93 | features = ["eeg"] + stimulus_features 94 | 95 | # Create a directory to store (intermediate) results 96 | results_folder = os.path.join(experiments_folder, "results_linear_baseline") 97 | os.makedirs(results_folder, exist_ok=True) 98 | 99 | # train a sub dependent model for each sub 100 | # Create a dataset generator for each training subject 101 | # Get all different subjects from the training set 102 | all_subs = list( 103 | set([os.path.basename(x).split("_-_")[1] for x in glob.glob(os.path.join(data_folder, "train_-_*"))])) 104 | 105 | 106 | # create a simple linear model 107 | model = simple_linear_model(integration_window = int(fs*0.25), nb_filters=10) 108 | model.summary() 109 | model_path = os.path.join(results_folder, f"model.h5") 110 | training_log_filename = f"training_log.csv" 111 | results_filename = f'eval.json' 112 | 113 | 114 | if only_evaluate: 115 | # load weights 116 | model.load_weights(model_path) 117 | else: 118 | 119 | train_files = [x for x in glob.glob(os.path.join(data_folder, "train_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features ] 120 | # Create list of numpy array files 121 | train_generator = DataGenerator(train_files, window_length) 122 | dataset_train = create_tf_dataset(train_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10)) 123 | 124 | # Create the generator for the validation set 125 | val_files = [x for x in glob.glob(os.path.join(data_folder, "val_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 126 | val_generator = DataGenerator(val_files, window_length) 127 | dataset_val = create_tf_dataset(val_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10)) 128 | 129 | # Train the model 130 | model.fit( 131 | dataset_train, 132 | epochs=epochs, 133 | validation_data=dataset_val, 134 | callbacks=[ 135 | tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True), 136 | tf.keras.callbacks.CSVLogger(os.path.join(results_folder, training_log_filename)), 137 | tf.keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True), 138 | ], 139 | workers = tf.data.AUTOTUNE, 140 | use_multiprocessing=True 141 | 142 | ) 143 | 144 | # Evaluate the model on test set 145 | # Create a dataset generator for each test subject 146 | test_files = [x for x in glob.glob(os.path.join(data_folder, "test_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 147 | # Get all different subjects from the test set 148 | subjects = list(set([os.path.basename(x).split("_-_")[1] for x in test_files])) 149 | datasets_test = {} 150 | # Create a generator for each subject 151 | for sub in subjects: 152 | files_test_sub = [f for f in test_files if sub in os.path.basename(f)] 153 | test_generator = DataGenerator(files_test_sub, window_length) 154 | datasets_test[sub] = create_tf_dataset(test_generator, window_length, None, hop_length, batch_size=1, data_types=(tf.float32, tf.float32), feature_dims=(64, 10)) 155 | 156 | # Evaluate the model 157 | evaluation = evaluate_model(model, datasets_test) 158 | 159 | # We can save our results in a json encoded file 160 | results_path = os.path.join(results_folder, results_filename) 161 | with open(results_path, "w") as fp: 162 | json.dump(evaluation, fp) 163 | logging.info(f"Results saved at {results_path}") 164 | -------------------------------------------------------------------------------- /task2_regression/experiments/test_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sample code to generate test labels (reconstructed envelopes) for 3 | the regression task. The requested format for submitting the reconstructed envelopes is 4 | as follows: 5 | for each subject a json file containing a python dictionary in the 6 | format of ==> {'sample_id': reconstructed_envelope, ... }. 7 | """ 8 | 9 | 10 | import os 11 | import glob 12 | import json 13 | import numpy as np 14 | import glob 15 | import json 16 | import logging 17 | import os, sys 18 | import tensorflow as tf 19 | 20 | import sys 21 | # add base path to sys 22 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 23 | 24 | 25 | import os 26 | import glob 27 | import json 28 | import numpy as np 29 | from task2_regression.models.linear import simple_linear_model 30 | # from task2_regression.models.vlaai import vlaai, pearson_loss, pearson_metric, pearson_tf_non_averaged 31 | 32 | 33 | if __name__ == '__main__': 34 | 35 | # Parameters 36 | window_length_s = 30*64 # 30 seconds 37 | # Root dataset directory containing test set 38 | # Parameters 39 | # Length of the decision window 40 | fs = 64 41 | 42 | window_length = window_length_s * fs # 5 seconds 43 | # Hop length between two consecutive decision windows 44 | 45 | # Get the path to the config gile 46 | experiments_folder = os.path.dirname(__file__) 47 | task_folder = os.path.dirname(experiments_folder) 48 | util_folder = os.path.join(os.path.dirname(task_folder), "util") 49 | config_path = os.path.join(util_folder, 'config.json') 50 | 51 | # Load the config 52 | with open(config_path) as fp: 53 | config = json.load(fp) 54 | 55 | # Provide the path of the dataset 56 | # which is split already to train, val, test 57 | data_folder = os.path.join(config["dataset_folder"], config["test_folder"], 'TASK2_regression') 58 | eeg_folder = os.path.join(data_folder, 'preprocessed_eeg') 59 | 60 | 61 | # uncomment if you want to train with the mel spectrogram stimulus representation 62 | stimulus_features = ["mel"] 63 | stimulus_dimension = 10 64 | 65 | features = ["eeg"] + stimulus_features 66 | 67 | pretrained_model = os.path.join(os.path.dirname(__file__), 'results_linear_baseline', 'model.h5') 68 | 69 | # Define and load the pretrained model 70 | model = simple_linear_model(integration_window = int(fs*0.25), nb_filters=10) 71 | model.load_weights(pretrained_model) 72 | 73 | 74 | test_eeg_mapping = glob.glob(os.path.join(data_folder, 'sub*mapping.json')) 75 | 76 | for sub_stimulus_mapping in test_eeg_mapping: 77 | subject = os.path.basename(sub_stimulus_mapping).split('_')[0] 78 | 79 | # load stimulus mapping 80 | sub_stimulus_mapping = json.load(open(sub_stimulus_mapping)) 81 | 82 | #load eeg data 83 | sub_path = os.path.join(eeg_folder, f'{subject}_eeg.npz') 84 | sub_eeg_data = dict(np.load(sub_path)) 85 | 86 | data_eeg = np.stack([sub_eeg_data[value['eeg']] for key, value in sub_stimulus_mapping.items() ]) 87 | 88 | id_list= list(sub_stimulus_mapping.keys()) 89 | 90 | # predict 91 | predictions = model.predict(data_eeg) 92 | 93 | # Make predictions json-serializable 94 | predictions = [np.array(value).tolist() for value in np.squeeze(predictions)] 95 | 96 | # Create dictionary from id_list and predictions 97 | sub = dict(zip(id_list, predictions)) 98 | 99 | prediction_dir = os.path.join(os.path.dirname(__file__), 'predictions') 100 | os.makedirs(prediction_dir, exist_ok=True) 101 | with open(os.path.join(prediction_dir, subject + '.json'), 'w') as f: 102 | json.dump(sub, f) 103 | 104 | 105 | -------------------------------------------------------------------------------- /task2_regression/experiments/vlaai_mel.py: -------------------------------------------------------------------------------- 1 | """Example experiment for the VLAAI model.""" 2 | import glob 3 | import json 4 | import logging 5 | import os 6 | # set gpu private 7 | os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' 8 | import tensorflow as tf 9 | 10 | 11 | import numpy as np 12 | import sys 13 | # add base path to sys 14 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 15 | 16 | from task2_regression.models.vlaai import vlaai, pearson_loss, pearson_metric, pearson_tf_non_averaged 17 | from util.dataset_generator import DataGenerator, create_tf_dataset 18 | 19 | 20 | def evaluate_model(model, test_dict): 21 | """Evaluate a model. 22 | 23 | Parameters 24 | ---------- 25 | model: tf.keras.Model 26 | Model to evaluate. 27 | test_dict: dict 28 | Mapping between a subject and a tf.data.Dataset containing the test 29 | set for the subject. 30 | 31 | Returns 32 | ------- 33 | dict 34 | Mapping between a subject and the loss/evaluation score on the test set 35 | """ 36 | evaluation = {} 37 | for subject, ds_test in test_dict.items(): 38 | logging.info(f"Scores for subject {subject}:") 39 | # evaluate model 40 | ds = [x for x in ds_test] 41 | eeg = tf.concat([x[0] for x in ds], axis=0) 42 | labels = tf.concat([x[1] for x in ds], axis=0) 43 | 44 | reconstructions = model.predict(eeg) 45 | correlations = np.squeeze(pearson_tf_non_averaged(labels, reconstructions)) 46 | 47 | # calculate pearson correlation per band 48 | 49 | results = model.evaluate(ds_test, verbose=2) 50 | 51 | metrics = model.metrics_names 52 | evaluation[subject] = dict(zip(metrics, results)) 53 | 54 | evaluation[subject]["pearson_correlation_per_band"] = np.mean(correlations, axis=0).tolist() 55 | # metrics = model.metrics_names 56 | # evaluation[subject] = dict(zip(metrics, results)) 57 | return evaluation 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | # Parameters 63 | # Length of the decision window 64 | fs= 64 65 | window_length = 5 * fs # 10 seconds 66 | # Hop length between two consecutive decision windows 67 | hop_length = 1*fs 68 | epochs = 100 69 | patience = 10 70 | batch_size = 10 71 | only_evaluate = True 72 | training_log_filename = "training_log.csv" 73 | results_filename = 'eval.json' 74 | 75 | 76 | # Get the path to the config gile 77 | experiments_folder = os.path.dirname(__file__) 78 | task_folder = os.path.dirname(experiments_folder) 79 | util_folder = os.path.join(os.path.dirname(task_folder), "util") 80 | config_path = os.path.join(util_folder, 'config.json') 81 | 82 | # Load the config 83 | with open(config_path) as fp: 84 | config = json.load(fp) 85 | 86 | # Provide the path of the dataset 87 | # which is split already to train, val, test 88 | 89 | data_folder = os.path.join(config["dataset_folder"],config["derivatives_folder"], config["split_folder"]) 90 | stimulus_features = ["mel"] 91 | features = ["eeg"] + stimulus_features 92 | 93 | 94 | # Create a directory to store (intermediate) results 95 | results_folder = os.path.join(experiments_folder, "results_vlaai_mel") 96 | os.makedirs(results_folder, exist_ok=True) 97 | 98 | # create the model 99 | model = vlaai() 100 | model.compile(tf.keras.optimizers.Adam(), loss=pearson_loss, metrics=[pearson_metric]) 101 | model_path = os.path.join(results_folder, "model.h5") 102 | 103 | if only_evaluate: 104 | 105 | model.load_weights(model_path) 106 | else: 107 | train_files = [x for x in glob.glob(os.path.join(data_folder, "train_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 108 | # Create list of numpy array files 109 | train_generator = DataGenerator(train_files, window_length) 110 | dataset_train = create_tf_dataset(train_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10)) 111 | 112 | # Create the generator for the validation set 113 | val_files = [x for x in glob.glob(os.path.join(data_folder, "val_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features ] 114 | val_generator = DataGenerator(val_files, window_length) 115 | dataset_val = create_tf_dataset(val_generator, window_length, None, hop_length, batch_size, data_types=(tf.float32, tf.float32), feature_dims=(64, 10)) 116 | 117 | # Train the model 118 | model.fit( 119 | dataset_train, 120 | epochs=epochs, 121 | validation_data=dataset_val, 122 | callbacks=[ 123 | tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True), 124 | tf.keras.callbacks.CSVLogger(os.path.join(results_folder, training_log_filename)), 125 | tf.keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True), 126 | ], 127 | workers = tf.data.AUTOTUNE, 128 | use_multiprocessing=True 129 | ) 130 | 131 | # Evaluate the model on test set 132 | # Create a dataset generator for each test subject 133 | test_files = [x for x in glob.glob(os.path.join(data_folder, "test_-_*")) if os.path.basename(x).split("_-_")[-1].split(".")[0] in features] 134 | # Get all different subjects from the test set 135 | subjects = list(set([os.path.basename(x).split("_-_")[1] for x in test_files])) 136 | datasets_test = {} 137 | # Create a generator for each subject 138 | for sub in subjects: 139 | files_test_sub = [f for f in test_files if sub in os.path.basename(f)] 140 | test_generator = DataGenerator(files_test_sub, window_length) 141 | datasets_test[sub] = create_tf_dataset(test_generator, window_length, None, hop_length, batch_size=64, data_types=(tf.float32, tf.float32), feature_dims=(64, 10)) 142 | 143 | evaluation = evaluate_model(model, datasets_test) 144 | 145 | # We can save our results in a json encoded file 146 | results_path = os.path.join(results_folder, results_filename) 147 | with open(results_path, "w") as fp: 148 | json.dump(evaluation, fp) 149 | logging.info(f"Results saved at {results_path}") 150 | -------------------------------------------------------------------------------- /task2_regression/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Models for task2.""" 2 | -------------------------------------------------------------------------------- /task2_regression/models/linear.py: -------------------------------------------------------------------------------- 1 | """ This module contains linear backward model""" 2 | import tensorflow as tf 3 | 4 | from task2_regression.models.vlaai import pearson_tf, pearson_tf_non_averaged 5 | 6 | 7 | @tf.function 8 | def pearson_loss_cut(y_true, y_pred, axis=1): 9 | """Pearson loss function. 10 | 11 | Parameters 12 | ---------- 13 | y_true: tf.Tensor 14 | True values. Shape is (batch_size, time_steps, n_features) 15 | y_pred: tf.Tensor 16 | Predicted values. Shape is (batch_size, time_steps, n_features) 17 | 18 | Returns 19 | ------- 20 | tf.Tensor 21 | Pearson loss. 22 | Shape is (batch_size, 1, n_features) 23 | """ 24 | return -pearson_tf(y_true[:, : tf.shape(y_pred)[1], :], y_pred, axis=axis) 25 | 26 | 27 | @tf.function 28 | def pearson_metric_cut(y_true, y_pred, axis=1): 29 | """Pearson metric function. 30 | 31 | Parameters 32 | ---------- 33 | y_true: tf.Tensor 34 | True values. Shape is (batch_size, time_steps, n_features) 35 | y_pred: tf.Tensor 36 | Predicted values. Shape is (batch_size, time_steps, n_features) 37 | 38 | Returns 39 | ------- 40 | tf.Tensor 41 | Pearson metric. 42 | Shape is (batch_size, 1, n_features) 43 | """ 44 | return pearson_tf(y_true[:, : tf.shape(y_pred)[1], :], y_pred, axis=axis) 45 | 46 | @tf.function 47 | def pearson_metric_cut_non_averaged(y_true, y_pred, axis=1): 48 | """Pearson metric function. 49 | 50 | Parameters 51 | ---------- 52 | y_true: tf.Tensor 53 | True values. Shape is (batch_size, time_steps, n_features) 54 | y_pred: tf.Tensor 55 | Predicted values. Shape is (batch_size, time_steps, n_features) 56 | 57 | Returns 58 | ------- 59 | tf.Tensor 60 | Pearson metric. 61 | Shape is (batch_size, 1, n_features) 62 | """ 63 | return pearson_tf_non_averaged(y_true[:, : tf.shape(y_pred)[1], :], y_pred, axis=axis) 64 | 65 | 66 | 67 | def simple_linear_model(integration_window=32, nb_filters=1, nb_channels=64): 68 | inp = tf.keras.layers.Input( 69 | ( 70 | None, 71 | nb_channels, 72 | ) 73 | ) 74 | out = tf.keras.layers.Conv1D(nb_filters, integration_window)(inp) 75 | model = tf.keras.models.Model(inputs=[inp], outputs=[out]) 76 | model.compile( 77 | tf.keras.optimizers.Adam(), 78 | loss=pearson_loss_cut, 79 | metrics=[pearson_metric_cut] 80 | ) 81 | return model 82 | 83 | def simple_linear_model_stimulus(integration_window=32, nb_filters=1, nb_channels=64): 84 | inp = tf.keras.layers.Input( 85 | ( 86 | None, 87 | nb_channels, 88 | ) 89 | 90 | 91 | ) 92 | # env = abs(s) 93 | # f0= np.phase(s) 94 | # f0 = np.angle(s) 95 | 96 | # reconstruct env 97 | # reconsturct f0 98 | # reconstructed s = real(reconstructed_env .*exp(1j*reconstructed_f0))./ np.max(abs(reconstructed_env)) 99 | 100 | out = tf.keras.layers.Conv1D(nb_filters, integration_window)(inp) 101 | model = tf.keras.models.Model(inputs=[inp], outputs=[out]) 102 | model.compile( 103 | tf.keras.optimizers.Adam(), 104 | loss=pearson_loss_cut, 105 | metrics=[pearson_metric_cut] 106 | ) 107 | return model 108 | -------------------------------------------------------------------------------- /task2_regression/models/vlaai.py: -------------------------------------------------------------------------------- 1 | """Code to construct the VLAAI network. 2 | Code was extrcted from https://github.com/exporl/vlaai 3 | """ 4 | import tensorflow as tf 5 | 6 | 7 | def extractor( 8 | filters=(256, 256, 256, 128, 128), 9 | kernels=(64,) * 5, 10 | dilation_rate = 1, 11 | input_channels=64, 12 | normalization_fn=lambda x: tf.keras.layers.LayerNormalization()(x), 13 | activation_fn=lambda x: tf.keras.layers.LeakyReLU()(x), 14 | name="extractor", 15 | ): 16 | """Construct the extractor model. 17 | 18 | Parameters 19 | ---------- 20 | filters: Sequence[int]python 21 | Number of filters for each layer. 22 | kernels: Sequence[int] 23 | Kernel size for each layer. 24 | input_channels: int 25 | Number of EEG channels in the input 26 | normalization_fn: Callable[[tf.Tensor], tf.Tensor] 27 | Function to normalize the contents of a tensor. 28 | activation_fn: Callable[[tf.Tensor], tf.Tensor] 29 | Function to apply an activation function to the contents of a tensor. 30 | name: str 31 | Name of the model. 32 | 33 | Returns 34 | ------- 35 | tf.keras.models.Model 36 | The extractor model. 37 | """ 38 | eeg = tf.keras.layers.Input((None, input_channels)) 39 | 40 | x = eeg 41 | 42 | if len(filters) != len(kernels): 43 | raise ValueError("'filters' and 'kernels' must have the same length") 44 | 45 | # Add the convolutional layers 46 | i = 0 47 | for filter_, kernel in zip(filters, kernels): 48 | i +=1 49 | 50 | if i == len(filters) : 51 | padding = 'valid' 52 | else: 53 | padding = 'valid' 54 | x = tf.keras.layers.Conv1D(filter_, kernel, dilation_rate=dilation_rate,padding=padding )(x) 55 | x = normalization_fn(x) 56 | x = activation_fn(x) 57 | x = tf.keras.layers.ZeroPadding1D((0, kernel - 1))(x) 58 | 59 | return tf.keras.models.Model(inputs=[eeg], outputs=[x], name=name) 60 | 61 | 62 | def output_context( 63 | filter_=64, 64 | kernel=64, 65 | input_channels=64, 66 | normalization_fn=lambda x: tf.keras.layers.LayerNormalization()(x), 67 | activation_fn=lambda x: tf.keras.layers.LeakyReLU()(x), 68 | name="output_context_model", 69 | ): 70 | """Construct the output context model. 71 | 72 | Parameters 73 | ---------- 74 | filter_: int 75 | Number of filters for the convolutional layer. 76 | kernel: int 77 | Kernel size for the convolutional layer. 78 | input_channels: int 79 | Number of EEG channels in the input. 80 | normalization_fn: Callable[[tf.Tensor], tf.Tensor] 81 | Function to normalize the contents of a tensor. 82 | activation_fn: Callable[[tf.Tensor], tf.Tensor] 83 | Function to apply an activation function to the contents of a tensor. 84 | name: str 85 | Name of the model. 86 | 87 | Returns 88 | ------- 89 | tf.keras.models.Model 90 | The output context model. 91 | """ 92 | inp = tf.keras.layers.Input((None, input_channels)) 93 | x = tf.keras.layers.ZeroPadding1D((kernel - 1, 0))(inp) 94 | x = tf.keras.layers.Conv1D(filter_, kernel)(x) 95 | x = normalization_fn(x) 96 | x = activation_fn(x) 97 | return tf.keras.models.Model(inputs=[inp], outputs=[x], name=name) 98 | 99 | 100 | def vlaai( 101 | nb_blocks=4, 102 | extractor_model=None, 103 | output_context_model=None, 104 | use_skip=True, 105 | input_channels=64, 106 | output_dim=1, 107 | name="vlaai", 108 | ): 109 | """Construct the VLAAI model. 110 | 111 | Parameters 112 | ---------- 113 | nb_blocks: int 114 | Number of repeated blocks to use. 115 | extractor_model: Callable[[tf.Tensor], tf.Tensor] 116 | The extractor model to use. 117 | output_context_model: Callable[[tf.Tensor], tf.Tensor] 118 | The output context model to use. 119 | use_skip: bool 120 | Whether to use skip connections. 121 | input_channels: int 122 | Number of EEG channels in the input. 123 | output_dim: int 124 | Number of output dimensions. 125 | name: str 126 | Name of the model. 127 | 128 | Returns 129 | ------- 130 | tf.keras.models.Model 131 | The VLAAI model. 132 | """ 133 | if extractor_model is None: 134 | extractor_model = extractor() 135 | if output_context_model is None: 136 | output_context_model = output_context() 137 | 138 | eeg = tf.keras.layers.Input((None, input_channels)) 139 | 140 | # If using skip connections: start with x set to zero 141 | if use_skip: 142 | x = tf.zeros_like(eeg) 143 | else: 144 | x = eeg 145 | 146 | # Iterate over the blocks 147 | for i in range(nb_blocks): 148 | if use_skip: 149 | x = extractor_model(eeg + x) 150 | else: 151 | x = extractor_model(x) 152 | x = tf.keras.layers.Dense(input_channels)(x) 153 | x = output_context_model(x) 154 | 155 | x = tf.keras.layers.Dense(output_dim)(x) 156 | 157 | return tf.keras.models.Model(inputs=[eeg], outputs=[x], name=name) 158 | 159 | 160 | def pearson_tf(y_true, y_pred, axis=1): 161 | """Pearson correlation function implemented in tensorflow. 162 | 163 | Parameters 164 | ---------- 165 | y_true: tf.Tensor 166 | Ground truth labels. Shape is (batch_size, time_steps, n_features) 167 | y_pred: tf.Tensor 168 | Predicted labels. Shape is (batch_size, time_steps, n_features) 169 | axis: int 170 | Axis along which to compute the pearson correlation. Default is 1. 171 | 172 | Returns 173 | ------- 174 | tf.Tensor 175 | Pearson correlation. 176 | Shape is (batch_size, 1, n_features) if axis is 1. 177 | """ 178 | # Compute the mean of the true and predicted values 179 | y_true_mean = tf.reduce_mean(y_true, axis=axis, keepdims=True) 180 | y_pred_mean = tf.reduce_mean(y_pred, axis=axis, keepdims=True) 181 | 182 | # Compute the numerator and denominator of the pearson correlation 183 | numerator = tf.reduce_sum( 184 | (y_true - y_true_mean) * (y_pred - y_pred_mean), 185 | axis=axis, 186 | keepdims=True, 187 | ) 188 | std_true = tf.reduce_sum(tf.square(y_true - y_true_mean), axis=axis, keepdims=True) 189 | std_pred = tf.reduce_sum(tf.square(y_pred - y_pred_mean), axis=axis, keepdims=True) 190 | denominator = tf.sqrt(std_true * std_pred) 191 | 192 | # Compute the pearson correlation 193 | return tf.reduce_mean(tf.math.divide_no_nan(numerator, denominator), axis=-1) 194 | 195 | def pearson_tf_non_averaged(y_true, y_pred, axis=1): 196 | """Pearson correlation function implemented in tensorflow. 197 | 198 | Parameters 199 | ---------- 200 | y_true: tf.Tensor 201 | Ground truth labels. Shape is (batch_size, time_steps, n_features) 202 | y_pred: tf.Tensor 203 | Predicted labels. Shape is (batch_size, time_steps, n_features) 204 | axis: int 205 | Axis along which to compute the pearson correlation. Default is 1. 206 | 207 | Returns 208 | ------- 209 | tf.Tensor 210 | Pearson correlation. 211 | Shape is (batch_size, 1, n_features) if axis is 1. 212 | """ 213 | # Compute the mean of the true and predicted values 214 | y_true_mean = tf.reduce_mean(y_true, axis=axis, keepdims=True) 215 | y_pred_mean = tf.reduce_mean(y_pred, axis=axis, keepdims=True) 216 | 217 | # Compute the numerator and denominator of the pearson correlation 218 | numerator = tf.reduce_sum( 219 | (y_true - y_true_mean) * (y_pred - y_pred_mean), 220 | axis=axis, 221 | keepdims=True, 222 | ) 223 | std_true = tf.reduce_sum(tf.square(y_true - y_true_mean), axis=axis, keepdims=True) 224 | std_pred = tf.reduce_sum(tf.square(y_pred - y_pred_mean), axis=axis, keepdims=True) 225 | denominator = tf.sqrt(std_true * std_pred) 226 | 227 | # Compute the pearson correlation 228 | return tf.math.divide_no_nan(numerator, denominator) 229 | 230 | 231 | @tf.function 232 | def pearson_loss(y_true, y_pred, axis=1): 233 | """Pearson loss function. 234 | 235 | Parameters 236 | ---------- 237 | y_true: tf.Tensor 238 | True values. Shape is (batch_size, time_steps, n_features) 239 | y_pred: tf.Tensor 240 | Predicted values. Shape is (batch_size, time_steps, n_features) 241 | 242 | Returns 243 | ------- 244 | tf.Tensor 245 | Pearson loss. 246 | Shape is (batch_size, 1, n_features) 247 | """ 248 | return -pearson_tf(y_true, y_pred, axis=axis) 249 | 250 | 251 | @tf.function 252 | def pearson_metric(y_true, y_pred, axis=1): 253 | """Pearson metric function. 254 | 255 | Parameters 256 | ---------- 257 | y_true: tf.Tensor 258 | True values. Shape is (batch_size, time_steps, n_features) 259 | y_pred: tf.Tensor 260 | Predicted values. Shape is (batch_size, time_steps, n_features) 261 | 262 | Returns 263 | ------- 264 | tf.Tensor 265 | Pearson metric. 266 | Shape is (batch_size, 1, n_features) 267 | """ 268 | return pearson_tf(y_true, y_pred, axis=axis) 269 | -------------------------------------------------------------------------------- /util/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_folder": "PATH/TO/sparrKULee", 3 | "derivatives_folder": "derivatives", 4 | "preprocessed_eeg_folder": "preprocessed_eeg", 5 | "preprocessed_stimuli_folder": "preprocessed_stimuli", 6 | "split_folder": "split_data", 7 | "test_folder": "test_set" 8 | } 9 | -------------------------------------------------------------------------------- /util/dataset_generator.py: -------------------------------------------------------------------------------- 1 | """Code for the dataset_generator for both tasks.""" 2 | import itertools 3 | import os 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | 8 | @tf.function 9 | def batch_equalizer_fn(*args): 10 | """Batch equalizer. 11 | Prepares the inputs for a model to be trained in 12 | match-mismatch task. It makes sure that match_env 13 | and mismatch_env are equally presented as a first 14 | envelope in match-mismatch task. 15 | 16 | Parameters 17 | ---------- 18 | args : Sequence[tf.Tensor] 19 | List of tensors representing feature data 20 | 21 | Returns 22 | ------- 23 | Tuple[Tuple[tf.Tensor], tf.Tensor] 24 | Tuple of the EEG/speech features serving as the input to the model and 25 | the labels for the match/mismatch task 26 | 27 | Notes 28 | ----- 29 | This function will also double the batch size. E.g. if the batch size of 30 | the elements in each of the args was 32, the output features will have 31 | a batch size of 64. 32 | """ 33 | eeg = args[0] 34 | num_stimuli = len(args) - 1 35 | # repeat eeg num_stimuli times 36 | new_eeg = tf.concat([eeg] * num_stimuli, axis=0) 37 | all_features = [new_eeg] 38 | 39 | # create args 40 | args_to_zip = [args[i::num_stimuli] for i in range(1,num_stimuli+1)] 41 | for stimuli_features in zip(*args_to_zip): 42 | 43 | for i in range(num_stimuli): 44 | stimulus_rolled = tf.roll(stimuli_features, shift=i, axis=0) 45 | # reshape stimulus_rolled to merge the first two dimensions 46 | stimulus_rolled = tf.reshape(stimulus_rolled, [tf.shape(stimulus_rolled)[0] * tf.shape(stimulus_rolled)[1], stimuli_features[0].shape[-2], stimuli_features[0].shape[-1]]) 47 | 48 | all_features.append(stimulus_rolled) 49 | labels = tf.concat( 50 | [ 51 | tf.tile(tf.constant([[1 if ii == i else 0 for ii in range(num_stimuli)]]), [tf.shape(eeg)[0], 1]) for i in range(num_stimuli) 52 | ], axis=0 53 | ) 54 | 55 | return tuple(all_features), labels 56 | 57 | def shuffle_fn(args, number_mismatch): 58 | # repeat the last argument number_ mismatch times 59 | args = list(args) 60 | for _ in range(number_mismatch): 61 | args.append(tf.random.shuffle(args[-1])) 62 | return tuple(args) 63 | 64 | 65 | 66 | def create_tf_dataset( 67 | data_generator, 68 | window_length, 69 | batch_equalizer_fn=None, 70 | hop_length=64, 71 | batch_size=64, 72 | data_types=(tf.float32, tf.float32), 73 | feature_dims=(64, 1), 74 | number_mismatch = None # None for regression, 2 or 4 for match-mismatch 75 | ): 76 | """Creates a tf.data.Dataset. 77 | 78 | This will be used to create a dataset generator that will 79 | pass windowed data to a model in both tasks. 80 | 81 | Parameters 82 | --------- 83 | data_generator: DataGenerator 84 | A data generator. 85 | window_length: int 86 | Length of the decision window in samples. 87 | batch_equalizer_fn: Callable 88 | Function that will be applied on the data after batching (using 89 | the `map` method from tf.data.Dataset). In the match/mismatch task, 90 | this function creates the imposter segments and labels. 91 | hop_length: int 92 | Hop length between two consecutive decision windows. 93 | batch_size: Optional[int] 94 | If not None, specifies the batch size. In the match/mismatch task, 95 | this amount will be doubled by the default_batch_equalizer_fn 96 | data_types: Union[Sequence[tf.dtype], tf.dtype] 97 | The data types that the individual features of data_generator should 98 | be cast to. If you only specify a single datatype, it will be chosen 99 | for all EEG/speech features. 100 | 101 | Returns 102 | ------- 103 | tf.data.Dataset 104 | A Dataset object that generates data to train/evaluate models 105 | efficiently 106 | """ 107 | # create tf dataset from generator 108 | dataset = tf.data.Dataset.from_generator( 109 | data_generator, 110 | output_signature=tuple( 111 | tf.TensorSpec(shape=(None, x), dtype=data_types[index]) 112 | for index, x in enumerate(feature_dims) 113 | ), 114 | ) 115 | # window dataset 116 | dataset = dataset.map( 117 | lambda *args: [ 118 | tf.signal.frame(arg, window_length, hop_length, axis=0) 119 | for arg in args 120 | ], 121 | num_parallel_calls=tf.data.AUTOTUNE 122 | ) 123 | 124 | if number_mismatch is not None: 125 | # map second argument to shifted version 126 | 127 | 128 | dataset = dataset.map( lambda *args : shuffle_fn(args, number_mismatch), 129 | 130 | num_parallel_calls=tf.data.AUTOTUNE 131 | ) 132 | # batch data 133 | dataset = dataset.interleave( 134 | lambda *args: tf.data.Dataset.from_tensor_slices(args), 135 | cycle_length=8, 136 | block_length=1, 137 | num_parallel_calls=tf.data.AUTOTUNE, 138 | ) 139 | if batch_size is not None: 140 | dataset = dataset.batch(batch_size, drop_remainder=True) 141 | 142 | if batch_equalizer_fn is not None: 143 | # Create the labels and make sure classes are balanced 144 | dataset = dataset.map(batch_equalizer_fn, 145 | num_parallel_calls=tf.data.AUTOTUNE) 146 | 147 | return dataset 148 | 149 | 150 | def group_recordings(files): 151 | """Group recordings and corresponding stimuli. 152 | 153 | Parameters 154 | ---------- 155 | files : Sequence[Union[str, pathlib.Path]] 156 | List of filepaths to preprocessed and split EEG and speech features 157 | 158 | Returns 159 | ------- 160 | list 161 | Files grouped by the self.group_key_fn and subsequently sorted 162 | by the self.feature_sort_fn. 163 | """ 164 | new_files = [] 165 | grouped = itertools.groupby(sorted(files), lambda x: "_-_".join(os.path.basename(x).split("_-_")[:3])) 166 | for recording_name, feature_paths in grouped: 167 | new_files += [sorted(feature_paths, key=lambda x: "0" if x == "eeg" else x)] 168 | return new_files 169 | 170 | 171 | 172 | class DataGenerator: 173 | """Generate data for the Match/Mismatch task.""" 174 | 175 | def __init__( 176 | self, 177 | files, 178 | window_length, 179 | ): 180 | """Initialize the DataGenerator. 181 | 182 | Parameters 183 | ---------- 184 | files: Sequence[Union[str, pathlib.Path]] 185 | Files to load. 186 | window_length: int 187 | Length of the decision window. 188 | spacing: int 189 | Spacing between matched and mismatched samples 190 | """ 191 | self.window_length = window_length 192 | self.files = self.group_recordings(files) 193 | 194 | 195 | def group_recordings(self, files): 196 | """Group recordings and corresponding stimuli. 197 | 198 | Parameters 199 | ---------- 200 | files : Sequence[Union[str, pathlib.Path]] 201 | List of filepaths to preprocessed and split EEG and speech features 202 | 203 | Returns 204 | ------- 205 | list 206 | Files grouped by the self.group_key_fn and subsequently sorted 207 | by the self.feature_sort_fn. 208 | """ 209 | new_files = [] 210 | grouped = itertools.groupby(sorted(files), lambda x: "_-_".join(os.path.basename(x).split("_-_")[:3])) 211 | for recording_name, feature_paths in grouped: 212 | new_files += [sorted(feature_paths, key=lambda x: "0" if x == "eeg" else x)] 213 | return new_files 214 | 215 | def __len__(self): 216 | return len(self.files) 217 | 218 | def __getitem__(self, recording_index): 219 | """Get data for a certain recording. 220 | 221 | Parameters 222 | ---------- 223 | recording_index: int 224 | Index of the recording in this dataset 225 | 226 | Returns 227 | ------- 228 | Union[Tuple[tf.Tensor,...], Tuple[np.ndarray,...]] 229 | The features corresponding to the recording_index recording 230 | """ 231 | data = [] 232 | for feature in self.files[recording_index]: 233 | f = np.load(feature).astype(np.float32) 234 | if f.ndim == 1: 235 | f = f[:,None] 236 | 237 | data += [f] 238 | data = self.prepare_data(data) 239 | return tuple(tf.constant(x) for x in data) 240 | 241 | 242 | def __call__(self): 243 | """Load data for the next recording. 244 | 245 | Yields 246 | ------- 247 | Union[Tuple[tf.Tensor,...], Tuple[np.ndarray,...]] 248 | The features corresponding to the recording_index recording 249 | """ 250 | for idx in range(self.__len__()): 251 | yield self.__getitem__(idx) 252 | 253 | if idx == self.__len__() - 1: 254 | self.on_epoch_end() 255 | 256 | def on_epoch_end(self): 257 | """Change state at the end of an epoch.""" 258 | np.random.shuffle(self.files) 259 | 260 | def prepare_data(self, data): 261 | # make sure data has dimensionality of (n_samples, n_features) 262 | 263 | 264 | return data 265 | 266 | 267 | --------------------------------------------------------------------------------