├── ecog2txt ├── auxiliary │ ├── vocab.demo2-reduced.125 │ ├── EFC │ │ ├── demo2_word_sequence.yaml │ │ ├── mocha-1_word_sequence.yaml │ │ ├── mochastar_word_sequence.yaml │ │ └── block_breakdowns.json │ └── vocab.mocha-timit.1806 ├── __init__.py ├── data_generators.py ├── subjects.py └── trainers.py ├── setup.py └── README.md /ecog2txt/auxiliary/vocab.demo2-reduced.125: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | guests_ 5 | used_ 6 | coming_ 7 | falling_ 8 | because_ 9 | eaten_ 10 | are_ 11 | by_ 12 | chaos_ 13 | i_ 14 | men_ 15 | bill_ 16 | high_ 17 | boy_ 18 | stool_ 19 | bird_ 20 | presents_ 21 | branch_ 22 | jar_ 23 | arrived_ 24 | cat_ 25 | doesnt_ 26 | is_ 27 | there_ 28 | broom_ 29 | how_ 30 | think_ 31 | sink_ 32 | old_ 33 | commotion_ 34 | under_ 35 | woman_ 36 | lit_ 37 | will_ 38 | man_ 39 | could_ 40 | their_ 41 | sister_ 42 | turning_ 43 | children_ 44 | only_ 45 | at_ 46 | hiding_ 47 | kitchen_ 48 | not_ 49 | down_ 50 | kids_ 51 | ate_ 52 | did_ 53 | steal_ 54 | a_ 55 | overflowing_ 56 | candles_ 57 | for_ 58 | of_ 59 | the_ 60 | if_ 61 | little_ 62 | and_ 63 | large_ 64 | dog_ 65 | tree_ 66 | window_ 67 | jumping_ 68 | water_ 69 | attention_ 70 | cake_ 71 | watching_ 72 | to_ 73 | pet_ 74 | table_ 75 | mother_ 76 | holding_ 77 | over_ 78 | in_ 79 | several_ 80 | years_ 81 | giggling_ 82 | seem_ 83 | stuck_ 84 | cookie_ 85 | her_ 86 | pay_ 87 | bushes_ 88 | him_ 89 | ladder_ 90 | barking_ 91 | tricycle_ 92 | angry_ 93 | with_ 94 | interested_ 95 | room_ 96 | riding_ 97 | off_ 98 | tipping_ 99 | which_ 100 | worried_ 101 | was_ 102 | get_ 103 | adults_ 104 | outside_ 105 | considers_ 106 | crying_ 107 | on_ 108 | sofa_ 109 | rescue_ 110 | from_ 111 | partially_ 112 | child_ 113 | firemen_ 114 | come_ 115 | reaching_ 116 | does_ 117 | be_ 118 | want_ 119 | while_ 120 | helping_ 121 | four_ 122 | girl_ 123 | grabs_ 124 | part_ 125 | his_ 126 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import sys 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | 8 | python_subversion = int(sys.version.split('.')[1]) 9 | if python_subversion > 8: 10 | install_requires = [ 11 | 'numpy', 12 | 'scipy', 'matplotlib', 'pandas', 'seaborn', 'tikzplotlib', 13 | 'python_speech_features', 'pyyaml', 14 | 'hickle==5.0.2', 'protobuf==3.20.3', 15 | 'tensorflow', 'tensorflow-probability', 16 | ] 17 | else: 18 | install_requires = [ 19 | 'numpy==1.22.4', 20 | 'scipy', 'matplotlib', 'pandas', 'seaborn', 'tikzplotlib', 21 | 'python_speech_features', 'pyyaml', 22 | # 'dopamine-rl==2.0.5', 'jax==0.3.13', 'jaxlib==0.3.10', 'flax==0.4.2', 23 | # tensorflow won't like this; ignore it 24 | 'hickle==5.0.2', 'protobuf==3.20.3', 25 | 'tensor2tensor==1.15.7', 'tensorflow-probability==0.7', 26 | # 'samplerate', 27 | # 'tensorflow-gpu==1.15.3' # the cpu version will also work 28 | ] 29 | 30 | setuptools.setup( 31 | name="ecog2txt", 32 | version="0.7.0", 33 | author="J.G. Makin", 34 | author_email="jgmakin@gmail.com", 35 | description="Code for decoding speech as text from neural data", 36 | long_description=long_description, 37 | long_description_content_type="text/markdown", 38 | url="https://github.com/jgmakin/ecog2txt", 39 | packages=setuptools.find_packages(), 40 | package_data={ 41 | 'ecog2txt': [ 42 | 'auxiliary/block_breakdowns.json', 43 | 'auxiliary/example_experiment_manifest.yaml', 44 | 'auxiliary/vocab.mocha-timit.1806', 45 | ] 46 | }, 47 | classifiers=[ 48 | "Development Status :: 3 - Alpha", 49 | "Intended Audience :: Science/Research", 50 | "Topic :: Scientific/Engineering", 51 | "Programming Language :: Python :: 3", 52 | # "License :: OSI Approved :: MIT License", 53 | "Operating System :: OS Independent", 54 | ], 55 | install_requires=install_requires 56 | ) 57 | -------------------------------------------------------------------------------- /ecog2txt/__init__.py: -------------------------------------------------------------------------------- 1 | # standard libraries 2 | import os 3 | 4 | # third-party packages 5 | import pandas as pd 6 | 7 | # from tensor2tensor.data_generators import text_encoder 8 | 9 | # paths 10 | text_dir = os.path.join(os.path.dirname(__file__), 'auxiliary') 11 | 12 | # other useful variables 13 | EOS_token = '' # text_encoder.EOS 14 | pad_token = '' # text_encoder.PAD 15 | OOV_token = '' 16 | 17 | # useful sets 18 | TOKEN_TYPES = { 19 | 'phoneme', 'word', 'trial', 'word_sequence', 'word_piece_sequence', 20 | 'phoneme_sequence' 21 | } 22 | DATA_PARTITIONS = {'training', 'validation', 'testing'} 23 | 24 | # useful linguistic things 25 | consonant_dict = { 26 | 'phoneme': [ 27 | 'p', 'b', 't', 'd', 'k', 'g', 28 | 'f', 'v', '\u03B8', '\u00F0', 's', 'z', '\u0283', '\u0292', 'h', 29 | 't\u0283', 'd\u0292', 30 | 'm', 'n', '\u014b', 31 | 'l', 'r', # '\u0279', 32 | 'w', 'j', 33 | ], 34 | 'voicing': [ 35 | 'voiceless', 'voiced', 'voiceless', 'voiced', 'voiceless', 'voiced', 36 | 'voiceless', 'voiced', 'voiceless', 'voiced', 'voiceless', 37 | 'voiced', 'voiceless', 'voiced', 'voiceless', 38 | 'voiceless', 'voiced', 39 | 'voiced', 'voiced', 'voiced', 40 | 'voiced', 'voiced', 41 | 'voiced', 'voiced', 42 | ], 43 | 'place': [ 44 | 'bilabial', 'bilabial', 'alveolar', 'alveolar', 'velar', 'velar', 45 | 'labiodental', 'labiodental', 'dental', 'dental', 'alveolar', 46 | 'alveolar', 'palatal', 'palatal', 'glotal', 47 | 'palatal', 'palatal', 48 | 'bilabial', 'alveolar', 'velar', 49 | 'alveolar', 'palatal', 50 | 'labio-velar', 'palatal' 51 | ], 52 | 'manner': [ 53 | 'stop', 'stop', 'stop', 'stop', 'stop', 'stop', 54 | 'fricative', 'fricative', 'fricative', 'fricative', 'fricative', 55 | 'fricative', 'fricative', 'fricative', 'fricative', 56 | 'affricate', 'affricate', 57 | 'nasal', 'nasal', 'nasal', 58 | 'liquid', 'liquid', 59 | 'approximant', 'approximant', 60 | ], 61 | 'ARPABET': [ 62 | 'p', 'b', 't', 'd', 'k', 'g', 63 | 'f', 'v', 'th', 'dh', 's', 'z', 'sh', 'zh', 'hh', 64 | 'ch', 'jh', 65 | 'm', 'n', 'ng', 66 | 'l', 'r', 67 | 'w', 'y', 68 | ] 69 | } 70 | consonant_df = pd.DataFrame(consonant_dict) 71 | 72 | # "Acoustic Characteristics of American English Vowels" 73 | # Hillenbrand et al 74 | # J. Acoustic Soc. Am., 97(5), Pt. 1 75 | # 1995 76 | vowel_dict = { 77 | 'phoneme': ['i', '\u026A', 'e', '\u025B', '\u00E6', '\u0251', '\u0252', '\u0254', 'o', '\u028A', 'u', '\u028C'], 78 | 'F1': [342, 427, 476, 580, 588, 768, 768, 652, 497, 469, 378, 623], 79 | 'F2': [2322, 2034, 2089, 1799, 1952, 1333, 1333, 997, 910, 1122, 997, 1200], 80 | } 81 | # '\u0259'? 'a'? 82 | vowel_df = pd.DataFrame(vowel_dict) 83 | -------------------------------------------------------------------------------- /ecog2txt/auxiliary/EFC/demo2_word_sequence.yaml: -------------------------------------------------------------------------------- 1 | 402: 2 | DataGenerator: &id001 !!python/name:ecog2txt.data_generators.ECoGDataGenerator '' 3 | EMA_decay: 0.99 4 | FF_dropout: 0.1 5 | N_epochs: 800 6 | REFERENCE_BIPOLAR: true 7 | RGB_color: !!python/tuple 8 | - 0.4588235294117647 9 | - 0.4392156862745098 10 | - 0.7019607843137254 11 | RNN_dropout: 0.5 12 | TEMPORALLY_CONVOLVE: true 13 | USE_FIELD_POTENTIALS: false 14 | USE_LOG_MELS: false 15 | USE_MFCC_DELTAS: false 16 | alias: participant c 17 | anatomy_grand_list: 18 | - middle frontal 19 | - inferior temporal 20 | - middle temporal 21 | - superior temporal 22 | - IFG 23 | - supramarginal 24 | - vSMC 25 | assessment_epoch_interval: 10 26 | audio_sequence_penalty_scale: 0.1 27 | azimuth: 180 28 | bad_electrodes_path: /home/makin/data/ecog/EFC402/bad_electrodes 29 | beam_width: 1 30 | block_descriptors: 31 | - type 32 | block_types: 33 | testing: !!set 34 | picture_description_birthday_reduced: null 35 | picture_description_kitchen_reduced: null 36 | picture_description_tree_reduced: null 37 | training: !!set 38 | picture_description_birthday_reduced: null 39 | picture_description_birthday_reduced_mimed: null 40 | picture_description_kitchen_reduced: null 41 | picture_description_kitchen_reduced_mimed: null 42 | picture_description_kitchen_reduced_tiny: null 43 | picture_description_reduced: null 44 | picture_description_tree_reduced: null 45 | picture_description_tree_reduced_mimed: null 46 | validation: !!set 47 | picture_description_birthday_reduced: null 48 | picture_description_kitchen_reduced: null 49 | picture_description_tree_reduced: null 50 | data_mapping: 51 | decoder_targets: text_sequence 52 | encoder_1_targets: phoneme_sequence 53 | encoder_inputs: ecog_sequence 54 | decimation_factor: null 55 | electrode_path: /home/makin/data/ecog/EFC402/TDT_elecs_all.mat 56 | elevation: 10 57 | encoder_1_targets_penalty_scale: 1.0 58 | grid_names: 59 | - InferiorGrid 60 | grid_size: 61 | - 8 62 | - 16 63 | grid_step: 1 64 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 65 | layer_sizes: 66 | decoder_embedding: 67 | - 150 68 | decoder_projection: [] 69 | decoder_rnn: 70 | - 800 71 | encoder_1_projection: 72 | - 225 73 | encoder_embedding: 74 | - 100 75 | encoder_rnn: 76 | - 400 77 | - 400 78 | - 400 79 | mfcc_winlen: 0.02 80 | model_class: null 81 | num_cepstral_coeffs: 0 82 | num_mel_features: 26 83 | num_unique_training_sentences: 30 84 | phoneme_sequence_vocab_file: vocab.phonemes.42 85 | png_partial_path: /home/makin/figures/word_sequence/EFC402/{0}.png 86 | project: EFC 87 | sampling_rate: 190.73486328125 88 | sampling_rate_decimated: 16.5 89 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 90 | subject_name: EFC402 91 | temperature: 0.384 92 | text_sequence_vocab_file: vocab.demo2-reduced.125 93 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC402_B{0}.tfrecord 94 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 95 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC402/{0}.tex 96 | token_type: word_sequence 97 | 403: 98 | DataGenerator: *id001 99 | EMA_decay: 0.99 100 | FF_dropout: 0.1 101 | N_epochs: 800 102 | REFERENCE_BIPOLAR: true 103 | RGB_color: !!python/tuple 104 | - 0.6509803921568628 105 | - 0.4627450980392157 106 | - 0.11372549019607843 107 | RNN_dropout: 0.5 108 | TEMPORALLY_CONVOLVE: true 109 | USE_FIELD_POTENTIALS: false 110 | USE_LOG_MELS: false 111 | USE_MFCC_DELTAS: false 112 | alias: participant d 113 | anatomy_grand_list: 114 | - middle frontal 115 | - inferior temporal 116 | - middle temporal 117 | - superior temporal 118 | - IFG 119 | - supramarginal 120 | - vSMC 121 | assessment_epoch_interval: 10 122 | audio_sequence_penalty_scale: 0.1 123 | azimuth: 170 124 | bad_electrodes_path: /home/makin/data/ecog/EFC403/bad_electrodes 125 | beam_width: 1 126 | block_descriptors: 127 | - type 128 | block_types: 129 | testing: !!set 130 | picture_description_birthday_reduced: null 131 | picture_description_kitchen_reduced: null 132 | picture_description_tree_reduced: null 133 | training: !!set 134 | picture_description_birthday_reduced: null 135 | picture_description_birthday_reduced_mimed: null 136 | picture_description_kitchen_reduced: null 137 | picture_description_kitchen_reduced_mimed: null 138 | picture_description_kitchen_reduced_tiny: null 139 | picture_description_reduced: null 140 | picture_description_tree_reduced: null 141 | picture_description_tree_reduced_mimed: null 142 | validation: !!set 143 | picture_description_birthday_reduced: null 144 | picture_description_kitchen_reduced: null 145 | picture_description_tree_reduced: null 146 | data_mapping: 147 | decoder_targets: text_sequence 148 | encoder_1_targets: phoneme_sequence 149 | encoder_inputs: ecog_sequence 150 | decimation_factor: null 151 | electrode_path: /home/makin/data/ecog/EFC403/TDT_elecs_all.mat 152 | elevation: 0 153 | encoder_1_targets_penalty_scale: 1.0 154 | grid_names: 155 | - Grid 156 | grid_size: 157 | - 16 158 | - 16 159 | grid_step: 1 160 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 161 | layer_sizes: 162 | decoder_embedding: 163 | - 150 164 | decoder_projection: [] 165 | decoder_rnn: 166 | - 800 167 | encoder_1_projection: 168 | - 225 169 | encoder_embedding: 170 | - 100 171 | encoder_rnn: 172 | - 400 173 | - 400 174 | - 400 175 | mfcc_winlen: 0.02 176 | model_class: null 177 | num_cepstral_coeffs: 0 178 | num_mel_features: 26 179 | num_unique_training_sentences: 30 180 | phoneme_sequence_vocab_file: vocab.phonemes.42 181 | png_partial_path: /home/makin/figures/word_sequence/EFC403/{0}.png 182 | project: EFC 183 | sampling_rate: 190.73486328125 184 | sampling_rate_decimated: 16.5 185 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 186 | subject_name: EFC403 187 | temperature: 0.384 188 | text_sequence_vocab_file: vocab.demo2-reduced.125 189 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC403_B{0}.tfrecord 190 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 191 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC403/{0}.tex 192 | token_type: word_sequence 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ecog2txt 2 | Code for decoding speech as text from neural data 3 | 4 | This package contains Python code for the high-level aspects of decoding speech from neural data, including transfer learning across multiple subjects. It was used for all results in the paper "Machine translation of cortical activity to text with an encoder-decoder framework" (Makin et al., _Nature Neuroscience_, 2020). These high-level aspects include the structuring of the training, the organization by subjects, and the construction of [`TFRecord`](https://www.tensorflow.org/tutorials/load_data/tfrecord)s. The (low-level) training itself is done with the adjacent [`machine_learning` package](https://github.com/jgmakin/machine_learning), which implements sequence-to-sequence networks in [TensorFlow](https://www.tensorflow.org). 5 | 6 | ## Installation 7 | 1. Install [TensorFlow 1.15.5](https://www.tensorflow.org), the final version of TF1.x. 8 | ``` 9 | pip install tensorflow-gpu==1.15.5 10 | ``` 11 | If you don't have a GPU you should install the CPU version 12 | ``` 13 | pip install tensorflow==1.15.5 14 | ``` 15 | Please consult the Tensorflow installation documents. The most important facts to know are that TF1.15 requires CUDA 10.0, `libcudnn7>=7.6.5.32-1+cuda10.0`, and `libnccl2>=2.6.4-1+cuda10.0`. (I have only tested with up to, not beyond, the listed versions of these libraries). Make sure the driver for your GPU is compatible with these versions of the cudNN and NCCL libraries. And the latest version of Python supported by TF1.15 is 3.7. 16 | 17 | 2. Install the three required packages: 18 | ``` 19 | git clone https://github.com/jgmakin/utils_jgm.git 20 | pip install -e utils_jgm 21 | 22 | git clone https://github.com/jgmakin/machine_learning.git 23 | pip install -e machine_learning 24 | 25 | git clone https://github.com/jgmakin/ecog2txt.git 26 | pip install -e ecog2txt 27 | 28 | ``` 29 | Note that `utils_jgm` requires the user to set up a configuration file; please see the [README for that package](https://github.com/jgmakin/utils_jgm). 30 | 31 | 32 | ## Getting started 33 | In order to unify the vast set of parameters (paths, experimental block structure, neural-network hyperparameters, etc.), all experiments are organized with the help of two configuration files, `block_breakdowns.json`, and `YOUR_EXPERIMENT_manifest.yaml`, examples of each are [included in this repository](https://github.com/jgmakin/ecog2txt/tree/master/ecog2txt/auxiliary/EFC). 34 | 35 | 1. Edit the `block_breakdowns.json` to match your use case. The entries are 36 | 37 | ```SUBJECT_ID: {BLOCK: {"type: BLOCK_TYPE, "default_dataset": DEFAULT_DATASET_VALUE}}``` 38 | 39 | where the `DEFAULT_DATASET_VALUE` is one of `"training"`/`"validation"`/`"testing"`; and the `BLOCK_TYPE` is whatever descriptive title you want to give to your block (e.g., `"mocha-3"`). Assigning types to the blocks allows them to be filtered out of datasets, according to information provided in the `manifest` (see next item). 40 | Place your edited copy into a directory we will call `json_dir`. 41 | 42 | 2. Edit [one of the `.yaml` manifest files](https://github.com/jgmakin/ecog2txt/tree/master/ecog2txt/auxiliary/EFC) to something sensible for your case. The *most important thing to know* is that many of the classes in this package (and `machine_learning`) load their default attributes from this `manifest`. That means that, even though the keyword arguments for their constructors (`__init__()` methods) may appear to default to `None`, this `None` actually instructs the class to default to the argument's value in the `manifest`. 43 | 44 | You don't have to set all the values before your first run, but in the very least, you should: 45 | * Fix the paths/dirs. For the most part they are for writing, not reading, so you can set them wherever you like. For the three reading paths: 46 | * `json_dir` must point to the location of your `block_breakdowns.json` file (see previous item). 47 | * `bad_electrodes_path` must point to a (possibly empty) plain-text file listing (one entry per line) any bad channels. NB that these are assumed to be 1-indexed! (but will internally be converted to zero-indexing). Alternatively, you can provide (either via the manifest or as an argument to the `ECoGDataGenerator`) the `good_electrodes` directly. 48 | * `electrode_path`: you can ignore this unless you plan to plot results on the cortical surface (in which case contact me). 49 | * `block_types`: these set *necessary* conditions for membership in one of the datasets, `training`/`validation`/`testing`. For example, in the `mochastar_word_sequence.yaml` manifest file, the `testing` and `validation` sets are allowed to include only `mocha-1`, but the training set is allowed to include `mocha-1, ..., mocha-9`. So if a `mocha-3` block has `validation` as its `"default_dataset"` in the `block_breakdowns.json`, it would be excluded altogether. 50 | * `grid_size`: Set this to match the dimensions of your ECoG grid. 51 | * `text_sequence_vocab_file`: You can provide a file with a list, one word per line, of all words to be targeted by the decoder. This key specifies just the *name* of the file; the file itself must live in the `text_dir` specified in `__init__.py`. If you set this key to `None`, the package will attempt to build a list of unique targets directly from the `TFRecord`s. An example vocab_file, `vocab.mocha-timit.1806`, is included in this package. 52 | * `data_mapping`: Use this to set which data to use as inputs and outputs for the sequence-to-sequence network--see `_ecog_token_generator` below. 53 | * `DataGenerator`: In the `manifest`, this points to the `ECoGDataGenerator` in `data_generators.py`, but you will probably want to subclass this class and point to your new (sub)class instead--see next item. 54 | 55 | You can probably get away with leaving the rest of the values in the `.yaml` at their default values, at least for your first run. 56 | 57 | Finally, make sure `YOUR_EXPERIMENT_manifest.yaml` lives at the `text_dir` specified in `__init__.py` (you can change this as you like, but remember that the `text_sequence_vocab_file` must live in the same directory). 58 | 59 | 3. `ECoGDataGenerator`, found in `data_generators.py`, is a shell class for generating data--more particularly for writing out the `TFRecords` that will be used for training and assessing your model--that plays nicely with the other classes. However, three of its (required!) methods are unspecified because they depend on how *you* store *your* data. (Dummy versions appear in `ECoGDataGenerator`; you can inspect their input and outputs there.) You should subclass `ECoGDataGenerator` and fill in these methods: 60 | * `_ecog_token_generator`: a Python generator that yields data structures in the form of a `dict`, each entry of which corresponds to a set of inputs and outputs on a single trial. For example, the entries might be `ecog_sequence`,`text_sequence`, `audio_sequence`, and `phoneme_sequence`. The last two are not strictly necessary for speech decoding and can be left out--or you can add more. Just *make sure that you return at least the data structures requested in the `data_mapping` specified in the `manifest`*. So e.g. if the `data_mapping` is 61 | ```data_mapping = {'decoder_targets': 'text_sequence', 'encoder_inputs': 'ecog_sequence'}``` 62 | then `_ecog_token_generator` must yield dictionaries containing *at least* (but not limited to) a `text_sequence` and an `ecog_sequence`. The entire dictionary will be written to a `TFRecord` (one for each block), so it's better to yield more rather than fewer data structures, in case you change your mind later about the `data_mapping` but don't want to have to rewrite all the `TFRecord`s. 63 | 64 | And one more thing: the `text_sequence_vocab_file` key in the experiment manifest is linked to the `text_sequence` in this data mapping. So if you plan to call your `decoder_targets` something else, say `my_words`, then make sure to rename the key in the experiment manifest that points to a vocab file to `my_words_vocab_file`. 65 | * `_get_wav_data`: should return the `sampling_rate` and audio `signal` for one (e.g.) block of audio data. This will allow you to make use of the built-in `_get_MFCC_features` in constructing your `_ecog_token_generator`. If you're never going to generate an `audio_sequence`, however, you can ignore it. 66 | * `_query`: should return the total number of examples in a group of blocks. This will allow you to allocate memory efficiently when using the `get` method. However, the methods `_query` and `get` are not used elsewhere in the code; they are convenience functions for examining the data directly rather than through a `TFRecord`. 67 | 68 | 69 | ## Training a model 70 | The basic commands to train a model are as follows (you can e.g. run this in a Python notebook). 71 | 72 | ``` 73 | import ecog2txt.trainers as e2t_trainers 74 | import ecog2txt.data_generators 75 | 76 | # CREATE A NEW MODEL 77 | trainer = e2t_trainers.MultiSubjectTrainer( 78 | experiment_manifest_name=YOUR_EXPERIMENT_manifest.yaml, 79 | subject_ids=[400, 401], 80 | SN_kwargs={ 81 | 'FF_dropout': 0.4, # overwriting whatever is in the manifest 82 | 'TEMPORALLY_CONVOLVE': True # overwriting whatever is in the manifest 83 | }, 84 | DG_kwargs={ 85 | 'REFERENCE_BIPOLAR': True, # overwriting whatever is in the manifest 86 | }, 87 | ES_kwargs = { 88 | 'data_mapping': { # overwriting whatever is in the manifest 89 | 'encoder_inputs': 'ecog_sequence', 90 | 'decoder_targets': 'text_sequence', 91 | }, 92 | }, 93 | ) 94 | 95 | # MAKE SURE ALL THE TFRECORDS ARE WRITTEN 96 | for subject in trainer.ecog_subjects: 97 | subject.write_tf_records_maybe() 98 | trainer.subject_to_table() 99 | 100 | # TRAIN THE TWO SUBJECTS IN PARALLEL 101 | assessments = trainer.parallel_transfer_learn() 102 | ``` 103 | -------------------------------------------------------------------------------- /ecog2txt/auxiliary/EFC/mocha-1_word_sequence.yaml: -------------------------------------------------------------------------------- 1 | 400: 2 | # DataGenerator: &id001 !!python/name:ecog2txt.data_generators.ECoGDataGenerator '' 3 | # DataGenerator: &id001 !!python/name:ecog2txt.chang_lab_data_generators.BambooDataGenerator '' 4 | DataGenerator: &id001 !!python/name:ecog2txt.makin_lab_data_generators.SpeechDataGenerator '' 5 | EMA_decay: 0.99 6 | FF_dropout: 0.1 7 | N_epochs: 800 8 | REFERENCE_BIPOLAR: true 9 | RGB_color: !!python/tuple 10 | - 0.4 11 | - 0.6509803921568628 12 | - 0.11764705882352941 13 | RNN_dropout: 0.5 14 | TEMPORALLY_CONVOLVE: true 15 | USE_FIELD_POTENTIALS: false 16 | USE_LOG_MELS: false 17 | USE_MFCC_DELTAS: false 18 | alias: participant a 19 | anatomy_grand_list: 20 | - middle frontal 21 | - inferior temporal 22 | - middle temporal 23 | - superior temporal 24 | - IFG 25 | - supramarginal 26 | - vSMC 27 | assessment_epoch_interval: 10 28 | audio_sequence_penalty_scale: 0.1 29 | azimuth: 0 30 | bad_electrodes_path: /home/makin/data/ecog/EFC400/bad_electrodes 31 | beam_width: 1 32 | block_descriptors: 33 | - type 34 | block_types: 35 | testing: !!set 36 | mocha-1: null 37 | training: !!set 38 | mocha-1: null 39 | validation: !!set 40 | mocha-1: null 41 | data_mapping: 42 | decoder_targets: text_sequence 43 | encoder_1_targets: phoneme_sequence 44 | encoder_inputs: ecog_sequence 45 | decimation_factor: null 46 | electrode_path: /home/makin/data/ecog/EFC400/TDT_elecs_all.mat 47 | elevation: 0 48 | encoder_1_targets_penalty_scale: 1.0 49 | grid_names: 50 | - R256GridElectrode 51 | grid_size: 52 | - 16 53 | - 16 54 | grid_step: 1 55 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 56 | layer_sizes: 57 | decoder_embedding: 58 | - 150 59 | decoder_projection: [] 60 | decoder_rnn: 61 | - 800 62 | encoder_1_projection: 63 | - 225 64 | encoder_embedding: 65 | - 100 66 | encoder_rnn: 67 | - 400 68 | - 400 69 | - 400 70 | mfcc_winlen: 0.02 71 | model_class: null 72 | num_cepstral_coeffs: 0 73 | num_mel_features: 26 74 | num_unique_training_sentences: 50 75 | phoneme_sequence_vocab_file: vocab.phonemes.42 76 | png_partial_path: /home/makin/figures/word_sequence/EFC400/{0}.png 77 | project: EFC 78 | sampling_rate: 200 79 | sampling_rate_decimated: 16.5 80 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 81 | subject_name: EFC400 82 | temperature: 0.384 83 | text_sequence_vocab_file: vocab.mocha-timit.1806 84 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC400_B{0}.tfrecord 85 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 86 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC400/{0}.tex 87 | token_type: word_sequence 88 | 401: 89 | # DataGenerator: *id001 90 | DataGenerator: !!python/name:ecog2txt.makin_lab_data_generators.SpeechDataGenerator '' 91 | EMA_decay: 0.99 92 | FF_dropout: 0.1 93 | N_epochs: 800 94 | REFERENCE_BIPOLAR: true 95 | RGB_color: !!python/tuple 96 | - 0.9058823529411765 97 | - 0.1607843137254902 98 | - 0.5411764705882353 99 | RNN_dropout: 0.5 100 | TEMPORALLY_CONVOLVE: true 101 | USE_FIELD_POTENTIALS: false 102 | USE_LOG_MELS: false 103 | USE_MFCC_DELTAS: false 104 | alias: participant b 105 | anatomy_grand_list: 106 | - middle frontal 107 | - inferior temporal 108 | - middle temporal 109 | - superior temporal 110 | - IFG 111 | - supramarginal 112 | - vSMC 113 | assessment_epoch_interval: 10 114 | audio_sequence_penalty_scale: 0.1 115 | azimuth: 180 116 | bad_electrodes_path: /home/makin/data/ecog/EFC401/bad_electrodes 117 | beam_width: 1 118 | block_descriptors: 119 | - type 120 | block_types: 121 | testing: !!set 122 | mocha-1: null 123 | training: !!set 124 | mocha-1: null 125 | validation: !!set 126 | mocha-1: null 127 | data_mapping: 128 | decoder_targets: text_sequence 129 | encoder_1_targets: phoneme_sequence 130 | encoder_inputs: ecog_sequence 131 | decimation_factor: null 132 | electrode_path: /home/makin/data/ecog/EFC401/TDT_elecs_all.mat 133 | elevation: 0 134 | encoder_1_targets_penalty_scale: 1.0 135 | grid_names: 136 | - L256GridElectrode 137 | grid_size: 138 | - 16 139 | - 16 140 | grid_step: 1 141 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 142 | layer_sizes: 143 | decoder_embedding: 144 | - 150 145 | decoder_projection: [] 146 | decoder_rnn: 147 | - 800 148 | encoder_1_projection: 149 | - 225 150 | encoder_embedding: 151 | - 100 152 | encoder_rnn: 153 | - 400 154 | - 400 155 | - 400 156 | mfcc_winlen: 0.02 157 | model_class: null 158 | num_cepstral_coeffs: 0 159 | num_mel_features: 26 160 | num_unique_training_sentences: 50 161 | phoneme_sequence_vocab_file: vocab.phonemes.42 162 | png_partial_path: /home/makin/figures/word_sequence/EFC401/{0}.png 163 | project: EFC 164 | sampling_rate: 200 165 | sampling_rate_decimated: 16.5 166 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 167 | subject_name: EFC401 168 | temperature: 0.384 169 | text_sequence_vocab_file: vocab.mocha-timit.1806 170 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC401_B{0}.tfrecord 171 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 172 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC401/{0}.tex 173 | token_type: word_sequence 174 | 402: 175 | DataGenerator: *id001 176 | EMA_decay: 0.99 177 | FF_dropout: 0.1 178 | N_epochs: 800 179 | REFERENCE_BIPOLAR: true 180 | RGB_color: !!python/tuple 181 | - 0.4588235294117647 182 | - 0.4392156862745098 183 | - 0.7019607843137254 184 | RNN_dropout: 0.5 185 | TEMPORALLY_CONVOLVE: true 186 | USE_FIELD_POTENTIALS: false 187 | USE_LOG_MELS: false 188 | USE_MFCC_DELTAS: false 189 | alias: participant c 190 | anatomy_grand_list: 191 | - middle frontal 192 | - inferior temporal 193 | - middle temporal 194 | - superior temporal 195 | - IFG 196 | - supramarginal 197 | - vSMC 198 | assessment_epoch_interval: 10 199 | audio_sequence_penalty_scale: 0.1 200 | azimuth: 180 201 | bad_electrodes_path: /home/makin/data/ecog/EFC402/bad_electrodes 202 | beam_width: 1 203 | block_descriptors: 204 | - type 205 | block_types: 206 | testing: !!set 207 | mocha-1: null 208 | training: !!set 209 | mocha-1: null 210 | validation: !!set 211 | mocha-1: null 212 | data_mapping: 213 | decoder_targets: text_sequence 214 | encoder_1_targets: phoneme_sequence 215 | encoder_inputs: ecog_sequence 216 | decimation_factor: null 217 | electrode_path: /home/makin/data/ecog/EFC402/TDT_elecs_all.mat 218 | elevation: 10 219 | encoder_1_targets_penalty_scale: 1.0 220 | grid_names: 221 | - InferiorGrid 222 | grid_size: 223 | - 8 224 | - 16 225 | grid_step: 1 226 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 227 | layer_sizes: 228 | decoder_embedding: 229 | - 150 230 | decoder_projection: [] 231 | decoder_rnn: 232 | - 800 233 | encoder_1_projection: 234 | - 225 235 | encoder_embedding: 236 | - 100 237 | encoder_rnn: 238 | - 400 239 | - 400 240 | - 400 241 | mfcc_winlen: 0.02 242 | model_class: null 243 | num_cepstral_coeffs: 0 244 | num_mel_features: 26 245 | num_unique_training_sentences: 50 246 | phoneme_sequence_vocab_file: vocab.phonemes.42 247 | png_partial_path: /home/makin/figures/word_sequence/EFC402/{0}.png 248 | project: EFC 249 | sampling_rate: 190.73486328125 250 | sampling_rate_decimated: 16.5 251 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 252 | subject_name: EFC402 253 | temperature: 0.384 254 | text_sequence_vocab_file: vocab.mocha-timit.1806 255 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC402_B{0}.tfrecord 256 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 257 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC402/{0}.tex 258 | token_type: word_sequence 259 | 403: 260 | DataGenerator: *id001 261 | EMA_decay: 0.99 262 | FF_dropout: 0.1 263 | N_epochs: 800 264 | REFERENCE_BIPOLAR: true 265 | RGB_color: !!python/tuple 266 | - 0.6509803921568628 267 | - 0.4627450980392157 268 | - 0.11372549019607843 269 | RNN_dropout: 0.5 270 | TEMPORALLY_CONVOLVE: true 271 | USE_FIELD_POTENTIALS: false 272 | USE_LOG_MELS: false 273 | USE_MFCC_DELTAS: false 274 | alias: participant d 275 | anatomy_grand_list: 276 | - middle frontal 277 | - inferior temporal 278 | - middle temporal 279 | - superior temporal 280 | - IFG 281 | - supramarginal 282 | - vSMC 283 | assessment_epoch_interval: 10 284 | audio_sequence_penalty_scale: 0.1 285 | azimuth: 170 286 | bad_electrodes_path: /home/makin/data/ecog/EFC403/bad_electrodes 287 | beam_width: 1 288 | block_descriptors: 289 | - type 290 | block_types: 291 | testing: !!set 292 | mocha-1: null 293 | training: !!set 294 | mocha-1: null 295 | validation: !!set 296 | mocha-1: null 297 | data_mapping: 298 | decoder_targets: text_sequence 299 | encoder_1_targets: phoneme_sequence 300 | encoder_inputs: ecog_sequence 301 | decimation_factor: null 302 | electrode_path: /home/makin/data/ecog/EFC403/TDT_elecs_all.mat 303 | elevation: 0 304 | encoder_1_targets_penalty_scale: 1.0 305 | grid_names: 306 | - Grid 307 | grid_size: 308 | - 16 309 | - 16 310 | grid_step: 1 311 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 312 | layer_sizes: 313 | decoder_embedding: 314 | - 150 315 | decoder_projection: [] 316 | decoder_rnn: 317 | - 800 318 | encoder_1_projection: 319 | - 225 320 | encoder_embedding: 321 | - 100 322 | encoder_rnn: 323 | - 400 324 | - 400 325 | - 400 326 | mfcc_winlen: 0.02 327 | model_class: null 328 | num_cepstral_coeffs: 0 329 | num_mel_features: 26 330 | num_unique_training_sentences: 50 331 | phoneme_sequence_vocab_file: vocab.phonemes.42 332 | png_partial_path: /home/makin/figures/word_sequence/EFC403/{0}.png 333 | project: EFC 334 | sampling_rate: 190.73486328125 335 | sampling_rate_decimated: 16.5 336 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 337 | subject_name: EFC403 338 | temperature: 0.384 339 | text_sequence_vocab_file: vocab.mocha-timit.1806 340 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC403_B{0}.tfrecord 341 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 342 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC403/{0}.tex 343 | token_type: word_sequence 344 | -------------------------------------------------------------------------------- /ecog2txt/auxiliary/EFC/mochastar_word_sequence.yaml: -------------------------------------------------------------------------------- 1 | 400: 2 | DataGenerator: &id001 !!python/name:ecog2txt.makin_lab_data_generators.SpeechDataGenerator '' 3 | EMA_decay: 0.99 4 | FF_dropout: 0.1 5 | N_epochs: 800 6 | REFERENCE_BIPOLAR: true 7 | RGB_color: !!python/tuple 8 | - 0.4 9 | - 0.6509803921568628 10 | - 0.11764705882352941 11 | RNN_dropout: 0.5 12 | TEMPORALLY_CONVOLVE: true 13 | USE_FIELD_POTENTIALS: false 14 | USE_LOG_MELS: false 15 | USE_MFCC_DELTAS: false 16 | alias: participant a 17 | anatomy_grand_list: 18 | - middle frontal 19 | - inferior temporal 20 | - middle temporal 21 | - superior temporal 22 | - IFG 23 | - supramarginal 24 | - vSMC 25 | assessment_epoch_interval: 10 26 | audio_sequence_penalty_scale: 0.1 27 | azimuth: 0 28 | bad_electrodes_path: /home/makin/data/ecog/EFC400/bad_electrodes 29 | beam_width: 1 30 | block_descriptors: 31 | - type 32 | block_types: 33 | testing: !!set 34 | mocha-1: null 35 | training: !!set 36 | mocha-1: null 37 | mocha-2: null 38 | mocha-3: null 39 | mocha-4: null 40 | mocha-5: null 41 | mocha-6: null 42 | mocha-7: null 43 | mocha-8: null 44 | mocha-9: null 45 | validation: !!set 46 | mocha-1: null 47 | data_mapping: 48 | decoder_targets: text_sequence 49 | encoder_1_targets: phoneme_sequence 50 | encoder_inputs: ecog_sequence 51 | decimation_factor: null 52 | electrode_path: /home/makin/data/ecog/EFC400/TDT_elecs_all.mat 53 | elevation: 0 54 | encoder_1_targets_penalty_scale: 1.0 55 | grid_names: 56 | - R256GridElectrode 57 | grid_size: 58 | - 16 59 | - 16 60 | grid_step: 1 61 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 62 | layer_sizes: 63 | decoder_embedding: 64 | - 150 65 | decoder_projection: [] 66 | decoder_rnn: 67 | - 800 68 | encoder_1_projection: 69 | - 225 70 | encoder_embedding: 71 | - 100 72 | encoder_rnn: 73 | - 400 74 | - 400 75 | - 400 76 | mfcc_winlen: 0.02 77 | model_class: null 78 | num_cepstral_coeffs: 0 79 | num_mel_features: 26 80 | num_unique_training_sentences: null 81 | phoneme_sequence_vocab_file: vocab.phonemes.42 82 | png_partial_path: /home/makin/figures/word_sequence/EFC400/{0}.png 83 | project: EFC 84 | sampling_rate: 200 85 | sampling_rate_decimated: 16.5 86 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 87 | subject_name: EFC400 88 | temperature: 0.384 89 | text_sequence_vocab_file: vocab.mocha-timit.1806 90 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC400_B{0}.tfrecord 91 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 92 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC400/{0}.tex 93 | token_type: word_sequence 94 | 401: 95 | DataGenerator: *id001 96 | EMA_decay: 0.99 97 | FF_dropout: 0.1 98 | N_epochs: 800 99 | REFERENCE_BIPOLAR: true 100 | RGB_color: !!python/tuple 101 | - 0.9058823529411765 102 | - 0.1607843137254902 103 | - 0.5411764705882353 104 | RNN_dropout: 0.5 105 | TEMPORALLY_CONVOLVE: true 106 | USE_FIELD_POTENTIALS: false 107 | USE_LOG_MELS: false 108 | USE_MFCC_DELTAS: false 109 | alias: participant b 110 | anatomy_grand_list: 111 | - middle frontal 112 | - inferior temporal 113 | - middle temporal 114 | - superior temporal 115 | - IFG 116 | - supramarginal 117 | - vSMC 118 | assessment_epoch_interval: 10 119 | audio_sequence_penalty_scale: 0.1 120 | azimuth: 180 121 | bad_electrodes_path: /home/makin/data/ecog/EFC401/bad_electrodes 122 | beam_width: 1 123 | block_descriptors: 124 | - type 125 | block_types: 126 | testing: !!set 127 | mocha-1: null 128 | training: !!set 129 | mocha-1: null 130 | mocha-2: null 131 | mocha-3: null 132 | mocha-4: null 133 | mocha-5: null 134 | mocha-6: null 135 | mocha-7: null 136 | mocha-8: null 137 | mocha-9: null 138 | validation: !!set 139 | mocha-1: null 140 | data_mapping: 141 | decoder_targets: text_sequence 142 | encoder_1_targets: phoneme_sequence 143 | encoder_inputs: ecog_sequence 144 | decimation_factor: null 145 | electrode_path: /home/makin/data/ecog/EFC401/TDT_elecs_all.mat 146 | elevation: 0 147 | encoder_1_targets_penalty_scale: 1.0 148 | grid_names: 149 | - L256GridElectrode 150 | grid_size: 151 | - 16 152 | - 16 153 | grid_step: 1 154 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 155 | layer_sizes: 156 | decoder_embedding: 157 | - 150 158 | decoder_projection: [] 159 | decoder_rnn: 160 | - 800 161 | encoder_1_projection: 162 | - 225 163 | encoder_embedding: 164 | - 100 165 | encoder_rnn: 166 | - 400 167 | - 400 168 | - 400 169 | mfcc_winlen: 0.02 170 | model_class: null 171 | num_cepstral_coeffs: 0 172 | num_mel_features: 26 173 | num_unique_training_sentences: null 174 | phoneme_sequence_vocab_file: vocab.phonemes.42 175 | png_partial_path: /home/makin/figures/word_sequence/EFC401/{0}.png 176 | project: EFC 177 | sampling_rate: 200 178 | sampling_rate_decimated: 16.5 179 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 180 | subject_name: EFC401 181 | temperature: 0.384 182 | text_sequence_vocab_file: vocab.mocha-timit.1806 183 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC401_B{0}.tfrecord 184 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 185 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC401/{0}.tex 186 | token_type: word_sequence 187 | 402: 188 | DataGenerator: *id001 189 | EMA_decay: 0.99 190 | FF_dropout: 0.1 191 | N_epochs: 800 192 | REFERENCE_BIPOLAR: true 193 | RGB_color: !!python/tuple 194 | - 0.4588235294117647 195 | - 0.4392156862745098 196 | - 0.7019607843137254 197 | RNN_dropout: 0.5 198 | TEMPORALLY_CONVOLVE: true 199 | USE_FIELD_POTENTIALS: false 200 | USE_LOG_MELS: false 201 | USE_MFCC_DELTAS: false 202 | alias: participant c 203 | anatomy_grand_list: 204 | - middle frontal 205 | - inferior temporal 206 | - middle temporal 207 | - superior temporal 208 | - IFG 209 | - supramarginal 210 | - vSMC 211 | assessment_epoch_interval: 10 212 | audio_sequence_penalty_scale: 0.1 213 | azimuth: 180 214 | bad_electrodes_path: /home/makin/data/ecog/EFC402/bad_electrodes 215 | beam_width: 1 216 | block_descriptors: 217 | - type 218 | block_types: 219 | testing: !!set 220 | mocha-1: null 221 | training: !!set 222 | mocha-1: null 223 | mocha-2: null 224 | mocha-3: null 225 | mocha-4: null 226 | mocha-5: null 227 | mocha-6: null 228 | mocha-7: null 229 | mocha-8: null 230 | mocha-9: null 231 | validation: !!set 232 | mocha-1: null 233 | data_mapping: 234 | decoder_targets: text_sequence 235 | encoder_1_targets: phoneme_sequence 236 | encoder_inputs: ecog_sequence 237 | decimation_factor: null 238 | electrode_path: /home/makin/data/ecog/EFC402/TDT_elecs_all.mat 239 | elevation: 10 240 | encoder_1_targets_penalty_scale: 1.0 241 | grid_names: 242 | - InferiorGrid 243 | grid_size: 244 | - 8 245 | - 16 246 | grid_step: 1 247 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 248 | layer_sizes: 249 | decoder_embedding: 250 | - 150 251 | decoder_projection: [] 252 | decoder_rnn: 253 | - 800 254 | encoder_1_projection: 255 | - 225 256 | encoder_embedding: 257 | - 100 258 | encoder_rnn: 259 | - 400 260 | - 400 261 | - 400 262 | mfcc_winlen: 0.02 263 | model_class: null 264 | num_cepstral_coeffs: 0 265 | num_mel_features: 26 266 | num_unique_training_sentences: null 267 | phoneme_sequence_vocab_file: vocab.phonemes.42 268 | png_partial_path: /home/makin/figures/word_sequence/EFC402/{0}.png 269 | project: EFC 270 | sampling_rate: 190.73486328125 271 | sampling_rate_decimated: 16.5 272 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 273 | subject_name: EFC402 274 | temperature: 0.384 275 | text_sequence_vocab_file: vocab.mocha-timit.1806 276 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC402_B{0}.tfrecord 277 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 278 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC402/{0}.tex 279 | token_type: word_sequence 280 | 403: 281 | DataGenerator: *id001 282 | EMA_decay: 0.99 283 | FF_dropout: 0.1 284 | N_epochs: 800 285 | REFERENCE_BIPOLAR: true 286 | RGB_color: !!python/tuple 287 | - 0.6509803921568628 288 | - 0.4627450980392157 289 | - 0.11372549019607843 290 | RNN_dropout: 0.5 291 | TEMPORALLY_CONVOLVE: true 292 | USE_FIELD_POTENTIALS: false 293 | USE_LOG_MELS: false 294 | USE_MFCC_DELTAS: false 295 | alias: participant d 296 | anatomy_grand_list: 297 | - middle frontal 298 | - inferior temporal 299 | - middle temporal 300 | - superior temporal 301 | - IFG 302 | - supramarginal 303 | - vSMC 304 | assessment_epoch_interval: 10 305 | audio_sequence_penalty_scale: 0.1 306 | azimuth: 170 307 | bad_electrodes_path: /home/makin/data/ecog/EFC403/bad_electrodes 308 | beam_width: 1 309 | block_descriptors: 310 | - type 311 | block_types: 312 | testing: !!set 313 | mocha-1: null 314 | training: !!set 315 | mocha-1: null 316 | mocha-2: null 317 | mocha-3: null 318 | mocha-4: null 319 | mocha-5: null 320 | mocha-6: null 321 | mocha-7: null 322 | mocha-8: null 323 | mocha-9: null 324 | validation: !!set 325 | mocha-1: null 326 | data_mapping: 327 | decoder_targets: text_sequence 328 | encoder_1_targets: phoneme_sequence 329 | encoder_inputs: ecog_sequence 330 | decimation_factor: null 331 | electrode_path: /home/makin/data/ecog/EFC403/TDT_elecs_all.mat 332 | elevation: 0 333 | encoder_1_targets_penalty_scale: 1.0 334 | grid_names: 335 | - Grid 336 | grid_size: 337 | - 16 338 | - 16 339 | grid_step: 1 340 | json_dir: /home/makin/code/ecog2txt/ecog2txt/auxiliary/EFC 341 | layer_sizes: 342 | decoder_embedding: 343 | - 150 344 | decoder_projection: [] 345 | decoder_rnn: 346 | - 800 347 | encoder_1_projection: 348 | - 225 349 | encoder_embedding: 350 | - 100 351 | encoder_rnn: 352 | - 400 353 | - 400 354 | - 400 355 | mfcc_winlen: 0.02 356 | model_class: null 357 | num_cepstral_coeffs: 0 358 | num_mel_features: 26 359 | num_unique_training_sentences: null 360 | phoneme_sequence_vocab_file: vocab.phonemes.42 361 | png_partial_path: /home/makin/figures/word_sequence/EFC403/{0}.png 362 | project: EFC 363 | sampling_rate: 190.73486328125 364 | sampling_rate_decimated: 16.5 365 | saved_results_dir: /home/makin/data/ecog2txt/word_sequence/saved_results 366 | subject_name: EFC403 367 | temperature: 0.384 368 | text_sequence_vocab_file: vocab.mocha-timit.1806 369 | tf_record_partial_path: /home/makin/data/ecog2txt/word_sequence/tf_records/EFC403_B{0}.tfrecord 370 | tf_summaries_dir: /home/makin/data/ecog2txt/word_sequence/tf_summaries 371 | tikz_partial_path: /home/makin/Documents/#texs/tikzpics/word_sequence/EFC403/{0}.tex 372 | token_type: word_sequence 373 | -------------------------------------------------------------------------------- /ecog2txt/data_generators.py: -------------------------------------------------------------------------------- 1 | # standard libraries 2 | import pdb 3 | import os 4 | 5 | # third-party packages 6 | import numpy as np 7 | from scipy.fftpack import dct 8 | import tensorflow as tf 9 | try: 10 | from tensor2tensor.data_generators import text_encoder 11 | except ModuleNotFoundError: 12 | print('WARNING: tensor2tensor missing; skipping') 13 | try: 14 | from python_speech_features import delta, fbank, lifter 15 | except ModuleNotFoundError: 16 | pass 17 | 18 | # local 19 | from machine_learning.neural_networks import tf_helpers as tfh 20 | from utils_jgm.toolbox import auto_attribute 21 | from ecog2txt import text_dir 22 | 23 | 24 | '''' 25 | The ECoGDataGenerator class and related functions for assembling ECoG data, 26 | and then writing them out with a generator, to a numpy tensor, or 27 | to a tfrecord (tensorflow protobuf). 28 | 29 | :Author: J.G. Makin (except where otherwise noted) 30 | 31 | Created: July 2017 32 | Revised: 02/18/20 33 | ''' 34 | 35 | max_seconds_dict = { 36 | 'phoneme': 0.2, 37 | 'word': 1.0, 38 | 'word_sequence': 6.25, 39 | 'word_piece_sequence': 6.25, 40 | 'phoneme_sequence': 6.25, 41 | 'trial': 6.25 42 | } 43 | 44 | 45 | class ECoGDataGenerator: 46 | 47 | @auto_attribute(CHECK_MANIFEST=True) 48 | def __init__( 49 | self, 50 | manifest, 51 | subj_id, 52 | ##### 53 | # kwargs that will default to the manifest 54 | grid_step=None, 55 | num_cepstral_coeffs=None, 56 | mfcc_winlen=None, 57 | USE_LOG_MELS=None, 58 | USE_MFCC_DELTAS=None, 59 | USE_FIELD_POTENTIALS=None, 60 | REFERENCE_BIPOLAR=None, 61 | num_mel_features=None, 62 | sampling_rate=None, 63 | token_type=None, 64 | bad_electrodes_path=None, 65 | tf_record_partial_path=None, 66 | grid_size=None, 67 | max_seconds=None, 68 | max_samples=None, 69 | good_electrodes=None, 70 | ###### 71 | # private; don't assign these to self: 72 | # ... 73 | ): 74 | ''' 75 | A class shell for generating ECoG data and corresponding labels in a 76 | format suitable for a neural network to consume. There are two main 77 | methods to be used externally: 78 | get 79 | write_to_Protobuf_maybe 80 | 81 | To use this class, one should subclass it and provide at least these 82 | three methods: 83 | _get_wav_data 84 | _query 85 | _ecog_token_generator 86 | ''' 87 | 88 | # set this directly to None 89 | self._bipolar_to_elec_map = None 90 | 91 | # keys providing vocab file names end in _vocab_file; add them to self 92 | for key, value in manifest.items(): 93 | if key.endswith('_vocab_file'): 94 | setattr(self, key, value) 95 | 96 | @property 97 | def target_type(self): 98 | if 'sequence' in self.token_type: 99 | return 'Trial' 100 | else: 101 | return self.token_type.capitalize() 102 | 103 | @property 104 | def elec_layout(self): 105 | layout = np.arange(np.prod( 106 | self.grid_size)-1, -1, -1).reshape(self.grid_size).T 107 | 108 | # now correct for subsampling the grid 109 | return layout[::self.grid_step, ::self.grid_step] 110 | 111 | @property 112 | def bad_electrodes_path(self): 113 | if self._bad_electrodes_path is not None: 114 | return self._bad_electrodes_path 115 | else: 116 | return os.path.join(text_dir, 'bad_electrodes') 117 | 118 | @bad_electrodes_path.setter 119 | def bad_electrodes_path(self, bad_electrodes_path): 120 | self._bad_electrodes_path = bad_electrodes_path 121 | 122 | @property 123 | def tf_record_partial_path(self): 124 | # something of a hack: insert a subdir before the file name 125 | if self.REFERENCE_BIPOLAR and self.grid_step > 1: 126 | subdir = 'lowdensity_bipolar' 127 | return os.path.join( 128 | os.path.dirname(self._tf_record_partial_path), 129 | subdir, 130 | os.path.basename(self._tf_record_partial_path) 131 | ) 132 | else: 133 | return self._tf_record_partial_path 134 | 135 | @tf_record_partial_path.setter 136 | def tf_record_partial_path(self, tf_record_partial_path): 137 | self._tf_record_partial_path = tf_record_partial_path 138 | 139 | @property 140 | def max_seconds(self): 141 | # _max_seconds has precedence over the max_seconds_dict 142 | if self._max_seconds is not None: 143 | return self._max_seconds 144 | else: 145 | return max_seconds_dict.get(self.token_type, 0.2) 146 | 147 | @max_seconds.setter 148 | def max_seconds(self, max_seconds): 149 | self._max_seconds = max_seconds 150 | 151 | @property 152 | def max_samples(self): 153 | # _max_samples has precedence over max_seconds 154 | if self._max_samples is not None: 155 | return self._max_samples 156 | else: 157 | return int(np.floor(self.sampling_rate*self.max_seconds)) 158 | 159 | @max_samples.setter 160 | def max_samples(self, max_samples): 161 | self._max_samples = max_samples 162 | 163 | @property 164 | def num_MFCC_features(self): 165 | if self.USE_LOG_MELS: 166 | return self.num_mel_features + 1 167 | else: 168 | if self.USE_MFCC_DELTAS: 169 | return 2*self.num_cepstral_coeffs 170 | else: 171 | return self.num_cepstral_coeffs 172 | 173 | @property 174 | def good_electrodes(self): 175 | ''' 176 | NB!!! bad_electrodes are 1-indexed, good_electrodes are zero-indexed!! 177 | 178 | Since this is a set, it contains no order information. The canonical 179 | ordering is established with good_channels, since after all the data 180 | size is (... x Nchannels), not (... x Nelectrodes). 181 | ''' 182 | 183 | if self._good_electrodes is None: 184 | # construct by first loading the *bad*_electrodes 185 | with open(self.bad_electrodes_path, 'r') as f: 186 | bad_electrodes = f.readlines() 187 | bad_electrodes = [int(e.strip()) for e in bad_electrodes] 188 | return ( 189 | set(range(np.prod(self.grid_size))) - 190 | set(np.array(bad_electrodes)-1) 191 | ) 192 | else: 193 | return self._good_electrodes 194 | 195 | @good_electrodes.setter 196 | def good_electrodes(self, good_electrodes): 197 | self._good_electrodes = good_electrodes 198 | 199 | @property 200 | def good_channels(self): 201 | ''' 202 | Pseudo-channels, constructed (on the fly) from the physical electrodes. 203 | For now at least, we won't USE_FIELD_POTENTIALS if we want to 204 | REFERENCE_BIPOLAR. 205 | 206 | NB!!: The *order* of these channels matters--it determines the order of 207 | the input data, and therefore is required by the functions that plot 208 | electrode_contributions in plotters.py! And the order of these channels 209 | will be determined by the *elec_layout*. 210 | ''' 211 | 212 | # NB: this means that the electrodes are *not* in numerical order ('e1' 213 | # does not correspond to the 0th entry in all_electrodes): as you can 214 | # check, flattening the elec_layout does not yield an ordered list. 215 | all_electrodes = self.elec_layout.flatten().tolist() 216 | 217 | if self.USE_FIELD_POTENTIALS: 218 | M = len(all_electrodes) 219 | return ( 220 | [e for e in all_electrodes if e in self.good_electrodes] + 221 | [e+M for e in all_electrodes if e in self.good_electrodes] 222 | ) 223 | elif self.REFERENCE_BIPOLAR: 224 | return [ 225 | ch for ch, elec_pair in enumerate(self.bipolar_to_elec_map) 226 | if all([e in self.good_electrodes for e in elec_pair]) 227 | ] 228 | else: 229 | return [e for e in all_electrodes if e in self.good_electrodes] 230 | 231 | @property 232 | def num_ECoG_channels(self): 233 | return len(self.good_channels) 234 | 235 | def sequence_type_to_vocab_file_path(self, sequence_type): 236 | # The vocab file *must* live in the text_dir 237 | vocab_file_key = '_'.join([sequence_type, 'vocab_file']) 238 | vocab_file = getattr(self, vocab_file_key, None) 239 | if vocab_file is not None: 240 | path = os.path.join(text_dir, vocab_file) 241 | if os.path.isfile(path): 242 | return path 243 | 244 | # if anything else failed, return None 245 | return None 246 | 247 | def get(self, block_set, sequence_types=None): 248 | '''Generate and pad data''' 249 | 250 | # init 251 | if sequence_types is None: 252 | sequence_types = ['ecog_sequence'] 253 | 254 | # The sequence_types 'ecog_sequence' and 'audio_sequence' are special: 255 | # their sizes are linked to properties of this data generator. The 256 | # others are assumed to be text or anyway sensibly stored in a list. 257 | # If other, non-text sequence_types are added, this preallocation 258 | # should be adjusted accordingly. 259 | 260 | # malloc the output_dict 261 | num_examples = self._query(block_set) 262 | output_dict = dict.fromkeys(sequence_types) 263 | for sequence_type in output_dict: 264 | if sequence_type == 'ecog_sequence': 265 | output_dict[sequence_type] = np.zeros( 266 | (num_examples, self.max_samples, self.num_ECoG_channels) 267 | ) 268 | elif sequence_type == 'audio_sequence': 269 | output_dict[sequence_type] = np.zeros( 270 | (num_examples, self.max_samples, self.num_MFCC_features) 271 | ) 272 | else: 273 | # presumably some kind of text.... 274 | output_dict[sequence_type] = [] 275 | 276 | # for each block... 277 | i_example = 0 278 | num_clipped = 0 279 | print('\nLoading data for tensor construction...') 280 | for block in block_set: 281 | 282 | # ...get a lazy iterator... 283 | data_iterator = self._ecog_token_generator(block) 284 | 285 | # ...and iterate through it 286 | for element in data_iterator: 287 | 288 | # pack each entry of element into its output data_struct 289 | for sequence_type, data_struct in output_dict.items(): 290 | assert sequence_type in element, ( 291 | "The sequence_type {} in the sequence_types passed to" 292 | " this method (or defaulted to) is not in the generator" 293 | ).format(sequence_type) 294 | token = element[sequence_type] 295 | if type(data_struct) is list: 296 | data_struct.append(token) 297 | elif type(data_struct) is np.ndarray: 298 | excess = self.max_samples - token.shape[0] 299 | if excess == 0: 300 | num_clipped += 1 301 | token = np.pad(token, ((0, excess), (0, 0)), 'constant') 302 | data_struct[i_example, :, :] = np.expand_dims( 303 | token, axis=0) 304 | else: 305 | raise ValueError('Unexpected data structure!') 306 | 307 | i_example += 1 308 | 309 | # some information 310 | if num_clipped > 0: 311 | print('\n\n') 312 | print('WARNING: %i of %i sequences' % (num_clipped, i_example), end='') 313 | print(' (%.2f%%) have been clipped' % (100*num_clipped/i_example)) 314 | 315 | return output_dict 316 | 317 | def _write_to_Protobuf(self, block): 318 | ''' 319 | Collect the relevant ECoG data and then write to disk as a (google) 320 | protocol buffer. 321 | ''' 322 | writer = tf.io.TFRecordWriter( 323 | self.tf_record_partial_path.format(block)) 324 | for example_dict in self._ecog_token_generator(block): 325 | feature_example = tfh.make_feature_example(example_dict) 326 | writer.write(feature_example.SerializeToString()) 327 | 328 | def _get_MFCC_features(self, index, winstep, nfft=512): 329 | 330 | # first load the .wav file 331 | audio_sampling_rate, audio_signal = self._get_wav_data(index) 332 | 333 | # now convert to MFCCs 334 | if audio_signal is None: 335 | # No need to warn: that will have been done in _get_wav_data. 336 | # NB: This sets the MFCCs to *length-zero* sequences of vectors, 337 | # each *of length num_MFCC_features*. When called by self.get(), 338 | # the sequences will anyway be padded out to self.max_samples. But 339 | # when the generator is called directly, zero-length sequences 340 | # will indeed by returned. 341 | return np.zeros((0, self.num_MFCC_features)) 342 | elif self.num_MFCC_features == 0: 343 | print('WARNING: no MFCCs requested') 344 | # NB: You have yet to use this. That is, in theory this allows one 345 | # to request that no MFCCs be packaged with the other data; but in 346 | # practice when training a SequenceNetwork w/o encoder targetting, 347 | # you don't bother (you wouldn't want to have to re-create the tf 348 | # records), and instead just set encoder targets penalty=0. 349 | Nsamples = int(audio_signal.shape[0]/audio_sampling_rate/winstep) 350 | return np.zeros((Nsamples, 0)) 351 | else: 352 | # unpack the log-mel calculations, because you may just use them 353 | lowfreq = 0 354 | highfreq = None 355 | preemph = 0.97 356 | ceplifter = 22 357 | features, energy = fbank( 358 | audio_signal, audio_sampling_rate, self.mfcc_winlen, winstep, 359 | self.num_mel_features, nfft, lowfreq, highfreq, preemph, 360 | lambda x: np.ones((x,)) 361 | ) 362 | features = np.log(features) 363 | 364 | # use MFCCs (as opposed to log-mels) 365 | if not self.USE_LOG_MELS: 366 | features = dct(features, type=2, axis=1, norm='ortho') 367 | features = features[:, :self.num_cepstral_coeffs] 368 | features = lifter(features, ceplifter) 369 | features[:, 0] = np.log(energy) 370 | else: 371 | features = np.concatenate( 372 | (features, np.log(energy)[:, None]), axis=1) 373 | 374 | # use deltas? 375 | mfccs = ( 376 | np.concatenate((features, delta(features, N=2)), axis=1) 377 | if self.USE_MFCC_DELTAS else features 378 | ) 379 | 380 | return mfccs 381 | 382 | def write_to_Protobuf_maybe(self, sequence_type, block_set): 383 | 384 | from ecog2txt.subjects import SequenceDataManifest 385 | 386 | # set up a data manifest for loading in the sequences 387 | manifest = SequenceDataManifest(sequence_type, num_features_raw=1) 388 | 389 | target_list = [] 390 | for block in block_set: 391 | # wtf? 392 | # if block == 46: 393 | # pass 394 | 395 | data_path = self.tf_record_partial_path.format(block) 396 | if not os.path.exists(data_path): 397 | self._write_to_Protobuf(block) 398 | 399 | # grab the contribution of this block to the target_list 400 | simple_graph = tf.Graph() 401 | with simple_graph.as_default(): 402 | dataset = tf.data.TFRecordDataset(data_path) 403 | dataset = dataset.map( 404 | lambda example_proto: tfh.parse_protobuf_seq2seq_example( 405 | example_proto, {'seq': manifest} 406 | ) 407 | ) 408 | next_example = tf.compat.v1.data.make_one_shot_iterator( 409 | dataset).get_next() 410 | with tf.compat.v1.Session(graph=simple_graph) as sess: 411 | while True: 412 | try: 413 | target_list.append( 414 | sess.run(next_example['seq'])[:, 0].tolist() 415 | ) 416 | except tf.errors.OutOfRangeError: 417 | # print('block %i is ready' % block) 418 | break 419 | print('.', end='') 420 | print() 421 | 422 | # bytes -> strings, and only return the unique elements 423 | return list(set( 424 | w.decode('utf-8') for word_list in target_list for w in word_list 425 | )) 426 | 427 | def get_class_list(self, sequence_type=None, block_set=None): 428 | if sequence_type is not None: 429 | vocab_file_path = self.sequence_type_to_vocab_file_path(sequence_type) 430 | if self.token_type == 'word_piece_sequence': 431 | class_list = self.TokenEncoder( 432 | vocab_file_path)._all_subtoken_strings 433 | else: 434 | with open(vocab_file_path, 'r') as f: 435 | class_list = [word for word in f.read().split()] 436 | elif block_set is not None: 437 | class_list = self.write_to_Protobuf_maybe(sequence_type, block_set) 438 | else: 439 | raise ValueError( 440 | 'get_class_list requires at least one of a sequence_type or a' 441 | ' block_set (the former has priority) as an input argument.' 442 | ) 443 | 444 | return class_list 445 | 446 | def _sentence_tokenize(self, token_list, sequence_type=None): 447 | # NB that conversion to UTF-8 (bytes objects) also happens here: 448 | # token_list is a list of *strings*, but the tokenized_sentence is a 449 | # list of *bytes*. 450 | 451 | if self.token_type == 'word_piece_sequence': 452 | # get the encoder and unique_targets via tensor2tensor code 453 | vocab_file_path = self.sequence_type_to_vocab_file_path(sequence_type) 454 | token_encoder = self.TokenEncoder(vocab_file_path) 455 | unique_targets = token_encoder._all_subtoken_strings 456 | 457 | # we can't just encode, we must also break into subwords 458 | indices = token_encoder.encode(' '.join( 459 | [token.lower() for token in token_list])) 460 | tokenized_sentence = [ 461 | unique_targets[i].encode('utf-8') for i in indices] 462 | elif self.token_type == 'trial': 463 | # So that we can use vocab_files with (one-word) trials, we always 464 | # append an underscore to each word, before joining them together. 465 | tokenized_sentence = [' '.join( 466 | [token.lower() + '_' for token in token_list] 467 | ).encode('utf-8')] 468 | else: 469 | # all other token_types are lists (possibly of length-1) of 470 | # underscore-postfixed tokens 471 | tokenized_sentence = [ 472 | (token.lower() + '_').encode('utf-8') for token in token_list 473 | ] 474 | 475 | return tokenized_sentence 476 | 477 | def TokenEncoder(self, vocab_file_path): 478 | ''' 479 | if self.token_type == 'word_piece_sequence': 480 | return text_encoder.SubwordTextEncoder(vocab_file_path) 481 | else: 482 | return text_encoder.TokenTextEncoder( 483 | vocab_file_path, replace_oov=OOV_token) 484 | ''' 485 | return text_encoder.SubwordTextEncoder(vocab_file_path) 486 | 487 | ############# 488 | # DUMMY PROPERTIES AND METHODS 489 | @property 490 | def bipolar_to_elec_map(self): 491 | # print('WARNING!!!! MAKING UP bipolar_to_elec_map!!!') 492 | elec_map = [] 493 | layout = self.elec_layout # for short 494 | for i in range(layout.shape[0]): 495 | for j in range(layout.shape[1]): 496 | if j < layout.shape[1]-1: 497 | elec_map.append((layout[i, j], layout[i, j+1])) 498 | if i < layout.shape[0]-1: 499 | elec_map.append((layout[i, j], layout[i+1, j])) 500 | return np.array(elec_map) 501 | 502 | def _get_wav_data(self, index): 503 | sampling_rate = None 504 | signal = None 505 | return sampling_rate, signal 506 | 507 | def _query(self, block_set): 508 | ''' 509 | Get the number of examples for the purpose of memory pre-allocation 510 | ''' 511 | 512 | num_examples = None 513 | return num_examples 514 | 515 | def _ecog_token_generator(self, block): 516 | ''' 517 | A generator that yields a dictionary with: 518 | `ecog_sequence`: ECoG data, clipped to token(-sequence) length 519 | `text_sequence`: the corresponding text token(-sequence) 520 | `audio_sequence`: the corresponding audio (MFCC) token sequence 521 | `phoneme_sequence`: ditto for phonemes--with repeats 522 | ''' 523 | 524 | for i in range(0): 525 | yield { 526 | 'ecog_sequence': None, 527 | 'text_sequence': None, 528 | 'audio_sequence': None, 529 | 'phoneme_sequence': None, 530 | } 531 | ############# 532 | 533 | 534 | # deprecated 535 | def filter_to_common_targets(inputs_A, targets_A, inputs_B, targets_B): 536 | '''Filter out the examples that have targets not occurring in the 537 | other set. For example, if the word "horse" shows up in targets_A 538 | but not targets_B, remove that example from targets_A and inputs_A.''' 539 | 540 | common_targets = set(targets_A) & set(targets_B) 541 | inputs_A, targets_A = filter_to_common_targets_core( 542 | inputs_A, targets_A, common_targets) 543 | inputs_B, targets_B = filter_to_common_targets_core( 544 | inputs_B, targets_B, common_targets) 545 | print('Sets (A,B) now have (%d,%d) examples and (%d,%d) unique tokens' % ( 546 | len(targets_A), len(targets_B), 547 | len(set(targets_A)), len(set(targets_B)))) 548 | return inputs_A, targets_A, inputs_B, targets_B 549 | 550 | 551 | def filter_to_common_targets_core(inputs, targets, common_targets): 552 | 553 | # get the indices of training pairs whose targets are in the common set 554 | common_targets_indices = [ind for ind, val in enumerate(targets) 555 | for this_common_target in common_targets 556 | if val == this_common_target] 557 | 558 | # the inputs are a numpy array, the targets are a list (or list of lists) 559 | inputs = inputs[common_targets_indices, :, :] 560 | targets = [targets[ind] for ind in common_targets_indices] 561 | 562 | return inputs, targets 563 | 564 | -------------------------------------------------------------------------------- /ecog2txt/subjects.py: -------------------------------------------------------------------------------- 1 | # standard libraries 2 | import pdb 3 | import os 4 | import json 5 | import copy 6 | 7 | # third-party packages 8 | import numpy as np 9 | import tensorflow as tf 10 | from machine_learning.neural_networks.tf_helpers import ( 11 | parse_protobuf_seq2seq_example, fancy_indexing, string_seq_to_index_seq 12 | ) 13 | 14 | 15 | # local 16 | from utils_jgm.toolbox import wer_vector, auto_attribute, str2int_hook 17 | 18 | from ecog2txt import EOS_token, pad_token, OOV_token, DATA_PARTITIONS 19 | 20 | 21 | ''' 22 | :Author: J.G. Makin (except where otherwise noted) 23 | Split from trainers.py: 01/21/20 24 | ''' 25 | 26 | 27 | class ECoGSubject: 28 | @auto_attribute(CHECK_MANIFEST=True) 29 | def __init__( 30 | self, 31 | manifest, 32 | subj_id, 33 | pretrain_all_blocks=False, 34 | input_mask=None, 35 | target_specs=(), 36 | block_ids=(), 37 | ##### 38 | # in the manifest 39 | block_types=None, 40 | data_mapping=None, 41 | decimation_factor=None, # has priority over sampling_rate_decimated 42 | sampling_rate_decimated=None, 43 | json_dir=None, 44 | ##### 45 | # private; do no assign to self 46 | _DG_kwargs=(), 47 | ): 48 | ''' 49 | An ECoGSubject is mostly a collection of attributes, but also includes 50 | an ECoGDataGenerator. Most (but not all) of the attributes are intended 51 | for accesss by a SequenceNetwork; in particular, a list of ECoGSubjects 52 | (or any other object with the parameters listed below) can be passed as 53 | an element of a list `params` to the SequenceNetwork's fit method. Such 54 | a list is created by (and attributed to) a MultiSubjectTrainer. 55 | 56 | Attributes intended for a SequenceNetwork: 57 | subnet_id 58 | block_ids 59 | decimation_factor 60 | tf_record_partial_path 61 | input_mask*** 62 | target_specs*** 63 | 64 | Other attributes include: 65 | data_generators (for generating data!) 66 | block_types (for constructing block_ids) 67 | pretrain_all_blocks (for constructing block_ids) 68 | ''' 69 | 70 | # get the block_breakdowns 71 | # json_dir = manifest['json_dir'] 72 | with open(os.path.join(self.json_dir, 'block_breakdowns.json')) as f: 73 | block_breakdowns = json.load(f, object_hook=str2int_hook)[subj_id] 74 | self._block_dict = block_breakdowns 75 | 76 | # these attributes will *not* be accessed by a SequenceNet 77 | DataGenerator = manifest['DataGenerator'] 78 | self.data_generator = DataGenerator(manifest, subj_id, **dict(_DG_kwargs)) 79 | 80 | # these attribute *will* be accessed by a SequenceNet 81 | self.target_specs = dict(target_specs) 82 | self.data_manifests = { 83 | data_key: 84 | SequenceDataManifest(**data_manifest_kwargs) 85 | if type(data_manifest_kwargs) is dict else 86 | SequenceDataManifest(data_manifest_kwargs) 87 | for data_key, data_manifest_kwargs in self.data_mapping.items() 88 | } 89 | 90 | # ATTRIBUTES THAT WILL *NOT* BE ACCESSED BY A SequenceNet 91 | @property 92 | def input_mask(self): 93 | return self._input_mask 94 | 95 | @input_mask.setter 96 | def input_mask(self, input_mask): 97 | 98 | # assign the shadow variable 99 | self._input_mask = input_mask 100 | 101 | # make this input mask consistent with this subject 102 | if self._input_mask is not None: 103 | self._input_mask.good_channels = self.data_generator.good_channels 104 | 105 | # ATTRIBUTES THAT *WILL* BE ACCESSED BY A SequenceNet 106 | @property 107 | def subnet_id(self): 108 | return self.subj_id 109 | 110 | @property 111 | def block_ids(self): 112 | 113 | if self._block_ids: 114 | return self._block_ids 115 | else: 116 | block_ids = { 117 | data_partition: { 118 | blk for blk in self._block_dict if 119 | self._block_dict[blk]['default_dataset'] == data_partition and 120 | self._block_dict[blk]['type'] in self.block_types[data_partition] 121 | } for data_partition in DATA_PARTITIONS 122 | } 123 | if self.pretrain_all_blocks: 124 | block_ids['training'] = { 125 | blk for blk_list in block_ids.values() for blk in blk_list 126 | } 127 | 128 | # if we will be specifying targets... 129 | if self.target_specs: 130 | # ... then we assign all blocks to all partitions 131 | blocks = {blk for blks in block_ids.values() for blk in blks} 132 | block_ids = {partition: blocks for partition in DATA_PARTITIONS} 133 | 134 | return block_ids 135 | 136 | @block_ids.setter 137 | def block_ids(self, block_ids): 138 | self._block_ids = block_ids 139 | 140 | @property 141 | def tf_record_partial_path(self): 142 | return self.data_generator.tf_record_partial_path 143 | 144 | @property 145 | def decimation_factor(self): 146 | if self._decimation_factor is None: 147 | factor = int(np.round( 148 | self.data_generator.sampling_rate/self.sampling_rate_decimated 149 | )) 150 | else: 151 | factor = self._decimation_factor 152 | 153 | return factor 154 | 155 | @decimation_factor.setter 156 | def decimation_factor(self, decimation_factor): 157 | self._decimation_factor = decimation_factor 158 | 159 | @property 160 | def data_manifests(self): 161 | # The sequence_types 'ecog_sequence' and 'audio_sequence' are special: 162 | # their sizes are linked to properties of this data generator. The 163 | # others are assumed to be text or anyway sensibly stored in a list. 164 | # If other, non-text sequence_types are added, this preallocation 165 | # should be adjusted accordingly. 166 | 167 | for data_manifest in self._data_manifests.values(): 168 | if data_manifest.sequence_type == 'ecog_sequence': 169 | data_manifest.num_features = \ 170 | self.data_generator.num_ECoG_channels 171 | elif data_manifest.sequence_type == 'audio_sequence': 172 | data_manifest.num_features = \ 173 | self.data_generator.num_MFCC_features 174 | else: 175 | pass 176 | 177 | return self._data_manifests 178 | 179 | @data_manifests.setter 180 | def data_manifests(self, data_manifests): 181 | self._data_manifests = data_manifests 182 | 183 | def write_tf_records_maybe( 184 | self, sequence_type=None, data_partitions=DATA_PARTITIONS 185 | ): 186 | # by default, provide the classes associated with the decoder_targets 187 | if sequence_type is None: 188 | sequence_type = self.data_manifests['decoder_targets'].sequence_type 189 | 190 | # Note that these class_list are overwriting each other: only the 191 | # one associated with the final data_partition is returned 192 | for data_partition in data_partitions: 193 | class_list = self.data_generator.write_to_Protobuf_maybe( 194 | sequence_type, self.block_ids[data_partition], 195 | ) 196 | return class_list 197 | 198 | def count_targets(self, unique_targets, threshold=0.4): 199 | 200 | # initialize 201 | target_counters = {} 202 | sequence_counters = {} 203 | unique_sequences = () 204 | 205 | # do *not* transform the saved strings into indices! 206 | target_manifest = copy.copy(self.data_manifests['decoder_targets']) 207 | target_manifest.transform = None 208 | 209 | # for each data_partition... 210 | for data_partition, blocks in self.block_ids.items(): 211 | # build two counters and apply them to *all* examples 212 | target_counter = TargetCounter(unique_targets) 213 | sequence_counter = SequenceCounter(unique_sequences, threshold) 214 | apply_to_all_tf_examples( 215 | [target_counter, sequence_counter], 216 | lambda example_proto: parse_protobuf_seq2seq_example( 217 | example_proto, {'decoder_targets': target_manifest}, 218 | ), 219 | blocks, self.tf_record_partial_path 220 | ) 221 | 222 | # say 223 | print('finished count for %17s' % (data_partition), end='') 224 | print(' with %5i tokens,' % sum(target_counter.types), end='') 225 | print(' with %5i types,' % sum(target_counter.types > 0), end='') 226 | print(' with %5i sequence types,' % sum( 227 | sequence_counter.types > 0), end='') 228 | print(' and %5i skipped tokens' % target_counter.skipped_tokens, 229 | end='') 230 | print(' in %5i examples' % target_counter.examples) 231 | 232 | # store 233 | target_counters[data_partition] = target_counter 234 | sequence_counters[data_partition] = sequence_counter 235 | 236 | # the next partition's sequence_counter should just add to the list 237 | unique_sequences = sequence_counter.unique_sequence_list 238 | 239 | # ... 240 | synchronize_sequence_counters(sequence_counters) 241 | 242 | return target_counters, sequence_counters 243 | 244 | def get_unique_target_lengths(self, threshold=0.4): 245 | 246 | # initialize 247 | sequence_counters = {} 248 | unique_sequence_list = () 249 | for data_partition, blks in self.block_ids.items(): 250 | 251 | # .... 252 | sequence_counter = SequenceCounter( 253 | unique_sequence_list, threshold, protobuf_name='full_record') 254 | apply_to_all_tf_examples( 255 | [sequence_counter], 256 | lambda example_proto: parse_protobuf_seq2seq_example( 257 | example_proto, self.data_manifests 258 | ), 259 | blks, self.tf_record_partial_path 260 | ) 261 | 262 | # store 263 | sequence_counters[data_partition] = sequence_counter 264 | 265 | # the next partition's sequence_counter should just add to the list 266 | unique_sequence_list = sequence_counter.unique_sequence_list 267 | 268 | # ... 269 | synchronize_sequence_counters(sequence_counters) 270 | 271 | return sequence_counters 272 | 273 | 274 | class SequenceDataManifest: 275 | ''' 276 | A simple class to hold the information for unpacking the sequence data from 277 | tf records--plus a few other useful values. This function can automatically 278 | adjust for masks that extract only a subset of the stored data. 279 | ''' 280 | 281 | @auto_attribute 282 | def __init__( 283 | self, 284 | sequence_type, 285 | num_features=None, 286 | num_features_raw=None, 287 | transform=None, 288 | padding_value=None, 289 | penalty_scale=1.0, 290 | distribution=None, 291 | mask=None, 292 | get_feature_list=None, 293 | APPEND_EOS=False 294 | ): 295 | pass 296 | 297 | @property 298 | def feature_value(self): 299 | if self.sequence_type in ['ecog_sequence', 'audio_sequence']: 300 | return tf.io.VarLenFeature(tf.float32) 301 | else: 302 | return tf.io.VarLenFeature(tf.string) 303 | 304 | @property 305 | def num_features(self): 306 | if self.mask is not None: 307 | return len(self.mask.inds) 308 | elif self.get_feature_list is not None: 309 | feature_list = self.get_feature_list() 310 | # feature_lists are for the special case of categorical data that 311 | # will be converted into one-hot representations 312 | return len(feature_list) 313 | else: 314 | return self._num_features 315 | 316 | @num_features.setter 317 | def num_features(self, num_features): 318 | self._num_features = num_features 319 | 320 | @property 321 | def num_features_raw(self): 322 | if self._num_features_raw is not None: 323 | return self._num_features_raw 324 | elif self.mask is not None: 325 | # the shadow property wasn't reduced by the presence of a mask 326 | return self._num_features 327 | elif self.get_feature_list is not None: 328 | # feature_lists are for the special case of categorical data that 329 | # will be converted into one-hot representations 330 | return 1 331 | else: 332 | return self.num_features 333 | 334 | @num_features_raw.setter 335 | def num_features_raw(self, num_features_raw): 336 | self._num_features_raw = num_features_raw 337 | 338 | @property 339 | def transform(self): 340 | if self._transform is not None: 341 | return self._transform 342 | elif self.mask is not None: 343 | return lambda seq: fancy_indexing(seq, self.input_mask.inds, 1) 344 | elif self.get_feature_list is not None: 345 | feature_list = self.get_feature_list() 346 | # set the ids to some defaults if they're not in the UTL 347 | ######## 348 | OOV_id = ( 349 | feature_list.index(OOV_token) 350 | if OOV_token in feature_list else 2 351 | ) 352 | # Just making up "2" here can give some really weird errors... 353 | ######## 354 | if self.APPEND_EOS: 355 | return lambda seq: string_seq_to_index_seq( 356 | seq, feature_list, [feature_list.index(EOS_token)], OOV_id, 357 | ) 358 | else: 359 | return lambda seq: string_seq_to_index_seq( 360 | seq, feature_list, [], OOV_id 361 | ) 362 | else: 363 | return lambda seq: seq 364 | 365 | @transform.setter 366 | def transform(self, transform): 367 | self._transform = transform 368 | 369 | @property 370 | def distribution(self): 371 | if self._distribution is not None: 372 | return self._distribution 373 | else: 374 | if self.sequence_type == 'ecog_sequence': 375 | # but you probably don't care about the ECoG distribution... 376 | return 'Rayleigh' 377 | elif self.sequence_type == 'audio_sequence': 378 | return 'Gaussian' 379 | else: 380 | return 'categorical' 381 | 382 | @distribution.setter 383 | def distribution(self, distribution): 384 | self._distribution = distribution 385 | 386 | @property 387 | def padding_value(self): 388 | if self._padding_value is None: 389 | if self.get_feature_list is None: 390 | return 0.0 391 | else: 392 | feature_list = self.get_feature_list() 393 | ######## 394 | return (feature_list.index(pad_token) 395 | if pad_token in feature_list else 0) 396 | # As above with "2," this 0 is highly dubious and will probably 397 | # create difficult-to-find bugs 398 | ######## 399 | else: 400 | return self._padding_value 401 | 402 | @padding_value.setter 403 | def padding_value(self, padding_value): 404 | self._padding_value = padding_value 405 | 406 | 407 | ################ 408 | # This is probably semi-broken and in any case should be brought up to date 409 | # with the rest of the package. 410 | ################ 411 | class SubgridParams: 412 | @auto_attribute 413 | def __init__( 414 | self, 415 | grid_size=[16, 16], 416 | subgrid_size=[8, 16], 417 | start=[0, 0], 418 | SUBSAMPLE=False, 419 | OCCLUDE=False, 420 | subj_id=None, 421 | good_channels=None, 422 | ): 423 | 424 | # set default values 425 | if grid_size is None: 426 | self.grid_size = [16, 16] 427 | if subgrid_size is None: 428 | self.subgrid_size = [8, 16] 429 | if start is None: 430 | self.start = [0, 0] 431 | 432 | self.inds = None 433 | 434 | @property 435 | def _electrodes(self): 436 | ########### 437 | # This should probably use elec_layout directly.... 438 | ########### 439 | 440 | # arrange electrodes in a rectilinear grid (matrix) 441 | full_grid_electrodes = np.reshape( 442 | np.arange(np.prod(self.grid_size)), self.grid_size) 443 | 444 | # subgrid_size is a list of either two ints or strs specifying anatomy 445 | ###if isinstance(subgrid_size[0], str): 446 | 447 | # either subsample or take a section 448 | if self.SUBSAMPLE: 449 | stop = [i+j for i, j in zip(self.start, self.grid_size)] 450 | step = [M//N for M, N in zip(self.grid_size, self.subgrid_size)] 451 | else: 452 | stop = [i+j for i, j in zip(self.start, self.subgrid_size)] 453 | step = [1, 1] 454 | 455 | # if "tall," the matrix must be transposed before flattening 456 | if self.subgrid_size[0] > self.subgrid_size[1]: 457 | full_grid_electrodes = full_grid_electrodes.T 458 | self.start.reverse() 459 | stop.reverse() 460 | 461 | return np.reshape(full_grid_electrodes[ 462 | self.start[0]:stop[0]:step[0], self.start[1]:stop[1]:step[1]], -1) 463 | 464 | @property 465 | def inds(self): 466 | if self._inds is not None: 467 | return self._inds 468 | 469 | if self.good_channels is not None: 470 | if self.OCCLUDE: 471 | # only *exclude* the subgrid 472 | return [i for i, e in enumerate(self.good_channels) 473 | if e not in self._electrodes] 474 | else: 475 | # only *include* the subgrid 476 | return [i for i, e in enumerate(self.good_channels) 477 | if e in self._electrodes] 478 | else: 479 | return None 480 | 481 | @inds.setter 482 | def inds(self, inds): 483 | self._inds = inds 484 | 485 | 486 | class TargetCounter: 487 | @auto_attribute 488 | def __init__( 489 | self, 490 | unique_targets, 491 | ): 492 | # the dictionary that will be updated 493 | self.types = np.zeros(len(unique_targets), dtype=int) 494 | self.skipped_tokens = 0 495 | self.examples = 0 496 | 497 | def update(self, byte_sequence): 498 | 499 | # just clean it up a bit 500 | sequence = [b.decode('utf-8') for b in byte_sequence] 501 | 502 | # all examples are counted 503 | self.examples += 1 504 | 505 | # for all entries (probably words) in this list 506 | for entry in sequence: 507 | try: 508 | self.types[self.unique_targets.index(entry)] += 1 509 | except ValueError: 510 | self.skipped_tokens += 1 511 | 512 | 513 | class SequenceCounter: 514 | def __init__( 515 | self, 516 | unique_sequence_list=(), 517 | threshold=0.4, 518 | protobuf_name='decoder_targets_only' 519 | ): 520 | 521 | # attribute 522 | self.threshold = threshold 523 | self.unique_sequence_list = list(unique_sequence_list) 524 | self.types = np.array( 525 | [0 for _ in range(len(unique_sequence_list))], dtype=int) 526 | self.examples = 0 527 | self.protobuf_name = protobuf_name 528 | self.lengths = [[] for _ in range(len(unique_sequence_list))] 529 | 530 | def update(self, data_example): 531 | 532 | # extract the sequence as a list (of strings or indices) 533 | ### sequence = data_example['decoder_targets'][:, 0].tolist() 534 | sequence = data_example['decoder_targets'][:, 0]._numpy().tolist() 535 | if type(sequence[0]) is bytes: 536 | sequence = [b.decode('utf-8') for b in sequence] 537 | if type(sequence[0]) is str: 538 | sequence += [EOS_token] 539 | 540 | # all examples are counted 541 | self.examples += 1 542 | 543 | # if at least one sequence has been added to the list... 544 | if self.unique_sequence_list: 545 | # ...then get their word error rate from the current sequence 546 | WERs = wer_vector( 547 | self.unique_sequence_list, 548 | [sequence]*len(self.unique_sequence_list) 549 | ) 550 | 551 | # if this sequence is close enough to an observed sequence... 552 | if np.min(WERs) < self.threshold: 553 | # ...assign it to that sequence 554 | self.types[np.argmin(WERs)] += 1 555 | if self.protobuf_name != 'decoder_targets_only': 556 | self.lengths[np.argmin(WERs)].append( 557 | data_example['encoder_inputs'].shape[0]) 558 | return 559 | 560 | # ...otherwise append this sequence and count it 561 | self.unique_sequence_list.append(sequence) 562 | self.types = np.append(self.types, [1]) 563 | if self.protobuf_name != 'decoder_targets_only': 564 | self.lengths.append([data_example['encoder_inputs'].shape[0]]) 565 | 566 | @property 567 | def lengths_means(self): 568 | return [np.mean(lengths) for lengths in self.lengths] 569 | 570 | @property 571 | def lengths_std_errs(self): 572 | return [(np.var(lengths)/len(lengths))**(1/2) 573 | for lengths in self.lengths] 574 | 575 | 576 | def synchronize_sequence_counters(sequence_counters): 577 | ''' 578 | Enforce consistency among all sequence counters in a dictionary 579 | ''' 580 | 581 | # find the partition with the longest unique_sequence_list 582 | max_length = -1 583 | for partition in DATA_PARTITIONS: 584 | num_sequences = len(sequence_counters[partition].unique_sequence_list) 585 | if num_sequences > max_length: 586 | max_length = num_sequences 587 | unique_sequences = sequence_counters[partition].unique_sequence_list 588 | 589 | for data_partition in DATA_PARTITIONS: 590 | # overwrite with the final unique_sequence_list 591 | sequence_counters[data_partition].unique_sequence_list = unique_sequences 592 | 593 | # pad out to length of final unique_sequence_list with 0s 594 | type_counts = sequence_counters[data_partition].types 595 | Npad = len(unique_sequences) - type_counts.shape[0] 596 | sequence_counters[data_partition].types = np.pad( 597 | type_counts, [0, Npad], mode='constant') 598 | 599 | # pad out to length of final unique_sequence_list with empty lists 600 | sequence_counters[data_partition].lengths.extend([[]]*Npad) 601 | 602 | 603 | def apply_to_all_tf_examples(examplers, map_fxn, blks, tf_record_partial_path): 604 | 605 | if int(tf.__version__.split('.')[0]) == 2: 606 | dataset = tf.data.TFRecordDataset([ 607 | tf_record_partial_path.format(blk) for blk in blks]) 608 | dataset = dataset.map(map_fxn) 609 | for example in dataset: 610 | for exampler in examplers: 611 | exampler.update(example) 612 | else: 613 | tf.compat.v1.reset_default_graph() 614 | data_graph = tf.Graph() 615 | with data_graph.as_default(): 616 | dataset = tf.data.TFRecordDataset([ 617 | tf_record_partial_path.format(blk) for blk in blks]) 618 | dataset = dataset.map(map_fxn, num_parallel_calls=32) 619 | get_next_example = dataset.make_one_shot_iterator().get_next() 620 | sess = tf.compat.v1.Session() 621 | while True: 622 | try: 623 | next_example = sess.run(get_next_example) 624 | for exampler in examplers: 625 | exampler.update(next_example) 626 | except tf.errors.OutOfRangeError: 627 | break 628 | -------------------------------------------------------------------------------- /ecog2txt/auxiliary/vocab.mocha-timit.1806: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | fangs_ 5 | medieval_ 6 | nothing_ 7 | lightbulbs_ 8 | antagonistic_ 9 | favour_ 10 | sure_ 11 | subdued_ 12 | signed_ 13 | rewarded_ 14 | take_ 15 | flower_ 16 | overwhelmed_ 17 | occurs_ 18 | caught_ 19 | then_ 20 | camp_ 21 | window_ 22 | stab_ 23 | noteworthy_ 24 | classical_ 25 | nan_ 26 | bells_ 27 | idly_ 28 | autumn_ 29 | feelings_ 30 | rescue_ 31 | wrap_ 32 | doll_ 33 | capable_ 34 | george_ 35 | chablis_ 36 | movie_ 37 | watch_ 38 | am_ 39 | aptitude_ 40 | worry_ 41 | schooner_ 42 | garbage_ 43 | burned_ 44 | bureaucracy_ 45 | judge_ 46 | view_ 47 | diagram_ 48 | mask_ 49 | damage_ 50 | scarf_ 51 | zips_ 52 | measured_ 53 | necklace_ 54 | participate_ 55 | motorists_ 56 | upbringing_ 57 | objects_ 58 | developing_ 59 | cigarettes_ 60 | dad_ 61 | acts_ 62 | paranoid_ 63 | hires_ 64 | slipped_ 65 | emblem_ 66 | makes_ 67 | crucial_ 68 | showers_ 69 | square_ 70 | red_ 71 | scoop_ 72 | kidnappers_ 73 | tycoons_ 74 | dime_ 75 | dolphins_ 76 | greatly_ 77 | notoriety_ 78 | degrees_ 79 | discussions_ 80 | jaw_ 81 | film_ 82 | disappeared_ 83 | arm_ 84 | exist_ 85 | cheating_ 86 | counted_ 87 | teach_ 88 | modelling_ 89 | rodents_ 90 | itemize_ 91 | buying_ 92 | contagious_ 93 | jane_ 94 | people_ 95 | basketball_ 96 | etiquette_ 97 | prowler_ 98 | symbols_ 99 | generous_ 100 | biologists_ 101 | thursday_ 102 | informative_ 103 | crayons_ 104 | students_ 105 | of_ 106 | overcharged_ 107 | small_ 108 | yesterday_ 109 | sun_ 110 | ideology_ 111 | forms_ 112 | out_ 113 | murals_ 114 | instructions_ 115 | confirm_ 116 | values_ 117 | cooperation_ 118 | aglow_ 119 | street_ 120 | so_ 121 | more_ 122 | todd_ 123 | jim_ 124 | robin_ 125 | throughout_ 126 | received_ 127 | furrier_ 128 | here_ 129 | geological_ 130 | first_ 131 | bidding_ 132 | exciting_ 133 | attitude_ 134 | impossible_ 135 | costumes_ 136 | lost_ 137 | psychological_ 138 | build_ 139 | evening_ 140 | leap_ 141 | walk_ 142 | academic_ 143 | shimmered_ 144 | hours_ 145 | contributed_ 146 | straight_ 147 | cutbacks_ 148 | burglar_ 149 | on_ 150 | cubic_ 151 | authorization_ 152 | failure_ 153 | john_ 154 | lessons_ 155 | ankle_ 156 | rose_ 157 | carl_ 158 | chipper_ 159 | church_ 160 | expertise_ 161 | water_ 162 | lifelong_ 163 | abbreviate_ 164 | requires_ 165 | finish_ 166 | reads_ 167 | fish_ 168 | michael_ 169 | remove_ 170 | bones_ 171 | off_ 172 | fundraisers_ 173 | handbag_ 174 | brother_ 175 | gowns_ 176 | woman_ 177 | precaution_ 178 | allowance_ 179 | exquisite_ 180 | outstanding_ 181 | scampered_ 182 | bracelet_ 183 | natural_ 184 | inferiority_ 185 | freely_ 186 | near_ 187 | pickpocket_ 188 | military_ 189 | galoshes_ 190 | cats_ 191 | jaguars_ 192 | alien_ 193 | marvellously_ 194 | effects_ 195 | screw_ 196 | classrooms_ 197 | snow_ 198 | muscles_ 199 | business_ 200 | one_ 201 | addition_ 202 | coast_ 203 | related_ 204 | status_ 205 | wood_ 206 | vaporization_ 207 | programs_ 208 | drunkard_ 209 | display_ 210 | glue_ 211 | assume_ 212 | rather_ 213 | corsage_ 214 | welfare_ 215 | due_ 216 | grow_ 217 | animals_ 218 | along_ 219 | rachel_ 220 | surplus_ 221 | through_ 222 | skirt_ 223 | eyestrain_ 224 | plate_ 225 | helped_ 226 | control_ 227 | don't_ 228 | strongly_ 229 | we'll_ 230 | microorganisms_ 231 | toy_ 232 | complex_ 233 | thermometer_ 234 | large_ 235 | neglect_ 236 | beds_ 237 | decorate_ 238 | paragraph_ 239 | know_ 240 | child_ 241 | fail_ 242 | slope_ 243 | dispute_ 244 | recuperating_ 245 | technical_ 246 | ready_ 247 | chases_ 248 | gunman_ 249 | sugar_ 250 | auburn_ 251 | social_ 252 | phony_ 253 | would_ 254 | plow_ 255 | vapour_ 256 | hat_ 257 | activities_ 258 | vietnamese_ 259 | subway_ 260 | recoiled_ 261 | expense_ 262 | buyer_ 263 | refurbishing_ 264 | begin_ 265 | society_ 266 | black_ 267 | suburbanites_ 268 | flurries_ 269 | does_ 270 | movies_ 271 | spring_ 272 | ice_ 273 | coleslaw_ 274 | activity_ 275 | seldom_ 276 | drugs_ 277 | stronghold_ 278 | discount_ 279 | documents_ 280 | penguins_ 281 | zinnias_ 282 | frost_ 283 | are_ 284 | argued_ 285 | steaming_ 286 | born_ 287 | joyce_ 288 | oily_ 289 | ate_ 290 | by_ 291 | rag_ 292 | vault_ 293 | angry_ 294 | mother_ 295 | dance_ 296 | five_ 297 | spilled_ 298 | save_ 299 | thursdays_ 300 | jokes_ 301 | subtitles_ 302 | chemicals_ 303 | some_ 304 | finding_ 305 | hyena_ 306 | jeep_ 307 | garden_ 308 | atypical_ 309 | shadow_ 310 | emergency_ 311 | ringing_ 312 | worn_ 313 | strength_ 314 | villains_ 315 | precincts_ 316 | reptiles_ 317 | boy_ 318 | dowager_ 319 | year_ 320 | two_ 321 | drugstore_ 322 | purple_ 323 | cuisine_ 324 | all_ 325 | my_ 326 | make_ 327 | cornered_ 328 | pays_ 329 | steep_ 330 | sweaters_ 331 | non-profit_ 332 | see_ 333 | juice_ 334 | musical_ 335 | security_ 336 | we_ 337 | needed_ 338 | became_ 339 | outdoors_ 340 | cartoon_ 341 | hot_ 342 | few_ 343 | safari_ 344 | stimulating_ 345 | recent_ 346 | famous_ 347 | crab_ 348 | cut_ 349 | huge_ 350 | lemon_ 351 | bandaged_ 352 | unlimited_ 353 | verbalize_ 354 | spherical_ 355 | thinner_ 356 | alfalfa_ 357 | pickpockets_ 358 | policy_ 359 | discouraging_ 360 | will_ 361 | worried_ 362 | well-kept_ 363 | barracuda_ 364 | simple_ 365 | thread_ 366 | progress_ 367 | countryside_ 368 | instruments_ 369 | article_ 370 | roll_ 371 | desert_ 372 | sheila_ 373 | cooperates_ 374 | lines_ 375 | pearls_ 376 | as_ 377 | lawyers_ 378 | experiment_ 379 | sugars_ 380 | sweet_ 381 | frequent_ 382 | solve_ 383 | seismic_ 384 | using_ 385 | wall_ 386 | smash_ 387 | please_ 388 | valuables_ 389 | antarctic_ 390 | peck_ 391 | compliance_ 392 | atheists_ 393 | corduroy_ 394 | york_ 395 | charmer_ 396 | pressure_ 397 | most_ 398 | couldn't_ 399 | fleecy_ 400 | upgrade_ 401 | masquerade_ 402 | attacked_ 403 | sleeping_ 404 | appointed_ 405 | unbeatable_ 406 | thoroughbred_ 407 | frequently_ 408 | felt_ 409 | cab_ 410 | exam_ 411 | diagnosis_ 412 | much_ 413 | further_ 414 | learn_ 415 | kayak_ 416 | brush_ 417 | oasis_ 418 | elderly_ 419 | goulash_ 420 | corner_ 421 | overlooked_ 422 | ride_ 423 | cast_ 424 | force_ 425 | essay_ 426 | expensive_ 427 | alimony_ 428 | candy_ 429 | planned_ 430 | treat_ 431 | light_ 432 | co-exist_ 433 | weatherproof_ 434 | disclaimer_ 435 | seeking_ 436 | he_ 437 | gab_ 438 | now_ 439 | soysauce_ 440 | items_ 441 | penalty_ 442 | extra_ 443 | contains_ 444 | lagoon_ 445 | guess_ 446 | be_ 447 | muscular_ 448 | night_ 449 | irving_ 450 | exchanged_ 451 | constantly_ 452 | audience_ 453 | clear_ 454 | major_ 455 | wear_ 456 | run_ 457 | horseradish_ 458 | women_ 459 | mango_ 460 | answered_ 461 | connoisseur_ 462 | papaya_ 463 | avoid_ 464 | annoying_ 465 | spurious_ 466 | trauma_ 467 | always_ 468 | cat_ 469 | each_ 470 | likes_ 471 | pie_ 472 | cashmere_ 473 | hook_ 474 | feet_ 475 | blouses_ 476 | swing_ 477 | cranberry_ 478 | thanksgiving_ 479 | shampooed_ 480 | plan_ 481 | serve_ 482 | ambled_ 483 | names_ 484 | smiths_ 485 | his_ 486 | hyenas_ 487 | cliff_ 488 | grandmother_ 489 | chop_ 490 | nectar_ 491 | eyedrops_ 492 | nice_ 493 | your_ 494 | aquatic_ 495 | sunshine_ 496 | calico_ 497 | put_ 498 | carpet_ 499 | nora_ 500 | county_ 501 | has_ 502 | provoked_ 503 | bank_ 504 | departure_ 505 | tim_ 506 | interpretation_ 507 | suggestion_ 508 | jewels_ 509 | shoes_ 510 | must_ 511 | wealth_ 512 | medical_ 513 | redwoods_ 514 | loss_ 515 | thick_ 516 | costume_ 517 | was_ 518 | retracted_ 519 | colored_ 520 | moment_ 521 | that_ 522 | calf_ 523 | based_ 524 | choosing_ 525 | urchins_ 526 | evaluate_ 527 | cured_ 528 | petticoats_ 529 | spotted_ 530 | seattle_ 531 | december_ 532 | stinging_ 533 | statuesque_ 534 | bluejay_ 535 | siamese_ 536 | answer_ 537 | consume_ 538 | quite_ 539 | miami_ 540 | nearest_ 541 | intelligible_ 542 | tranquilizers_ 543 | never_ 544 | goat_ 545 | cameo_ 546 | ears_ 547 | choices_ 548 | available_ 549 | home_ 550 | sky_ 551 | wealthy_ 552 | zoos_ 553 | porch_ 554 | swedish_ 555 | biblical_ 556 | growing_ 557 | and_ 558 | angora_ 559 | lamb_ 560 | bonfire_ 561 | audiovisual_ 562 | plymouth_ 563 | wardrobe_ 564 | forgery_ 565 | ocean_ 566 | perpendicular_ 567 | green_ 568 | brie_ 569 | times_ 570 | cheap_ 571 | hard_ 572 | seesaw_ 573 | exposure_ 574 | barometric_ 575 | forgot_ 576 | wound_ 577 | artists_ 578 | bobcat_ 579 | malnourished_ 580 | public_ 581 | several_ 582 | greg_ 583 | grades_ 584 | standby_ 585 | action_ 586 | friends_ 587 | making_ 588 | might_ 589 | clarification_ 590 | icicles_ 591 | underbrush_ 592 | funding_ 593 | therapy_ 594 | tongue_ 595 | farmers_ 596 | vocabulary_ 597 | need_ 598 | poor_ 599 | appreciated_ 600 | danny_ 601 | power_ 602 | beans_ 603 | lori_ 604 | graph_ 605 | surface_ 606 | events_ 607 | triumphant_ 608 | fruit_ 609 | bride_ 610 | pairs_ 611 | customer_ 612 | hindu_ 613 | predicament_ 614 | contained_ 615 | state_ 616 | postdate_ 617 | canteen_ 618 | formula_ 619 | keep_ 620 | company_ 621 | celebrates_ 622 | hungarian_ 623 | barbed_ 624 | open_ 625 | citizenship_ 626 | enter_ 627 | want_ 628 | edge_ 629 | these_ 630 | muskrat_ 631 | irish_ 632 | approach_ 633 | arriving_ 634 | hood_ 635 | gooseberry_ 636 | phil_ 637 | payments_ 638 | sat_ 639 | uses_ 640 | fjords_ 641 | ambiguous_ 642 | emphasized_ 643 | composure_ 644 | shape_ 645 | parties_ 646 | ointment_ 647 | began_ 648 | gas_ 649 | trespassing_ 650 | repainting_ 651 | present_ 652 | legislature_ 653 | rare_ 654 | affirmative_ 655 | myopia_ 656 | completely_ 657 | waste_ 658 | groundhog_ 659 | romantic_ 660 | easy_ 661 | needs_ 662 | sundaes_ 663 | flag_ 664 | poison_ 665 | unexpected_ 666 | prevented_ 667 | vegetable_ 668 | shaving_ 669 | aluminium_ 670 | prescribe_ 671 | deal_ 672 | pronunciation_ 673 | sudden_ 674 | eating_ 675 | shortage_ 676 | execution_ 677 | overweight_ 678 | herb_ 679 | primitive_ 680 | house_ 681 | best_ 682 | curiosity_ 683 | twice_ 684 | ron_ 685 | hull_ 686 | graduation_ 687 | accounts_ 688 | caused_ 689 | convenient_ 690 | tunafish_ 691 | destroy_ 692 | paper_ 693 | bright_ 694 | flew_ 695 | just_ 696 | drift_ 697 | files_ 698 | sea_ 699 | prison_ 700 | waiting_ 701 | beg_ 702 | him_ 703 | controlled_ 704 | men_ 705 | changes_ 706 | teaspoons_ 707 | products_ 708 | gifts_ 709 | avalanche_ 710 | rarely_ 711 | eleven_ 712 | who_ 713 | from_ 714 | fog_ 715 | appetizers_ 716 | clay_ 717 | lone_ 718 | wore_ 719 | wandered_ 720 | yacht_ 721 | over_ 722 | outer_ 723 | uninterrupted_ 724 | habit_ 725 | promote_ 726 | agricultural_ 727 | man_ 728 | stew_ 729 | lake_ 730 | peeling_ 731 | dirty_ 732 | exotic_ 733 | previous_ 734 | potatoes_ 735 | orders_ 736 | dressing_ 737 | turner_ 738 | lodge_ 739 | were_ 740 | purchased_ 741 | soon_ 742 | temperate_ 743 | serpent_ 744 | features_ 745 | cyclical_ 746 | equipment_ 747 | according_ 748 | intelligent_ 749 | work_ 750 | heating_ 751 | generals_ 752 | zones_ 753 | occasionally_ 754 | speech_ 755 | moth_ 756 | proof_ 757 | consuming_ 758 | nine_ 759 | bungalow_ 760 | aviaries_ 761 | honour_ 762 | ducks_ 763 | barb_ 764 | shawn_ 765 | bike_ 766 | household_ 767 | coincided_ 768 | this_ 769 | museum_ 770 | jennifer_ 771 | block_ 772 | under_ 773 | tribes_ 774 | execute_ 775 | gremlins_ 776 | often_ 777 | tears_ 778 | heroism_ 779 | those_ 780 | calcium_ 781 | play_ 782 | dish_ 783 | untimely_ 784 | fascinating_ 785 | blistered_ 786 | club_ 787 | artificial_ 788 | chronological_ 789 | food_ 790 | careful_ 791 | trish_ 792 | drawing_ 793 | idiotically_ 794 | can_ 795 | toothpaste_ 796 | stopwatch_ 797 | fixed_ 798 | store_ 799 | order_ 800 | older_ 801 | roger_ 802 | tweezers_ 803 | project_ 804 | zircons_ 805 | comes_ 806 | orange_ 807 | site_ 808 | interchangeably_ 809 | back_ 810 | handle_ 811 | regarding_ 812 | same_ 813 | nearly_ 814 | tugboats_ 815 | bottom_ 816 | practical_ 817 | corn_ 818 | chose_ 819 | early_ 820 | neoclassic_ 821 | course_ 822 | cheese_ 823 | dislikes_ 824 | locked_ 825 | get_ 826 | well_ 827 | fell_ 828 | lot_ 829 | way_ 830 | theatre_ 831 | set_ 832 | tomorrow_ 833 | others_ 834 | guarantees_ 835 | even_ 836 | hired_ 837 | smiles_ 838 | evidence_ 839 | suffer_ 840 | combine_ 841 | trees_ 842 | shoulder_ 843 | thomas_ 844 | fructose_ 845 | draw_ 846 | elm_ 847 | skill_ 848 | stems_ 849 | postponed_ 850 | interior_ 851 | tofu_ 852 | answers_ 853 | glistening_ 854 | bugle_ 855 | moisture_ 856 | they_ 857 | adjourned_ 858 | welcome_ 859 | ashtray_ 860 | skirts_ 861 | before_ 862 | living_ 863 | oysters_ 864 | three_ 865 | butterscotch_ 866 | within_ 867 | misplaced_ 868 | instead_ 869 | frightened_ 870 | such_ 871 | jam_ 872 | lively_ 873 | mum_ 874 | there_ 875 | distress_ 876 | publicity_ 877 | bog_ 878 | remember_ 879 | excluded_ 880 | popular_ 881 | money_ 882 | sandwich_ 883 | blue_ 884 | rhythm_ 885 | ability_ 886 | updating_ 887 | geese_ 888 | week_ 889 | zoologist_ 890 | appliances_ 891 | highway_ 892 | glistened_ 893 | which_ 894 | pure_ 895 | yards_ 896 | made_ 897 | anyone_ 898 | silly_ 899 | fortune_ 900 | blues_ 901 | turquoise_ 902 | jungle-like_ 903 | matched_ 904 | aches_ 905 | repertoire_ 906 | path_ 907 | appointment_ 908 | tell_ 909 | safe_ 910 | roof_ 911 | marine_ 912 | below_ 913 | colleges_ 914 | zebras_ 915 | breakfast_ 916 | income_ 917 | ironing_ 918 | garage_ 919 | vegetables_ 920 | thing_ 921 | use_ 922 | removal_ 923 | placed_ 924 | hit_ 925 | eight_ 926 | survive_ 927 | hear_ 928 | miles_ 929 | is_ 930 | chloride_ 931 | spanish_ 932 | takes_ 933 | birth_ 934 | boston_ 935 | excitement_ 936 | many_ 937 | or_ 938 | chlorine_ 939 | illegally_ 940 | cartoons_ 941 | number_ 942 | explicitly_ 943 | right_ 944 | rug_ 945 | drenched_ 946 | looking_ 947 | field_ 948 | both_ 949 | place_ 950 | reading_ 951 | when_ 952 | clams_ 953 | sketched_ 954 | disease_ 955 | gained_ 956 | mirage_ 957 | no_ 958 | why_ 959 | colourful_ 960 | afternoon_ 961 | skewers_ 962 | clearly_ 963 | obtain_ 964 | problem_ 965 | survey_ 966 | agency_ 967 | watches_ 968 | outgrew_ 969 | cheque_ 970 | thin_ 971 | smelled_ 972 | diane_ 973 | luxurious_ 974 | stole_ 975 | yet_ 976 | invest_ 977 | minor_ 978 | sport_ 979 | preparing_ 980 | next_ 981 | help_ 982 | had_ 983 | group_ 984 | toddler_ 985 | judged_ 986 | milk_ 987 | left_ 988 | toxic_ 989 | amoebas_ 990 | meeting_ 991 | alligators_ 992 | across_ 993 | abruptly_ 994 | oriental_ 995 | moon_ 996 | gift_ 997 | butcher_ 998 | cupcakes_ 999 | table_ 1000 | how_ 1001 | i_ 1002 | their_ 1003 | surely_ 1004 | saw_ 1005 | giant_ 1006 | abdomen_ 1007 | hats_ 1008 | spray_ 1009 | prospective_ 1010 | the_ 1011 | expression_ 1012 | collects_ 1013 | mayan_ 1014 | where_ 1015 | existing_ 1016 | lack_ 1017 | picked_ 1018 | us_ 1019 | correct_ 1020 | latest_ 1021 | cook_ 1022 | stray_ 1023 | spielberg_ 1024 | hundred_ 1025 | disguise_ 1026 | axis_ 1027 | obey_ 1028 | surveying_ 1029 | spend_ 1030 | radioactive_ 1031 | creole_ 1032 | regular_ 1033 | westchester_ 1034 | ballet_ 1035 | finds_ 1036 | fawn_ 1037 | rock-and-roll_ 1038 | ideal_ 1039 | severe_ 1040 | cory_ 1041 | ambidextrous_ 1042 | processed_ 1043 | pop_ 1044 | gus_ 1045 | assistance_ 1046 | balls_ 1047 | earthquake_ 1048 | viewpoint_ 1049 | sprained_ 1050 | triggered_ 1051 | data_ 1052 | broken_ 1053 | endurance_ 1054 | consists_ 1055 | tax_ 1056 | gregory_ 1057 | nancy_ 1058 | crooked_ 1059 | operates_ 1060 | escalator_ 1061 | once_ 1062 | causeway_ 1063 | holidays_ 1064 | opens_ 1065 | development_ 1066 | soothed_ 1067 | finger_ 1068 | ignored_ 1069 | become_ 1070 | undeniably_ 1071 | beautiful_ 1072 | opaque_ 1073 | fill_ 1074 | reminded_ 1075 | singer_ 1076 | rhubarb_ 1077 | economic_ 1078 | giraffes_ 1079 | new_ 1080 | brightly_ 1081 | lengthy_ 1082 | temper_ 1083 | employee_ 1084 | stylish_ 1085 | real_ 1086 | eat_ 1087 | emperor_ 1088 | garlic_ 1089 | cream_ 1090 | yellow_ 1091 | outcome_ 1092 | lots_ 1093 | unevenly_ 1094 | pretty_ 1095 | violence_ 1096 | alone_ 1097 | cloverleaf_ 1098 | high_ 1099 | fires_ 1100 | at_ 1101 | long_ 1102 | massage_ 1103 | suitable_ 1104 | loved_ 1105 | mandates_ 1106 | al_ 1107 | give_ 1108 | handed_ 1109 | clamshell_ 1110 | go-cart_ 1111 | parenthood_ 1112 | drop_ 1113 | foam_ 1114 | if_ 1115 | stung_ 1116 | buy_ 1117 | algebraic_ 1118 | dispensing_ 1119 | review_ 1120 | idiotic_ 1121 | cooking_ 1122 | bob_ 1123 | aggressive_ 1124 | yogurt_ 1125 | illegal_ 1126 | obtaining_ 1127 | dishes_ 1128 | gold_ 1129 | bedroom_ 1130 | move_ 1131 | coach_ 1132 | adult_ 1133 | rise_ 1134 | prestige_ 1135 | acclaim_ 1136 | outage_ 1137 | monday_ 1138 | i'll_ 1139 | broke_ 1140 | mammals_ 1141 | going_ 1142 | chain_ 1143 | today_ 1144 | item_ 1145 | bleachers_ 1146 | hauling_ 1147 | required_ 1148 | puree_ 1149 | doctor_ 1150 | grown_ 1151 | humid_ 1152 | overflowed_ 1153 | gave_ 1154 | you_ 1155 | misquote_ 1156 | our_ 1157 | whoever_ 1158 | gives_ 1159 | outcast_ 1160 | laugh_ 1161 | steph_ 1162 | results_ 1163 | pleasantly_ 1164 | wild_ 1165 | scalp_ 1166 | didn't_ 1167 | another_ 1168 | misprint_ 1169 | coyote_ 1170 | norwegian_ 1171 | shredded_ 1172 | top_ 1173 | regulations_ 1174 | isotopes_ 1175 | for_ 1176 | kindergarten_ 1177 | bibliographies_ 1178 | dark_ 1179 | priorities_ 1180 | success_ 1181 | nightly_ 1182 | accomplished_ 1183 | ruins_ 1184 | challenge_ 1185 | dessert_ 1186 | determination_ 1187 | healthy_ 1188 | day_ 1189 | worked_ 1190 | hispanic_ 1191 | withdraw_ 1192 | desires_ 1193 | question_ 1194 | mediocrity_ 1195 | vodka_ 1196 | not_ 1197 | zig-zagged_ 1198 | oozed_ 1199 | begins_ 1200 | speaker_ 1201 | co-authors_ 1202 | particularly_ 1203 | task_ 1204 | shore_ 1205 | rich_ 1206 | layoffs_ 1207 | farmland_ 1208 | oak_ 1209 | exclusive_ 1210 | learned_ 1211 | study_ 1212 | lunch_ 1213 | may_ 1214 | bongos_ 1215 | oyster_ 1216 | petrol_ 1217 | attach_ 1218 | patient_ 1219 | tests_ 1220 | flimsy_ 1221 | interview_ 1222 | rob_ 1223 | about_ 1224 | needle_ 1225 | preschooler_ 1226 | grievances_ 1227 | special_ 1228 | cows_ 1229 | with_ 1230 | frustration_ 1231 | abolish_ 1232 | mine_ 1233 | holiday_ 1234 | cottage_ 1235 | baboon_ 1236 | earn_ 1237 | scholars_ 1238 | months_ 1239 | voyage_ 1240 | good_ 1241 | team_ 1242 | fresh_ 1243 | elegant_ 1244 | future_ 1245 | screen_ 1246 | overalls_ 1247 | goose_ 1248 | else_ 1249 | coat_ 1250 | haunted_ 1251 | carol_ 1252 | lives_ 1253 | experience_ 1254 | price_ 1255 | scholastic_ 1256 | funny_ 1257 | noise_ 1258 | big_ 1259 | harmonize_ 1260 | harms_ 1261 | straw_ 1262 | chew_ 1263 | steps_ 1264 | diploma_ 1265 | companions_ 1266 | scholar_ 1267 | vagrants_ 1268 | twelfth_ 1269 | them_ 1270 | word_ 1271 | sauce_ 1272 | nearer_ 1273 | enjoy_ 1274 | clumsy_ 1275 | distance_ 1276 | tom_ 1277 | celebrate_ 1278 | tapestry_ 1279 | national_ 1280 | pizzerias_ 1281 | only_ 1282 | hand_ 1283 | you'll_ 1284 | variety_ 1285 | an_ 1286 | cleaners_ 1287 | gwen_ 1288 | ices_ 1289 | tea_ 1290 | heat_ 1291 | policeman_ 1292 | shorten_ 1293 | innocence_ 1294 | diseases_ 1295 | thoroughly_ 1296 | quality_ 1297 | snapper_ 1298 | love_ 1299 | beach_ 1300 | tiny_ 1301 | quick_ 1302 | book_ 1303 | lie_ 1304 | joint_ 1305 | distributed_ 1306 | oats_ 1307 | verse_ 1308 | her_ 1309 | remained_ 1310 | seamstresses_ 1311 | overly_ 1312 | let_ 1313 | create_ 1314 | beverage_ 1315 | fifth_ 1316 | reflects_ 1317 | parental_ 1318 | forest_ 1319 | useful_ 1320 | ended_ 1321 | what_ 1322 | immediate_ 1323 | cutlery_ 1324 | housewives_ 1325 | any_ 1326 | wounds_ 1327 | victor_ 1328 | charge_ 1329 | cleans_ 1330 | shrapnel_ 1331 | every_ 1332 | roll-ups_ 1333 | operation_ 1334 | employment_ 1335 | vanquished_ 1336 | squeezed_ 1337 | apples_ 1338 | woolen_ 1339 | although_ 1340 | slip_ 1341 | votes_ 1342 | i'd_ 1343 | january_ 1344 | ahead_ 1345 | tadpole_ 1346 | icy_ 1347 | mercilessly_ 1348 | departments_ 1349 | horse_ 1350 | bagpipes_ 1351 | tomboy_ 1352 | sequoia_ 1353 | tenant_ 1354 | worship_ 1355 | could_ 1356 | old_ 1357 | feathers_ 1358 | describe_ 1359 | pleasure_ 1360 | pathological_ 1361 | gunpowder_ 1362 | butterfly_ 1363 | engineering_ 1364 | yearly_ 1365 | they're_ 1366 | shipbuilding_ 1367 | board_ 1368 | clamp_ 1369 | healthier_ 1370 | guard_ 1371 | priority_ 1372 | rationalize_ 1373 | advertising_ 1374 | expected_ 1375 | vanilla_ 1376 | star_ 1377 | leaves_ 1378 | than_ 1379 | iris_ 1380 | walking_ 1381 | sculpture_ 1382 | challenged_ 1383 | charged_ 1384 | greasing_ 1385 | farmyard_ 1386 | encyclopedias_ 1387 | identical_ 1388 | books_ 1389 | room_ 1390 | wire_ 1391 | thinker_ 1392 | in_ 1393 | scared_ 1394 | male_ 1395 | fashion_ 1396 | roy_ 1397 | morning_ 1398 | meow_ 1399 | sherbet_ 1400 | into_ 1401 | splurge_ 1402 | thinks_ 1403 | audition_ 1404 | willowy_ 1405 | brochure_ 1406 | process_ 1407 | thought_ 1408 | sermon_ 1409 | splurged_ 1410 | remote_ 1411 | pewter_ 1412 | superb_ 1413 | youngsters_ 1414 | desperately_ 1415 | maids_ 1416 | larger_ 1417 | crisscrossed_ 1418 | tradition_ 1419 | overthrow_ 1420 | mergers_ 1421 | come_ 1422 | bayou_ 1423 | accusations_ 1424 | recognition_ 1425 | computer_ 1426 | perfume_ 1427 | courier_ 1428 | journalist_ 1429 | third_ 1430 | without_ 1431 | lily_ 1432 | grows_ 1433 | critical_ 1434 | gallon_ 1435 | irate_ 1436 | problems_ 1437 | errors_ 1438 | look_ 1439 | pill_ 1440 | getting_ 1441 | shellfish_ 1442 | porcupines_ 1443 | carefully_ 1444 | oil_ 1445 | shock_ 1446 | warm_ 1447 | autographs_ 1448 | catch_ 1449 | reflect_ 1450 | box_ 1451 | date_ 1452 | gloves_ 1453 | but_ 1454 | ship_ 1455 | prepared_ 1456 | wasp_ 1457 | competition_ 1458 | kept_ 1459 | difficult_ 1460 | part_ 1461 | four_ 1462 | have_ 1463 | recall_ 1464 | curry_ 1465 | however_ 1466 | forbidden_ 1467 | thieves_ 1468 | dig_ 1469 | cash_ 1470 | intelligence_ 1471 | socks_ 1472 | plant_ 1473 | peaches_ 1474 | bowl_ 1475 | value_ 1476 | sound_ 1477 | mosquitoes_ 1478 | theme_ 1479 | supervision_ 1480 | millionaires_ 1481 | coconut_ 1482 | driving_ 1483 | increases_ 1484 | altruistic_ 1485 | catastrophic_ 1486 | like_ 1487 | official_ 1488 | authorized_ 1489 | education_ 1490 | around_ 1491 | depicts_ 1492 | played_ 1493 | haughty_ 1494 | upon_ 1495 | a_ 1496 | do_ 1497 | hair_ 1498 | salad_ 1499 | pam_ 1500 | live_ 1501 | lithographs_ 1502 | miraculously_ 1503 | encouraged_ 1504 | fails_ 1505 | think_ 1506 | jeff_ 1507 | aprons_ 1508 | away_ 1509 | tina_ 1510 | stayed_ 1511 | tips_ 1512 | deadline_ 1513 | dinner_ 1514 | jar_ 1515 | tropical_ 1516 | co-educational_ 1517 | iguanas_ 1518 | longer_ 1519 | me_ 1520 | line_ 1521 | dry_ 1522 | original_ 1523 | argue_ 1524 | sold_ 1525 | goes_ 1526 | novel_ 1527 | young_ 1528 | symposium_ 1529 | should_ 1530 | athletic_ 1531 | go_ 1532 | parfait_ 1533 | overcame_ 1534 | lower_ 1535 | haven't_ 1536 | conditions_ 1537 | spinach_ 1538 | bought_ 1539 | pick_ 1540 | too_ 1541 | alleviate_ 1542 | diminish_ 1543 | shocked_ 1544 | talk_ 1545 | poisonous_ 1546 | cloth_ 1547 | charlie_ 1548 | sing_ 1549 | shimmers_ 1550 | wool_ 1551 | points_ 1552 | meet_ 1553 | papered_ 1554 | got_ 1555 | chosen_ 1556 | gets_ 1557 | entertaining_ 1558 | sometimes_ 1559 | beggar_ 1560 | traffic_ 1561 | colorful_ 1562 | cement_ 1563 | repainted_ 1564 | journal_ 1565 | daphne_ 1566 | raccoons_ 1567 | situated_ 1568 | relish_ 1569 | worm_ 1570 | lay_ 1571 | dew_ 1572 | positive_ 1573 | birthday_ 1574 | met_ 1575 | gardens_ 1576 | shell_ 1577 | pledge_ 1578 | marriage_ 1579 | service_ 1580 | strong_ 1581 | imagination_ 1582 | accomplish_ 1583 | shone_ 1584 | tag_ 1585 | break_ 1586 | warrior_ 1587 | steven_ 1588 | saturday_ 1589 | teeth_ 1590 | flying_ 1591 | job_ 1592 | chamber_ 1593 | cost_ 1594 | contributions_ 1595 | foreign_ 1596 | rabbits_ 1597 | murky_ 1598 | eyes_ 1599 | taxicab_ 1600 | intuition_ 1601 | great_ 1602 | it_ 1603 | time_ 1604 | account_ 1605 | sense_ 1606 | juicy_ 1607 | boring_ 1608 | used_ 1609 | agree_ 1610 | ingredients_ 1611 | leather_ 1612 | call_ 1613 | doctors_ 1614 | clothing_ 1615 | sitting_ 1616 | raisins_ 1617 | surrounded_ 1618 | eastern_ 1619 | hallway_ 1620 | own_ 1621 | stockings_ 1622 | change_ 1623 | attendance_ 1624 | maintenance_ 1625 | looked_ 1626 | organizations_ 1627 | dwarf_ 1628 | halloween_ 1629 | synagogue_ 1630 | leeway_ 1631 | naive_ 1632 | offensive_ 1633 | hurts_ 1634 | slopes_ 1635 | divorced_ 1636 | alice_ 1637 | barely_ 1638 | shows_ 1639 | presented_ 1640 | exhibited_ 1641 | government_ 1642 | yell_ 1643 | hung_ 1644 | centrifuge_ 1645 | mean_ 1646 | steve_ 1647 | solves_ 1648 | chocolate_ 1649 | straightforward_ 1650 | cow_ 1651 | dutch_ 1652 | soybeans_ 1653 | higher_ 1654 | nevada_ 1655 | otto_ 1656 | door_ 1657 | victim_ 1658 | ever_ 1659 | try_ 1660 | whenever_ 1661 | apply_ 1662 | lawyer_ 1663 | ralph_ 1664 | audits_ 1665 | mouse_ 1666 | general_ 1667 | gently_ 1668 | permanent_ 1669 | antelope_ 1670 | history_ 1671 | records_ 1672 | took_ 1673 | slowly_ 1674 | working_ 1675 | wash_ 1676 | musicians_ 1677 | file_ 1678 | up_ 1679 | purists_ 1680 | sweater_ 1681 | zoo_ 1682 | frantically_ 1683 | ski_ 1684 | pond_ 1685 | apology_ 1686 | players_ 1687 | stomped_ 1688 | daytime_ 1689 | while_ 1690 | enough_ 1691 | hierarchies_ 1692 | approval_ 1693 | children_ 1694 | freeway_ 1695 | understanding_ 1696 | gorgeous_ 1697 | thirty_ 1698 | actor_ 1699 | drink_ 1700 | eggs_ 1701 | reorganization_ 1702 | after_ 1703 | possible_ 1704 | to_ 1705 | backed_ 1706 | scientific_ 1707 | music_ 1708 | did_ 1709 | canned_ 1710 | relaxed_ 1711 | twilight_ 1712 | surgeon_ 1713 | tube_ 1714 | tastes_ 1715 | personnel_ 1716 | add_ 1717 | until_ 1718 | needlepoint_ 1719 | thimble_ 1720 | compounded_ 1721 | skills_ 1722 | twins_ 1723 | proceeding_ 1724 | chip_ 1725 | delete_ 1726 | gigantic_ 1727 | writers_ 1728 | purchase_ 1729 | building_ 1730 | cannot_ 1731 | onto_ 1732 | archeological_ 1733 | arrange_ 1734 | gunpoint_ 1735 | equal_ 1736 | forces_ 1737 | web_ 1738 | upbeat_ 1739 | pair_ 1740 | full_ 1741 | local_ 1742 | last_ 1743 | webbed_ 1744 | techniques_ 1745 | compile_ 1746 | dog_ 1747 | splinter_ 1748 | ancient_ 1749 | common_ 1750 | suburban_ 1751 | sleigh_ 1752 | resemble_ 1753 | examples_ 1754 | climates_ 1755 | news_ 1756 | fudge_ 1757 | tornados_ 1758 | under-age_ 1759 | down_ 1760 | she_ 1761 | hostages_ 1762 | coins_ 1763 | acropolis_ 1764 | chives_ 1765 | clasp_ 1766 | continental_ 1767 | spider_ 1768 | stag_ 1769 | theory_ 1770 | roses_ 1771 | valley_ 1772 | attention_ 1773 | coffee_ 1774 | outlaws_ 1775 | tooth_ 1776 | glucose_ 1777 | sought_ 1778 | kippers_ 1779 | biology_ 1780 | standardized_ 1781 | football_ 1782 | ably_ 1783 | fairy_ 1784 | count_ 1785 | ambulance_ 1786 | rain_ 1787 | popularity_ 1788 | very_ 1789 | almost_ 1790 | delicious_ 1791 | proper_ 1792 | screwdriver_ 1793 | anecdotal_ 1794 | notices_ 1795 | subject_ 1796 | auditory_ 1797 | thoughtless_ 1798 | system_ 1799 | breakdown_ 1800 | loads_ 1801 | pine_ 1802 | found_ 1803 | illuminating_ 1804 | trouble_ 1805 | subtraction_ 1806 | allow_ 1807 | -------------------------------------------------------------------------------- /ecog2txt/trainers.py: -------------------------------------------------------------------------------- 1 | # standard libraries 2 | import pdb 3 | import os 4 | import re 5 | from functools import reduce, partial 6 | from collections import defaultdict 7 | 8 | # third-party packages 9 | import yaml 10 | import numpy as np 11 | import pickle 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | import tensorflow as tf 15 | from tensorflow.python import pywrap_tensorflow 16 | from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file 17 | 18 | # local 19 | from utils_jgm.toolbox import heatmap_confusions, MutableNamedTuple 20 | from machine_learning.neural_networks import tf_helpers as tfh 21 | from ecog2txt.subjects import ECoGSubject 22 | from ecog2txt import plotters, text_dir, TOKEN_TYPES, DATA_PARTITIONS 23 | from ecog2txt import EOS_token, pad_token, OOV_token 24 | if int(tf.__version__.split('.')[0]) == 2: 25 | # from machine_learning.neural_networks.tf_helpers_too import NeuralNetwork 26 | # from machine_learning.neural_networks.sequence_networks_too import Seq2Seq 27 | # pass 28 | from machine_learning.neural_networks.torch_sequence_networks import ( 29 | Sequence2Sequence, SequenceTrainer 30 | ) 31 | else: 32 | from machine_learning.neural_networks import basic_components as nn 33 | from machine_learning.neural_networks.sequence_networks import SequenceNetwork 34 | 35 | 36 | ''' 37 | :Author: J.G. Makin (except where otherwise noted) 38 | ''' 39 | 40 | 41 | class MultiSubjectTrainer: 42 | def __init__( 43 | self, 44 | experiment_manifest_name, 45 | subject_ids, 46 | checkpoint_dir='.', 47 | restore_epoch=None, 48 | SN_kwargs=(), 49 | DG_kwargs=(), 50 | RP_kwargs=(), 51 | ES_kwargs=(), 52 | VERBOSE=True, 53 | **kwargs 54 | ): 55 | 56 | # ... 57 | SN_kwargs = dict(SN_kwargs) 58 | 59 | # load the experiment_manifest 60 | with open(os.path.join(text_dir, experiment_manifest_name)) as file: 61 | self.experiment_manifest = yaml.full_load(file) 62 | 63 | # checks 64 | token_type = self.experiment_manifest[subject_ids[-1]]['token_type'] 65 | assert token_type in TOKEN_TYPES, 'Unrecognized token_type!! -- jgm' 66 | 67 | # attribute 68 | self._token_type = token_type # NB: changes will not propagate 69 | self._RP_kwargs = dict(RP_kwargs) 70 | 71 | # create ECoG subjects 72 | self.ecog_subjects = [ 73 | ECoGSubject( 74 | self.experiment_manifest[subject_id], 75 | subject_id, 76 | pretrain_all_blocks=(subject_id != subject_ids[-1]), 77 | **dict(ES_kwargs), 78 | _DG_kwargs=dict(DG_kwargs) 79 | ##### 80 | # target_specs=target_specs 81 | ##### 82 | ) for subject_id in subject_ids] 83 | 84 | # invoke some setters 85 | # NB: these attributes adjust self.ecog_subjects, so they must be 86 | # invoked *after* those are created (hence no auto_attribute). But 87 | # the changes to the ecog_subjects below in turn depend on the 88 | # self.checkpoint_dir, so they have to be set after these lines. 89 | self.VERBOSE = VERBOSE 90 | self.checkpoint_dir = checkpoint_dir 91 | self.restore_epoch = restore_epoch 92 | 93 | # update the data_manifests for our case 94 | for subject in self.ecog_subjects: 95 | for data_key, data_manifest in subject.data_manifests.items(): 96 | if data_key == 'decoder_targets' and 'sequence' in token_type: 97 | data_manifest.APPEND_EOS = True 98 | try: 99 | data_manifest.penalty_scale = self.experiment_manifest[ 100 | subject.subnet_id][data_key + '_penalty_scale'] 101 | except KeyError: 102 | pass 103 | self.set_feature_lists(**kwargs) 104 | 105 | # create the SequenceNetwork according to the experiment_manifest 106 | if int(tf.__version__.split('.')[0]) == 2: 107 | # remove SN_kwargs that aren't expected by Sequence2Sequence 108 | self.ST_kwargs = { 109 | key: SN_kwargs.pop(key) for key in { 110 | 'temperature', 'EMA_decay', 'beam_width', 111 | 'assessment_epoch_interval', 'tf_summaries_dir', 112 | 'N_cases', 113 | } if key in SN_kwargs 114 | } 115 | self.N_epochs = SN_kwargs.pop('N_epochs', None) 116 | self.net = Sequence2Sequence( 117 | self.experiment_manifest[subject_ids[-1]], 118 | self.ecog_subjects, 119 | EOS_token=EOS_token, 120 | pad_token=pad_token, 121 | TARGETS_ARE_SEQUENCES='sequence' in token_type, 122 | VERBOSE=VERBOSE, 123 | **dict(SN_kwargs) 124 | ) 125 | else: 126 | self.net = SequenceNetwork( 127 | self.experiment_manifest[subject_ids[-1]], 128 | EOS_token=EOS_token, 129 | pad_token=pad_token, 130 | OOV_token=OOV_token, 131 | training_GPUs=[0], 132 | TARGETS_ARE_SEQUENCES='sequence' in token_type, 133 | VERBOSE=VERBOSE, 134 | **dict(SN_kwargs) 135 | ) 136 | 137 | # re-run to set the net's checkpoint_path 138 | self.checkpoint_dir = checkpoint_dir 139 | 140 | # initialize 141 | self._results_plotter = None 142 | 143 | def vprint(self, *args, **kwargs): 144 | if self.VERBOSE: 145 | print(*args, **kwargs) 146 | 147 | def set_feature_lists(self, **kwargs): 148 | for subject in self.ecog_subjects: 149 | 150 | # adjust data_manifests for the specifics of this experiment 151 | for data_key, data_manifest in subject.data_manifests.items(): 152 | sequence_type = data_manifest.sequence_type 153 | 154 | # for categorical data, set get_feature_list 155 | if data_manifest.distribution == 'categorical': 156 | 157 | # useful string constants derived from the sequence_type 158 | vocab_list_name = '_'.join([sequence_type, 'vocab_list']) 159 | vocab_file_path = subject.data_generator.sequence_type_to_vocab_file_path( 160 | sequence_type) 161 | vocab_pkl_path = os.path.join( 162 | self.checkpoint_dir, '_'.join([sequence_type, 'vocab_file.pkl']) 163 | ) 164 | 165 | self.vprint( 166 | 'Setting feature_list for %s to ' % data_key, end='' 167 | ) 168 | 169 | # explicit vocab_list has priority 1 170 | if vocab_list_name in kwargs: 171 | self.vprint("argument passed w/key %s" % vocab_list_name) 172 | class_list = kwargs[vocab_list_name] 173 | 174 | # saved vocab_file has priority 2 175 | elif vocab_file_path is not None: 176 | self.vprint("vocab list stored in %s" % vocab_file_path) 177 | class_list = subject.data_generator.get_class_list( 178 | sequence_type 179 | ) 180 | 181 | # a pickled vocab file has priority 3 182 | elif os.path.isfile(vocab_pkl_path): 183 | self.vprint("vocab list stored in %s" % vocab_pkl_path) 184 | with open(vocab_pkl_path, 'rb') as fp: 185 | bytes_list = pickle.load(fp) 186 | class_list = [t.decode('utf-8') for t in bytes_list] 187 | 188 | # none of the above, yet the data are still categorical 189 | else: 190 | self.vprint("training-intersection/validation-union") 191 | special_tokens = ( 192 | [pad_token, EOS_token, OOV_token] 193 | if 'sequence' in self._token_type 194 | and 'encoder_' not in data_key 195 | else [pad_token, OOV_token] 196 | ) 197 | class_list = self._training_intersection_validation_union( 198 | sequence_type, special_tokens 199 | ) 200 | 201 | # and now set it (extremely verbosely because of python's 202 | # idiosyncratic late binding) 203 | # data_manifest.get_feature_list = ( 204 | # lambda class_list=class_list: class_list 205 | # ) 206 | # work-around because lambdas can't be pickled 207 | data_manifest.get_feature_list = partial(_identity, class_list) 208 | 209 | else: 210 | # don't do anything for non-categorical data 211 | pass 212 | 213 | @property 214 | def checkpoint_dir(self): 215 | 216 | # update the SequenceNetwork's checkpoint_path as well--if the net 217 | # has been created at this point: 218 | try: 219 | self.net.checkpoint_path = os.path.join( 220 | self._checkpoint_dir, 'model.ckpt' 221 | ) 222 | except AttributeError: 223 | pass 224 | return self._checkpoint_dir 225 | 226 | @checkpoint_dir.setter 227 | def checkpoint_dir(self, checkpoint_dir): 228 | 229 | # set the shadow variable 230 | self._checkpoint_dir = checkpoint_dir 231 | 232 | # make sure the self.net.checkpoint_path gets updated as well 233 | self.checkpoint_dir 234 | 235 | @property 236 | def restore_epoch(self): 237 | if self._restore_epoch is not None: 238 | return self._restore_epoch 239 | else: 240 | model_name = 'model.ckpt' 241 | restore_epochs = [ 242 | int(name.split('-')[1].split('.')[0]) 243 | for name in os.listdir(self.checkpoint_dir) 244 | if name.split('-')[0] == model_name and 245 | name.split('.')[-1] == 'index' 246 | ] 247 | restore_epochs.sort() 248 | if restore_epochs: 249 | return restore_epochs[-1] 250 | else: 251 | # no models have been trained yet! 252 | return None 253 | 254 | @restore_epoch.setter 255 | def restore_epoch(self, restore_epoch): 256 | self._restore_epoch = restore_epoch 257 | 258 | @property 259 | def results_plotter(self): 260 | if self._results_plotter is None: 261 | subject = self.ecog_subjects[-1] 262 | self.results_plotter = plotters.ResultsPlotter( 263 | self.experiment_manifest[subject.subnet_id], subject, 264 | VERBOSE=self.VERBOSE, **self._RP_kwargs 265 | ) 266 | 267 | return self._results_plotter 268 | 269 | @results_plotter.setter 270 | def results_plotter(self, results_plotter): 271 | # set up methods 272 | results_plotter.get_saliencies = self.get_saliencies 273 | results_plotter.get_encoder_embedding = self.get_encoder_embedding 274 | results_plotter.get_internal_activations = self.get_internal_activations 275 | 276 | self._results_plotter = results_plotter 277 | 278 | def torch_learn(self): 279 | import torch 280 | # somewhat hacky way to shoehorn PyTorch version in here... 281 | 282 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 283 | 284 | ######## 285 | # manifest from final subject only?? 286 | torch_trainer = SequenceTrainer( 287 | self.experiment_manifest[self.ecog_subjects[-1].subnet_id], 288 | self.ecog_subjects, 289 | **self.ST_kwargs, 290 | OOV_token=OOV_token, 291 | REPORT_TRAINING_LOSS=True, 292 | TARGETS_ARE_SEQUENCES=self.net.TARGETS_ARE_SEQUENCES, 293 | ) 294 | ######## 295 | 296 | # something of a hack here for multi_trainers 297 | assessments = torch_trainer.train_and_assess( 298 | self.N_epochs, self.net, device 299 | ) 300 | 301 | return assessments 302 | 303 | def parallel_transfer_learn(self, RESUME=False, fit_kwargs=()): 304 | ''' 305 | Parallel transfer learning 306 | ''' 307 | 308 | if RESUME: 309 | fit_kwargs = { 310 | '_restore_epoch': self.restore_epoch, 311 | **dict(fit_kwargs), 312 | 'train_vars_scope': 'seq2seq', 313 | 'reuse_vars_scope': 'seq2seq', 314 | } 315 | self.ecog_subjects = [self.ecog_subjects[-1]] 316 | 317 | # fit and save the results 318 | assessments = self.net.fit(self.ecog_subjects, **dict(fit_kwargs)) 319 | self._save_results(assessments) 320 | 321 | # to facilitate restoring/assessing, update hard-coded restore_epochs 322 | if self._restore_epoch is not None: 323 | self.restore_epoch = ( 324 | self.restore_epoch + self.net.N_epochs if RESUME else self.net.N_epochs 325 | ) 326 | 327 | return assessments 328 | 329 | def sequential_transfer_learn( 330 | self, pretraining_epochs=60, training_epochs=200, posttraining_epochs=340 331 | ): 332 | ''' 333 | Sequential transfer learning. 334 | ''' 335 | 336 | # set which layers are frozen, reused, reinitialized 337 | proprietary_scopes = 'seq2seq/subnet' 338 | reusable_scopes = 'seq2seq/(?!subnet)' # negative lookahead 339 | 340 | # train on each subject sequentially 341 | fit_kwargs = {} 342 | for subject in self.ecog_subjects: 343 | 344 | # pre-training 345 | if subject == self.ecog_subjects[0]: 346 | # first subject; do nothing but set up for next training phase 347 | latest_epoch = 0 348 | fit_kwargs['reuse_vars_scope'] = None 349 | else: 350 | # first acquire this subject's encoder embedding 351 | self.net.N_epochs = pretraining_epochs 352 | fit_kwargs['train_vars_scope'] = proprietary_scopes 353 | fit_kwargs['reuse_vars_scope'] = reusable_scopes 354 | fit_kwargs['_restore_epoch'] = latest_epoch 355 | self.net.fit([subject], **fit_kwargs) 356 | 357 | # then set up for next next training phase 358 | latest_epoch += self.net.N_epochs 359 | fit_kwargs['_restore_epoch'] = latest_epoch 360 | fit_kwargs['reuse_vars_scope'] = 'seq2seq' 361 | 362 | # full training 363 | if subject == self.ecog_subjects[-1]: 364 | training_epochs += posttraining_epochs 365 | self.net.N_epochs = training_epochs 366 | fit_kwargs['train_vars_scope'] = 'seq2seq' 367 | assessments = self.net.fit([subject], **fit_kwargs) 368 | latest_epoch += self.net.N_epochs 369 | self._save_results(assessments) 370 | 371 | # to facilitate restoring and assessing, store this 372 | self.restore_epoch = latest_epoch 373 | 374 | return assessments 375 | 376 | def assess_saved_model(self): 377 | 378 | self.update_net_from_saved_model() 379 | assessment_dict = self.net.restore_and_assess( 380 | self.ecog_subjects, self.restore_epoch) 381 | return assessment_dict 382 | 383 | def update_net_from_saved_model(self): 384 | # pull the model sizes from the saved file 385 | self.net.layer_sizes, data_sizes, strides, EMA = self.recover_model_sizes() 386 | self.net.TEMPORALLY_CONVOLVE = len(strides) 387 | self.net.EMA_decay = 0.99*EMA 388 | 389 | # these vary by subject 390 | for subject in self.ecog_subjects: 391 | s_id = subject.subnet_id 392 | manifests = subject.data_manifests 393 | 394 | ####### 395 | # This can fail for a non-standard data_mapping 396 | for key, data_size in data_sizes[s_id].items(): 397 | manifests[key].num_features = data_size 398 | ####### 399 | 400 | # data sizes that hold for all subjects use the key None 401 | if None in data_sizes.keys(): 402 | for key, data_size in data_sizes[None].items(): 403 | manifests[key].num_features = data_size 404 | 405 | # convolutional? 406 | if strides[s_id]: 407 | subject.decimation_factor = np.prod(strides[s_id]) 408 | # otherwise go with the default value 409 | 410 | def _training_intersection_validation_union( 411 | self, sequence_type, special_tokens=[] 412 | ): 413 | ''' 414 | Typically used when neither a vocab_list nor a vocab_file has been 415 | provided, and not vocab_file.pkl has been found. 416 | ''' 417 | 418 | # to get the class_list... 419 | targets_list = list(reduce( 420 | # ...reduce via the union across the DATA_PARTITIONS... 421 | lambda A, B: A | B, [ 422 | reduce( 423 | # ...of the reductions across the intersection or union... 424 | (lambda A, B: A & B) if data_partition == 'training' else ( 425 | lambda A, B: A | B), 426 | # ...of the class_list of this data_partition 427 | [ 428 | set(s.write_tf_records_maybe( 429 | sequence_type, [data_partition] 430 | )) for s in self.ecog_subjects 431 | ] 432 | ) for data_partition in DATA_PARTITIONS 433 | ] 434 | )) 435 | self.vprint('All tf_records have been written...') 436 | 437 | # insert at the beginning, and in order, any special_tokens 438 | targets_list = [t for t in targets_list if t not in special_tokens] 439 | for token in reversed(special_tokens): 440 | targets_list.insert(0, token) 441 | 442 | return targets_list 443 | 444 | def recover_model_sizes(self): 445 | ##### 446 | # TO DO: 447 | # (1) maybe this should be put into SequenceNets, since it's hard-coded 448 | # for that particular network.... 449 | ##### 450 | 451 | # extract the dictionary mapping long var names to shapes 452 | reader = pywrap_tensorflow.NewCheckpointReader(os.path.join( 453 | self.checkpoint_dir, 'model.ckpt') + '-%i' % self.restore_epoch) 454 | var_to_shape = reader.get_variable_to_shape_map() 455 | 456 | # Accumulate a useful structure of network sizes. You have to assemble 457 | # the intermediate data structure before unpacking into 458 | # the ones that will be returned because to put the layer sizes 459 | # in order you need to collect all of them first. 460 | net_info = defaultdict(lambda: defaultdict(dict)) 461 | EMA = False 462 | for var_name, var_shape in var_to_shape.items(): 463 | name_scopes = var_name.split('/') 464 | outer_scope = name_scopes.pop(0) 465 | 466 | # note if an exponential moving average was used 467 | if name_scopes[-1] == 'ExponentialMovingAverage': 468 | EMA = True 469 | 470 | if outer_scope == 'seq2seq': 471 | subsubnet = name_scopes.pop(0) 472 | 473 | # if this is a subnetwork, find out which one 474 | if re.match(r'subnet_\d*', subsubnet): 475 | subnet_id = subsubnet.split('_')[1] 476 | subsubnet = name_scopes.pop(0) 477 | else: 478 | subnet_id = None 479 | 480 | # check if it's an RNN 481 | for scope in name_scopes: 482 | match_obj = re.match(r'cell_\d*', scope) 483 | if match_obj: 484 | layer_number = int(match_obj[0].split('_')[-1]) 485 | break 486 | else: 487 | # it's not an RNN 488 | if name_scopes[0] == 'weights': 489 | # there are three numbers appended to each name 490 | subsubnet, _, _, layer_number = subsubnet.rsplit('_', 3) 491 | layer_number = int(layer_number) 492 | else: 493 | continue 494 | 495 | # store 496 | net_info[subnet_id][subsubnet][layer_number] = var_shape 497 | 498 | # Now unpack into data structures useful for a SequenceNetwork 499 | layer_sizes = {} 500 | data_sizes = defaultdict(dict) 501 | encoder_strides = defaultdict(list) 502 | 503 | for subnet_id, subnet_info in net_info.items(): 504 | for subsubnet, subsubnet_info in subnet_info.items(): 505 | 506 | # We *assume* (given the implementation of SequenceNets) that 507 | # the layer_sizes do not vary across subjects/subnet_ids! so 508 | # only the last subnet_id will count. 509 | layer_sizes[subsubnet] = [] 510 | 511 | for layer_number in sorted(subsubnet_info.keys()): 512 | 513 | # the final projection layer is special 514 | if '_projection' in subsubnet and layer_number == max( 515 | subsubnet_info.keys() 516 | ): 517 | # The only relevant info is the output size. NB that 518 | # the weight matrix of this layer is *transposed* 519 | data_sizes[subnet_id][subsubnet.replace('_projection', '_targets')] = \ 520 | subsubnet_info[layer_number][0] 521 | else: 522 | weight_shape = subsubnet_info[layer_number] 523 | 524 | # the base layer size (may need to be adjusted) 525 | layer_size = weight_shape[-1] 526 | 527 | # the LSTM variables pack together 4 weight matrices 528 | if '_rnn' in subsubnet: 529 | layer_size //= 4 530 | 531 | # add this layer size to the current list 532 | layer_sizes[subsubnet].append(layer_size) 533 | 534 | # the encoder_embedding is special 535 | if subsubnet == 'encoder_embedding': 536 | if len(weight_shape) == 4: 537 | encoder_strides[subnet_id].append(weight_shape[1]) 538 | 539 | # 1st encoder_embedding layer has info about input size 540 | if layer_number == min(subsubnet_info.keys()): 541 | data_sizes[subnet_id]['encoder_inputs'] = weight_shape[-2] 542 | 543 | # In SequenceNets, the encoder RNN is constructed in a python loop, 544 | # rather than within tf function, so it is the 'encoder_rnn' scope 545 | # that gets numbered, rather than the cells. Here you convert all 546 | # these 'encoder_rnn_n` keys to a single key, 'encoder_rnn'. 547 | encoder_rnn_sizes = [] 548 | for layer_name, layer_size in sorted(layer_sizes.items()): 549 | if layer_name.startswith('encoder_rnn'): 550 | encoder_rnn_sizes += layer_size 551 | layer_sizes.pop(layer_name) 552 | layer_sizes['encoder_rnn'] = encoder_rnn_sizes 553 | 554 | return layer_sizes, data_sizes, encoder_strides, EMA 555 | 556 | def _save_results(self, assessments): 557 | ''' 558 | Write out to a text file 559 | ''' 560 | 561 | # the save-file path/name 562 | subject = self.ecog_subjects[-1] 563 | experiment_manifest = self.experiment_manifest[subject.subnet_id] 564 | save_file_dir = experiment_manifest['saved_results_dir'] 565 | project = experiment_manifest['project'] 566 | save_file_path = os.path.join( 567 | save_file_dir, 568 | '_'.join( 569 | [ 570 | 'accuracies', 571 | project + '-'.join(str(s.subnet_id) for s in self.ecog_subjects), 572 | str(self.net.FF_dropout), 573 | str(self.net.RNN_dropout), 574 | ] + [ 575 | '-'.join(str(N) for N in sizes) 576 | for key, sizes in sorted(self.net.layer_sizes.items()) 577 | ] 578 | ) 579 | ) 580 | print('save file is ' + save_file_path) 581 | 582 | # variables used for for plotting 583 | plot_interval = self.net.assessment_epoch_interval 584 | max_epoch = len(assessments['training'].decoder_accuracies)*plot_interval 585 | accuracies_epochs = [epoch for epoch in range(0, max_epoch, plot_interval)] 586 | 587 | # save the accuracies to a text file 588 | np.savetxt( 589 | save_file_path, 590 | np.stack([ 591 | assessments['training'].decoder_accuracies, 592 | assessments['training'].decoder_word_error_rates, 593 | assessments['validation'].decoder_accuracies, 594 | assessments['validation'].decoder_word_error_rates, 595 | np.array(accuracies_epochs) 596 | ], axis=1), 597 | fmt="%.4f", 598 | header=( 599 | 'training accs | training WERs | ' 600 | 'validation acc | validation WERs | epochs' 601 | ) 602 | ) 603 | 604 | # confusion matrix looks bad in tensorboard, so rebuild here 605 | decoder_targets_list = subject.data_manifests[ 606 | 'decoder_targets'].get_feature_list() 607 | N = subject.data_manifests['decoder_targets'].num_features 608 | if N < 100: 609 | fig_dimension = N//6 610 | confusions = assessments['validation'].decoder_confusions 611 | if confusions is not None: 612 | fig = heatmap_confusions( 613 | plt.figure(figsize=(fig_dimension, fig_dimension)), 614 | confusions, 615 | x_axis_labels=decoder_targets_list, 616 | y_axis_labels=decoder_targets_list, 617 | ) 618 | fig.savefig(os.path.join( 619 | save_file_dir, '%s_confusions.pdf' % self._token_type), 620 | bbox_inches='tight') 621 | 622 | def count_all_targets(self, data_key='decoder_targets', threshold=0.4): 623 | 624 | # which targets do you want to count? 625 | targets_list = self.ecog_subjects[-1].data_manifests[ 626 | data_key].get_feature_list() 627 | 628 | # dump into two tuples (each entry in a tuple corresponds to a subject) 629 | target_counters, sequence_counters = zip(*[ 630 | subj.count_targets(targets_list, threshold) 631 | for subj in self.ecog_subjects 632 | ]) 633 | 634 | # convert tuples into dictionaries so we know which subject is which 635 | def tuple_to_dict(tpl): 636 | return {s.subnet_id: t for (s, t) in zip(self.ecog_subjects, tpl)} 637 | return tuple_to_dict(target_counters), tuple_to_dict(sequence_counters) 638 | 639 | def subject_to_table(self): 640 | subject_attributes = { 641 | 'block_types', 642 | 'block_ids', 643 | 'decimation_factor', 644 | } 645 | trainer_attributes = { 646 | # 'checkpoint_dir', 647 | 'restore_epoch', 648 | # 'vocab_file', 649 | } 650 | 651 | params_series = [pd.Series( 652 | { 653 | # **{k: v for k, v in s.__dict__.items() if not k.startswith('_')}, 654 | **{key: getattr(manifest, 'num_features') 655 | for key, manifest in s.data_manifests.items()}, 656 | **{'_'.join([manifest.sequence_type, 'vocab_list']): 657 | manifest.get_feature_list() 658 | for manifest in s.data_manifests.values() 659 | if manifest.distribution == 'categorical'}, 660 | **{attr: getattr(s, attr) for attr in subject_attributes}, 661 | **{attr: getattr(self, attr) for attr in trainer_attributes}, 662 | }, 663 | name=s.subnet_id) for s in self.ecog_subjects 664 | ] 665 | return pd.concat(params_series, axis=1).transpose() 666 | 667 | def print_tensor_names(self): 668 | ckpt = os.path.join(self.checkpoint_dir, 'model.ckpt') + '-' + repr( 669 | self.restore_epoch) 670 | print_tensors_in_checkpoint_file( 671 | file_name=ckpt, 672 | tensor_name='', 673 | all_tensors=False, 674 | all_tensor_names=False 675 | ) 676 | 677 | def cluster_embedded_words(self, weights_name, cluster_embeddings_kwargs=()): 678 | W = self._retrieve_layer_weights(weights_name) 679 | return plotters.cluster_embeddings(W, **cluster_embeddings_kwargs) 680 | 681 | def _retrieve_layer_weights(self, weights_name): 682 | 683 | # assemble the full name of the weights 684 | reader = pywrap_tensorflow.NewCheckpointReader( 685 | self.net.checkpoint_path + '-%i' % self.restore_epoch) 686 | var_to_shape = reader.get_variable_to_shape_map() 687 | weights_full_name = None 688 | for key in sorted(var_to_shape): 689 | #### 690 | # This isn't really right: the 0 says to use the *first* layer of 691 | # whatever part of the network, but you really should use the first 692 | # for the embedding and the last for the projection.... 693 | if re.match('.*{0}.*0/weights/ExponentialMovingAverage'.format( 694 | weights_name), key): 695 | weights_full_name = key 696 | assert weights_full_name, "Uh-oh, no such weights found! -- jgm" 697 | 698 | # extract this weight 699 | W = self.net.get_weights_as_numpy_array( 700 | weights_full_name, self.restore_epoch) 701 | return W 702 | 703 | def get_saliencies(self, contrib_method, assessment_type='norms'): 704 | ''' 705 | Compute average "saliency" of input electrodes by backpropagating 706 | error gradients into the inputs. 707 | ''' 708 | 709 | # save the original penalties in a temporary variable 710 | old_penalties = {} 711 | subject = self.ecog_subjects[-1] 712 | for key, manifest in subject.data_manifests.items(): 713 | if '_targets' in key: 714 | old_penalties[key] = manifest.penalty_scale 715 | manifest.penalty_scale = 0.0 716 | 717 | # set the penalty for the output under consideration to 1.0 718 | key = contrib_method.replace('saliency_map', 'targets') 719 | subject.data_manifests[key].penalty_scale = 1.0 720 | 721 | # backpropagate error derivatives into the inputs 722 | contributions = self.net.restore_and_get_saliencies( 723 | [subject], self.restore_epoch, 724 | data_partition='validation', assessment_type=assessment_type 725 | ) 726 | 727 | # set the penalties back to their original value 728 | for key, manifest in subject.data_manifests.items(): 729 | if '_targets' in key: 730 | manifest.penalty_scale = old_penalties[key] 731 | 732 | return contributions 733 | 734 | def get_encoder_embedding(self): 735 | # fixed properties 736 | embedding_partial_name = ( 737 | 'seq2seq/subnet_{0}/encoder_embedding_{1}_{2}_0' 738 | '/weights/ExponentialMovingAverage' 739 | ) 740 | 741 | # first get the *name* of the weight matrix, based on its size 742 | layer_sizes, data_sizes, _, _ = self.recover_model_sizes() 743 | embedding_name = embedding_partial_name.format( 744 | self.subj_id, 745 | data_sizes[self.subj_id]['encoder_input'], 746 | layer_sizes['encoder_embedding'][0] 747 | ) 748 | 749 | # then get that matrix 750 | return self.net.get_weights_as_numpy_array( 751 | embedding_name, self.restore_epoch) 752 | 753 | ###### 754 | # You should make it easier to do what you do here. E.g., there should be 755 | # a more general way to make an appropriate AssessmentTuple. 756 | ###### 757 | def get_internal_activations(self): 758 | # You should make these arguments--although that would require getting 759 | # some other things to work.... 760 | op_strings = [ 761 | 'convolved_inputs', 762 | 'reversed_inputs', 763 | 'decimated_reversed_targets', 764 | 'final_RNN_state', 765 | ] 766 | 767 | # ... 768 | subnet_params = self.ecog_subjects[-1] 769 | 770 | class BriefAssessmentTuple(MutableNamedTuple): 771 | __slots__ = ['initializer'] + op_strings 772 | 773 | def assessment_data_fxn(num_epochs): 774 | GPU_op_dict, CPU_op_dict, assessments = \ 775 | self.net._generate_oneshot_datasets(subnet_params, 0) 776 | brief_assessments = { 777 | 'validation': BriefAssessmentTuple( 778 | initializer=assessments['validation'].initializer, 779 | **{op_string: None for op_string in op_strings} 780 | ) 781 | } 782 | return GPU_op_dict, CPU_op_dict, brief_assessments 783 | 784 | def assessment_net_builder(GPU_op_dict, CPU_op_dict): 785 | with tf.variable_scope('seq2seq', reuse=tf.compat.v1.AUTO_REUSE): 786 | # reverse and decimate encoder targets 787 | #### 788 | # HARD-CODED for 'encoder_1_targets' 789 | _, get_targets_lengths = nn.sequences_tools( 790 | GPU_op_dict['encoder_1_targets']) 791 | reverse_targets = tf.reverse_sequence( 792 | GPU_op_dict['encoder_1_targets'], get_targets_lengths, 793 | seq_axis=1, batch_axis=0) 794 | decimate_reversed_targets = reverse_targets[ 795 | :, 0::subnet_params.decimation_factor, :] 796 | #### 797 | 798 | self.net._prepare_encoder_targets( 799 | GPU_op_dict, 0, subnet_params.decimation_factor) 800 | 801 | with tf.compat.v1.variable_scope( 802 | 'subnet_{}'.format(subnet_params.subnet_id,), 803 | reuse=tf.compat.v1.AUTO_REUSE 804 | ): 805 | # reverse inputs 806 | _, get_lengths = nn.sequences_tools(tfh.hide_shape( 807 | GPU_op_dict['encoder_inputs'])) 808 | reverse_inputs = tf.reverse_sequence( 809 | GPU_op_dict['encoder_inputs'], get_lengths, 810 | seq_axis=1, batch_axis=0) 811 | 812 | # convolve inputs 813 | convolve_reversed_inputs, _ = self.net._convolve_sequences( 814 | reverse_inputs, subnet_params.decimation_factor, 815 | subnet_params.data_manifests['encoder_inputs'].num_features, 816 | self.net.layer_sizes['encoder_embedding'], 0.0, 817 | 'encoder_embedding', tower_name='' 818 | ) 819 | 820 | # get the encoder state 821 | _, get_final_state, _, _ = self.net._encode_sequences( 822 | GPU_op_dict, subnet_params, 0.0, 0.0, set_initial_ind=0, 823 | ) 824 | 825 | # give names to these so you can recover them later 826 | decimate_reversed_targets = tf.identity( 827 | decimate_reversed_targets, 'assess_decimated_reversed_targets') 828 | convolve_reversed_inputs = tf.identity( 829 | convolve_reversed_inputs, 'assess_convolved_inputs') 830 | reverse_inputs = tf.identity( 831 | reverse_inputs, 'assess_reversed_inputs') 832 | get_final_state = tf.identity( 833 | get_final_state, 'assess_final_RNN_state') 834 | 835 | # one day you will be able to get rid of these... 836 | return None, None 837 | 838 | def assessor( 839 | sess, assessment_struct, epoch, assessment_step, data_partition 840 | ): 841 | sess.run(assessment_struct.initializer) 842 | assessments = sess.run([ 843 | sess.graph.get_operation_by_name('assess_' + op_string).outputs[0] 844 | for op_string in op_strings] 845 | ) 846 | for op_string, assessment in zip(op_strings, assessments): 847 | setattr(assessment_struct, op_string, assessment) 848 | 849 | return assessment_struct 850 | 851 | # use the general graph build to assemble these pieces 852 | graph_builder = tfh.GraphBuilder( 853 | None, assessment_data_fxn, None, assessment_net_builder, None, 854 | assessor, self.net.checkpoint_path, self.restore_epoch, 855 | self.restore_epoch-1, EMA_decay=self.net.EMA_decay, 856 | assessment_GPU=self.net.assessment_GPU, 857 | ) 858 | 859 | return graph_builder.assess() 860 | 861 | def tf_record_to_numpy_data(self, subj_id, block_id): 862 | ''' 863 | It is frequently useful to inspect the content of the tf_records. 864 | 865 | NB: this method *does* reshape flattened ECoG data, but does *not* 866 | substitute indices for strings. 867 | 868 | USAGE: 869 | for example in trainer.tf_record_to_numpy_data(401, 4): 870 | print(example.keys()) 871 | ''' 872 | 873 | tf.compat.v1.disable_eager_execution() 874 | tf.compat.v1.reset_default_graph() 875 | 876 | # get the requested ECoGSubject 877 | for subject in self.ecog_subjects: 878 | if subject.subj_id == subj_id: 879 | break 880 | else: 881 | raise ValueError('Requested subject not in this trainer') 882 | 883 | # block default transforms, e.g. of strings to indices; see subjects.py 884 | None_transforms = [] 885 | for key, data_manifest in subject.data_manifests.items(): 886 | if data_manifest._transform is None: 887 | None_transforms.append(key) 888 | subject.data_manifests[key]._transform = lambda seq: seq 889 | 890 | # pull the tf_record into a TF dataset 891 | dataset = tf.data.TFRecordDataset( 892 | [subject.tf_record_partial_path.format(block_id)] 893 | ) 894 | 895 | # parse according to the info in the data_manifests 896 | dataset = dataset.map( 897 | lambda example_proto: tfh.parse_protobuf_seq2seq_example( 898 | example_proto, subject.data_manifests 899 | ), 900 | num_parallel_calls=tf.data.experimental.AUTOTUNE 901 | ) 902 | 903 | # remove transform blocking by restoring original transforms 904 | for key in None_transforms: 905 | subject.data_manifests[key]._transform = None 906 | 907 | # set up the one-shot iterator 908 | iterator = tf.compat.v1.data.Iterator.from_structure( 909 | tf.compat.v1.data.get_output_types(dataset), 910 | tf.compat.v1.data.get_output_shapes(dataset) 911 | ) 912 | initializer = iterator.make_initializer(dataset) 913 | sequenced_op_dict = iterator.get_next() 914 | 915 | # finally, transform to numpy data 916 | with tf.compat.v1.Session() as sess: 917 | sess.run(initializer) 918 | while True: 919 | try: 920 | yield sess.run(sequenced_op_dict) 921 | except tf.errors.OutOfRangeError: 922 | break 923 | 924 | 925 | def construct_online_predictor( 926 | restore_dir, targets_list=None, TARGETS_ARE_SEQUENCES=False 927 | ): 928 | 929 | # open a session with the saved_model loaded into it 930 | sess = tfh.get_session_with_saved_model(restore_dir) 931 | 932 | # create a function which uses this session to decode 933 | def predict(inputs): 934 | decoded_probs, sequenced_decoder_outputs = sess.run( 935 | ['decoder_probs:0', 'decoder_outputs:0'], 936 | feed_dict={'encoder_inputs:0': inputs} 937 | ) 938 | if targets_list: 939 | tokens_list = ( 940 | targets_list if TARGETS_ARE_SEQUENCES else 941 | nn.targets_to_tokens(targets_list, pad_token) 942 | ) 943 | hypotheses = target_inds_to_sequences( 944 | sequenced_decoder_outputs, tokens_list)[0] 945 | return hypotheses 946 | else: 947 | return decoded_probs 948 | 949 | return predict 950 | 951 | 952 | def target_inds_to_sequences(hypotheses, targets_list, iExample=0): 953 | ###### 954 | # This is redundant with the one in SequenceNets. Think about 955 | # the best place to put a single version.... 956 | ###### 957 | predicted_tokens = [ 958 | ''.join([targets_list[ind] for ind in hypothesis]).replace( 959 | '_', ' ').replace(pad_token, '').replace( 960 | EOS_token, '').rstrip() 961 | for hypothesis in hypotheses[iExample] 962 | ] 963 | return predicted_tokens 964 | 965 | 966 | def _identity(x): 967 | return x 968 | -------------------------------------------------------------------------------- /ecog2txt/auxiliary/EFC/block_breakdowns.json: -------------------------------------------------------------------------------- 1 | { 2 | "397": { 3 | "20": { 4 | "audio": false, 5 | "bipolar": false, 6 | "default_dataset": "training", 7 | "type": "mocha-1" 8 | }, 9 | "24": { 10 | "audio": false, 11 | "bipolar": false, 12 | "default_dataset": "training", 13 | "type": "mocha-2" 14 | }, 15 | "40": { 16 | "audio": false, 17 | "bipolar": false, 18 | "default_dataset": "training", 19 | "type": "mocha-5" 20 | }, 21 | "45": { 22 | "audio": false, 23 | "bipolar": false, 24 | "default_dataset": "training", 25 | "type": "mocha-6" 26 | }, 27 | "46": { 28 | "audio": false, 29 | "bipolar": false, 30 | "default_dataset": "training", 31 | "type": "mocha-7" 32 | }, 33 | "48": { 34 | "audio": false, 35 | "bipolar": false, 36 | "default_dataset": "training", 37 | "type": "mocha-8" 38 | }, 39 | "54": { 40 | "audio": false, 41 | "bipolar": false, 42 | "default_dataset": "training", 43 | "type": "mocha-9" 44 | }, 45 | "55": { 46 | "audio": false, 47 | "bipolar": false, 48 | "default_dataset": "validation", 49 | "type": "mocha-1" 50 | }, 51 | "57": { 52 | "audio": false, 53 | "bipolar": false, 54 | "default_dataset": "training", 55 | "type": "mocha-2" 56 | }, 57 | "58": { 58 | "audio": false, 59 | "bipolar": false, 60 | "default_dataset": "training", 61 | "type": "mocha-3" 62 | }, 63 | "60": { 64 | "audio": false, 65 | "bipolar": false, 66 | "default_dataset": "training", 67 | "type": "mocha-4" 68 | }, 69 | "61": { 70 | "audio": false, 71 | "bipolar": false, 72 | "default_dataset": "training", 73 | "type": "mocha-5" 74 | }, 75 | "63": { 76 | "audio": false, 77 | "bipolar": false, 78 | "default_dataset": "training", 79 | "type": "mocha-6" 80 | }, 81 | "64": { 82 | "audio": false, 83 | "bipolar": false, 84 | "default_dataset": "training", 85 | "type": "mocha-7" 86 | }, 87 | "66": { 88 | "audio": false, 89 | "bipolar": false, 90 | "default_dataset": "training", 91 | "type": "mocha-8" 92 | }, 93 | "67": { 94 | "audio": false, 95 | "bipolar": false, 96 | "default_dataset": "training", 97 | "type": "mocha-9" 98 | } 99 | }, 100 | "398": { 101 | "3": { 102 | "audio": false, 103 | "bipolar": false, 104 | "default_dataset": "training", 105 | "type": "mocha-1" 106 | }, 107 | "4": { 108 | "audio": false, 109 | "bipolar": false, 110 | "default_dataset": "training", 111 | "type": "mocha-2" 112 | }, 113 | "8": { 114 | "audio": false, 115 | "bipolar": false, 116 | "default_dataset": "training", 117 | "type": "mocha-3" 118 | }, 119 | "11": { 120 | "audio": false, 121 | "bipolar": false, 122 | "default_dataset": "training", 123 | "type": "mocha-4" 124 | }, 125 | "12": { 126 | "audio": false, 127 | "bipolar": false, 128 | "default_dataset": "training", 129 | "type": "mocha-5" 130 | }, 131 | "13": { 132 | "audio": false, 133 | "bipolar": false, 134 | "default_dataset": "training", 135 | "type": "mocha-6" 136 | }, 137 | "14": { 138 | "audio": false, 139 | "bipolar": false, 140 | "default_dataset": "training", 141 | "type": "mocha-7" 142 | }, 143 | "15": { 144 | "audio": false, 145 | "bipolar": false, 146 | "default_dataset": "training", 147 | "type": "mocha-8" 148 | }, 149 | "16": { 150 | "audio": false, 151 | "bipolar": false, 152 | "default_dataset": "training", 153 | "type": "mocha-9" 154 | }, 155 | "20": { 156 | "audio": false, 157 | "bipolar": false, 158 | "default_dataset": "validation", 159 | "type": "mocha-1" 160 | }, 161 | "22": { 162 | "audio": false, 163 | "bipolar": false, 164 | "default_dataset": "training", 165 | "type": "mocha-2" 166 | }, 167 | "27": { 168 | "audio": false, 169 | "bipolar": false, 170 | "default_dataset": "training", 171 | "type": "mocha-3" 172 | }, 173 | "29": { 174 | "audio": false, 175 | "bipolar": false, 176 | "default_dataset": "training", 177 | "type": "mocha-4" 178 | }, 179 | "36": { 180 | "audio": false, 181 | "bipolar": false, 182 | "default_dataset": "training", 183 | "type": "mocha-5" 184 | }, 185 | "38": { 186 | "audio": false, 187 | "bipolar": false, 188 | "default_dataset": "training", 189 | "type": "mocha-6" 190 | }, 191 | "40": { 192 | "audio": false, 193 | "bipolar": false, 194 | "default_dataset": "training", 195 | "type": "mocha-7" 196 | }, 197 | "41": { 198 | "audio": false, 199 | "bipolar": false, 200 | "default_dataset": "training", 201 | "type": "mocha-8" 202 | }, 203 | "42": { 204 | "audio": false, 205 | "bipolar": false, 206 | "default_dataset": "training", 207 | "type": "mocha-9" 208 | } 209 | }, 210 | "399": { 211 | "26": { 212 | "audio": false, 213 | "bipolar": false, 214 | "default_dataset": "training", 215 | "type": "mocha-1" 216 | }, 217 | "27": { 218 | "audio": false, 219 | "bipolar": false, 220 | "default_dataset": "training", 221 | "type": "mocha-2" 222 | }, 223 | "28": { 224 | "audio": false, 225 | "bipolar": false, 226 | "default_dataset": "training", 227 | "type": "mocha-3" 228 | }, 229 | "38": { 230 | "audio": false, 231 | "bipolar": false, 232 | "default_dataset": "training", 233 | "type": "mocha-4" 234 | }, 235 | "39": { 236 | "audio": false, 237 | "bipolar": false, 238 | "default_dataset": "training", 239 | "type": "mocha-5" 240 | }, 241 | "42": { 242 | "audio": false, 243 | "bipolar": false, 244 | "default_dataset": "training", 245 | "type": "mocha-6" 246 | }, 247 | "43": { 248 | "audio": false, 249 | "bipolar": false, 250 | "default_dataset": "training", 251 | "type": "mocha-7" 252 | }, 253 | "46": { 254 | "audio": false, 255 | "bipolar": false, 256 | "default_dataset": "training", 257 | "type": "mocha-9" 258 | } 259 | }, 260 | "400": { 261 | "3": { 262 | "audio": true, 263 | "bipolar": true, 264 | "default_dataset": "training", 265 | "type": "mocha-1" 266 | }, 267 | "4": { 268 | "audio": true, 269 | "bipolar": true, 270 | "default_dataset": "training", 271 | "type": "mocha-2" 272 | }, 273 | "6": { 274 | "audio": true, 275 | "bipolar": true, 276 | "default_dataset": "training", 277 | "type": "mocha-3" 278 | }, 279 | "8": { 280 | "audio": true, 281 | "bipolar": true, 282 | "default_dataset": "training", 283 | "type": "mocha-4" 284 | }, 285 | "10": { 286 | "audio": true, 287 | "bipolar": true, 288 | "default_dataset": "training", 289 | "type": "mocha-5" 290 | }, 291 | "12": { 292 | "audio": true, 293 | "bipolar": true, 294 | "default_dataset": "training", 295 | "type": "mocha-6" 296 | }, 297 | "14": { 298 | "audio": true, 299 | "bipolar": true, 300 | "default_dataset": "training", 301 | "type": "mocha-7" 302 | }, 303 | "15": { 304 | "audio": true, 305 | "bipolar": true, 306 | "default_dataset": "training", 307 | "type": "mocha-8" 308 | }, 309 | "19": { 310 | "audio": true, 311 | "bipolar": true, 312 | "default_dataset": "training", 313 | "type": "mocha-9" 314 | }, 315 | "23": { 316 | "audio": true, 317 | "bipolar": true, 318 | "default_dataset": "training", 319 | "type": "mocha-1" 320 | }, 321 | "28": { 322 | "audio": true, 323 | "bipolar": true, 324 | "default_dataset": "training", 325 | "type": "mocha-2" 326 | }, 327 | "30": { 328 | "audio": true, 329 | "bipolar": true, 330 | "default_dataset": "training", 331 | "type": "mocha-3" 332 | }, 333 | "38": { 334 | "audio": true, 335 | "bipolar": true, 336 | "default_dataset": "training", 337 | "type": "mocha-4" 338 | }, 339 | "40": { 340 | "audio": true, 341 | "bipolar": true, 342 | "default_dataset": "training", 343 | "type": "mocha-5" 344 | }, 345 | "42": { 346 | "audio": true, 347 | "bipolar": true, 348 | "default_dataset": "training", 349 | "type": "mocha-6" 350 | }, 351 | "46": { 352 | "audio": true, 353 | "bipolar": true, 354 | "default_dataset": "training", 355 | "type": "mocha-7" 356 | }, 357 | "57": { 358 | "audio": true, 359 | "bipolar": true, 360 | "default_dataset": "training", 361 | "type": "mocha-8" 362 | }, 363 | "61": { 364 | "audio": true, 365 | "bipolar": true, 366 | "default_dataset": "training", 367 | "type": "mocha-9" 368 | }, 369 | "72": { 370 | "audio": true, 371 | "bipolar": true, 372 | "default_dataset": "validation", 373 | "type": "mocha-1" 374 | }, 375 | "73": { 376 | "audio": false, 377 | "bipolar": false, 378 | "default_dataset": "extra", 379 | "type": "mocha-2" 380 | }, 381 | "82": { 382 | "audio": false, 383 | "bipolar": false, 384 | "default_dataset": "extra", 385 | "type": "mocha-3" 386 | } 387 | }, 388 | "401": { 389 | "4": { 390 | "audio": true, 391 | "bipolar": true, 392 | "default_dataset": "training", 393 | "type": "mocha-1" 394 | }, 395 | "6": { 396 | "audio": true, 397 | "bipolar": true, 398 | "default_dataset": "training", 399 | "type": "mocha-2" 400 | }, 401 | "8": { 402 | "audio": true, 403 | "bipolar": true, 404 | "default_dataset": "training", 405 | "type": "mocha-3" 406 | }, 407 | "12": { 408 | "audio": true, 409 | "bipolar": true, 410 | "default_dataset": "training", 411 | "type": "mocha-4" 412 | }, 413 | "17": { 414 | "audio": true, 415 | "bipolar": true, 416 | "default_dataset": "training", 417 | "type": "mocha-5" 418 | }, 419 | "18": { 420 | "audio": true, 421 | "bipolar": true, 422 | "default_dataset": "training", 423 | "type": "mocha-6" 424 | }, 425 | "20": { 426 | "audio": true, 427 | "bipolar": true, 428 | "default_dataset": "training", 429 | "type": "mocha-7" 430 | }, 431 | "32": { 432 | "audio": true, 433 | "bipolar": true, 434 | "default_dataset": "training", 435 | "type": "mocha-8" 436 | }, 437 | "34": { 438 | "audio": true, 439 | "bipolar": true, 440 | "default_dataset": "training", 441 | "type": "mocha-9" 442 | }, 443 | "41": { 444 | "audio": true, 445 | "bipolar": true, 446 | "default_dataset": "training", 447 | "type": "mocha-1" 448 | }, 449 | "57": { 450 | "audio": true, 451 | "bipolar": true, 452 | "default_dataset": "training", 453 | "type": "mocha-1" 454 | }, 455 | "61": { 456 | "audio": true, 457 | "bipolar": true, 458 | "default_dataset": "training", 459 | "type": "mocha-1" 460 | }, 461 | "66": { 462 | "audio": true, 463 | "bipolar": true, 464 | "default_dataset": "training", 465 | "type": "mocha-1" 466 | }, 467 | "69": { 468 | "audio": true, 469 | "bipolar": true, 470 | "default_dataset": "training", 471 | "type": "mocha-1" 472 | }, 473 | "73": { 474 | "audio": true, 475 | "bipolar": true, 476 | "default_dataset": "training", 477 | "type": "mocha-1" 478 | }, 479 | "77": { 480 | "audio": true, 481 | "bipolar": true, 482 | "default_dataset": "training", 483 | "type": "mocha-1" 484 | }, 485 | "83": { 486 | "audio": true, 487 | "bipolar": true, 488 | "default_dataset": "validation", 489 | "type": "mocha-1" 490 | }, 491 | "87": { 492 | "audio": true, 493 | "bipolar": true, 494 | "default_dataset": "testing", 495 | "type": "mocha-1" 496 | } 497 | }, 498 | "402": { 499 | "3": { 500 | "audio": true, 501 | "bipolar": true, 502 | "day": 0, 503 | "default_dataset": "training", 504 | "hour": 11, 505 | "type": "demo_2_sentences" 506 | }, 507 | "4": { 508 | "audio": true, 509 | "bipolar": true, 510 | "day": 0, 511 | "default_dataset": "training", 512 | "hour": 11, 513 | "type": "picture_description_reduced" 514 | }, 515 | "5": { 516 | "audio": true, 517 | "bipolar": true, 518 | "day": 0, 519 | "default_dataset": "training", 520 | "hour": 16, 521 | "type": "picture_description_reduced" 522 | }, 523 | "6": { 524 | "audio": true, 525 | "bipolar": true, 526 | "day": 0, 527 | "default_dataset": "training", 528 | "hour": 16, 529 | "type": "picture_description_reduced" 530 | }, 531 | "7": { 532 | "audio": true, 533 | "bipolar": true, 534 | "day": 0, 535 | "default_dataset": "training", 536 | "hour": 16, 537 | "type": "picture_description_reduced" 538 | }, 539 | "8": { 540 | "audio": true, 541 | "bipolar": true, 542 | "day": 0, 543 | "default_dataset": "training", 544 | "hour": 16, 545 | "type": "picture_description_reduced" 546 | }, 547 | "9": { 548 | "audio": true, 549 | "bipolar": true, 550 | "day": 0, 551 | "default_dataset": "training", 552 | "hour": 16, 553 | "type": "picture_description_reduced" 554 | }, 555 | "11": { 556 | "audio": true, 557 | "bipolar": true, 558 | "day": 0, 559 | "default_dataset": "validation", 560 | "hour": 16, 561 | "type": "mocha-1" 562 | }, 563 | "12": { 564 | "audio": true, 565 | "bipolar": true, 566 | "day": 0, 567 | "default_dataset": "training", 568 | "hour": 16, 569 | "type": "mocha-2" 570 | }, 571 | "13": { 572 | "audio": true, 573 | "bipolar": true, 574 | "day": 0, 575 | "default_dataset": "training", 576 | "hour": 16, 577 | "type": "picture_description_reduced" 578 | }, 579 | "14": { 580 | "audio": true, 581 | "bipolar": true, 582 | "day": 0, 583 | "default_dataset": "training", 584 | "hour": 16, 585 | "type": "picture_description_reduced" 586 | }, 587 | "15": { 588 | "audio": true, 589 | "bipolar": true, 590 | "day": 0, 591 | "default_dataset": "training", 592 | "hour": 16, 593 | "type": "picture_description_reduced" 594 | }, 595 | "16": { 596 | "audio": true, 597 | "bipolar": true, 598 | "day": 0, 599 | "default_dataset": "training", 600 | "hour": 16, 601 | "type": "picture_description_reduced" 602 | }, 603 | "17": { 604 | "audio": true, 605 | "bipolar": true, 606 | "day": 1, 607 | "default_dataset": "training", 608 | "hour": 11, 609 | "type": "picture_description_kitchen_reduced" 610 | }, 611 | "18": { 612 | "audio": true, 613 | "bipolar": true, 614 | "day": 1, 615 | "default_dataset": "training", 616 | "hour": 11, 617 | "type": "picture_description_birthday_reduced" 618 | }, 619 | "19": { 620 | "audio": true, 621 | "bipolar": true, 622 | "day": 1, 623 | "default_dataset": "training", 624 | "hour": 11, 625 | "type": "picture_description_tree_reduced" 626 | }, 627 | "20": { 628 | "audio": true, 629 | "bipolar": true, 630 | "day": 1, 631 | "default_dataset": "training", 632 | "hour": 11, 633 | "type": "mocha-3" 634 | }, 635 | "21": { 636 | "audio": true, 637 | "bipolar": true, 638 | "day": 1, 639 | "default_dataset": "training", 640 | "hour": 11, 641 | "type": "mocha-4" 642 | }, 643 | "22": { 644 | "audio": true, 645 | "bipolar": true, 646 | "day": 1, 647 | "default_dataset": "training", 648 | "hour": 11, 649 | "type": "mocha-5" 650 | }, 651 | "25": { 652 | "audio": true, 653 | "bipolar": true, 654 | "day": 1, 655 | "default_dataset": "training", 656 | "hour": 15, 657 | "type": "picture_description_kitchen_reduced" 658 | }, 659 | "26": { 660 | "audio": true, 661 | "bipolar": true, 662 | "day": 1, 663 | "default_dataset": "training", 664 | "hour": 15, 665 | "type": "picture_description_tree_reduced" 666 | }, 667 | "27": { 668 | "audio": true, 669 | "bipolar": true, 670 | "day": 1, 671 | "default_dataset": "training", 672 | "hour": 15, 673 | "type": "picture_description_birthday_reduced" 674 | }, 675 | "28": { 676 | "audio": true, 677 | "bipolar": true, 678 | "day": 1, 679 | "default_dataset": "training", 680 | "hour": 15, 681 | "type": "mocha-6" 682 | }, 683 | "29": { 684 | "audio": true, 685 | "bipolar": true, 686 | "day": 1, 687 | "default_dataset": "training", 688 | "hour": 15, 689 | "type": "mocha-7" 690 | }, 691 | "30": { 692 | "audio": true, 693 | "bipolar": true, 694 | "day": 1, 695 | "default_dataset": "training", 696 | "hour": 15, 697 | "type": "mocha-8" 698 | }, 699 | "31": { 700 | "audio": true, 701 | "bipolar": true, 702 | "day": 1, 703 | "default_dataset": "training", 704 | "hour": 15, 705 | "type": "mocha-9" 706 | }, 707 | "33": { 708 | "audio": true, 709 | "bipolar": true, 710 | "day": 2, 711 | "default_dataset": "training", 712 | "hour": 16, 713 | "type": "picture_description_kitchen_reduced" 714 | }, 715 | "34": { 716 | "audio": true, 717 | "bipolar": true, 718 | "day": 2, 719 | "default_dataset": "training", 720 | "hour": 16, 721 | "type": "picture_description_birthday_reduced" 722 | }, 723 | "35": { 724 | "audio": true, 725 | "bipolar": true, 726 | "day": 2, 727 | "default_dataset": "training", 728 | "hour": 16, 729 | "type": "picture_description_tree_reduced" 730 | }, 731 | "41": { 732 | "audio": true, 733 | "bipolar": true, 734 | "day": 4, 735 | "default_dataset": "training", 736 | "hour": 13.25, 737 | "type": "mocha-1" 738 | }, 739 | "44": { 740 | "audio": true, 741 | "bipolar": true, 742 | "day": 4, 743 | "default_dataset": "training", 744 | "hour": 13.5, 745 | "type": "picture_description_kitchen_reduced" 746 | }, 747 | "45": { 748 | "audio": true, 749 | "bipolar": true, 750 | "day": 4, 751 | "default_dataset": "validation", 752 | "hour": 13.5, 753 | "type": "picture_description_kitchen_reduced" 754 | }, 755 | "46": { 756 | "audio": true, 757 | "bipolar": true, 758 | "day": 4, 759 | "default_dataset": "validation", 760 | "hour": 13.5, 761 | "type": "picture_description_birthday_reduced" 762 | }, 763 | "47": { 764 | "audio": true, 765 | "bipolar": true, 766 | "day": 4, 767 | "default_dataset": "validation", 768 | "hour": 13.5, 769 | "type": "picture_description_tree_reduced" 770 | }, 771 | "48": { 772 | "audio": true, 773 | "bipolar": true, 774 | "day": 4, 775 | "default_dataset": "training", 776 | "hour": 13.5, 777 | "type": "picture_description_kitchen_reduced" 778 | }, 779 | "49": { 780 | "audio": true, 781 | "bipolar": true, 782 | "day": 4, 783 | "default_dataset": "training", 784 | "hour": 13.5, 785 | "type": "picture_description_kitchen_reduced" 786 | }, 787 | "50": { 788 | "audio": true, 789 | "bipolar": true, 790 | "day": 4, 791 | "default_dataset": "training", 792 | "hour": 14, 793 | "type": "mocha-2" 794 | }, 795 | "51": { 796 | "audio": true, 797 | "bipolar": true, 798 | "day": 5, 799 | "default_dataset": "training", 800 | "hour": 15, 801 | "type": "mocha-3" 802 | }, 803 | "52": { 804 | "audio": true, 805 | "bipolar": true, 806 | "day": 5, 807 | "default_dataset": "training", 808 | "hour": 15, 809 | "type": "mocha-4" 810 | }, 811 | "53": { 812 | "audio": true, 813 | "bipolar": true, 814 | "day": 5, 815 | "default_dataset": "training", 816 | "hour": 15, 817 | "type": "mocha-5" 818 | }, 819 | "54": { 820 | "audio": true, 821 | "bipolar": true, 822 | "day": 5, 823 | "default_dataset": "training", 824 | "hour": 15, 825 | "type": "mocha-6" 826 | }, 827 | "55": { 828 | "audio": true, 829 | "bipolar": true, 830 | "day": 5, 831 | "default_dataset": "training", 832 | "hour": 15, 833 | "type": "mocha-7" 834 | }, 835 | "56": { 836 | "audio": false, 837 | "bipolar": true, 838 | "day": 6, 839 | "default_dataset": "testing", 840 | "hour": 9, 841 | "type": "mocha-8" 842 | }, 843 | "57": { 844 | "audio": false, 845 | "bipolar": true, 846 | "day": 6, 847 | "default_dataset": "testing", 848 | "hour": 9.3, 849 | "type": "mocha-9" 850 | }, 851 | "58": { 852 | "audio": true, 853 | "bipolar": true, 854 | "day": 6, 855 | "default_dataset": "testing", 856 | "hour": 9.3, 857 | "type": "picture_description_kitchen_reduced" 858 | }, 859 | "59": { 860 | "audio": true, 861 | "bipolar": true, 862 | "day": 6, 863 | "default_dataset": "training", 864 | "hour": 9.3, 865 | "type": "picture_description_kitchen_reduced_mimed" 866 | }, 867 | "60": { 868 | "audio": true, 869 | "bipolar": true, 870 | "day": 6, 871 | "default_dataset": "training", 872 | "hour": 9.3, 873 | "type": "picture_description_kitchen_reduced_mimed" 874 | } 875 | }, 876 | "403": { 877 | "3": { 878 | "audio": true, 879 | "bipolar": true, 880 | "default_dataset": "training", 881 | "type": "mocha-1" 882 | }, 883 | "4": { 884 | "audio": true, 885 | "bipolar": true, 886 | "default_dataset": "training", 887 | "type": "picture_description_reduced" 888 | }, 889 | "5": { 890 | "audio": false, 891 | "bipolar": true, 892 | "default_dataset": "training", 893 | "type": "interview" 894 | }, 895 | "6": { 896 | "audio": true, 897 | "bipolar": true, 898 | "default_dataset": "training", 899 | "type": "mocha-2" 900 | }, 901 | "7": { 902 | "audio": true, 903 | "bipolar": true, 904 | "default_dataset": "training", 905 | "type": "picture_description_reduced" 906 | }, 907 | "8": { 908 | "audio": false, 909 | "bipolar": true, 910 | "default_dataset": "training", 911 | "type": "picture_description" 912 | }, 913 | "9": { 914 | "audio": true, 915 | "bipolar": true, 916 | "default_dataset": "training", 917 | "type": "mocha-3" 918 | }, 919 | "10": { 920 | "audio": true, 921 | "bipolar": true, 922 | "default_dataset": "training", 923 | "type": "picture_description_reduced" 924 | }, 925 | "11": { 926 | "audio": false, 927 | "bipolar": true, 928 | "default_dataset": "training", 929 | "type": "interview" 930 | }, 931 | "12": { 932 | "audio": true, 933 | "bipolar": true, 934 | "default_dataset": "training", 935 | "type": "mocha-4" 936 | }, 937 | "13": { 938 | "audio": true, 939 | "bipolar": true, 940 | "default_dataset": "training", 941 | "type": "picture_description_reduced" 942 | }, 943 | "14": { 944 | "audio": false, 945 | "bipolar": true, 946 | "default_dataset": "training", 947 | "type": "picture_description" 948 | }, 949 | "15": { 950 | "audio": true, 951 | "bipolar": true, 952 | "default_dataset": "training", 953 | "type": "mocha-5" 954 | }, 955 | "16": { 956 | "audio": false, 957 | "bipolar": true, 958 | "default_dataset": "training", 959 | "type": "interview" 960 | }, 961 | "17": { 962 | "audio": true, 963 | "bipolar": true, 964 | "default_dataset": "training", 965 | "type": "mocha-6" 966 | }, 967 | "18": { 968 | "audio": true, 969 | "bipolar": true, 970 | "default_dataset": "training", 971 | "type": "mocha-7" 972 | }, 973 | "19": { 974 | "audio": true, 975 | "bipolar": true, 976 | "default_dataset": "training", 977 | "type": "picture_description_reduced" 978 | }, 979 | "20": { 980 | "audio": true, 981 | "bipolar": true, 982 | "default_dataset": "training", 983 | "type": "picture_description_reduced" 984 | }, 985 | "21": { 986 | "audio": true, 987 | "bipolar": true, 988 | "default_dataset": "training", 989 | "type": "picture_description_reduced" 990 | }, 991 | "22": { 992 | "audio": true, 993 | "bipolar": true, 994 | "default_dataset": "training", 995 | "type": "mocha-8" 996 | }, 997 | "25": { 998 | "audio": false, 999 | "bipolar": true, 1000 | "default_dataset": "training", 1001 | "type": "interview" 1002 | }, 1003 | "27": { 1004 | "audio": true, 1005 | "bipolar": true, 1006 | "default_dataset": "training", 1007 | "type": "mocha-9" 1008 | }, 1009 | "28": { 1010 | "audio": true, 1011 | "bipolar": true, 1012 | "default_dataset": "training", 1013 | "type": "picture_description_birthday_reduced_mimed" 1014 | }, 1015 | "29": { 1016 | "audio": false, 1017 | "bipolar": true, 1018 | "default_dataset": "training", 1019 | "type": "picture_description" 1020 | }, 1021 | "30": { 1022 | "audio": true, 1023 | "bipolar": true, 1024 | "default_dataset": "validation", 1025 | "type": "mocha-1" 1026 | }, 1027 | "31": { 1028 | "audio": false, 1029 | "bipolar": true, 1030 | "default_dataset": "training", 1031 | "type": "interview" 1032 | }, 1033 | "33": { 1034 | "audio": true, 1035 | "bipolar": true, 1036 | "default_dataset": "training", 1037 | "type": "mocha-2" 1038 | }, 1039 | "34": { 1040 | "audio": false, 1041 | "bipolar": true, 1042 | "default_dataset": "training", 1043 | "type": "interview" 1044 | }, 1045 | "35": { 1046 | "audio": true, 1047 | "bipolar": true, 1048 | "default_dataset": "training", 1049 | "type": "picture_description_kitchen_reduced_mimed" 1050 | }, 1051 | "36": { 1052 | "audio": false, 1053 | "bipolar": true, 1054 | "default_dataset": "training", 1055 | "type": "picture_description" 1056 | }, 1057 | "37": { 1058 | "audio": false, 1059 | "bipolar": true, 1060 | "default_dataset": "training", 1061 | "type": "interview" 1062 | }, 1063 | "38": { 1064 | "audio": true, 1065 | "bipolar": true, 1066 | "default_dataset": "training", 1067 | "type": "mocha-3" 1068 | }, 1069 | "39": { 1070 | "audio": true, 1071 | "bipolar": true, 1072 | "default_dataset": "training", 1073 | "type": "picture_description_birthday_reduced_mimed" 1074 | }, 1075 | "40": { 1076 | "audio": true, 1077 | "bipolar": true, 1078 | "default_dataset": "training", 1079 | "type": "mocha-4" 1080 | }, 1081 | "41": { 1082 | "audio": false, 1083 | "bipolar": true, 1084 | "default_dataset": "training", 1085 | "type": "picture_description" 1086 | }, 1087 | "42": { 1088 | "audio": true, 1089 | "bipolar": true, 1090 | "default_dataset": "training", 1091 | "type": "mocha-5" 1092 | }, 1093 | "44": { 1094 | "audio": true, 1095 | "bipolar": true, 1096 | "default_dataset": "training", 1097 | "type": "mocha-6" 1098 | }, 1099 | "46": { 1100 | "audio": true, 1101 | "bipolar": true, 1102 | "default_dataset": "training", 1103 | "type": "mocha-6" 1104 | }, 1105 | "47": { 1106 | "audio": false, 1107 | "bipolar": true, 1108 | "default_dataset": "defective", 1109 | "type": "unknown" 1110 | }, 1111 | "48": { 1112 | "audio": true, 1113 | "bipolar": true, 1114 | "default_dataset": "training", 1115 | "type": "mocha-7" 1116 | }, 1117 | "50": { 1118 | "audio": true, 1119 | "bipolar": true, 1120 | "default_dataset": "training", 1121 | "type": "mocha-8" 1122 | }, 1123 | "51": { 1124 | "audio": false, 1125 | "bipolar": true, 1126 | "default_dataset": "defective", 1127 | "type": "mocha-9" 1128 | }, 1129 | "52": { 1130 | "audio": true, 1131 | "bipolar": true, 1132 | "default_dataset": "validation", 1133 | "type": "picture_description_tree_reduced" 1134 | }, 1135 | "53": { 1136 | "audio": true, 1137 | "bipolar": true, 1138 | "default_dataset": "training", 1139 | "type": "picture_description_tree_reduced" 1140 | }, 1141 | "54": { 1142 | "audio": true, 1143 | "bipolar": true, 1144 | "default_dataset": "validation", 1145 | "type": "picture_description_kitchen_reduced" 1146 | }, 1147 | "55": { 1148 | "audio": true, 1149 | "bipolar": true, 1150 | "default_dataset": "training", 1151 | "type": "picture_description_kitchen_reduced" 1152 | }, 1153 | "56": { 1154 | "audio": true, 1155 | "bipolar": true, 1156 | "default_dataset": "validation", 1157 | "type": "picture_description_birthday_reduced" 1158 | }, 1159 | "59": { 1160 | "audio": true, 1161 | "bipolar": true, 1162 | "default_dataset": "training", 1163 | "type": "picture_description_reduced" 1164 | }, 1165 | "60": { 1166 | "audio": true, 1167 | "bipolar": true, 1168 | "default_dataset": "training", 1169 | "type": "picture_description_kitchen_reduced" 1170 | }, 1171 | "61": { 1172 | "audio": true, 1173 | "bipolar": true, 1174 | "default_dataset": "testing", 1175 | "type": "picture_description_tree_reduced" 1176 | }, 1177 | "62": { 1178 | "audio": true, 1179 | "bipolar": true, 1180 | "default_dataset": "testing", 1181 | "type": "picture_description_birthday_reduced" 1182 | }, 1183 | "63": { 1184 | "audio": true, 1185 | "bipolar": true, 1186 | "default_dataset": "training", 1187 | "type": "picture_description_reduced" 1188 | }, 1189 | "64": { 1190 | "audio": true, 1191 | "bipolar": true, 1192 | "default_dataset": "training", 1193 | "type": "picture_description_reduced" 1194 | }, 1195 | "65": { 1196 | "audio": true, 1197 | "bipolar": true, 1198 | "default_dataset": "training", 1199 | "type": "mocha-1" 1200 | }, 1201 | "66": { 1202 | "audio": false, 1203 | "bipolar": true, 1204 | "default_dataset": "extra", 1205 | "type": "mocha-2" 1206 | }, 1207 | "69": { 1208 | "audio": false, 1209 | "bipolar": true, 1210 | "default_dataset": "extra", 1211 | "type": "mocha-3" 1212 | }, 1213 | "70": { 1214 | "audio": true, 1215 | "bipolar": true, 1216 | "default_dataset": "training", 1217 | "type": "picture_description_reduced" 1218 | }, 1219 | "73": { 1220 | "audio": true, 1221 | "bipolar": true, 1222 | "default_dataset": "training", 1223 | "type": "picture_description_kitchen_reduced" 1224 | }, 1225 | "74": { 1226 | "audio": true, 1227 | "bipolar": true, 1228 | "default_dataset": "training", 1229 | "type": "picture_description_kitchen_reduced" 1230 | }, 1231 | "75": { 1232 | "audio": true, 1233 | "bipolar": true, 1234 | "default_dataset": "training", 1235 | "type": "picture_description_tree_reduced" 1236 | }, 1237 | "76": { 1238 | "audio": true, 1239 | "bipolar": true, 1240 | "default_dataset": "training", 1241 | "type": "picture_description_birthday_reduced" 1242 | }, 1243 | "77": { 1244 | "audio": true, 1245 | "bipolar": true, 1246 | "default_dataset": "training", 1247 | "type": "picture_description_tree_reduced" 1248 | }, 1249 | "83": { 1250 | "audio": true, 1251 | "bipolar": true, 1252 | "default_dataset": "training", 1253 | "type": "picture_description_reduced" 1254 | }, 1255 | "92": { 1256 | "audio": true, 1257 | "bipolar": true, 1258 | "default_dataset": "training", 1259 | "type": "picture_description_kitchen_reduced_mimed" 1260 | }, 1261 | "93": { 1262 | "audio": true, 1263 | "bipolar": true, 1264 | "default_dataset": "training", 1265 | "type": "picture_description_tree_reduced_mimed" 1266 | }, 1267 | "94": { 1268 | "audio": true, 1269 | "bipolar": true, 1270 | "default_dataset": "training", 1271 | "type": "picture_description_tree_reduced_mimed" 1272 | }, 1273 | "95": { 1274 | "audio": true, 1275 | "bipolar": true, 1276 | "default_dataset": "training", 1277 | "type": "picture_description_birthday_reduced_mimed" 1278 | }, 1279 | "96": { 1280 | "audio": false, 1281 | "bipolar": true, 1282 | "default_dataset": "defective", 1283 | "type": "picture_description_kitchen_reduced_mimed" 1284 | }, 1285 | "97": { 1286 | "audio": true, 1287 | "bipolar": true, 1288 | "default_dataset": "training", 1289 | "type": "picture_description_kitchen_reduced_mimed" 1290 | }, 1291 | "98": { 1292 | "audio": true, 1293 | "bipolar": true, 1294 | "default_dataset": "training", 1295 | "type": "picture_description_tree_reduced_mimed" 1296 | }, 1297 | "99": { 1298 | "audio": true, 1299 | "bipolar": true, 1300 | "default_dataset": "training", 1301 | "type": "picture_description_kitchen_reduced_mimed" 1302 | }, 1303 | "100": { 1304 | "audio": true, 1305 | "bipolar": true, 1306 | "default_dataset": "training", 1307 | "type": "picture_description_birthday_reduced_mimed" 1308 | }, 1309 | "101": { 1310 | "audio": true, 1311 | "bipolar": true, 1312 | "default_dataset": "training", 1313 | "type": "picture_description_reduced" 1314 | }, 1315 | "102": { 1316 | "audio": false, 1317 | "bipolar": true, 1318 | "default_dataset": "extra", 1319 | "type": "mocha-4" 1320 | }, 1321 | "104": { 1322 | "audio": false, 1323 | "bipolar": true, 1324 | "default_dataset": "extra", 1325 | "type": "mocha-5" 1326 | }, 1327 | "105": { 1328 | "audio": false, 1329 | "bipolar": true, 1330 | "default_dataset": "defective", 1331 | "type": "unknown" 1332 | }, 1333 | "106": { 1334 | "audio": false, 1335 | "bipolar": true, 1336 | "default_dataset": "defective", 1337 | "type": "unknown" 1338 | }, 1339 | "108": { 1340 | "audio": true, 1341 | "bipolar": true, 1342 | "default_dataset": "validation", 1343 | "type": "picture_description_birthday_reduced" 1344 | }, 1345 | "109": { 1346 | "audio": true, 1347 | "bipolar": true, 1348 | "default_dataset": "training", 1349 | "type": "picture_description_birthday_reduced" 1350 | }, 1351 | "110": { 1352 | "audio": true, 1353 | "bipolar": true, 1354 | "default_dataset": "validation", 1355 | "type": "picture_description_tree_reduced" 1356 | }, 1357 | "111": { 1358 | "audio": true, 1359 | "bipolar": true, 1360 | "default_dataset": "validation", 1361 | "type": "picture_description_kitchen_reduced" 1362 | }, 1363 | "112": { 1364 | "audio": true, 1365 | "bipolar": true, 1366 | "default_dataset": "training", 1367 | "type": "picture_description_birthday_reduced" 1368 | }, 1369 | "113": { 1370 | "audio": true, 1371 | "bipolar": true, 1372 | "default_dataset": "training", 1373 | "type": "picture_description_kitchen_reduced_tiny" 1374 | }, 1375 | "114": { 1376 | "audio": true, 1377 | "bipolar": true, 1378 | "default_dataset": "training", 1379 | "type": "picture_description_kitchen_reduced_tiny" 1380 | }, 1381 | "115": { 1382 | "audio": true, 1383 | "bipolar": true, 1384 | "default_dataset": "training", 1385 | "type": "picture_description_kitchen_reduced_mimed" 1386 | } 1387 | }, 1388 | "405": { 1389 | "5": { 1390 | "audio": false, 1391 | "bipolar": true, 1392 | "default_dataset": "training", 1393 | "type": "demo_2_sentences" 1394 | }, 1395 | "6": { 1396 | "audio": false, 1397 | "bipolar": true, 1398 | "default_dataset": "training", 1399 | "type": "demo_2_sentences" 1400 | }, 1401 | "13": { 1402 | "audio": false, 1403 | "bipolar": true, 1404 | "default_dataset": "training", 1405 | "type": "picture_description_kitchen" 1406 | }, 1407 | "14": { 1408 | "audio": false, 1409 | "bipolar": true, 1410 | "default_dataset": "training", 1411 | "type": "picture_description_tree" 1412 | }, 1413 | "15": { 1414 | "audio": false, 1415 | "bipolar": true, 1416 | "default_dataset": "training", 1417 | "type": "picture_description_birthday" 1418 | }, 1419 | "24": { 1420 | "audio": false, 1421 | "bipolar": true, 1422 | "default_dataset": "training", 1423 | "type": "picture_description_kitchen" 1424 | }, 1425 | "25": { 1426 | "audio": false, 1427 | "bipolar": true, 1428 | "default_dataset": "training", 1429 | "type": "picture_description_tree" 1430 | }, 1431 | "26": { 1432 | "audio": false, 1433 | "bipolar": true, 1434 | "default_dataset": "training", 1435 | "type": "picture_description_birthday" 1436 | }, 1437 | "32": { 1438 | "audio": false, 1439 | "bipolar": true, 1440 | "default_dataset": "training", 1441 | "type": "picture_description_kitchen" 1442 | }, 1443 | "33": { 1444 | "audio": false, 1445 | "bipolar": true, 1446 | "default_dataset": "training", 1447 | "type": "picture_description_tree" 1448 | }, 1449 | "34": { 1450 | "audio": false, 1451 | "bipolar": true, 1452 | "default_dataset": "training", 1453 | "type": "picture_description_birthday" 1454 | }, 1455 | "38": { 1456 | "audio": false, 1457 | "bipolar": true, 1458 | "default_dataset": "training", 1459 | "type": "edgefield_sentences_1" 1460 | }, 1461 | "39": { 1462 | "audio": false, 1463 | "bipolar": true, 1464 | "default_dataset": "training", 1465 | "type": "edgefield_sentences_2" 1466 | }, 1467 | "40": { 1468 | "audio": false, 1469 | "bipolar": true, 1470 | "default_dataset": "validation", 1471 | "type": "edgefield_sentences_3" 1472 | }, 1473 | "47": { 1474 | "audio": false, 1475 | "bipolar": true, 1476 | "default_dataset": "training", 1477 | "type": "edgefield_sentences_4" 1478 | }, 1479 | "48": { 1480 | "audio": false, 1481 | "bipolar": true, 1482 | "default_dataset": "training", 1483 | "type": "edgefield_sentences_5" 1484 | }, 1485 | "51": { 1486 | "audio": false, 1487 | "bipolar": true, 1488 | "default_dataset": "training", 1489 | "type": "edgefield_sentences_7" 1490 | }, 1491 | "52": { 1492 | "audio": false, 1493 | "bipolar": true, 1494 | "default_dataset": "training", 1495 | "type": "edgefield_sentences_8" 1496 | }, 1497 | "53": { 1498 | "audio": false, 1499 | "bipolar": true, 1500 | "default_dataset": "training", 1501 | "type": "edgefield_sentences_9" 1502 | }, 1503 | "54": { 1504 | "audio": false, 1505 | "bipolar": true, 1506 | "default_dataset": "validation", 1507 | "type": "picture_description_kitchen" 1508 | }, 1509 | "55": { 1510 | "audio": false, 1511 | "bipolar": true, 1512 | "default_dataset": "validation", 1513 | "type": "picture_description_tree" 1514 | }, 1515 | "56": { 1516 | "audio": false, 1517 | "bipolar": true, 1518 | "default_dataset": "validation", 1519 | "type": "picture_description_birthday" 1520 | }, 1521 | "75": { 1522 | "audio": false, 1523 | "bipolar": true, 1524 | "default_dataset": "testing", 1525 | "type": "picture_description_kitchen" 1526 | }, 1527 | "76": { 1528 | "audio": false, 1529 | "bipolar": true, 1530 | "default_dataset": "testing", 1531 | "type": "picture_description_tree" 1532 | }, 1533 | "77": { 1534 | "audio": false, 1535 | "bipolar": true, 1536 | "default_dataset": "testing", 1537 | "type": "picture_description_birthday" 1538 | } 1539 | } 1540 | } --------------------------------------------------------------------------------