├── .gitignore ├── LICENSE ├── README.md ├── chordrec ├── __init__.py ├── augmenters.py ├── chroma.py ├── classify.py ├── data.py ├── experiment.py ├── features.py ├── models │ ├── __init__.py │ ├── avg_gap_feature.py │ ├── blocks.py │ ├── chroma_dnn.py │ ├── crf.py │ ├── dnn.py │ └── rnn.py ├── targets.py └── test.py ├── experiments ├── feature_cache │ └── README ├── ismir2016 │ ├── chroma.yaml │ ├── chroma_wlog.yaml │ ├── data │ ├── deep_chroma.yaml │ ├── feature_cache │ ├── logfiltspec.yaml │ └── run.sh ├── madmom2016 │ ├── README.md │ ├── chord_feature_convnet.yaml │ ├── create_crf_init_params.py │ ├── create_madmom_convnet_model.py │ ├── create_madmom_crf_model.py │ ├── create_madmom_deep_chroma_model.py │ ├── crf_chord_rec.yaml │ └── deep_chroma.yaml └── mlsp2016 │ ├── README.md │ ├── convnet.yaml │ ├── create_crf_init_params.py │ ├── crf.yaml │ ├── feature_cache │ └── to_madmom_crf.py └── tools ├── evaluate.py ├── extract_perfect_chroma.py └── post_process.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | ### Vim template 61 | [._]*.s[a-w][a-z] 62 | [._]s[a-w][a-z] 63 | *.un~ 64 | Session.vim 65 | .netrwhist 66 | *~ 67 | ### IPythonNotebook template 68 | # Temporary data 69 | .ipynb_checkpoints/ 70 | ### JetBrains template 71 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 72 | 73 | *.iml 74 | 75 | ## Directory-based project format: 76 | .idea/ 77 | # if you remove the above rule, at least ignore the following: 78 | 79 | # User-specific stuff: 80 | # .idea/workspace.xml 81 | # .idea/tasks.xml 82 | # .idea/dictionaries 83 | 84 | # Sensitive or high-churn files: 85 | # .idea/dataSources.ids 86 | # .idea/dataSources.xml 87 | # .idea/sqlDataSources.xml 88 | # .idea/dynamic.xml 89 | # .idea/uiDesigner.xml 90 | 91 | # Gradle: 92 | # .idea/gradle.xml 93 | # .idea/libraries 94 | 95 | # Mongo Explorer plugin: 96 | # .idea/mongoSettings.xml 97 | 98 | ## File-based project format: 99 | *.ipr 100 | *.iws 101 | 102 | ## Plugin-specific files: 103 | 104 | # IntelliJ 105 | /out/ 106 | 107 | # mpeltonen/sbt-idea plugin 108 | .idea_modules/ 109 | 110 | # JIRA plugin 111 | atlassian-ide-plugin.xml 112 | 113 | # Crashlytics plugin (for Android Studio and IntelliJ) 114 | com_crashlytics_export_strings.xml 115 | crashlytics.properties 116 | crashlytics-build.properties 117 | 118 | # Created by .ignore support plugin (hsz.mobi) 119 | 120 | # Own ignores 121 | experiments/data 122 | notes/ 123 | experiments/feature_cache/* 124 | !experiments/feature_cache/README -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Filip Korzeniowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chordrec 2 | 3 | This is the code I use for my chord recognition experiments. 4 | 5 | ## Requirements & Installation 6 | 7 | I assume a standard "scientific Python" environment with NumPy, SciPy, etc. 8 | Additionally, the following libraries are required: 9 | 10 | - [Theano](https://github.com/Theano/Theano) 11 | - [Lasagne](https://github.com/Lasagne/Lasagne) 12 | - [dmgr](https://github.com/fdlm/dmgr) 13 | - [nn](https://github.com/fdlm/nn) 14 | - [Spaghetti](https://github.com/fdlm/Spaghetti) 15 | - [madmom](https://github.com/CPJKU/madmom)* 16 | - [librosa](https://github.com/librosa/librosa) (version 0.4.1)* 17 | - [mir_eval](https://github.com/craffel/mir_eval)* 18 | - [pyyaml](https://bitbucket.org/xi/pyyaml)* 19 | - [sacred](https://github.com/IDSIA/sacred)* 20 | 21 | Packages marked with a * can be installed using `pip`, the others are either 22 | not available or recommended to be installed from source. If I missed any 23 | dependency, please let me know. 24 | 25 | Once you have all libraries installed, clone this repository and add its path 26 | to the `$PYTHONPATH` environment variable. 27 | 28 | ## Data Setup 29 | 30 | Different experiments might require different data set to be present (you can 31 | find detailed information on the sites describing the experiments on my 32 | [website](http://fdlm.github.io)). The directory structure for each dataset, 33 | however, is the same. 34 | 35 | Put all datasets into respective subdirectories under 36 | `chordrec/experiments/data`. The datasets have to contain three types of data: 37 | audio files in `.flac` format, corresponding chord annotations in lab format 38 | with the file extension `.chords`, and the cross-validation split definitions. 39 | Audio and annotation files can be organised on a directory structure, but do 40 | not need to; the programs will look for any `.flac` and `.chord` files in all 41 | directories recursively. However, the split definition 42 | files must be in a `splits` sub-directory in each dataset directory (e.g. 43 | `beatles/splits`). File names of audio and annotation files must correspond to 44 | the names given in the split definition files. 45 | 46 | The `data` directory including some example datasets should look like this, 47 | The internal structures of the `queen`, `robbie_williams`, `rwc` and `zweieck` 48 | directories following the one of the `beatles`: 49 | 50 | ``` 51 | experiments 52 | +-- data 53 | +-- beatles 54 | +-- *.flac 55 | +-- *.chords 56 | +-- splits 57 | +-- 8-fold_cv_album_distributed_*.fold 58 | +-- queen 59 | +-- robbie_williams 60 | +-- rwc 61 | +-- zweieck 62 | ``` 63 | 64 | Refer to the websites for each individual experiment for more information on 65 | the data and how to obtain it. 66 | 67 | ## Experiments 68 | 69 | The `experiments` sub-directory contains scripts and configurations to 70 | reproduce the results of all my papers on chord recognition (plus some more). 71 | Since neural networks are initialised randomly, and I usually do not save the 72 | seed, the results might differ slightly from the ones in the papers. 73 | 74 | - `experiments/ismir2016`: Reproduces the final results for all features 75 | compared in the paper 76 | 77 | F. Korzeniowski and G. Widmer. ["Feature Learning for Chord Recognition: The 78 | Deep Chroma Extractor"](https://drive.google.com/open?id=0B0gBhdh1fIPKZUwtdnJpeDBjdlk). In *Proceedings of the 17th International Society 79 | for Music Information Retrieval Conference (ISMIR 2016)*, New York, USA. 80 | 81 | See [here](http://fdlm.github.io/post/deepchroma) for more 82 | information on the model and the necessary data. 83 | 84 | - `experiments/madmom2016`: Configurations to train the chord recognition 85 | models of the [madmom](https://github.com/CPJKU/madmom) audio processing 86 | library. 87 | 88 | - `experiments/mlsp2016`: Reproduces the results of the chord recognition 89 | system presented in the following paper: 90 | 91 | F. Korzeniowski and G. Widmer. ["A Fully Convolutional Deep Auditory Model 92 | for Musical Chord Recognition"](https://drive.google.com/open?id=0B0gBhdh1fIPKNXE5Z3VpQ2pjcE0) 93 | In *Proceedings of the IEEE International Workshop on Machine Learning for 94 | Signal Processing (MLSP 2016)*, Salerno, Italy, 2016. 95 | 96 | See [here](http://fdlm.github.io/post/auditorymodel) for more 97 | information on the model and the necessary data. 98 | -------------------------------------------------------------------------------- /chordrec/__init__.py: -------------------------------------------------------------------------------- 1 | from . import (augmenters, chroma, classify, data, experiment, features, 2 | targets, test) 3 | -------------------------------------------------------------------------------- /chordrec/augmenters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.ndimage import shift 3 | import random 4 | from targets import one_hot 5 | 6 | 7 | class SemitoneShift(object): 8 | 9 | def __init__(self, p, max_shift, bins_per_semitone, 10 | target_type='chords_maj_min'): 11 | """ 12 | Augmenter that shifts by semitones a spectrum with logarithmically 13 | spaced frequency bins. 14 | 15 | :param p: percentage of data to be shifted 16 | :param max_shift: maximum number of semitones to shift 17 | :param bins_per_semitone: number of spectrogram bins per semitone 18 | :param target_type: specifies target type 19 | """ 20 | self.p = p 21 | self.max_shift = max_shift 22 | self.bins_per_semitone = bins_per_semitone 23 | 24 | if target_type == 'chords_maj_min': 25 | self.adapt_targets = self._adapt_targets_chords_maj_min 26 | elif target_type == 'chroma': 27 | self.adapt_targets = self._adapt_targets_chroma 28 | 29 | def _adapt_targets_chords_maj_min(self, targets, shifts): 30 | chord_classes = targets.argmax(-1) 31 | no_chord_class = targets.shape[-1] - 1 32 | no_chords = (chord_classes == no_chord_class) 33 | chord_roots = chord_classes % 12 34 | chord_majmin = chord_classes / 12 35 | 36 | new_chord_roots = (chord_roots + shifts) % 12 37 | new_chord_classes = new_chord_roots + chord_majmin * 12 38 | new_chord_classes[no_chords] = no_chord_class 39 | new_targets = one_hot(new_chord_classes, no_chord_class + 1) 40 | return new_targets 41 | 42 | def _adapt_targets_chroma(self, targets, shifts): 43 | new_targets = np.empty_like(targets) 44 | for i in range(len(targets)): 45 | new_targets[i] = np.roll(targets[i], shifts[i], axis=-1) 46 | return new_targets 47 | 48 | def __call__(self, batch_iterator): 49 | """ 50 | :param batch_iterator: data iterator that yields the data to be 51 | augmented 52 | :return: augmented data/target pairs 53 | """ 54 | 55 | for data, targets in batch_iterator: 56 | batch_size = len(data) 57 | 58 | shifts = np.random.randint(-self.max_shift, 59 | self.max_shift + 1, batch_size) 60 | 61 | # zero out shifts for 1-p percentage 62 | no_shift = random.sample(range(batch_size), 63 | int(batch_size * (1 - self.p))) 64 | shifts[no_shift] = 0 65 | 66 | new_targets = self.adapt_targets(targets, shifts) 67 | 68 | 69 | new_data = np.empty_like(data) 70 | for i in range(batch_size): 71 | # TODO: remove data from upper and lower parts that got 72 | # rolled (?) 73 | new_data[i] = np.roll( 74 | data[i], shifts[i] * self.bins_per_semitone, axis=-1) 75 | 76 | yield new_data, new_targets 77 | 78 | 79 | class Detuning(object): 80 | 81 | def __init__(self, p, max_shift, bins_per_semitone): 82 | """ 83 | Augmenter that shifts a spectrogram with logarithmically spaced 84 | frequency bins by maximum 0.5 semitones 85 | :param p: percentage of data to be shifted 86 | :param max_shift: maximum fraction of semitone to shirt (<= 0.5) 87 | :param bins_per_semitone: number of spectrogram bins per semitone 88 | """ 89 | if max_shift >= 0.5: 90 | raise ValueError('Detuning only works up to half a semitone!') 91 | self.p = p 92 | self.max_shift = max_shift 93 | self.bins_per_semitone = bins_per_semitone 94 | 95 | def __call__(self, batch_iterator): 96 | """ 97 | :param batch_iterator: data iterator that yields the data to be 98 | augmented 99 | :return: augmented data/target pairs 100 | """ 101 | for data, targets in batch_iterator: 102 | batch_size = len(data) 103 | 104 | shifts = np.random.rand(batch_size) * 2 * self.max_shift - \ 105 | self.max_shift 106 | 107 | # zero out shifts for 1-p percentage 108 | no_shift = random.sample(range(batch_size), 109 | int(batch_size * (1 - self.p))) 110 | shifts[no_shift] = 0 111 | 112 | new_data = np.empty_like(data) 113 | for i in range(batch_size): 114 | new_data[i] = shift( 115 | data[i], (shifts[i] * self.bins_per_semitone, 0)) 116 | 117 | yield new_data, targets 118 | 119 | 120 | def create_augmenters(augmentation): 121 | return [globals()[name](**params) 122 | for name, params in augmentation.iteritems()] 123 | 124 | 125 | def add_sacred_config(ex): 126 | ex.add_named_config( 127 | 'augmentation', 128 | augmentation=dict( 129 | SemitoneShift=dict( 130 | p=1.0, 131 | max_shift=4, 132 | bins_per_semitone=2 133 | ), 134 | Detuning=dict( 135 | p=1.0, 136 | max_shift=0.4, 137 | bins_per_semitone=2 138 | ) 139 | ) 140 | ) 141 | -------------------------------------------------------------------------------- /chordrec/chroma.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import numpy as np 5 | import yaml 6 | 7 | from nn.utils import Colors 8 | 9 | import data 10 | import dmgr 11 | import features 12 | import nn 13 | import targets 14 | import test 15 | from experiment import TempDir, create_optimiser, setup 16 | from models import chroma_dnn 17 | 18 | 19 | def compute_chroma(process_fn, agg_dataset, dest_dir, batch_size, 20 | extension='.features.npy'): 21 | if not os.path.exists(dest_dir): 22 | os.makedirs(dest_dir) 23 | 24 | chroma_files = [] 25 | 26 | for ds_idx in range(agg_dataset.n_datasources): 27 | ds = agg_dataset.datasource(ds_idx) 28 | 29 | chromas = [] 30 | 31 | for data, _ in dmgr.iterators.iterate_batches(ds, batch_size, 32 | randomise=False, 33 | expand=False): 34 | chromas.append(process_fn(data)) 35 | 36 | chromas = np.concatenate(chromas) 37 | chroma_file = os.path.join(dest_dir, ds.name + extension) 38 | np.save(chroma_file, chromas) 39 | chroma_files.append(chroma_file) 40 | 41 | return chroma_files 42 | 43 | 44 | # Initialise Sacred experiment 45 | ex = setup('Deep Chroma Extractor') 46 | 47 | 48 | # Standard config 49 | @ex.config 50 | def _cfg(): 51 | observations = 'results' 52 | feature_extractor = None 53 | target = None 54 | chroma_network = None 55 | optimiser = None 56 | training = None 57 | regularisation = None 58 | testing = None 59 | augmentation = None 60 | 61 | 62 | # add models 63 | chroma_dnn.add_sacred_config(ex) 64 | 65 | 66 | @ex.automain 67 | def main(datasource, feature_extractor, target, chroma_network, 68 | optimiser, training, regularisation, augmentation, testing): 69 | 70 | err = False 71 | if chroma_network is None: 72 | print(Colors.red('ERROR: Specify a chroma extractor!')) 73 | err = True 74 | if feature_extractor is None: 75 | print(Colors.red('ERROR: Specify a feature extractor!')) 76 | err = True 77 | if target is None: 78 | print(Colors.red('ERROR: Specify a target!')) 79 | err = True 80 | if chroma_network is None: 81 | print(Colors.red('ERROR: Specify a chroma extractor!')) 82 | err = True 83 | if err: 84 | return 1 85 | 86 | # intermediate target is always chroma vectors 87 | target_chroma = targets.ChromaTarget( 88 | feature_extractor['params']['fps']) 89 | 90 | target_chords = targets.create_target( 91 | feature_extractor['params']['fps'], 92 | target 93 | ) 94 | 95 | if not isinstance(datasource['test_fold'], list): 96 | datasource['test_fold'] = [datasource['test_fold']] 97 | 98 | if not isinstance(datasource['val_fold'], list): 99 | datasource['val_fold'] = [datasource['val_fold']] 100 | 101 | # if no validation folds are specified, always use the 102 | # 'None' and determine validation fold automatically 103 | if datasource['val_fold'][0] is None: 104 | datasource['val_fold'] *= len(datasource['test_fold']) 105 | 106 | if len(datasource['test_fold']) != len(datasource['val_fold']): 107 | print(Colors.red('ERROR: Need same number of validation and ' 108 | 'test folds')) 109 | return 1 110 | 111 | all_pred_files = [] 112 | all_gt_files = [] 113 | 114 | print(Colors.magenta('\nStarting experiment ' + ex.observers[0].hash())) 115 | 116 | with TempDir() as exp_dir: 117 | for test_fold, val_fold in zip(datasource['test_fold'], 118 | datasource['val_fold']): 119 | print('') 120 | print(Colors.yellow( 121 | '=' * 20 + ' FOLD {} '.format(test_fold) + '=' * 20)) 122 | # Load data sets 123 | print(Colors.red('\nLoading data...\n')) 124 | 125 | feature_ext = features.create_extractor(feature_extractor, 126 | test_fold) 127 | train_set, val_set, test_set, gt_files = data.create_datasources( 128 | dataset_names=datasource['datasets'], 129 | preprocessors=datasource['preprocessors'], 130 | compute_features=feature_ext, 131 | compute_targets=target_chroma, 132 | context_size=datasource['context_size'], 133 | test_fold=test_fold, 134 | val_fold=val_fold, 135 | cached=datasource['cached'] 136 | ) 137 | 138 | if testing['test_on_val']: 139 | test_set = val_set 140 | 141 | print(Colors.blue('Train Set:')) 142 | print('\t', train_set) 143 | 144 | print(Colors.blue('Validation Set:')) 145 | print('\t', val_set) 146 | 147 | print(Colors.blue('Test Set:')) 148 | print('\t', test_set) 149 | print('') 150 | 151 | # build network 152 | print(Colors.red('Building network...\n')) 153 | 154 | model_type = globals()[chroma_network['model']['type']] 155 | mdl = model_type.build_model(in_shape=train_set.dshape, 156 | out_size_chroma=train_set.tshape[0], 157 | out_size=target_chords.num_classes, 158 | model=chroma_network['model']) 159 | 160 | chroma_neural_net = mdl['chroma_network'] 161 | chord_neural_net = mdl['chord_network'] 162 | input_var = mdl['input_var'] 163 | chroma_target_var = mdl['chroma_target_var'] 164 | chord_target_var = mdl['chord_target_var'] 165 | chroma_loss_fn = mdl['chroma_loss_fn'] 166 | chord_loss_fn = mdl['chord_loss_fn'] 167 | 168 | chroma_opt, chroma_lrs = create_optimiser(chroma_network['optimiser']) 169 | chord_opt, chord_lrs = create_optimiser(optimiser) 170 | 171 | chroma_train_fn = nn.compile_train_fn( 172 | chroma_neural_net, input_var, chroma_target_var, 173 | loss_fn=chroma_loss_fn, opt_fn=chroma_opt, 174 | **chroma_network['regularisation'] 175 | ) 176 | 177 | chroma_test_fn = nn.compile_test_func( 178 | chroma_neural_net, input_var, chroma_target_var, 179 | loss_fn=chroma_loss_fn, 180 | **chroma_network['regularisation'] 181 | ) 182 | 183 | chroma_process_fn = nn.compile_process_func( 184 | chroma_neural_net, input_var 185 | ) 186 | 187 | chord_train_fn = nn.compile_train_fn( 188 | chord_neural_net, input_var, chord_target_var, 189 | loss_fn=chord_loss_fn, opt_fn=chord_opt, tags={'chord': True}, 190 | **regularisation 191 | ) 192 | 193 | chord_test_fn = nn.compile_test_func( 194 | chord_neural_net, input_var, chord_target_var, 195 | loss_fn=chord_loss_fn, tags={'chord': True}, 196 | **regularisation 197 | ) 198 | 199 | chord_process_fn = nn.compile_process_func( 200 | chord_neural_net, input_var 201 | ) 202 | 203 | print(Colors.blue('Chroma Network:')) 204 | print(nn.to_string(chroma_neural_net)) 205 | print('') 206 | 207 | print(Colors.blue('Chords Network:')) 208 | print(nn.to_string(chord_neural_net)) 209 | print('') 210 | 211 | print(Colors.red('Starting training chroma network...\n')) 212 | 213 | chroma_training = chroma_network['training'] 214 | chroma_train_batches, chroma_validation_batches = \ 215 | model_type.create_iterators(train_set, val_set, 216 | chroma_training, augmentation) 217 | crm_train_losses, crm_val_losses, _, crm_val_accs = nn.train( 218 | network=chroma_neural_net, 219 | train_fn=chroma_train_fn, train_batches=chroma_train_batches, 220 | test_fn=chroma_test_fn, 221 | validation_batches=chroma_validation_batches, 222 | threads=10, callbacks=[chroma_lrs] if chroma_lrs else [], 223 | num_epochs=chroma_training['num_epochs'], 224 | early_stop=chroma_training['early_stop'], 225 | early_stop_acc=chroma_training['early_stop_acc'], 226 | acc_func=nn.nn.elemwise_acc 227 | ) 228 | 229 | # we need to create a new dataset with a new target (chords) 230 | del train_set 231 | del val_set 232 | del test_set 233 | del gt_files 234 | 235 | train_set, val_set, test_set, gt_files = data.create_datasources( 236 | dataset_names=datasource['datasets'], 237 | preprocessors=datasource['preprocessors'], 238 | compute_features=feature_ext, 239 | compute_targets=target_chords, 240 | context_size=datasource['context_size'], 241 | test_fold=test_fold, 242 | val_fold=val_fold, 243 | cached=datasource['cached'] 244 | ) 245 | 246 | if testing['test_on_val']: 247 | test_set = val_set 248 | 249 | print(Colors.blue('Train Set:')) 250 | print('\t', train_set) 251 | 252 | print(Colors.blue('Validation Set:')) 253 | print('\t', val_set) 254 | 255 | print(Colors.blue('Test Set:')) 256 | print('\t', test_set) 257 | print('') 258 | 259 | print(Colors.red('Starting training chord network...\n')) 260 | 261 | chord_train_batches, chord_validation_batches = \ 262 | model_type.create_iterators(train_set, val_set, training, 263 | augmentation) 264 | 265 | crd_train_losses, crd_val_losses, _, crd_val_accs = nn.train( 266 | network=chord_neural_net, 267 | train_fn=chord_train_fn, train_batches=chord_train_batches, 268 | test_fn=chord_test_fn, 269 | validation_batches=chord_validation_batches, 270 | threads=10, callbacks=[chord_lrs] if chord_lrs else [], 271 | num_epochs=training['num_epochs'], 272 | early_stop=training['early_stop'], 273 | early_stop_acc=training['early_stop_acc'], 274 | ) 275 | 276 | print(Colors.red('\nStarting testing...\n')) 277 | 278 | param_file = os.path.join( 279 | exp_dir, 'params_fold_{}.pkl'.format(test_fold)) 280 | nn.save_params(chord_neural_net, param_file) 281 | ex.add_artifact(param_file) 282 | 283 | pred_files = test.compute_labeling( 284 | chord_process_fn, target_chords, test_set, dest_dir=exp_dir, 285 | use_mask=False, batch_size=testing['batch_size'] 286 | ) 287 | 288 | # compute chroma vectors for the test set 289 | # TODO: replace this with experiment.compute_features 290 | for cf in compute_chroma(chroma_process_fn, test_set, 291 | batch_size=training['batch_size'], 292 | dest_dir=exp_dir): 293 | ex.add_artifact(cf) 294 | 295 | test_gt_files = dmgr.files.match_files( 296 | pred_files, test.PREDICTION_EXT, gt_files, data.GT_EXT 297 | ) 298 | 299 | all_pred_files += pred_files 300 | all_gt_files += test_gt_files 301 | 302 | print(Colors.blue('Results:')) 303 | scores = test.compute_average_scores(test_gt_files, pred_files) 304 | test.print_scores(scores) 305 | result_file = os.path.join( 306 | exp_dir, 'results_fold_{}.yaml'.format(test_fold)) 307 | yaml.dump(dict(scores=scores, 308 | chord_train_losses=map(float, crd_train_losses), 309 | chord_val_losses=map(float, crd_val_losses), 310 | chord_val_accs=map(float, crd_val_accs), 311 | chroma_train_losses=map(float, crm_train_losses), 312 | chroma_val_losses=map(float, crm_val_losses), 313 | chroma_val_accs=map(float, crm_val_accs)), 314 | open(result_file, 'w')) 315 | ex.add_artifact(result_file) 316 | 317 | # close all files 318 | del train_set 319 | del val_set 320 | del test_set 321 | del gt_files 322 | 323 | # if there is something to aggregate 324 | if len(datasource['test_fold']) > 1: 325 | print(Colors.yellow('\nAggregated Results:\n')) 326 | scores = test.compute_average_scores(all_gt_files, all_pred_files) 327 | test.print_scores(scores) 328 | result_file = os.path.join(exp_dir, 'results.yaml') 329 | yaml.dump(dict(scores=scores), open(result_file, 'w')) 330 | ex.add_artifact(result_file) 331 | 332 | for pf in all_pred_files: 333 | ex.add_artifact(pf) 334 | 335 | print(Colors.magenta('Stopping experiment ' + ex.observers[0].hash())) 336 | -------------------------------------------------------------------------------- /chordrec/classify.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | 5 | import yaml 6 | 7 | import data 8 | import dmgr 9 | import features 10 | import nn 11 | import targets 12 | import test 13 | 14 | from nn.utils import Colors 15 | from models import dnn, avg_gap_feature, crf, rnn 16 | from experiment import TempDir, create_optimiser, setup, compute_features 17 | 18 | # Initialise Sacred experiment 19 | ex = setup('Classify Chords') 20 | 21 | 22 | # Standard config 23 | @ex.config 24 | def _cfg(): 25 | observations = 'results' 26 | feature_extractor = None 27 | target = None 28 | model = None 29 | optimiser = None 30 | training = None 31 | regularisation = None 32 | testing = None 33 | augmentation = None 34 | 35 | 36 | # add models 37 | dnn.add_sacred_config(ex) 38 | avg_gap_feature.add_sacred_config(ex) 39 | crf.add_sacred_config(ex) 40 | rnn.add_sacred_config(ex) 41 | 42 | 43 | # add general configs 44 | @ex.named_config 45 | def learn_rate_schedule(): 46 | optimiser = dict( 47 | schedule=dict( 48 | interval=10, 49 | factor=0.5 50 | ) 51 | ) 52 | 53 | 54 | @ex.automain 55 | def main(_log, datasource, feature_extractor, target, model, optimiser, 56 | training, regularisation, augmentation, testing): 57 | 58 | err = False 59 | if model is None or not model or 'type' not in model: 60 | _log.error(Colors.red('Specify a model!')) 61 | err = True 62 | if feature_extractor is None: 63 | _log.error(Colors.red('Specify a feature extractor!')) 64 | err = True 65 | if target is None: 66 | _log.error(Colors.red('Specify a target!')) 67 | err = True 68 | if err: 69 | return 1 70 | 71 | target_computer = targets.create_target( 72 | feature_extractor['params']['fps'], 73 | target 74 | ) 75 | 76 | if not isinstance(datasource['test_fold'], list): 77 | datasource['test_fold'] = [datasource['test_fold']] 78 | 79 | if not isinstance(datasource['val_fold'], list): 80 | datasource['val_fold'] = [datasource['val_fold']] 81 | 82 | # if no validation folds are specified, always use the 83 | # 'None' and determine validation fold automatically 84 | if datasource['val_fold'][0] is None: 85 | datasource['val_fold'] *= len(datasource['test_fold']) 86 | 87 | if len(datasource['test_fold']) != len(datasource['val_fold']): 88 | _log.error(Colors.red('Need same number of validation and test folds')) 89 | return 1 90 | 91 | all_pred_files = [] 92 | all_gt_files = [] 93 | 94 | print(Colors.magenta('\nStarting experiment ' + ex.observers[0].hash())) 95 | 96 | with TempDir() as exp_dir: 97 | for test_fold, val_fold in zip(datasource['test_fold'], 98 | datasource['val_fold']): 99 | print('') 100 | print(Colors.yellow( 101 | '=' * 20 + ' FOLD {} '.format(test_fold) + '=' * 20)) 102 | # Load data sets 103 | print(Colors.red('\nLoading data...\n')) 104 | 105 | train_set, val_set, test_set, gt_files = data.create_datasources( 106 | dataset_names=datasource['datasets'], 107 | preprocessors=datasource['preprocessors'], 108 | compute_features=features.create_extractor(feature_extractor, 109 | test_fold), 110 | compute_targets=target_computer, 111 | context_size=datasource['context_size'], 112 | test_fold=test_fold, 113 | val_fold=val_fold, 114 | cached=datasource['cached'], 115 | ) 116 | 117 | if testing['test_on_val']: 118 | test_set = val_set 119 | 120 | print(Colors.blue('Train Set:')) 121 | print('\t', train_set) 122 | print(Colors.blue('Validation Set:')) 123 | print('\t', val_set) 124 | print(Colors.blue('Test Set:')) 125 | print('\t', test_set) 126 | print('') 127 | 128 | # build network 129 | print(Colors.red('Building network...\n')) 130 | 131 | model_type = globals()[model['type']] 132 | mdl = model_type.build_model(in_shape=train_set.dshape, 133 | out_size=train_set.tshape[0], 134 | model=model) 135 | 136 | # mandatory parts of the model 137 | neural_net = mdl['network'] 138 | input_var = mdl['input_var'] 139 | target_var = mdl['target_var'] 140 | loss_fn = mdl['loss_fn'] 141 | 142 | # optional parts 143 | mask_var = mdl.get('mask_var') 144 | feature_out = mdl.get('feature_out') 145 | 146 | train_batches, validation_batches = model_type.create_iterators( 147 | train_set, val_set, training, augmentation 148 | ) 149 | 150 | opt, lrs = create_optimiser(optimiser) 151 | 152 | train_fn = nn.compile_train_fn( 153 | neural_net, input_var, target_var, 154 | loss_fn=loss_fn, opt_fn=opt, mask_var=mask_var, 155 | **regularisation 156 | ) 157 | 158 | test_fn = nn.compile_test_func( 159 | neural_net, input_var, target_var, 160 | loss_fn=loss_fn, mask_var=mask_var, 161 | **regularisation 162 | ) 163 | 164 | process_fn = nn.compile_process_func( 165 | neural_net, input_var, mask_var=mask_var) 166 | 167 | if feature_out is not None: 168 | feature_fn = nn.compile_process_func( 169 | feature_out, input_var, mask_var=mask_var 170 | ) 171 | else: 172 | feature_fn = None 173 | 174 | print(Colors.blue('Neural Network:')) 175 | print(nn.to_string(neural_net)) 176 | print('') 177 | 178 | if 'param_file' in training: 179 | nn.load_params(neural_net, 180 | training['param_file'].format(test_fold)) 181 | train_losses = [] 182 | val_losses = [] 183 | val_accs = [] 184 | else: 185 | if 'init_file' in training: 186 | print('initialising') 187 | nn.load_params(neural_net, 188 | training['init_file'].format(test_fold)) 189 | print(Colors.red('Starting training...\n')) 190 | train_losses, val_losses, _, val_accs = nn.train( 191 | network=neural_net, 192 | train_fn=train_fn, train_batches=train_batches, 193 | test_fn=test_fn, validation_batches=validation_batches, 194 | threads=10, callbacks=[lrs] if lrs else [], 195 | num_epochs=training['num_epochs'], 196 | early_stop=training['early_stop'], 197 | early_stop_acc=training['early_stop_acc'] 198 | ) 199 | param_file = os.path.join( 200 | exp_dir, 'params_fold_{}.pkl'.format(test_fold)) 201 | nn.save_params(neural_net, param_file) 202 | ex.add_artifact(param_file) 203 | 204 | print(Colors.red('\nStarting testing...\n')) 205 | 206 | if feature_fn is not None: 207 | dest_dir = os.path.join(exp_dir, 208 | 'features_fold_{}'.format(test_fold)) 209 | compute_features( 210 | feature_fn, train_set, batch_size=testing['batch_size'], 211 | dest_dir=dest_dir, extension='.features.npy', 212 | use_mask=mask_var is not None) 213 | compute_features( 214 | feature_fn, val_set, batch_size=testing['batch_size'], 215 | dest_dir=dest_dir, extension='.features.npy', 216 | use_mask=mask_var is not None) 217 | compute_features( 218 | feature_fn, test_set, batch_size=testing['batch_size'], 219 | dest_dir=dest_dir, extension='.features.npy', 220 | use_mask=mask_var is not None) 221 | ex.add_artifact(dest_dir) 222 | 223 | pred_files = test.compute_labeling( 224 | process_fn, target_computer, test_set, dest_dir=exp_dir, 225 | use_mask=mask_var is not None, batch_size=testing['batch_size'] 226 | ) 227 | 228 | test_gt_files = dmgr.files.match_files( 229 | pred_files, test.PREDICTION_EXT, gt_files, data.GT_EXT 230 | ) 231 | 232 | all_pred_files += pred_files 233 | all_gt_files += test_gt_files 234 | 235 | print(Colors.blue('Results:')) 236 | scores = test.compute_average_scores(test_gt_files, pred_files) 237 | test.print_scores(scores) 238 | result_file = os.path.join( 239 | exp_dir, 'results_fold_{}.yaml'.format(test_fold)) 240 | yaml.dump(dict(scores=scores, 241 | train_losses=map(float, train_losses), 242 | val_losses=map(float, val_losses), 243 | val_accs=map(float, val_accs)), 244 | open(result_file, 'w')) 245 | ex.add_artifact(result_file) 246 | 247 | # delete datasets so disk space is free 248 | del train_set 249 | del val_set 250 | del test_set 251 | 252 | # if there is something to aggregate 253 | if len(datasource['test_fold']) > 1: 254 | print(Colors.yellow('\nAggregated Results:\n')) 255 | scores = test.compute_average_scores(all_gt_files, all_pred_files) 256 | test.print_scores(scores) 257 | result_file = os.path.join(exp_dir, 'results.yaml') 258 | yaml.dump(dict(scores=scores), open(result_file, 'w')) 259 | ex.add_artifact(result_file) 260 | 261 | for pf in all_pred_files: 262 | ex.add_artifact(pf) 263 | 264 | print(Colors.magenta('Stopping experiment ' + ex.observers[0].hash())) 265 | -------------------------------------------------------------------------------- /chordrec/data.py: -------------------------------------------------------------------------------- 1 | from operator import eq 2 | import os 3 | import dmgr 4 | 5 | DATA_DIR = 'data' 6 | CACHE_DIR = 'feature_cache' 7 | SRC_EXT = '.flac' 8 | GT_EXT = '.chords' 9 | 10 | 11 | def combine_files(*args): 12 | """ 13 | Combines file dictionaries as returned by the methods of Dataset. 14 | :param args: file dictionaries 15 | :return: combined file dictionaries 16 | """ 17 | if len(args) < 1: 18 | raise ValueError('Pass at least one argument!') 19 | 20 | # make sure all elements contain the same number of splits 21 | if len(set(len(a) for a in args)) > 1: 22 | raise ValueError('Arguments must contain the same number of splits!') 23 | 24 | combined = [{'feat': [], 'targ': []} for _ in range(len(args[0]))] 25 | 26 | for fs in args: 27 | for s in range(len(combined)): 28 | for t in combined[s]: 29 | combined[s][t] += fs[s][t] 30 | 31 | return combined 32 | 33 | 34 | DATASET_DEFS = { 35 | 'beatles': { 36 | 'data_dir': 'beatles', 37 | 'split_filename': '8-fold_cv_album_distributed_{}.fold' 38 | }, 39 | 'queen': { 40 | 'data_dir': 'queen', 41 | 'split_filename': '8-fold_cv_random_{}.fold' 42 | }, 43 | 'zweieck': { 44 | 'data_dir': 'zweieck', 45 | 'split_filename': '8-fold_cv_random_{}.fold' 46 | }, 47 | 'robbie_williams': { 48 | 'data_dir': 'robbie_williams', 49 | 'split_filename': '8-fold_cv_random_{}.fold' 50 | }, 51 | 'rwc': { 52 | 'data_dir': 'rwc', 53 | 'split_filename': '8-fold_cv_random_{}.fold' 54 | }, 55 | 'billboard': { 56 | 'data_dir': os.path.join('mcgill-billboard', 'unique'), 57 | 'split_filename': '8-fold_cv_random_{}.fold' 58 | } 59 | } 60 | 61 | 62 | def load_dataset(name, data_dir, feature_cache_dir, 63 | compute_features, compute_targets): 64 | 65 | assert name in DATASET_DEFS.keys(), 'Unknown dataset {}'.format(name) 66 | 67 | data_dir = os.path.join(data_dir, DATASET_DEFS[name]['data_dir']) 68 | split_filename = os.path.join(data_dir, 'splits', 69 | DATASET_DEFS[name]['split_filename']) 70 | 71 | return dmgr.Dataset( 72 | data_dir, 73 | os.path.join(feature_cache_dir, name), 74 | [split_filename.format(f) for f in range(8)], 75 | source_ext=SRC_EXT, 76 | gt_ext=GT_EXT, 77 | compute_features=compute_features, 78 | compute_targets=compute_targets, 79 | ) 80 | 81 | 82 | def create_preprocessors(preproc_defs): 83 | preprocessors = [] 84 | for pp in preproc_defs: 85 | preprocessors.append( 86 | getattr(dmgr.preprocessing, pp['name'])(**pp['params'])) 87 | return preprocessors 88 | 89 | 90 | def create_datasources(dataset_names, preprocessors, 91 | compute_features, compute_targets, context_size, 92 | data_dir=DATA_DIR, feature_cache_dir=CACHE_DIR, 93 | test_fold=0, val_fold=None, 94 | **kwargs): 95 | 96 | if test_fold is not None and val_fold is None: 97 | val_fold = test_fold - 1 98 | 99 | preprocessors = create_preprocessors(preprocessors) 100 | 101 | if context_size > 0: 102 | data_source_type = dmgr.datasources.ContextDataSource 103 | kwargs['context_size'] = context_size 104 | else: 105 | data_source_type = dmgr.datasources.DataSource 106 | 107 | # load all datasets 108 | datasets = [load_dataset(name, data_dir, feature_cache_dir, 109 | compute_features, compute_targets) 110 | for name in dataset_names] 111 | 112 | if test_fold is not None: 113 | files = combine_files(*[ds.fold_split(val_fold, test_fold) 114 | for ds in datasets]) 115 | else: 116 | # times three such that train, validation and test set are the same 117 | files = combine_files(*[[ds.all_files()] 118 | for ds in datasets]) 119 | 120 | ds = dmgr.datasources.get_datasources( 121 | files, preprocessors=preprocessors, data_source_type=data_source_type, 122 | **kwargs 123 | ) 124 | 125 | if len(ds) == 3: 126 | train, val, test = ds 127 | elif len(ds) == 1: 128 | train = ds[0] 129 | val = ds[0] 130 | test = ds[0] 131 | else: 132 | raise RuntimeError('Got {} datasources,' 133 | ' expected 1 or 3.'.format(len(ds))) 134 | 135 | return train, val, test, sum((ds.gt_files for ds in datasets), []) 136 | 137 | 138 | def add_sacred_config(ex): 139 | ex.add_config( 140 | datasource=dict( 141 | datasets=['beatles', 'queen', 'zweieck', 'robbie_williams', 'rwc'], 142 | context_size=0, 143 | preprocessors=[], 144 | # fold 6 overestimates the score, but has highest correlation 145 | # with the total score 146 | test_fold=6, 147 | val_fold=None, 148 | cached=True 149 | ) 150 | ) 151 | -------------------------------------------------------------------------------- /chordrec/experiment.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import yaml 4 | import pickle 5 | import shutil 6 | import hashlib 7 | import tempfile 8 | import sys 9 | from functools import partial 10 | from sacred import Experiment 11 | from sacred.observers import RunObserver 12 | import lasagne as lnn 13 | import theano 14 | import numpy as np 15 | 16 | import nn 17 | import dmgr 18 | from nn.utils import Colors 19 | 20 | import data 21 | import features 22 | import targets 23 | import augmenters 24 | 25 | 26 | class TempDir: 27 | """ 28 | Creates a temporary directory to save stuff to 29 | """ 30 | def __enter__(self): 31 | self._tmp_dir_path = tempfile.mkdtemp() 32 | return self._tmp_dir_path 33 | 34 | def __exit__(self, type, value, traceback): 35 | shutil.rmtree(self._tmp_dir_path) 36 | 37 | 38 | def compute_features(process_fn, agg_dataset, dest_dir, use_mask, 39 | batch_size, extension): 40 | if not os.path.exists(dest_dir): 41 | os.makedirs(dest_dir) 42 | else: 43 | if not os.path.isdir(dest_dir): 44 | print(Colors.red('Destination path exists but is not a directory!'), 45 | file=sys.stderr) 46 | return 47 | 48 | iterate_batches = dmgr.iterators.iterate_batches 49 | 50 | feature_files = [] 51 | 52 | for ds_idx in range(agg_dataset.n_datasources): 53 | ds = agg_dataset.datasource(ds_idx) 54 | 55 | feats = [] 56 | for data, _ in iterate_batches(ds, batch_size or ds.n_data, 57 | randomise=False, expand=False): 58 | if use_mask: 59 | data = data[np.newaxis, :] 60 | mask = np.ones(data.shape[:2], dtype=np.float32) 61 | 62 | f = process_fn(data, mask)[0] 63 | else: 64 | f = process_fn(data) 65 | feats.append(f) 66 | 67 | feats = np.concatenate(feats) 68 | feat_file = os.path.join(dest_dir, ds.name + extension) 69 | np.save(feat_file, feats) 70 | feature_files.append(feat_file) 71 | 72 | return feature_files 73 | 74 | 75 | def create_optimiser(optimiser): 76 | """ 77 | Creates a function that returns an optimiser and (optional) a learn 78 | rate schedule 79 | """ 80 | 81 | if optimiser['schedule'] is not None: 82 | # if we have a learn rate schedule, create a theano shared 83 | # variable and a corresponding update 84 | lr = theano.shared(np.float32(optimiser['params']['learning_rate'])) 85 | 86 | # create a copy of the optimiser config dict so we do not change 87 | # it 88 | from copy import deepcopy 89 | optimiser = deepcopy(optimiser) 90 | optimiser['params']['learning_rate'] = lr 91 | lrs = nn.LearnRateSchedule(learning_rate=lr, **optimiser['schedule']) 92 | else: 93 | lrs = None 94 | 95 | return partial(getattr(lnn.updates, optimiser['name']), 96 | **optimiser['params']), lrs 97 | 98 | 99 | def rhash(d): 100 | """ 101 | Coputes the recursive hash of a dictionary 102 | :param d: dictionary to hash 103 | :return: hash of dictionary 104 | """ 105 | m = hashlib.sha1() 106 | 107 | if isinstance(d, dict): 108 | for _, value in sorted(d.items(), key=lambda (k, v): k): 109 | m.update(rhash(value)) 110 | else: 111 | m.update(str(d)) 112 | 113 | return m.hexdigest() 114 | 115 | 116 | def fhash(filename): 117 | """ 118 | Computes the hash of a file 119 | :param filename: file to hash 120 | :return: hash value of file 121 | """ 122 | md5 = hashlib.md5() 123 | with open(filename, 'rb') as f: 124 | # this needs an empty *byte* string b'' as a sentinel value 125 | for chunk in iter(lambda: f.read(128 * md5.block_size), b''): 126 | md5.update(chunk) 127 | return md5.hexdigest() 128 | 129 | 130 | class PickleAndSymlinkObserver(RunObserver): 131 | 132 | def __init__(self): 133 | self.config = None 134 | self.run = None 135 | self._hash = None 136 | 137 | def started_event(self, ex_info, host_info, start_time, config, comment): 138 | self.config = config 139 | 140 | # remember the *exact* configuration used for this run 141 | config_file = os.path.join(self.config_path(), 'config.yaml') 142 | with open(config_file, 'w') as f: 143 | f.write(yaml.dump(self.config)) 144 | 145 | self.run = { 146 | 'ex_info': ex_info, 147 | 'host_info': host_info, 148 | 'start_time': start_time, 149 | 'comment': comment 150 | } 151 | 152 | def hash(self): 153 | if self._hash is None: 154 | self._hash = rhash(self.config) 155 | 156 | return self._hash 157 | 158 | def config_path(self): 159 | if self.config is None: 160 | raise RuntimeError('tried to get a path without a configuration!') 161 | 162 | config_save_path = os.path.join(self.config['observations'], 163 | self.hash()) 164 | 165 | if not os.path.exists(config_save_path): 166 | os.makedirs(os.path.join(config_save_path, 'resources')) 167 | os.makedirs(os.path.join(config_save_path, 'artifacts')) 168 | return config_save_path 169 | 170 | def heartbeat_event(self, info, captured_out, beat_time): 171 | self.run['info'] = info 172 | self.run['captured_out'] = captured_out 173 | self.run['beat_time'] = beat_time 174 | 175 | def completed_event(self, stop_time, result): 176 | run_file = os.path.join(self.config_path(), 'completed.pkl') 177 | with open(run_file, 'w') as f: 178 | pickle.dump(self.run, f) 179 | 180 | def interrupted_event(self, interrupt_time): 181 | self.run['interrupt_time'] = interrupt_time 182 | interrupted_file = os.path.join(self.config_path(), 'interrupted.pkl') 183 | with open(interrupted_file, 'w') as f: 184 | pickle.dump(self.run, f) 185 | 186 | def failed_event(self, fail_time, fail_trace): 187 | self.run['fail_time'] = fail_time 188 | self.run['fail_trace'] = fail_trace 189 | 190 | fail_file = os.path.join(self.config_path(), 'failed.pkl') 191 | with open(fail_file, 'w') as f: 192 | pickle.dump(self.run, f) 193 | 194 | fail_file = os.path.join(self.config_path(), 'failed_trace.txt') 195 | with open(fail_file, 'w') as f: 196 | f.write(''.join(fail_trace)) 197 | 198 | def resource_event(self, filename): 199 | """ 200 | link a used file (this is where we could have distributed storage)... 201 | """ 202 | linkname = os.path.join(self.config_path(), 'resources', 203 | fhash(filename)) 204 | if not os.path.exists(linkname): 205 | os.symlink(filename, linkname) 206 | 207 | def artifact_event(self, filename): 208 | """ 209 | move an artifact from a temporary space to the actual observations 210 | directory for this run 211 | """ 212 | newname = os.path.join(self.config_path(), 'artifacts', 213 | os.path.basename(filename)) 214 | shutil.move(filename, newname) 215 | 216 | def get_artifact_path(self, path): 217 | return os.path.join(self.config_path(), 'artifacts', path) 218 | 219 | 220 | class ParamSaver: 221 | 222 | def __init__(self, ex, net, tmp_dir): 223 | self.ex = ex 224 | self.tmp_dir = tmp_dir 225 | self.net = net 226 | 227 | def __call__(self, epoch): 228 | fn = os.path.join(self.tmp_dir, 'params_{}.pkl'.format(epoch)) 229 | self.net.save_parameters(fn) 230 | self.ex.add_artifact(fn) 231 | 232 | 233 | def setup(name): 234 | ex = Experiment(name) 235 | ex.observers.append(PickleAndSymlinkObserver()) 236 | data.add_sacred_config(ex) 237 | features.add_sacred_config(ex) 238 | targets.add_sacred_config(ex) 239 | augmenters.add_sacred_config(ex) 240 | return ex 241 | 242 | 243 | -------------------------------------------------------------------------------- /chordrec/features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import madmom as mm 3 | import pickle 4 | 5 | 6 | class ConstantQ: 7 | 8 | def __init__(self, num_bands, fmin, num_octaves, fps, align, log_div, 9 | sample_rate=44100, fold=None): 10 | 11 | self.fps = fps 12 | self.num_bands = num_bands 13 | self.align = align 14 | self.fmin = fmin 15 | self.num_octaves = num_octaves 16 | self.log_div = log_div 17 | 18 | self.sample_rate = sample_rate 19 | 20 | from yaafelib import FeaturePlan, Engine 21 | 22 | fp = FeaturePlan(sample_rate=sample_rate) 23 | 24 | cqt_config = " ".join(['cqt: CQT', 25 | 'CQTAlign={}'.format(align), 26 | 'CQTBinsPerOctave={}'.format(num_bands), 27 | 'CQTMinFreq={}'.format(fmin), 28 | 'CQTNbOctaves={}'.format(num_octaves), 29 | 'stepSize={}'.format(sample_rate / fps) 30 | ]) 31 | 32 | fp.addFeature(cqt_config) 33 | 34 | df = fp.getDataFlow() 35 | self.engine = Engine() 36 | self.engine.load(df) 37 | 38 | @property 39 | def name(self): 40 | return 'cqt_fps={}_num-bands={}_align={}_fmin={}_num_oct={}'\ 41 | '_logdiv={}'.format(self.fps, self.num_bands, self.align, 42 | self.fmin, self.num_octaves, self.log_div) 43 | 44 | def __call__(self, audio_file): 45 | 46 | audio = mm.audio.signal.Signal(audio_file, 47 | sample_rate=self.sample_rate, 48 | num_channels=1).astype(np.float64) 49 | 50 | cqt = self.engine.processAudio(audio.reshape((1, -1)))['cqt'] 51 | # compensate for different padding in madmom vs. yaafe and convert 52 | # to float32 53 | cqt = np.vstack((cqt, np.zeros(cqt.shape[1:]))).astype(np.float32) 54 | 55 | if self.log_div: 56 | return np.log(cqt / self.log_div + 1) 57 | else: 58 | return cqt 59 | 60 | 61 | class LogFiltSpec: 62 | 63 | def __init__(self, frame_sizes, num_bands, fmin, fmax, fps, unique_filters, 64 | sample_rate=44100, fold=None): 65 | 66 | self.frame_sizes = frame_sizes 67 | self.num_bands = num_bands 68 | self.fmax = fmax 69 | self.fmin = fmin 70 | self.fps = fps 71 | self.unique_filters = unique_filters 72 | self.sample_rate = sample_rate 73 | 74 | @property 75 | def name(self): 76 | return 'lfs_fps={}_num-bands={}_fmin={}_fmax={}_frame_sizes=[{}]'.format( 77 | self.fps, self.num_bands, self.fmin, self.fmax, 78 | '-'.join(map(str, self.frame_sizes)) 79 | ) + ('_uf' if self.unique_filters else '') 80 | 81 | def __call__(self, audio_file): 82 | # do not resample because ffmpeg/avconv creates terrible sampling 83 | # artifacts 84 | specs = [ 85 | mm.audio.spectrogram.LogarithmicFilteredSpectrogram( 86 | audio_file, num_channels=1, sample_rate=self.sample_rate, 87 | fps=self.fps, frame_size=ffts, 88 | num_bands=self.num_bands, fmin=self.fmin, fmax=self.fmax, 89 | unique_filters=self.unique_filters) 90 | for ffts in self.frame_sizes 91 | ] 92 | 93 | return np.hstack(specs).astype(np.float32) 94 | 95 | 96 | class Chroma: 97 | 98 | def __init__(self, frame_size, fmax, fps, oct_width, center_note, log_eta, 99 | sample_rate=44100, fold=None): 100 | self.fps = fps 101 | self.fmax = fmax 102 | self.sample_rate = sample_rate 103 | self.oct_width = oct_width 104 | self.center_note = center_note 105 | self.frame_size = frame_size 106 | self.log_eta = log_eta 107 | 108 | # parameters are based on Cho and Bello, 2014. 109 | import librosa 110 | ctroct = (librosa.hz_to_octs(librosa.note_to_hz(center_note)) 111 | if center_note is not None else None) 112 | 113 | self.filterbank = librosa.filters.chroma( 114 | sr=sample_rate, n_fft=frame_size, octwidth=oct_width, 115 | ctroct=ctroct).T[:-1] 116 | 117 | # mask out everything above fmax 118 | from bottleneck import move_mean 119 | m = np.fft.fftfreq( 120 | frame_size, 1. / sample_rate)[:frame_size / 2] < fmax 121 | mask_smooth = move_mean(m, window=10, min_count=1) 122 | self.filterbank *= mask_smooth[:, np.newaxis] 123 | 124 | @property 125 | def name(self): 126 | if self.oct_width is not None: 127 | gauss_str = '_octwidth={:g}_cnote={}'.format(self.oct_width, 128 | self.center_note) 129 | else: 130 | gauss_str = '' 131 | 132 | if self.log_eta is not None: 133 | log_str = '_log={}'.format(self.log_eta) 134 | else: 135 | log_str = '' 136 | 137 | return 'chroma_fps={}_fmax={}_frame_size={}'.format( 138 | self.fps, self.fmax, self.frame_size) + gauss_str + log_str 139 | 140 | def __call__(self, audio_file): 141 | spec = mm.audio.spectrogram.Spectrogram( 142 | audio_file, num_channels=1, sample_rate=self.sample_rate, 143 | fps=self.fps, frame_size=4096, 144 | ) 145 | 146 | if self.log_eta is not None: 147 | spec = np.log(self.log_eta * spec / spec.max() + 1) 148 | 149 | chroma = np.dot(spec, self.filterbank) 150 | norm = np.sqrt(np.sum(chroma ** 2, axis=1)) 151 | norm[norm < 1e-20] = 1. 152 | return (chroma / norm[:, np.newaxis]).astype(np.float32) 153 | 154 | 155 | class ChromaCq: 156 | 157 | def __init__(self, fps, win_center, win_width, log_eta, 158 | sample_rate=44100, fold=None): 159 | """ 160 | Computes Chromas from a constant q transform. 161 | :param fps: frames per second 162 | :param win_center: midi number of window center note 163 | :param win_width: width of weighting window 164 | :param log_eta: scaling parameter for log 165 | :param sample_rate: sample rate of the audio 166 | """ 167 | self.fps = fps 168 | self.sample_rate = sample_rate 169 | self.num_bins = 84 170 | self.log_eta = log_eta 171 | 172 | if win_center is None: 173 | self.win = None 174 | self.win_center = None 175 | self.win_width = None 176 | else: 177 | # cq spec starts at C1, which is midi pitch 24. the zeroth bin thus 178 | # corresponds to midi note 24, and we have to adjust win_center 179 | self.win_center = float(win_center - 24) 180 | self.win_width = float(win_width) 181 | self.win = np.exp( 182 | -0.5 * ((self.win_center - np.arange(self.num_bins)) / 183 | self.win_width) ** 2 184 | ) 185 | 186 | @property 187 | def name(self): 188 | if self.win is not None: 189 | win_str = '_winc={}_winw={}'.format(self.win_center, 190 | self.win_width) 191 | else: 192 | win_str = '' 193 | 194 | log_str = '_log_eta={}'.format(self.log_eta) if self.log_eta else '' 195 | return 'chroma_cq_fps={}'.format(self.fps) + win_str + log_str 196 | 197 | def __call__(self, audio_file): 198 | import librosa 199 | y = mm.audio.signal.Signal(audio_file, num_channels=1, 200 | sample_rate=self.sample_rate) 201 | 202 | cq = librosa.core.cqt(y, sr=y.sample_rate, tuning=0, 203 | fmin=mm.audio.filters.midi2hz(24), 204 | n_bins=self.num_bins, 205 | hop_length=int(self.sample_rate / self.fps)) 206 | 207 | if self.log_eta is not None: 208 | cq = np.log(self.log_eta * cq / cq.max() + 1) 209 | 210 | if self.win is not None: 211 | cq *= self.win[:, np.newaxis] 212 | 213 | return librosa.feature.chroma_cqt(y=None, C=cq, tuning=0, 214 | norm=2).T.astype(np.float32) 215 | 216 | 217 | class HarmonicPitchClassProfile: 218 | 219 | def __init__(self, fps, frame_size, fmax, num_bands, 220 | sample_rate=44100, fold=None): 221 | self.fps = fps 222 | self.frame_size = frame_size 223 | self.fmax = fmax 224 | self.sample_rate = sample_rate 225 | self.num_bands = num_bands 226 | 227 | @property 228 | def name(self): 229 | return 'hpcp_fps={}_fmax={}_nbands={}_frame_size={}'.format( 230 | self.fps, self.fmax, self.num_bands, self.frame_size 231 | ) 232 | 233 | def __call__(self, audio_file): 234 | from madmom.audio import chroma 235 | 236 | hpcp = chroma.HarmonicPitchClassProfile( 237 | audio_file, fps=self.fps, fmax=self.fmax, 238 | num_classes=self.num_bands, sample_rate=self.sample_rate 239 | ) 240 | 241 | norm = np.sqrt(np.sum(hpcp ** 2, axis=1)) 242 | norm[norm < 1e-20] = 1. 243 | return (hpcp / norm[:, np.newaxis]).astype(np.float32) 244 | 245 | 246 | class DeepChroma: 247 | 248 | def __init__(self, fps, fmin=65, fmax=2100, unique_filters=True, 249 | models=None, sample_rate=44100, fold=None): 250 | assert fps == 10, 'Cannot handle fps different from 10 yet.' 251 | from madmom.audio.chroma import DeepChromaProcessor 252 | from hashlib import sha1 253 | self.fps = fps 254 | self.fmin = fmin 255 | self.fmax = fmax 256 | self.unique_filters = unique_filters 257 | self.dcp = DeepChromaProcessor( 258 | fmin=fmin, fmax=fmax, unique_filters=unique_filters, models=models 259 | ) 260 | self.model_hash = sha1(pickle.dumps(self.dcp)).hexdigest() 261 | 262 | @property 263 | def name(self): 264 | return 'deepchroma_fps={}_fmin={}_fmax={}_uf={}_mdlhsh={}'.format( 265 | self.fps, self.fmin, self.fmax, self.unique_filters, 266 | self.model_hash 267 | ) 268 | 269 | def __call__(self, audio_file): 270 | return self.dcp(audio_file) 271 | 272 | 273 | class PrecomputedFeature: 274 | 275 | def __init__(self, name, fps, fold): 276 | self._name = name 277 | self.fps = fps 278 | self.fold = fold 279 | 280 | @property 281 | def name(self): 282 | return self._name.format(fps=self.fps, fold=self.fold) 283 | 284 | def __call__(self, audio_file): 285 | raise NotImplementedError( 286 | 'Cannot compute features for {}. ' 287 | 'This feature is only precomputed!'.format(audio_file)) 288 | 289 | 290 | def add_sacred_config(ex): 291 | ex.add_named_config( 292 | 'constant_q', 293 | feature_extractor=dict( 294 | name='ConstantQ', 295 | params=dict( 296 | fps=10, 297 | num_bands=24, 298 | fmin=30, 299 | num_octaves=8, 300 | log_div=500., 301 | align='c' 302 | ) 303 | ) 304 | ) 305 | 306 | ex.add_named_config( 307 | 'log_filt_spec', 308 | feature_extractor=dict( 309 | name='LogFiltSpec', 310 | params=dict( 311 | fps=10, 312 | frame_sizes=[8192], 313 | num_bands=24, 314 | fmin=65, 315 | fmax=2100, 316 | unique_filters=True, 317 | ) 318 | ) 319 | ) 320 | 321 | ex.add_named_config( 322 | 'chroma_clp', 323 | feature_extractor=dict( 324 | name='PrecomputedFeature', 325 | params=dict( 326 | name='chroma_clp_fps={fps}', 327 | fps=10, 328 | ) 329 | ) 330 | ) 331 | 332 | ex.add_named_config( 333 | 'perfect_chroma', 334 | feature_extractor=dict( 335 | name='PrecomputedFeature', 336 | params=dict( 337 | name='perfect_chroma_fps={fps}', 338 | fps=10 339 | ) 340 | ) 341 | ) 342 | 343 | ex.add_named_config( 344 | 'gap_feature', 345 | feature_extractor=dict( 346 | name='PrecomputedFeature', 347 | params=dict( 348 | name='gap_feature/features_fold_{fold}', 349 | fps=10, 350 | ) 351 | ) 352 | ) 353 | 354 | ex.add_named_config( 355 | 'deep_chroma_pc', 356 | feature_extractor=dict( 357 | name='PrecomputedFeature', 358 | params=dict( 359 | name='deep_chroma_pc', 360 | fps=10 361 | ) 362 | ) 363 | ) 364 | 365 | ex.add_named_config( 366 | 'deep_chroma', 367 | feature_extractor=dict( 368 | name='DeepChroma', 369 | params=dict( 370 | fps=10 371 | ) 372 | ) 373 | ) 374 | 375 | ex.add_named_config( 376 | 'hpcp', 377 | feature_extractor=dict( 378 | name='HarmonicPitchClassProfile', 379 | params=dict( 380 | fps=10, 381 | frame_size=8192, 382 | fmax=5500, 383 | num_bands=36, 384 | ) 385 | ) 386 | ) 387 | 388 | ex.add_named_config( 389 | 'chroma_hpcp', 390 | feature_extractor=dict( 391 | name='HarmonicPitchClassProfile', 392 | params=dict( 393 | fps=10, 394 | frame_size=8192, 395 | fmax=5500, 396 | num_bands=12, 397 | ) 398 | ) 399 | ) 400 | 401 | ex.add_named_config( 402 | 'chroma', 403 | feature_extractor=dict( 404 | name='Chroma', 405 | params=dict( 406 | fps=10, 407 | frame_size=4096, 408 | fmax=5500, 409 | oct_width=None, 410 | center_note=None, 411 | log_eta=None 412 | ) 413 | ) 414 | ) 415 | 416 | ex.add_named_config( 417 | 'chroma_w_log', 418 | feature_extractor=dict( 419 | name='Chroma', 420 | params=dict( 421 | fps=10, 422 | frame_size=4096, 423 | fmax=5500, 424 | oct_width=15./12, 425 | center_note='C4', 426 | log_eta=1000 427 | ) 428 | ) 429 | ) 430 | 431 | ex.add_named_config( 432 | 'chroma_cq', 433 | feature_extractor=dict( 434 | name='ChromaCq', 435 | params=dict( 436 | fps=9.98641304347826086957, 437 | win_center=None, 438 | win_width=None, 439 | log_eta=None 440 | ) 441 | ) 442 | ) 443 | 444 | ex.add_named_config( 445 | 'chroma_cq_w_log', 446 | feature_extractor=dict( 447 | name='ChromaCq', 448 | params=dict( 449 | fps=9.98641304347826086957, 450 | # paramters taken from Cho's paper 451 | win_center=60, 452 | win_width=15, 453 | log_eta=1000 454 | ) 455 | ) 456 | ) 457 | 458 | 459 | def create_extractor(config, fold): 460 | return globals()[config['name']](fold=fold, **config['params']) 461 | -------------------------------------------------------------------------------- /chordrec/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fdlm/chordrec/1acb97e5efdd9474e7abfe4b741f94b5452499d5/chordrec/models/__init__.py -------------------------------------------------------------------------------- /chordrec/models/avg_gap_feature.py: -------------------------------------------------------------------------------- 1 | from dnn import * 2 | 3 | 4 | def build_model(in_shape, out_size, model): 5 | network, input_var, target_var = build_net(in_shape, out_size, model) 6 | 7 | # this goes back to the nonlinearity layer of the penultimate conv layer 8 | # (after batchnorm!) 9 | feature_layer = network 10 | for _ in range(7): 11 | feature_layer = feature_layer.input_layer 12 | 13 | # average the feature maps of this conv layer 14 | feature_out = lnn.layers.get_output(feature_layer, deterministic=True) 15 | feature_out = tt.mean(feature_out, axis=(2, 3)) 16 | 17 | return dict(network=network, input_var=input_var, target_var=target_var, 18 | loss_fn=categorical_crossentropy, feature_out=feature_out) 19 | 20 | 21 | def add_sacred_config(ex): 22 | # ======================================================= conv net with gap 23 | 24 | ex.add_named_config( 25 | name='gap_feature_extractor', 26 | datasource=dict( 27 | context_size=7, 28 | ), 29 | model=dict( 30 | type='avg_gap_feature', 31 | conv=dict( 32 | conv1=dict( 33 | num_layers=4, 34 | num_filters=32, 35 | filter_size=(3, 3), 36 | pool_size=(1, 2), 37 | dropout=0.5, 38 | pad='same', 39 | batch_norm=True, 40 | ), 41 | conv2=dict( 42 | num_layers=2, 43 | num_filters=64, 44 | filter_size=(3, 3), 45 | pool_size=(1, 2), 46 | dropout=0.5, 47 | pad='valid', 48 | batch_norm=True, 49 | ), 50 | conv3=dict( 51 | num_layers=1, 52 | num_filters=128, 53 | filter_size=(9, 12), 54 | pool_size=None, 55 | dropout=0.5, 56 | pad='valid', 57 | batch_norm=True 58 | ) 59 | ), 60 | gap=dict( 61 | batch_norm=True, 62 | gap_nonlinearity='linear', 63 | ), 64 | out_nonlinearity='softmax' 65 | ), 66 | optimiser=dict( 67 | name='adam', 68 | params=dict( 69 | learning_rate=0.001 70 | ), 71 | schedule=None 72 | ), 73 | training=dict( 74 | num_epochs=500, 75 | early_stop=5, 76 | early_stop_acc=True, 77 | batch_size=512, 78 | ), 79 | regularisation=dict( 80 | l2=1e-7, 81 | l1=0 82 | ), 83 | testing=dict( 84 | test_on_val=False, 85 | batch_size=512 86 | ) 87 | ) 88 | 89 | ex.add_named_config( 90 | name='gap_feature_extractor_mm_2016', 91 | datasource=dict( 92 | context_size=11, 93 | ), 94 | model=dict( 95 | type='avg_gap_feature', 96 | conv=dict( 97 | conv1=dict( 98 | num_layers=4, 99 | num_filters=32, 100 | filter_size=(3, 3), 101 | pool_size=(1, 2), 102 | dropout=0.5, 103 | pad='valid', 104 | batch_norm=True, 105 | ), 106 | conv2=dict( 107 | num_layers=2, 108 | num_filters=64, 109 | filter_size=(3, 3), 110 | pool_size=(1, 2), 111 | dropout=0.5, 112 | pad='valid', 113 | batch_norm=True, 114 | ), 115 | conv3=dict( 116 | num_layers=1, 117 | num_filters=128, 118 | filter_size=(9, 12), 119 | pool_size=None, 120 | dropout=0.5, 121 | pad='valid', 122 | batch_norm=True 123 | ) 124 | ), 125 | gap=dict( 126 | batch_norm=True, 127 | gap_nonlinearity='linear', 128 | ), 129 | out_nonlinearity='softmax' 130 | ), 131 | optimiser=dict( 132 | name='adam', 133 | params=dict( 134 | learning_rate=0.001 135 | ), 136 | schedule=None 137 | ), 138 | training=dict( 139 | num_epochs=500, 140 | early_stop=5, 141 | early_stop_acc=True, 142 | batch_size=512, 143 | ), 144 | regularisation=dict( 145 | l2=1e-7, 146 | l1=0 147 | ), 148 | testing=dict( 149 | test_on_val=False, 150 | batch_size=512 151 | ) 152 | ) 153 | -------------------------------------------------------------------------------- /chordrec/models/blocks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lasagne as lnn 3 | 4 | 5 | def conv(network, batch_norm, num_layers, num_filters, filter_size, pad, 6 | pool_size, dropout): 7 | for k in range(num_layers): 8 | network = lnn.layers.Conv2DLayer( 9 | network, num_filters=num_filters, 10 | filter_size=filter_size, 11 | W=lnn.init.Orthogonal(gain=np.sqrt(2 / (1 + .1 ** 2))), 12 | pad=pad, 13 | nonlinearity=lnn.nonlinearities.rectify, 14 | name='Conv_{}'.format(k)) 15 | if batch_norm: 16 | network = lnn.layers.batch_norm(network) 17 | 18 | if pool_size: 19 | network = lnn.layers.MaxPool2DLayer(network, pool_size=pool_size, 20 | name='Pool') 21 | if dropout > 0.0: 22 | network = lnn.layers.DropoutLayer(network, p=dropout) 23 | 24 | return network 25 | 26 | 27 | def gap(network, out_size, batch_norm, 28 | gap_nonlinearity, out_nonlinearity): 29 | 30 | gap_nonlinearity = getattr(lnn.nonlinearities, gap_nonlinearity) 31 | out_nonlinearity = getattr(lnn.nonlinearities, out_nonlinearity) 32 | 33 | # output classification layer 34 | network = lnn.layers.Conv2DLayer( 35 | network, num_filters=out_size, filter_size=1, 36 | nonlinearity=gap_nonlinearity, name='Output_Conv') 37 | if batch_norm: 38 | network = lnn.layers.batch_norm(network) 39 | 40 | network = lnn.layers.Pool2DLayer( 41 | network, pool_size=network.output_shape[-2:], ignore_border=False, 42 | mode='average_exc_pad', name='GlobalAveragePool') 43 | network = lnn.layers.FlattenLayer(network, name='Flatten') 44 | 45 | network = lnn.layers.NonlinearityLayer( 46 | network, nonlinearity=out_nonlinearity, name='output') 47 | 48 | return network 49 | 50 | 51 | def dense(network, batch_norm, nonlinearity, num_layers, num_units, 52 | dropout): 53 | 54 | nl = getattr(lnn.nonlinearities, nonlinearity) 55 | 56 | for i in range(num_layers): 57 | network = lnn.layers.DenseLayer( 58 | network, num_units=num_units, nonlinearity=nl, 59 | name='fc-{}'.format(i) 60 | ) 61 | if batch_norm: 62 | network = lnn.layers.batch_norm(network) 63 | if dropout > 0.0: 64 | network = lnn.layers.DropoutLayer(network, p=dropout) 65 | 66 | return network 67 | 68 | 69 | def recurrent(network, mask_in, num_rec_units, num_layers, dropout, 70 | bidirectional, nonlinearity): 71 | 72 | if nonlinearity != 'LSTM': 73 | nl = getattr(lnn.nonlinearities, nonlinearity) 74 | 75 | def add_layer(prev_layer, **kwargs): 76 | return lnn.layers.RecurrentLayer( 77 | prev_layer, num_units=num_rec_units, mask_input=mask_in, 78 | nonlinearity=nl, 79 | W_in_to_hid=lnn.init.GlorotUniform(), 80 | W_hid_to_hid=lnn.init.Orthogonal(gain=np.sqrt(2) / 2), 81 | **kwargs) 82 | 83 | else: 84 | def add_layer(prev_layer, **kwargs): 85 | return lnn.layers.LSTMLayer( 86 | prev_layer, num_units=num_rec_units, mask_input=mask_in, 87 | **kwargs 88 | ) 89 | 90 | fwd = network 91 | for i in range(num_layers): 92 | fwd = add_layer(fwd, name='rec_fwd_{}'.format(i)) 93 | if dropout > 0.: 94 | fwd = lnn.layers.DropoutLayer(fwd, p=dropout) 95 | 96 | if not bidirectional: 97 | return network 98 | 99 | bck = network 100 | for i in range(num_layers): 101 | bck = add_layer(bck, name='rec_bck_{}'.format(i), backwards=True) 102 | if dropout > 0: 103 | bck = lnn.layers.DropoutLayer(bck, p=dropout) 104 | 105 | # combine the forward and backward recurrent layers... 106 | network = lnn.layers.ConcatLayer([fwd, bck], name='fwd + bck', axis=-1) 107 | return network 108 | 109 | -------------------------------------------------------------------------------- /chordrec/models/chroma_dnn.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as tt 2 | import lasagne as lnn 3 | 4 | from . import dnn 5 | 6 | 7 | def compute_loss(prediction, target): 8 | # need to clip predictions for numerical stability 9 | eps = 1e-7 10 | pred_clip = tt.clip(prediction, eps, 1.-eps) 11 | return lnn.objectives.binary_crossentropy(pred_clip, target).mean() 12 | 13 | 14 | def build_net(in_shape, out_size_chroma, out_size, model): 15 | # first, stack the dnn chroma extractor 16 | chroma_network, input_var, crm_target_var = dnn.build_net( 17 | in_shape, out_size_chroma, model 18 | ) 19 | 20 | # then, add the logistic regression chord classifier 21 | crd_target_var = tt.matrix('target_output', dtype='float32') 22 | 23 | chord_network = lnn.layers.DenseLayer( 24 | chroma_network, name='chords', num_units=out_size, 25 | nonlinearity=lnn.nonlinearities.softmax) 26 | 27 | # tag chord classification parameters so we can distinguish them later 28 | for p in chord_network.get_params(): 29 | chord_network.params[p].add('chord') 30 | 31 | return (chroma_network, chord_network, 32 | input_var, crm_target_var, crd_target_var) 33 | 34 | 35 | def build_model(in_shape, out_size_chroma, out_size, model): 36 | (crm, crd, inv, crmv, crdv) = build_net(in_shape, out_size_chroma, 37 | out_size, model) 38 | return dict(chroma_network=crm, chord_network=crd, 39 | input_var=inv, chroma_target_var=crmv, chord_target_var=crdv, 40 | chroma_loss_fn=compute_loss, 41 | chord_loss_fn=dnn.categorical_crossentropy) 42 | 43 | 44 | create_iterators = dnn.create_iterators 45 | 46 | 47 | def add_sacred_config(ex): 48 | 49 | # =============================================================== dense net 50 | 51 | ex.add_named_config( 52 | name='dense_net', 53 | datasource=dict( 54 | context_size=7, 55 | ), 56 | chroma_network=dict( 57 | model=dict( 58 | type='chroma_dnn', 59 | dense=dict( 60 | num_layers=3, 61 | num_units=512, 62 | dropout=0.5, 63 | nonlinearity='rectify', 64 | batch_norm=False, 65 | ), 66 | out_nonlinearity='sigmoid' 67 | ), 68 | optimiser=dict( 69 | name='adam', 70 | params=dict( 71 | learning_rate=0.0001 72 | ), 73 | schedule=None 74 | ), 75 | training=dict( 76 | iterator='BatchIterator', 77 | batch_size=512, 78 | num_epochs=500, 79 | early_stop=20, 80 | early_stop_acc=False, 81 | ), 82 | regularisation=dict( 83 | l2=1e-4, 84 | l1=0.0, 85 | ), 86 | ), 87 | optimiser=dict( 88 | name='adam', 89 | params=dict( 90 | learning_rate=0.001 91 | ), 92 | schedule=None 93 | ), 94 | training=dict( 95 | iterator='BatchIterator', 96 | batch_size=512, 97 | num_epochs=500, 98 | early_stop=20, 99 | early_stop_acc=True 100 | ), 101 | regularisation=dict( 102 | l2=1e-4, 103 | l1=0.0, 104 | ), 105 | testing=dict( 106 | test_on_val=False, 107 | batch_size=None 108 | ) 109 | ) 110 | 111 | -------------------------------------------------------------------------------- /chordrec/models/crf.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as tt 2 | 3 | import dmgr 4 | import lasagne as lnn 5 | import spaghetti as spg 6 | 7 | from .. import augmenters 8 | 9 | 10 | class CrfLoss: 11 | 12 | def __init__(self, crf): 13 | self.crf = crf 14 | 15 | def __call__(self, prediction, target, mask): 16 | loss = spg.objectives.neg_log_likelihood(self.crf, target, mask) 17 | loss /= mask.sum(axis=1) # normalise to sequence length 18 | return lnn.objectives.aggregate(loss, mode='mean') 19 | 20 | 21 | def build_net(in_shape, out_size, model): 22 | # input variables 23 | input_var = (tt.tensor4('input', dtype='float32') 24 | if len(in_shape) > 1 else 25 | tt.tensor3('input', dtype='float32')) 26 | target_var = tt.tensor3('target_output', dtype='float32') 27 | mask_var = tt.matrix('mask_input', dtype='float32') 28 | 29 | # stack more layers 30 | network = lnn.layers.InputLayer( 31 | name='input', shape=(None, None) + in_shape, 32 | input_var=input_var 33 | ) 34 | 35 | mask_in = lnn.layers.InputLayer(name='mask', 36 | input_var=mask_var, 37 | shape=(None, None)) 38 | 39 | network = spg.layers.CrfLayer( 40 | network, mask_input=mask_in, num_states=out_size, name='CRF') 41 | 42 | return network, input_var, target_var, mask_var 43 | 44 | 45 | def build_model(in_shape, out_size, model): 46 | network, input_var, target_var, mask_var = build_net(in_shape, out_size, 47 | model) 48 | loss_fn = CrfLoss(network) 49 | return dict(network=network, input_var=input_var, target_var=target_var, 50 | mask_var=mask_var, loss_fn=loss_fn) 51 | 52 | 53 | def create_iterators(train_set, val_set, training, augmentation): 54 | train_batches = dmgr.iterators.SequenceIterator( 55 | train_set, training['batch_size'], randomise=True, 56 | expand=True, max_seq_len=training['max_seq_len'] 57 | ) 58 | 59 | val_batches = dmgr.iterators.SequenceIterator( 60 | val_set, training['batch_size'], randomise=False, 61 | expand=False 62 | ) 63 | 64 | if augmentation is not None: 65 | train_batches = dmgr.iterators.AugmentedIterator( 66 | train_batches, *augmenters.create_augmenters(augmentation) 67 | ) 68 | 69 | return train_batches, val_batches 70 | 71 | 72 | def add_sacred_config(ex): 73 | ex.add_named_config( 74 | name='crf', 75 | datasource=dict( 76 | context_size=0, 77 | ), 78 | model=dict( 79 | type='crf' 80 | ), 81 | optimiser=dict( 82 | name='adam', 83 | params=dict( 84 | learning_rate=0.01 85 | ), 86 | schedule=None 87 | ), 88 | training=dict( 89 | batch_size=32, 90 | max_seq_len=1024, 91 | num_epochs=500, 92 | early_stop=20, 93 | early_stop_acc=True, 94 | ), 95 | regularisation=dict( 96 | l1=1e-4, 97 | l2=0.0, 98 | ), 99 | testing=dict( 100 | test_on_val=False, 101 | batch_size=None, 102 | ) 103 | ) 104 | -------------------------------------------------------------------------------- /chordrec/models/dnn.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as tt 2 | import lasagne as lnn 3 | 4 | import dmgr 5 | 6 | from .. import augmenters 7 | from . import blocks 8 | 9 | 10 | def categorical_crossentropy(prediction, target): 11 | # need to clip predictions for numerical stability 12 | eps = 1e-7 13 | pred_clip = tt.clip(prediction, eps, 1.-eps) 14 | return lnn.objectives.categorical_crossentropy(pred_clip, target).mean() 15 | 16 | 17 | def categorical_mse(predictions, targets): 18 | """ Mean squared error on class targets """ 19 | return tt.mean( 20 | (1.0 - predictions[tt.arange(targets.shape[0]), targets]) ** 2) 21 | 22 | 23 | def build_net(in_shape, out_size, model): 24 | # input variables 25 | input_var = (tt.tensor3('input', dtype='float32') 26 | if len(in_shape) > 1 else 27 | tt.matrix('input', dtype='float32')) 28 | target_var = tt.matrix('target_output', dtype='float32') 29 | 30 | # stack more layers 31 | network = lnn.layers.InputLayer( 32 | name='input', shape=(None,) + in_shape, input_var=input_var) 33 | 34 | if 'conv' in model and model['conv']: 35 | # reshape to 1 "color" channel 36 | network = lnn.layers.reshape( 37 | network, shape=(-1, 1) + in_shape, name='reshape') 38 | 39 | for c in sorted(model['conv'].keys()): 40 | network = blocks.conv(network, **model['conv'][c]) 41 | 42 | # no more output layer if gap is already there! 43 | if 'gap' in model and model['gap']: 44 | network = blocks.gap(network, out_size=out_size, 45 | out_nonlinearity=model['out_nonlinearity'], 46 | **model['gap']) 47 | else: 48 | if 'dense' in model and model['dense']: 49 | network = blocks.dense(network, **model['dense']) 50 | 51 | # output layer 52 | out_nl = getattr(lnn.nonlinearities, model['out_nonlinearity']) 53 | network = lnn.layers.DenseLayer( 54 | network, name='output', num_units=out_size, 55 | nonlinearity=out_nl) 56 | 57 | return network, input_var, target_var 58 | 59 | 60 | def train_iterator(train_set, training): 61 | it = training.get('iterator', 'BatchIterator') 62 | 63 | if it == 'BatchIterator': 64 | return dmgr.iterators.BatchIterator( 65 | train_set, training['batch_size'], randomise=True, 66 | expand=True 67 | ) 68 | elif it == 'ClassBalancedIterator': 69 | return dmgr.iterators.UniformClassIterator( 70 | train_set, training['batch_size'] 71 | ) 72 | else: 73 | raise ValueError('Unknown Batch Iterator: {}'.format(it)) 74 | 75 | 76 | def build_model(in_shape, out_size, model): 77 | network, input_var, target_var = build_net(in_shape, out_size, model) 78 | return dict(network=network, input_var=input_var, target_var=target_var, 79 | loss_fn=categorical_crossentropy) 80 | 81 | 82 | def create_iterators(train_set, val_set, training, augmentation): 83 | train_batches = train_iterator(train_set, training) 84 | val_batches = dmgr.iterators.BatchIterator( 85 | val_set, training['batch_size'], randomise=False, expand=True 86 | ) 87 | 88 | if augmentation is not None: 89 | train_batches = dmgr.iterators.AugmentedIterator( 90 | train_batches, *augmenters.create_augmenters(augmentation) 91 | ) 92 | 93 | return train_batches, val_batches 94 | 95 | 96 | def add_sacred_config(ex): 97 | 98 | # =============================================================== dense net 99 | 100 | ex.add_named_config( 101 | name='dense_net', 102 | datasource=dict( 103 | context_size=7, 104 | ), 105 | model=dict( 106 | type='dnn', 107 | dense=dict( 108 | num_layers=3, 109 | num_units=512, 110 | nonlinearity='rectify', 111 | batch_norm=False, 112 | dropout=0.5, 113 | ), 114 | out_nonlinearity='softmax' 115 | ), 116 | optimiser=dict( 117 | name='adam', 118 | params=dict( 119 | learning_rate=0.0001 120 | ), 121 | schedule=None 122 | ), 123 | training=dict( 124 | iterator='BatchIterator', 125 | batch_size=512, 126 | num_epochs=500, 127 | early_stop=20, 128 | early_stop_acc=True, 129 | ), 130 | regularisation=dict( 131 | l2=1e-4, 132 | l1=0.0, 133 | ), 134 | testing=dict( 135 | test_on_val=False, 136 | batch_size=None 137 | ) 138 | ) 139 | 140 | # ================================================================ conv net 141 | 142 | ex.add_named_config( 143 | name='conv_net', 144 | datasource=dict( 145 | context_size=7, 146 | ), 147 | model=dict( 148 | type='dnn', 149 | conv=dict( 150 | conv1=dict( 151 | num_layers=4, 152 | num_filters=32, 153 | filter_size=(3, 3), 154 | pool_size=(1, 2), 155 | dropout=0.5, 156 | pad='same', 157 | batch_norm=True, 158 | ), 159 | conv2=dict( 160 | num_layers=2, 161 | num_filters=64, 162 | filter_size=(3, 3), 163 | pool_size=(1, 2), 164 | dropout=0.5, 165 | pad='valid', 166 | batch_norm=True, 167 | ), 168 | conv3=dict( 169 | num_layers=1, 170 | num_filters=128, 171 | filter_size=(9, 12), 172 | pool_size=None, 173 | dropout=0.5, 174 | pad='valid', 175 | batch_norm=True 176 | ) 177 | ), 178 | out_nonlinearity='softmax' 179 | ), 180 | optimiser=dict( 181 | name='adam', 182 | params=dict( 183 | learning_rate=0.001 184 | ), 185 | schedule=None 186 | ), 187 | training=dict( 188 | num_epochs=500, 189 | early_stop=5, 190 | early_stop_acc=True, 191 | batch_size=512, 192 | ), 193 | regularisation=dict( 194 | l2=1e-7, 195 | l1=0 196 | ), 197 | testing=dict( 198 | test_on_val=False, 199 | batch_size=512 200 | ) 201 | ) 202 | 203 | @ex.named_config 204 | def dense_classifier(): 205 | model = dict( 206 | dense=dict( 207 | num_layers=1, 208 | num_units=512, 209 | dropout=0.5, 210 | nonlinearity='rectify', 211 | batch_norm=False 212 | ) 213 | ) 214 | 215 | @ex.named_config 216 | def gap_classifier(): 217 | model = dict( 218 | gap=dict( 219 | batch_norm=True, 220 | gap_nonlinearity='linear', 221 | ) 222 | ) 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /chordrec/models/rnn.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as tt 2 | 3 | import dmgr 4 | import lasagne as lnn 5 | 6 | from .. import augmenters 7 | from . import blocks 8 | 9 | 10 | def compute_loss(prediction, target, mask): 11 | # need to clip predictions for numerical stability 12 | eps = 1e-7 13 | pred_clip = tt.clip(prediction, eps, 1. - eps) 14 | loss = lnn.objectives.categorical_crossentropy(pred_clip, target) 15 | return lnn.objectives.aggregate(loss, mask, mode='normalized_sum') 16 | 17 | 18 | def build_net(in_shape, out_size, model): 19 | # input variables 20 | input_var = tt.tensor3('input', dtype='float32') 21 | target_var = tt.tensor3('target_output', dtype='float32') 22 | mask_var = tt.matrix('mask_input', dtype='float32') 23 | 24 | # stack more layers 25 | network = lnn.layers.InputLayer( 26 | name='input', shape=(None, None) + in_shape, 27 | input_var=input_var 28 | ) 29 | 30 | true_batch_size, true_seq_len, _ = input_var.shape 31 | 32 | mask_in = lnn.layers.InputLayer(name='mask', 33 | input_var=mask_var, 34 | shape=(None, None)) 35 | 36 | network = blocks.recurrent(network, mask_in, **model['recurrent']) 37 | 38 | # In order to connect a recurrent layer to a dense layer, we need to 39 | # flatten the first two dimensions (our "sample dimensions"); this will 40 | # cause each time step of each sequence to be processed independently 41 | network = lnn.layers.ReshapeLayer( 42 | network, (-1, lnn.layers.get_output_shape(network)[-1]), 43 | name='reshape to single') 44 | 45 | network = lnn.layers.DenseLayer( 46 | network, num_units=out_size, nonlinearity=lnn.nonlinearities.softmax, 47 | name='output') 48 | 49 | # To reshape back to our original shape, we can use the symbolic shape 50 | # variables we retrieved above. 51 | network = lnn.layers.ReshapeLayer( 52 | network, (true_batch_size, true_seq_len, out_size), 53 | name='output-reshape') 54 | 55 | return network, input_var, target_var, mask_var 56 | 57 | 58 | def build_model(in_shape, out_size, model): 59 | network, input_var, target_var, mask_var = build_net(in_shape, out_size, 60 | model) 61 | return dict(network=network, input_var=input_var, target_var=target_var, 62 | mask_var=mask_var, loss_fn=compute_loss) 63 | 64 | 65 | def create_iterators(train_set, val_set, training, augmentation): 66 | train_batches = dmgr.iterators.SequenceIterator( 67 | train_set, training['batch_size'], randomise=True, 68 | expand=True, max_seq_len=training['max_seq_len'] 69 | ) 70 | 71 | val_batches = dmgr.iterators.SequenceIterator( 72 | val_set, training['batch_size'], randomise=False, 73 | expand=False 74 | ) 75 | 76 | if augmentation is not None: 77 | train_batches = dmgr.iterators.AugmentedIterator( 78 | train_batches, *augmenters.create_augmenters(augmentation) 79 | ) 80 | 81 | return train_batches, val_batches 82 | 83 | 84 | def add_sacred_config(ex): 85 | ex.add_named_config( 86 | name='recurrent', 87 | model=dict( 88 | type='rnn', 89 | recurrent=dict( 90 | num_rec_units=128, 91 | num_layers=3, 92 | dropout=0.3, 93 | bidirectional=True, 94 | nonlinearity='rectify' 95 | ) 96 | ), 97 | optimiser=dict( 98 | name='adam', 99 | params=dict( 100 | learning_rate=0.0001 101 | ), 102 | schedule=None 103 | ), 104 | training=dict( 105 | iterator='BatchIterator', 106 | batch_size=8, 107 | max_seq_len=64, 108 | num_epochs=1000, 109 | early_stop=20, 110 | early_stop_acc=True, 111 | ), 112 | regularisation=dict( 113 | l1=0.0, 114 | l2=1e-4, 115 | ), 116 | testing=dict( 117 | test_on_val=False 118 | ) 119 | ) 120 | 121 | @ex.named_config 122 | def lstm(): 123 | net = dict( 124 | nonlinearity='LSTM', 125 | num_rec_units=64, 126 | ) 127 | -------------------------------------------------------------------------------- /chordrec/targets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import string 3 | import mir_eval 4 | 5 | 6 | def one_hot(class_ids, num_classes): 7 | """ 8 | Create one-hot encoding of class ids 9 | :param class_ids: ids of classes to map 10 | :param num_classes: number of classes 11 | :return: one-hot encoding of class ids 12 | """ 13 | oh = np.zeros((len(class_ids), num_classes), dtype=np.float32) 14 | oh[np.arange(len(class_ids)), class_ids] = 1 15 | 16 | # make sure one-hot encoding corresponds to class ids 17 | assert (oh.argmax(axis=1) == class_ids).all() 18 | # make sure there is only one id set per vector 19 | assert (oh.sum(axis=1) == 1).all() 20 | 21 | return oh 22 | 23 | 24 | class IntervalAnnotationTarget(object): 25 | 26 | def __init__(self, fps, num_classes): 27 | self.fps = fps 28 | self.num_classes = num_classes 29 | 30 | def _annotations_to_targets(self, annotations): 31 | """ 32 | Class ID of 'no chord' should always be last! 33 | :param annotations: 34 | :return: 35 | """ 36 | raise NotImplementedError('Implement this') 37 | 38 | def _targets_to_annotations(self, targets): 39 | raise NotImplementedError('Implement this.') 40 | 41 | def _dummy_target(self): 42 | raise NotImplementedError('Implement this.') 43 | 44 | def __call__(self, target_file, num_frames=None): 45 | """ 46 | Creates one-hot encodings from an annotation file. 47 | 48 | :param target_file: file containing time annotations 49 | :param num_frames: number of frames in the audio file. if None, 50 | estimate from the end of last annotation 51 | :return: one-hot ground truth per frame 52 | """ 53 | ann = np.loadtxt(target_file, 54 | comments=None, 55 | dtype=[('start', np.float), 56 | ('end', np.float), 57 | # assumes chord descriptions are 58 | # shorter than 50 characters 59 | ('label', 'S50')]) 60 | 61 | if num_frames is None: 62 | num_frames = np.ceil(ann['end'][-1] * self.fps) 63 | 64 | # we will add a dummy class at the end and at the beginning, 65 | # because some annotations miss it, are not exactly aligned at the end 66 | # or do not start at the beginning of an audio file 67 | targets = np.vstack((self._dummy_target(), 68 | self._annotations_to_targets(ann['label']), 69 | self._dummy_target())) 70 | 71 | # add the times for the dummy events 72 | start = np.hstack(([-np.inf], ann['start'], ann['end'][-1])) 73 | end = np.hstack((ann['start'][0], ann['end'], [np.inf])) 74 | 75 | # next, we have to assign each frame a target. first, compute the 76 | # frame times 77 | frame_times = np.arange(num_frames, dtype=np.float) / self.fps 78 | 79 | # IMPORTANT: round everything to milliseconds to prevent errors caused 80 | # by floating point hell. Ideally, we would round everything to 81 | # possible *frame times*, but it is easier this way. 82 | start = np.round(start, decimals=3) 83 | end = np.round(end, decimals=3) 84 | frame_times = np.round(frame_times, decimals=3) 85 | 86 | target_per_frame = ((start <= frame_times[:, np.newaxis]) & 87 | (frame_times[:, np.newaxis] < end)) 88 | 89 | # make sure each frame is assigned to only one target vector 90 | assert (target_per_frame.sum(axis=1) == 1).all() 91 | 92 | # create the one hot vectors per frame 93 | return targets[np.nonzero(target_per_frame)[1]].astype(np.float32) 94 | 95 | def write_chord_predictions(self, filename, predictions): 96 | with open(filename, 'w') as f: 97 | f.writelines(['{:.3f}\t{:.3f}\t{}\n'.format(*p) 98 | for p in self._targets_to_annotations(predictions)]) 99 | 100 | 101 | class ChordsMajMin(IntervalAnnotationTarget): 102 | 103 | def __init__(self, fps): 104 | # 25 classes - 12 minor, 12 major, one "No Chord" 105 | super(ChordsMajMin, self).__init__(fps, 25) 106 | 107 | @property 108 | def name(self): 109 | return 'chords_majmin_fps={}'.format(self.fps) 110 | 111 | def _dummy_target(self): 112 | dt = np.zeros(self.num_classes, dtype=np.float32) 113 | dt[-1] = 1 114 | return dt 115 | 116 | def _annotations_to_targets(self, labels): 117 | """ 118 | Maps chord annotations to 25 classes (12 major, 12 minor, 1 no chord) 119 | 120 | :param labels: chord labels 121 | :return: one-hot encoding of class id per annotation 122 | """ 123 | # first, create chord/class mapping. root note 'A' has id 0, increasing 124 | # with each semitone. we have duplicate mappings for flat and sharp 125 | # notes, just to be sure. 126 | natural = zip(string.uppercase[:7], [0, 2, 3, 5, 7, 8, 10]) 127 | sharp = map(lambda v: (v[0] + '#', (v[1] + 1) % 12), natural) 128 | flat = map(lambda v: (v[0] + 'b', (v[1] - 1) % 12), natural) 129 | 130 | # 'no chord' is coded as 'N'. The class ID of 'N' is 24, after all 131 | # major and minor chords. Sometimes there is also an 'X' annotation, 132 | # meaning that the chord cannot be properly determined on beat-lebel 133 | # (too much going on in the audio). We will treat this also as 134 | # 'no chord' 135 | root_note_map = dict(natural + sharp + flat + [('N', 24), ('X', 24)]) 136 | 137 | # then, we load the annotations, map the chords to class ids, and 138 | # finally map class ids to a one-hot encoding. first, map the root 139 | # notes. 140 | chord_root_notes = [c.split(':')[0].split('/')[0] for c in labels] 141 | chord_root_note_ids = np.array([root_note_map[crn] 142 | for crn in chord_root_notes]) 143 | 144 | # then, map the chords to major and minor. we assume chords with a 145 | # minor third as first interval are considered minor chords, 146 | # the rest are major chords, following MIREX, as stated in 147 | # Taemin Cho, Juan Bello: "On the relative importance of Individual 148 | # Components of Chord Recognition Systems" 149 | 150 | chord_type = [c.split(':')[1] if ':' in c else '' for c in labels] 151 | 152 | # we will shift the class ids for all minor notes by 12 153 | # (num major chords) 154 | chord_type_shift = np.array( 155 | map(lambda x: 12 if 'min' in x or 'dim' in x else 0, chord_type) 156 | ) 157 | 158 | # now we can compute the final chord class id 159 | return one_hot(chord_root_note_ids + chord_type_shift, 160 | self.num_classes) 161 | 162 | def _targets_to_annotations(self, targets): 163 | natural = zip([0, 2, 3, 5, 7, 8, 10], string.uppercase[:7]) 164 | sharp = map(lambda v: ((v[0] + 1) % 12, v[1] + '#'), natural) 165 | 166 | semitone_to_label = dict(sharp + natural) 167 | 168 | def pred_to_label(pred): 169 | if pred == 24: 170 | return 'N' 171 | return '{}:{}'.format(semitone_to_label[pred % 12], 172 | 'maj' if pred < 12 else 'min') 173 | 174 | spf = 1. / self.fps 175 | labels = [(i * spf, pred_to_label(p)) for i, p in enumerate(targets)] 176 | 177 | # join same consequtive predictions 178 | prev_label = (None, None) 179 | uniq_labels = [] 180 | 181 | for label in labels: 182 | if label[1] != prev_label[1]: 183 | uniq_labels.append(label) 184 | prev_label = label 185 | 186 | # end time of last label is one frame duration after 187 | # the last prediction time 188 | start_times, chord_labels = zip(*uniq_labels) 189 | end_times = start_times[1:] + (labels[-1][0] + spf,) 190 | 191 | return zip(start_times, end_times, chord_labels) 192 | 193 | 194 | class ChordsRoot(IntervalAnnotationTarget): 195 | 196 | def __init__(self, fps): 197 | # 13 classes - 12 semitones and "no chord" 198 | super(ChordsRoot, self).__init__(fps, 13) 199 | 200 | @property 201 | def name(self): 202 | return 'chords_root_fps={}'.format(self.fps) 203 | 204 | def _dummy_target(self): 205 | dt = np.zeros(self.num_classes, dtype=np.float32) 206 | dt[-1] = 1 207 | return dt 208 | 209 | def _annotations_to_targets(self, labels): 210 | """ 211 | Maps chord annotations to 13 classes (12 root tones, 1 no chord) 212 | 213 | :param labels: chord label 214 | :return: class id per annotation 215 | """ 216 | # first, create chord/class mapping. root note 'A' has id 0, increasing 217 | # with each semitone. we have duplicate mappings for flat and sharp 218 | # notes, just to be sure. 219 | natural = zip(string.uppercase[:7], [0, 2, 3, 5, 7, 8, 10]) 220 | sharp = map(lambda v: (v[0] + '#', (v[1] + 1) % 12), natural) 221 | flat = map(lambda v: (v[0] + 'b', (v[1] - 1) % 12), natural) 222 | 223 | # 'no chord' is coded as 'N'. The class ID of 'N' is 12, after all 224 | # root notes. Sometimes there is also an 'X' annotation, 225 | # meaning that the chord cannot be properly determined on beat-lebel 226 | # (too much going on in the audio). We will treat this also as 227 | # 'no chord' 228 | root_note_map = dict(natural + sharp + flat + [('N', 12), ('X', 12)]) 229 | 230 | # then, we load the annotations, map the chords to class ids, and 231 | # finally map class ids to a one-hot encoding. first, map the root 232 | # notes. 233 | chord_root_notes = [c.split(':')[0].split('/')[0] for c in labels] 234 | chord_root_note_ids = np.array([root_note_map[crn] 235 | for crn in chord_root_notes]) 236 | 237 | return one_hot(chord_root_note_ids, self.num_classes) 238 | 239 | def _targets_to_annotations(self, targets): 240 | natural = zip([0, 2, 3, 5, 7, 8, 10], string.uppercase[:7]) 241 | sharp = map(lambda v: ((v[0] + 1) % 12, v[1] + '#'), natural) 242 | 243 | semitone_to_label = dict(sharp + natural + [(12, 'N')]) 244 | spf = 1. / self.fps 245 | labels = [(i * spf, semitone_to_label[p]) 246 | for i, p in enumerate(targets)] 247 | 248 | # join same consequtive predictions 249 | prev_label = (None, None) 250 | uniq_labels = [] 251 | 252 | for label in labels: 253 | if label[1] != prev_label[1]: 254 | uniq_labels.append(label) 255 | prev_label = label 256 | 257 | # end time of last label is one frame duration after 258 | # the last prediction time 259 | start_times, chord_labels = zip(*uniq_labels) 260 | end_times = start_times[1:] + (labels[-1][0] + spf,) 261 | 262 | return zip(start_times, end_times, chord_labels) 263 | 264 | 265 | class ChordsMajMinSevenths(IntervalAnnotationTarget): 266 | 267 | def __init__(self, fps): 268 | # 73 classes - maj, 7, maj7, min, min7 minmaj7 with 12 each, 1 no chord 269 | super(ChordsMajMinSevenths, self).__init__(fps, 73) 270 | 271 | @property 272 | def name(self): 273 | return 'chords_majminsevenths_fps={}'.format(self.fps) 274 | 275 | def _dummy_target(self): 276 | dt = np.zeros(self.num_classes, dtype=np.float32) 277 | dt[-1] = 1 278 | return dt 279 | 280 | def _annotations_to_targets(self, labels): 281 | root, semis, _ = mir_eval.chord.encode_many(labels, True) 282 | class_ids = root.copy() 283 | 284 | # 'no chord' is last class 285 | class_ids[class_ids == -1] = self.num_classes - 1 286 | 287 | # minor chords start at idx 36 288 | class_ids[semis[:, 3] == 1] += 36 289 | 290 | # seventh shift 291 | seventh = semis[:, 10] == 1 292 | maj_seventh = semis[:, 11] == 1 293 | 294 | # this weirdness is necessary because of a B:sus4(b7)/7 annotation 295 | # in the RWC corpus... 296 | maj_seventh &= ~seventh 297 | assert (seventh & maj_seventh).sum() == 0 298 | 299 | class_ids[seventh] += 12 300 | class_ids[maj_seventh] += 24 301 | 302 | return one_hot(class_ids, self.num_classes) 303 | 304 | def _targets_to_annotations(self, targets): 305 | natural = zip([0, 2, 3, 5, 7, 8, 10], string.uppercase[:7]) 306 | sharp = map(lambda v: ((v[0] + 1) % 12, v[1] + '#'), natural) 307 | roots = {(a - 3) % 12: b for a, b in dict(sharp + natural).iteritems()} 308 | ext = ['maj', '7', 'maj7', 'min', 'min7', 'minmaj7'] 309 | 310 | def pred_to_label(pred): 311 | if pred == self.num_classes - 1: 312 | return 'N' 313 | 314 | return '{root}:{ext}'.format( 315 | root=roots[pred % 12], 316 | ext=ext[pred / 12] 317 | ) 318 | 319 | spf = 1. / self.fps 320 | labels = [(i * spf, pred_to_label(p)) for i, p in enumerate(targets)] 321 | 322 | # join same consequtive predictions 323 | prev_label = (None, None) 324 | uniq_labels = [] 325 | 326 | for label in labels: 327 | if label[1] != prev_label[1]: 328 | uniq_labels.append(label) 329 | prev_label = label 330 | 331 | # end time of last label is one frame duration after 332 | # the last prediction time 333 | start_times, chord_labels = zip(*uniq_labels) 334 | end_times = start_times[1:] + (labels[-1][0] + spf,) 335 | 336 | return zip(start_times, end_times, chord_labels) 337 | 338 | 339 | class ChromaTarget(IntervalAnnotationTarget): 340 | 341 | def __init__(self, fps): 342 | # vector of 12 semitones 343 | super(ChromaTarget, self).__init__(fps, 12) 344 | 345 | @property 346 | def name(self): 347 | return 'chroma_target_fps={}'.format(self.fps) 348 | 349 | def _dummy_target(self): 350 | return mir_eval.chord.NO_CHORD_ENCODED[1] 351 | 352 | def _annotations_to_targets(self, labels): 353 | roots, bitmaps, _ = mir_eval.chord.encode_many(labels) 354 | chromas = mir_eval.chord.rotate_bitmaps_to_roots(bitmaps, roots) 355 | return chromas 356 | 357 | def _targets_to_annotations(self, targets): 358 | raise RuntimeError('Does not work with this target.') 359 | 360 | 361 | def add_sacred_config(ex): 362 | ex.add_named_config( 363 | 'chords_maj_min', 364 | target=dict( 365 | name='ChordsMajMin', 366 | params=dict() 367 | ) 368 | ) 369 | ex.add_named_config( 370 | 'chords_root', 371 | target=dict( 372 | name='ChordsRoot', 373 | params=dict() 374 | ) 375 | ) 376 | ex.add_named_config( 377 | 'chords_maj_min_sevenths', 378 | target=dict( 379 | name='ChordsMajMinSevenths', 380 | params=dict() 381 | ) 382 | ) 383 | 384 | 385 | def create_target(fps, config): 386 | return globals()[config['name']](fps=fps, **config['params']) 387 | 388 | -------------------------------------------------------------------------------- /chordrec/test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import os 4 | import numpy as np 5 | 6 | from dmgr.iterators import iterate_batches 7 | from nn.utils import Colors 8 | 9 | 10 | PREDICTION_EXT = '.chords.txt' 11 | 12 | 13 | def compute_labeling(process_fn, target, agg_dataset, dest_dir, use_mask, 14 | batch_size=None, extension='.chords.txt'): 15 | """ 16 | Computes and saves the labels for each datasource in an aggragated 17 | datasource 18 | :param process_fn: theano function that gives the nn's output 19 | :param target: target computer 20 | :param agg_dataset: aggragated datasource. 21 | :param dest_dir: where to store predicted chord labels 22 | :param use_mask: if the network is an rnn 23 | :param batch_size: Batch size if each datasource is to be processed batch-wise 24 | :param extension: file extension of the resulting files 25 | :return: list of files containing the predictions 26 | """ 27 | if not os.path.exists(dest_dir): 28 | os.makedirs(dest_dir) 29 | else: 30 | if not os.path.isdir(dest_dir): 31 | print(Colors.red('Destination path exists but is not a directory!'), 32 | file=sys.stderr) 33 | return 34 | 35 | pred_files = [] 36 | 37 | for ds_idx in range(agg_dataset.n_datasources): 38 | ds = agg_dataset.datasource(ds_idx) 39 | 40 | pred = [] 41 | for data, _ in iterate_batches(ds, batch_size or ds.n_data, 42 | randomise=False, expand=False): 43 | if use_mask: 44 | data = data[np.newaxis, :] 45 | mask = np.ones(data.shape[:2], dtype=np.float32) 46 | 47 | p = process_fn(data, mask)[0] 48 | else: 49 | p = process_fn(data) 50 | 51 | pred.append(p.argmax(axis=1)) 52 | 53 | pred = np.concatenate(pred) 54 | 55 | pred_file = os.path.join(dest_dir, ds.name + extension) 56 | target.write_chord_predictions(pred_file, pred) 57 | pred_files.append(pred_file) 58 | 59 | return pred_files 60 | 61 | 62 | def compute_scores(annotation_files, prediction_files): 63 | assert len(annotation_files) == len(prediction_files) 64 | assert len(annotation_files) > 0 65 | import mir_eval 66 | 67 | scores = [] 68 | total_length = 0. 69 | 70 | for af, pf in zip(annotation_files, prediction_files): 71 | ann_int, ann_lab = mir_eval.io.load_labeled_intervals(af) 72 | pred_int, pred_lab = mir_eval.io.load_labeled_intervals(pf) 73 | 74 | # we assume that the end-time of the last annotated label is the 75 | # length of the song 76 | song_length = ann_int[-1][1] 77 | total_length += song_length 78 | 79 | scores.append( 80 | (pf, song_length, 81 | mir_eval.chord.evaluate(ann_int, ann_lab, pred_int, pred_lab)) 82 | ) 83 | 84 | return scores, total_length 85 | 86 | 87 | def average_scores(scores, total_length): 88 | # initialise the average score with all metrics and values 0. 89 | avg_score = {metric: 0. for metric in scores[0][-1]} 90 | 91 | for _, length, score in scores: 92 | weight = length / total_length 93 | for metric in score: 94 | avg_score[metric] += float(weight * score[metric]) 95 | 96 | return avg_score 97 | 98 | 99 | def compute_average_scores(annotation_files, prediction_files): 100 | # first, compute all individual scores 101 | scores, total_length = compute_scores(annotation_files, prediction_files) 102 | return average_scores(scores, total_length) 103 | 104 | 105 | def print_scores(scores): 106 | for name, val in scores.iteritems(): 107 | label = '\t{}:'.format(name).ljust(16) 108 | print(label + '{:.3f}'.format(val)) 109 | -------------------------------------------------------------------------------- /experiments/feature_cache/README: -------------------------------------------------------------------------------- 1 | This directory will contain cached features. 2 | -------------------------------------------------------------------------------- /experiments/ismir2016/chroma.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | datasource: 3 | cached: true 4 | context_size: 13 5 | datasets: [beatles, zweieck, queen, rwc, robbie_williams] 6 | preprocessors: [] 7 | test_fold: [0, 1, 2, 3, 4, 5, 6, 7] 8 | val_fold: null 9 | feature_extractor: 10 | name: ChromaCq 11 | params: {fps: 9.986413043478262, log_eta: null, win_center: null, win_width: null} 12 | model: 13 | type: 'dnn' 14 | dense: {batch_norm: false, dropout: 0.5, nonlinearity: 'rectify', 15 | num_layers: 0, num_units: 0} 16 | out_nonlinearity: 'softmax' 17 | observations: 'results/chroma_test' 18 | optimiser: 19 | name: 'adam' 20 | params: {learning_rate: 0.001} 21 | schedule: null 22 | regularisation: {l1: 0.0, l2: 0.0001} 23 | # this is the seed of the best achieved result seed: 288170960 24 | target: 25 | name: ChordsMajMin 26 | params: {} 27 | testing: {batch_size: null, test_on_val: false} 28 | training: {batch_size: 512, early_stop: 20, early_stop_acc: true, 29 | iterator: 'BatchIterator', num_epochs: 500} 30 | -------------------------------------------------------------------------------- /experiments/ismir2016/chroma_wlog.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | datasource: 3 | cached: true 4 | context_size: 15 5 | datasets: [beatles, zweieck, queen, rwc, robbie_williams] 6 | preprocessors: [] 7 | test_fold: [0, 1, 2, 3, 4, 5, 6, 7] 8 | val_fold: null 9 | feature_extractor: 10 | name: ChromaCq 11 | params: {fps: 9.986413043478262, log_eta: 1000, win_center: 60, win_width: 15} 12 | model: 13 | type: 'dnn' 14 | dense: {batch_norm: false, dropout: 0.5, nonlinearity: 'rectify', 15 | num_layers: 0, num_units: 0} 16 | out_nonlinearity: 'softmax' 17 | observations: 'results/chroma_wlog_test' 18 | optimiser: 19 | name: 'adam' 20 | params: {learning_rate: 0.001} 21 | schedule: null 22 | regularisation: {l1: 0.0, l2: 0.0001} 23 | # this is the seed of the best achieved result seed: 906228973 24 | target: 25 | name: ChordsMajMin 26 | params: {} 27 | testing: {batch_size: null, test_on_val: false} 28 | training: {batch_size: 512, early_stop: 20, early_stop_acc: true, 29 | iterator: 'BatchIterator', num_epochs: 500} 30 | -------------------------------------------------------------------------------- /experiments/ismir2016/data: -------------------------------------------------------------------------------- 1 | ../data -------------------------------------------------------------------------------- /experiments/ismir2016/deep_chroma.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | chroma_network: 3 | model: 4 | type: 'chroma_dnn' 5 | dense: {batch_norm: false, dropout: 0.5, nonlinearity: 'rectify', 6 | num_layers: 3, num_units: 512} 7 | out_nonlinearity: 'sigmoid' 8 | optimiser: 9 | name: 'adam' 10 | params: {learning_rate: 0.0001} 11 | schedule: null 12 | regularisation: {l1: 0.0, l2: 0.0001} 13 | training: {batch_size: 512, early_stop: 20, early_stop_acc: false, 14 | iterator: 'BatchIterator', num_epochs: 500} 15 | datasource: 16 | cached: true 17 | context_size: 7 18 | datasets: [beatles, zweieck, queen, rwc, robbie_williams] 19 | preprocessors: [] 20 | test_fold: [0, 1, 2, 3, 4, 5, 6, 7] 21 | val_fold: null 22 | feature_extractor: 23 | name: LogFiltSpec 24 | params: 25 | fmax: 5500 26 | fmin: 30 27 | fps: 10 28 | frame_sizes: [8192] 29 | num_bands: 24 30 | unique_filters: false 31 | observations: 'results/deep_chroma_test' 32 | optimiser: 33 | name: 'adam' 34 | params: {learning_rate: 0.001} 35 | schedule: null 36 | # this is the seed of the best achieved result. seed: 13436906 37 | target: 38 | name: ChordsMajMin 39 | params: {} 40 | regularisation: {l1: 0.0, l2: 0.0001} 41 | testing: {batch_size: 512, test_on_val: false} 42 | training: {batch_size: 512, early_stop: 20, early_stop_acc: true, 43 | iterator: 'BatchIterator', num_epochs: 500} 44 | -------------------------------------------------------------------------------- /experiments/ismir2016/feature_cache: -------------------------------------------------------------------------------- 1 | ../feature_cache -------------------------------------------------------------------------------- /experiments/ismir2016/logfiltspec.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | datasource: 3 | cached: true 4 | context_size: 5 5 | datasets: [beatles, zweieck, queen, rwc, robbie_williams] 6 | preprocessors: [] 7 | test_fold: [0, 1, 2, 3, 4, 5, 6, 7] 8 | val_fold: null 9 | feature_extractor: 10 | name: LogFiltSpec 11 | params: 12 | fmax: 5500 13 | fmin: 30 14 | fps: 10 15 | frame_sizes: [8192] 16 | num_bands: 24 17 | unique_filters: false 18 | model: 19 | type: 'dnn' 20 | dense: {batch_norm: false, dropout: 0.5, nonlinearity: 'rectify', 21 | num_layers: 0, num_units: 0} 22 | out_nonlinearity: 'softmax' 23 | observations: 'results/logfiltspec_test' 24 | optimiser: 25 | name: 'adam' 26 | params: {learning_rate: 0.001} 27 | schedule: null 28 | regularisation: {l1: 0.0, l2: 0.0001} 29 | # this is the seed that achieved the best results seed: 55835954 30 | target: 31 | name: ChordsMajMin 32 | params: {} 33 | testing: {batch_size: null, test_on_val: false} 34 | training: {batch_size: 512, early_stop: 20, early_stop_acc: true, 35 | iterator: 'BatchIterator', num_epochs: 500} 36 | -------------------------------------------------------------------------------- /experiments/ismir2016/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # echo on 4 | set -x 5 | 6 | # number of runs to perform of each experiment 7 | N_RUNS=10 8 | 9 | # run chord classification ... 10 | # ... with deep chroma extractor 11 | for i in `seq $N_RUNS` 12 | do 13 | python -m chordrec.chroma with deep_chroma.yaml 14 | done 15 | 16 | # ... with simple chromas 17 | for i in `seq $N_RUNS` 18 | do 19 | python -m chordrec.classify with chroma.yaml 20 | done 21 | 22 | # ... with weighted, logarithmised chromas 23 | for i in `seq $N_RUNS` 24 | do 25 | python -m chordrec.classify with chroma_wlog.yaml 26 | done 27 | 28 | # ... with logarithmic filtered spectrogram 29 | for i in `seq $N_RUNS` 30 | do 31 | python -m chordrec.classify with logfiltspec.yaml 32 | done 33 | -------------------------------------------------------------------------------- /experiments/madmom2016/README.md: -------------------------------------------------------------------------------- 1 | # Training Models for the `madmom` Audio Processing Framework 2 | 3 | The following text will guide you through the process of training 4 | chord-recognition related models of `madmom`. 5 | 6 | ## Deep Chroma Extractor 7 | 8 | To train the Deep Chroma Extractor model, simply run 9 | 10 | $ python -m chordrec.chroma with deep_chroma.yaml 11 | 12 | and note the experiment ID (``). After training finished, you can 13 | convert the learned model to a madmom compatible model by running 14 | 15 | $ ./create_madmom_deep_chroma_model.py results//artifacts/params_fold_None.pkl \ 16 | chroma_dnn.pkl 17 | 18 | This will create a file "chords_dnn.pkl" which contains the madmom 19 | neural network model. 20 | 21 | ## Deep Chroma Chord Recogniser 22 | 23 | Before training the deep chroma chord recogniser, make sure to train the 24 | deep chroma extractor and note its experiment ID. The trained chord recogniser 25 | will work best with this chroma extractor. 26 | 27 | To train the Deep Chroma Chord Recogniser, run 28 | 29 | $ python -m chordrec.classify with crf_chord_rec.yaml \ 30 | feature_extractor.params.name='../../results//artifacts' 31 | 32 | and note the experiment ID (``). Then, convert the learned model 33 | to the a madmom compatible one using 34 | 35 | $ ./create_madmom_crf_model.py results//params_fold_None.pkl \ 36 | chords_dccrf.pkl 37 | 38 | This will create a file "chords_dccrf.pkl" which contains the madmom CRF model 39 | for chord recognition. 40 | 41 | ## ConvNet Chord Recogniser 42 | 43 | The ConvNet Chord Recogniser consists of a) the feature extraction ConvNet 44 | and b) a CRF for decoding the chord sequence. First, you need to train 45 | the ConvNet: 46 | 47 | $ python -m chordrec.classify with chord_feature_convnet.yaml 48 | 49 | Note the experiment id (``). Then, create the parameter 50 | initialisation file for the CRF, 51 | 52 | $ ./create_crf_init_params.py results//artifacts/params_fold_None.pkl \ 53 | crf_init_params.pkl 54 | 55 | and train the CRF for chord sequence decoding: 56 | 57 | $ python -m chordrec.classify with crf_chord_rec.yaml \ 58 | feature_extractor.params.name='../../results//artifacts/features_fold_None' \ 59 | training.init_file='crf_init_params.pkl' 60 | 61 | Also note the corresponding experiment id (``). Then, convert the 62 | learned models to madmom models: 63 | 64 | $ ./create_madmom_convnet_model.py results//artifacts/params_fold_None.pkl \ 65 | chords_cnnfeat.pkl 66 | $ ./create_madmom_crf_model.py results//artifacts/params_fold_None.pkl \ 67 | chords_cnncrf.pkl 68 | 69 | This will create two files (`chords_cnnfeat.pkl` and `chords_cnncrf.pkl`) which 70 | contain the CNN feature extraction model and the CRF chord recognition model 71 | respectively. -------------------------------------------------------------------------------- /experiments/madmom2016/chord_feature_convnet.yaml: -------------------------------------------------------------------------------- 1 | augmentation: 2 | Detuning: {bins_per_semitone: 2, max_shift: 0.4, p: 1.0} 3 | SemitoneShift: {bins_per_semitone: 2, max_shift: 4, p: 1.0} 4 | datasource: 5 | cached: false 6 | context_size: 11 7 | datasets: [beatles, queen, zweieck, robbie_williams, rwc, billboard] 8 | preprocessors: [] 9 | test_fold: null 10 | val_fold: null 11 | feature_extractor: 12 | name: LogFiltSpec 13 | params: 14 | fmax: 2600 15 | fmin: 60 16 | fps: 10 17 | frame_sizes: [8192] 18 | num_bands: 24 19 | unique_filters: true 20 | model: 21 | conv: 22 | conv1: 23 | batch_norm: true 24 | dropout: 0.5 25 | filter_size: [3, 3] 26 | num_filters: 32 27 | num_layers: 4 28 | pad: valid 29 | pool_size: [1, 2] 30 | conv2: 31 | batch_norm: true 32 | dropout: 0.5 33 | filter_size: [3, 3] 34 | num_filters: 64 35 | num_layers: 2 36 | pad: valid 37 | pool_size: [1, 2] 38 | conv3: 39 | batch_norm: true 40 | dropout: 0.5 41 | filter_size: [9, 12] 42 | num_filters: 128 43 | num_layers: 1 44 | pad: valid 45 | pool_size: null 46 | gap: {batch_norm: true, gap_nonlinearity: linear} 47 | out_nonlinearity: softmax 48 | type: avg_gap_feature 49 | observations: results 50 | optimiser: 51 | name: adam 52 | params: {learning_rate: 0.001} 53 | schedule: null 54 | regularisation: {l1: 0, l2: 1.0e-07} 55 | target: 56 | name: ChordsMajMin 57 | params: {} 58 | testing: {batch_size: 512, test_on_val: false} 59 | training: {batch_size: 512, early_stop: 5, early_stop_acc: true, num_epochs: 500} 60 | -------------------------------------------------------------------------------- /experiments/madmom2016/create_crf_init_params.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | from docopt import docopt 4 | 5 | USAGE = """ 6 | create_crf_init_params.py - creates initial crf parameters from a learned 7 | gap convnet. 8 | 9 | Usage: 10 | create_crf_init_params.py 11 | 12 | Arguments: 13 | pickle file containing the learned convnet parameters 14 | file where the initial crf parameters should be stored 15 | """ 16 | 17 | args = docopt(USAGE) 18 | 19 | params = pickle.load(open(args[''])) 20 | conv, beta, gamma, mean, inv_std = params[-5:] 21 | 22 | c = (beta - mean * gamma * inv_std) 23 | W = (conv.reshape(conv.shape[:2]) * gamma[:, np.newaxis] * 24 | inv_std[:, np.newaxis]).T 25 | pi = np.zeros_like(c) 26 | tau = np.zeros_like(c) 27 | A = np.zeros((len(beta), len(beta))) 28 | 29 | pickle.dump([pi.astype(np.float32), 30 | tau.astype(np.float32), 31 | c.astype(np.float32), 32 | A.astype(np.float32), 33 | W.astype(np.float32)], open(args[''], 'w')) 34 | -------------------------------------------------------------------------------- /experiments/madmom2016/create_madmom_convnet_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import madmom as mm 4 | import numpy as np 5 | import pickle 6 | from docopt import docopt 7 | from madmom.ml.nn.layers import (ConvolutionalLayer, 8 | BatchNormLayer, MaxPoolLayer) 9 | from madmom.ml.nn.activations import relu 10 | 11 | USAGE = """ 12 | create_madmom_convnet_model.py - creates madmom convnet models for chord rec. 13 | 14 | Usage: 15 | create_madmom_convnet_model.py 16 | 17 | Arguments: 18 | source lasagne model file name 19 | destination madmom model file name 20 | """ 21 | 22 | args = docopt(USAGE) 23 | 24 | 25 | def conv_block(p, n_layers): 26 | layers = [] 27 | for i in range(n_layers): 28 | layers.append(ConvolutionalLayer(p[0].transpose(1, 0, 2, 3), 29 | np.array([0]))) 30 | layers.append(BatchNormLayer(*p[1:5], activation_fn=relu)) 31 | del p[:5] 32 | return layers 33 | 34 | p = pickle.load(open(args[''])) 35 | 36 | layers = [] 37 | layers += conv_block(p, 4) 38 | layers.append(MaxPoolLayer((1, 2))) 39 | layers += conv_block(p, 2) 40 | layers.append(MaxPoolLayer((1, 2))) 41 | layers += conv_block(p, 1) 42 | 43 | nn = mm.ml.nn.NeuralNetwork(layers) 44 | nn.dump(args['']) 45 | -------------------------------------------------------------------------------- /experiments/madmom2016/create_madmom_crf_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import madmom as mm 4 | import pickle 5 | from docopt import docopt 6 | 7 | USAGE = """ 8 | create_madmom_crf_model.py - creates madmom CRF models. 9 | 10 | Usage: 11 | create_madmom_deep_chroma_model.py 12 | 13 | Arguments: 14 | source spaghetti model file 15 | destination madmom model file 16 | """ 17 | 18 | args = docopt(USAGE) 19 | 20 | pi, tau, c, A, W = pickle.load(open(args[''])) 21 | crf = mm.ml.crf.ConditionalRandomField(pi, tau, c, A, W) 22 | pickle.dump(crf, open(args[''], 'wb')) 23 | -------------------------------------------------------------------------------- /experiments/madmom2016/create_madmom_deep_chroma_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import madmom as mm 4 | import pickle 5 | from docopt import docopt 6 | 7 | USAGE = """ 8 | create_madmom_deep_chroma_model.py - creates madmom models for the 9 | DeepChromaProcessor. 10 | 11 | Usage: 12 | create_madmom_deep_chroma_model.py 13 | 14 | Arguments: 15 | source lasagne model 16 | destination madmom model 17 | """ 18 | 19 | args = docopt(USAGE) 20 | 21 | p = pickle.load(open(args[''])) 22 | nn = mm.ml.nn.NeuralNetwork([ 23 | mm.ml.nn.layers.FeedForwardLayer( 24 | p[i], p[i+1], 25 | # relu layers, but last layer is sigmoid 26 | mm.ml.nn.activations.relu if i < len(p) - 4 else 27 | mm.ml.nn.activations.sigmoid 28 | ) 29 | for i in range(0, len(p) - 2, 2) 30 | ]) 31 | nn.dump(args['']) 32 | -------------------------------------------------------------------------------- /experiments/madmom2016/crf_chord_rec.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | datasource: 3 | cached: false 4 | context_size: 0 5 | datasets: [beatles, queen, zweieck, robbie_williams, rwc, billboard] 6 | preprocessors: [] 7 | test_fold: null 8 | val_fold: null 9 | feature_extractor: 10 | name: PrecomputedFeature 11 | params: 12 | fps: 10 13 | name: 'substitute this on the command line according to README.md' 14 | model: {type: crf} 15 | observations: 'results' 16 | optimiser: 17 | name: adam 18 | params: {learning_rate: 0.01} 19 | schedule: null 20 | regularisation: {l1: 0.0001, l2: 0.0} 21 | target: 22 | name: ChordsMajMin 23 | params: {} 24 | testing: {batch_size: null, test_on_val: false} 25 | training: {batch_size: 32, early_stop: 20, early_stop_acc: true, max_seq_len: 1024, 26 | num_epochs: 500} -------------------------------------------------------------------------------- /experiments/madmom2016/deep_chroma.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | chroma_network: 3 | model: 4 | dense: {batch_norm: false, dropout: 0.5, nonlinearity: rectify, num_layers: 3, 5 | num_units: 256} 6 | out_nonlinearity: sigmoid 7 | type: chroma_dnn 8 | optimiser: 9 | name: adam 10 | params: {learning_rate: 0.0001} 11 | schedule: null 12 | regularisation: {l1: 0.0, l2: 0.0001} 13 | training: {batch_size: 512, early_stop: 20, early_stop_acc: false, iterator: BatchIterator, 14 | num_epochs: 500} 15 | datasource: 16 | cached: true 17 | context_size: 7 18 | datasets: [beatles, zweieck, queen, rwc, robbie_williams, billboard] 19 | preprocessors: [] 20 | test_fold: null 21 | val_fold: null 22 | feature_extractor: 23 | name: LogFiltSpec 24 | params: 25 | fmax: 2100 26 | fmin: 65 27 | fps: 10 28 | frame_sizes: [8192] 29 | num_bands: 24 30 | unique_filters: true 31 | observations: results 32 | optimiser: 33 | name: adam 34 | params: {learning_rate: 0.001} 35 | schedule: null 36 | regularisation: {l1: 0.0, l2: 0.0001} 37 | target: 38 | name: ChordsMajMin 39 | params: {} 40 | testing: {batch_size: 512, test_on_val: false} 41 | training: {batch_size: 512, early_stop: 20, early_stop_acc: true, iterator: BatchIterator, 42 | num_epochs: 500} 43 | -------------------------------------------------------------------------------- /experiments/mlsp2016/README.md: -------------------------------------------------------------------------------- 1 | # Running the experiment for the MLSP 2016 paper 2 | 3 | The experiment consists of two steps. First, we train the feature extraction 4 | CNN. Second, we train the conditional random field that decodes chord 5 | sequences. 6 | 7 | ## CNN feature extractor 8 | 9 | To train the convnet, simply run 10 | 11 | $ python -m chordrec.classify with convnet.yaml 12 | 13 | and note the experiment id (``). 14 | 15 | ## CRF chord decoder 16 | 17 | First, create the CRF parameter initialisation files for each fold. We 18 | will save those into a subdirectory `crf_init_params`: 19 | 20 | $ ./create_crf_init_params.py results//artifacts crf_init_params 21 | 22 | and train the CRF for chord sequence decoding: 23 | 24 | $ python -m chordrec.classify with crf.yaml \ 25 | feature_extractor.params.name='../../results//artifacts/features_fold_{fold}' \ 26 | training.init_file='crf_init_params/crf_init_params_{}.pkl' 27 | -------------------------------------------------------------------------------- /experiments/mlsp2016/convnet.yaml: -------------------------------------------------------------------------------- 1 | augmentation: 2 | Detuning: {bins_per_semitone: 2, max_shift: 0.4, p: 1.0} 3 | SemitoneShift: {bins_per_semitone: 2, max_shift: 4, p: 1.0} 4 | datasource: 5 | cached: true 6 | context_size: 7 7 | datasets: [beatles, queen, zweieck, robbie_williams, rwc] 8 | preprocessors: [] 9 | test_fold: [0, 1, 2, 3, 4, 5, 6, 7] 10 | val_fold: null 11 | feature_extractor: 12 | name: LogFiltSpec 13 | params: 14 | fmax: 2100 15 | fmin: 65 16 | fps: 10 17 | frame_sizes: [8192] 18 | num_bands: 24 19 | unique_filters: true 20 | model: 21 | conv: 22 | conv1: 23 | batch_norm: true 24 | dropout: 0.5 25 | filter_size: [3, 3] 26 | num_filters: 32 27 | num_layers: 4 28 | pad: same 29 | pool_size: [1, 2] 30 | conv2: 31 | batch_norm: true 32 | dropout: 0.5 33 | filter_size: [3, 3] 34 | num_filters: 64 35 | num_layers: 2 36 | pad: valid 37 | pool_size: [1, 2] 38 | conv3: 39 | batch_norm: true 40 | dropout: 0.5 41 | filter_size: [9, 12] 42 | num_filters: 128 43 | num_layers: 1 44 | pad: valid 45 | pool_size: null 46 | out_nonlinearity: softmax 47 | type: dnn 48 | observations: 'results' 49 | optimiser: 50 | name: adam 51 | params: {learning_rate: 0.001} 52 | schedule: null 53 | regularisation: {l1: 0, l2: 1.0e-07} 54 | target: 55 | name: ChordsMajMin 56 | params: {} 57 | testing: {batch_size: 512, test_on_val: false} 58 | training: {batch_size: 512, early_stop: 5, early_stop_acc: true, num_epochs: 500} 59 | -------------------------------------------------------------------------------- /experiments/mlsp2016/create_crf_init_params.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from docopt import docopt 5 | from glob import glob 6 | from os.path import join, exists 7 | 8 | 9 | USAGE = """ 10 | create_crf_init_params.py - creates initial crf parameters from a learned 11 | gap convnet. 12 | 13 | Usage: 14 | create_crf_init_params.py 15 | 16 | Arguments: 17 | directory containing the CNN parameter files for each fold 18 | directory where to store the initial CRF parameters 19 | """ 20 | 21 | args = docopt(USAGE) 22 | param_files = glob(join(args[''], 'params*.pkl')) 23 | 24 | if not exists(args['']): 25 | os.makedirs(args['']) 26 | 27 | for fold, pfile in enumerate(param_files): 28 | params = pickle.load(open(pfile)) 29 | conv, beta, gamma, mean, inv_std = params[-5:] 30 | 31 | c = (beta - mean * gamma * inv_std) 32 | W = (conv.reshape(conv.shape[:2]) * gamma[:, np.newaxis] * 33 | inv_std[:, np.newaxis]).T 34 | pi = np.zeros_like(c) 35 | tau = np.zeros_like(c) 36 | A = np.zeros((len(beta), len(beta))) 37 | 38 | dst_file = join(args[''], 'crf_init_params_{}.pkl'.format(fold)) 39 | 40 | pickle.dump([pi.astype(np.float32), 41 | tau.astype(np.float32), 42 | c.astype(np.float32), 43 | A.astype(np.float32), 44 | W.astype(np.float32)], open(dst_file, 'w')) 45 | -------------------------------------------------------------------------------- /experiments/mlsp2016/crf.yaml: -------------------------------------------------------------------------------- 1 | augmentation: null 2 | datasource: 3 | cached: true 4 | context_size: 0 5 | datasets: [beatles, queen, zweieck, robbie_williams, rwc] 6 | preprocessors: [] 7 | test_fold: null 8 | val_fold: null 9 | feature_extractor: 10 | name: PrecomputedFeature 11 | params: 12 | fps: 10 13 | name: 'substitute this on the command line according to README.md' 14 | model: {type: crf} 15 | observations: 'results' 16 | optimiser: 17 | name: adam 18 | params: {learning_rate: 0.01} 19 | schedule: null 20 | regularisation: {l1: 0.0001, l2: 0.0} 21 | target: 22 | name: ChordsMajMin 23 | params: {} 24 | testing: {batch_size: null, test_on_val: false} 25 | training: {batch_size: 32, early_stop: 20, early_stop_acc: true, max_seq_len: 1024, 26 | num_epochs: 500} -------------------------------------------------------------------------------- /experiments/mlsp2016/feature_cache: -------------------------------------------------------------------------------- 1 | ../feature_cache -------------------------------------------------------------------------------- /experiments/mlsp2016/to_madmom_crf.py: -------------------------------------------------------------------------------- 1 | import madmom as mm 2 | import pickle 3 | from glob import glob 4 | from docopt import docopt 5 | from os.path import join 6 | 7 | USAGE = """ 8 | create_madmom_deep_chroma_model.py - creates a madmom crf that predicts chords from deep chroma 9 | vectors. 10 | 11 | Usage: 12 | to_madmom_crf.py [] 13 | 14 | Arguments: 15 | directory containing the parameter files 16 | name format for destination files. '{}' will be replaced 17 | with the model number [default: crf_dc_{}.pkl] 18 | """ 19 | 20 | args = docopt(USAGE) 21 | 22 | args[''] = args[''] or 'crf_dc_{}.pkl' 23 | 24 | param_files = glob(join(args[''], 'params*.pkl')) 25 | 26 | for nid, f in enumerate(param_files): 27 | p = pickle.load(open(f)) 28 | crf = mm.ml.crf.ConditionalRandomField( 29 | initial=p[0], final=p[1], bias=p[2], transition=p[3], observation=p[4] 30 | ) 31 | crf.dump(args[''].format(nid + 1)) 32 | -------------------------------------------------------------------------------- /tools/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fnmatch 3 | from docopt import docopt 4 | 5 | import dmgr 6 | 7 | from chordrec import test 8 | 9 | 10 | USAGE = """ 11 | evaluate.py 12 | 13 | Usage: 14 | evaluate.py [-i IND_RES_FILE] [-o TOT_RES_FILE] FILES... 15 | 16 | Arguments: 17 | FILES annotaion or prediction files 18 | 19 | Options: 20 | -i IND_RES_FILE file where to store individual results 21 | -o TOT_RES_FILE file where to store total results 22 | """ 23 | 24 | 25 | def main(): 26 | args = docopt(USAGE) 27 | 28 | ann_files = fnmatch.filter(args['FILES'], '*.chords') 29 | 30 | pred_files = dmgr.files.match_files( 31 | ann_files, '.chords', 32 | args['FILES'], '.chords.txt' 33 | ) 34 | 35 | test.print_scores(test.compute_average_scores(ann_files, pred_files)) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /tools/extract_perfect_chroma.py: -------------------------------------------------------------------------------- 1 | """ 2 | extract_perfect_chroma.py 3 | 4 | Computes "perfect" chroma vectors based on the ground truth chord 5 | annotations of a file. 6 | 7 | Usage: 8 | extract_perfect_chroma.py [options] ... 9 | 10 | Arguments: 11 | frames per second 12 | directories containing ground truth and audio files. 13 | audio files are needed for song length 14 | 15 | Options: 16 | -o= where to put the resulting chromas 17 | [default: ./feature_cache] 18 | """ 19 | 20 | from os.path import splitext, basename, join 21 | import numpy as np 22 | from itertools import chain, izip 23 | from docopt import docopt 24 | import madmom as mm 25 | import mir_eval 26 | 27 | from dmgr.files import find, match_files 28 | 29 | 30 | def to_chroma(intervals, labels, num_frames, fps): 31 | roots, bitmaps, _ = mir_eval.chord.encode_many(labels) 32 | chromas = mir_eval.chord.rotate_bitmaps_to_roots(bitmaps, roots) 33 | starts = intervals[:, 0] 34 | ends = intervals[:, 1] 35 | 36 | # add dummy events 37 | starts = np.hstack(([-np.inf], starts, ends[-1])) 38 | ends = np.hstack((starts[1], ends, [np.inf])) 39 | chromas = np.vstack((np.zeros(12), chromas, np.zeros(12))) 40 | 41 | # Finally, we create the chroma vectors per frame! 42 | frame_times = np.arange(num_frames, dtype=np.float) / fps 43 | 44 | # IMPORTANT: round everything to milliseconds to prevent errors caused 45 | # by floating point hell. Ideally, we would round everything to 46 | # possible *frame times*, but it is easier this way. 47 | starts = np.round(starts, decimals=3) 48 | ends = np.round(ends, decimals=3) 49 | frame_times = np.round(frame_times, decimals=3) 50 | 51 | target_per_frame = ((starts <= frame_times[:, np.newaxis]) & 52 | (frame_times[:, np.newaxis] < ends)) 53 | 54 | # make sure each frame is assigned to only one target vector 55 | assert (target_per_frame.sum(axis=1) == 1).all() 56 | 57 | # create the one hot vectors per frame 58 | return chromas[np.nonzero(target_per_frame)[1]].astype(np.float32) 59 | 60 | 61 | def main(): 62 | args = docopt(__doc__) 63 | 64 | chord_files = list(chain.from_iterable( 65 | find(d, '*.chords') for d in args[''])) 66 | audio_files = list(chain.from_iterable( 67 | find(d, '*.flac') for d in args[''])) 68 | 69 | if len(chord_files) != len(audio_files): 70 | print 'ERROR: {} chord files, but {} audio files'.format( 71 | len(chord_files), len(audio_files)) 72 | 73 | audio_files = match_files(chord_files, audio_files, '.chords', '.flac') 74 | 75 | for cf, af in izip(chord_files, audio_files): 76 | sig = mm.audio.signal.FramedSignal(af, fps=float(args[''])) 77 | intervals, labels = mir_eval.io.load_labeled_intervals(cf) 78 | 79 | chromas = to_chroma(intervals, labels, sig.num_frames, 80 | float(args[''])) 81 | 82 | chroma_file = splitext(basename(cf))[0] + '.features.npy' 83 | np.save(join(args['-o'], chroma_file), chromas) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /tools/post_process.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import dmgr 3 | import test 4 | import os 5 | import shutil 6 | import fnmatch 7 | import scipy.stats 8 | from targets import ChordsMajMin 9 | from docopt import docopt 10 | from experiment import TempDir 11 | from itertools import tee, izip 12 | 13 | USAGE = """ 14 | Post-Processes chord prediction files. 15 | 16 | Usage: 17 | post_process.py [options] ... 18 | 19 | Options: 20 | --fps= work with this number of frames per second [default: 10] 21 | --win_length= length in seconds of the post-processing filter 22 | [default: 1.0] 23 | --beats use beat-based majority vote 24 | --out_dir= where to put the post-processed results 25 | """ 26 | 27 | 28 | def pairwise(iterable): 29 | a, b = tee(iterable) 30 | next(b, None) 31 | return izip(a, b) 32 | 33 | 34 | def majority_vote(targets, win_size): 35 | context_size = (win_size - 1) / 2 36 | t_wins = dmgr.datasources.segment_axis(targets, frame_size=win_size) 37 | middle = scipy.stats.mode(t_wins, axis=1)[0][:, 0] 38 | start = np.hstack([scipy.stats.mode(targets[:i + 1])[0] 39 | for i in range(context_size)]) 40 | end = np.hstack([scipy.stats.mode(targets[i:])[0] 41 | for i in range(-context_size, 0)]) 42 | return np.hstack((start, middle, end)) 43 | 44 | 45 | def majority_vote_beats(targets, beats): 46 | if len(beats) == 0: 47 | return targets 48 | pp_targets = np.zeros_like(targets) 49 | beats = np.concatenate(([0], beats, [None])) 50 | for start, end in pairwise(beats): 51 | pp_targets[start:end] = scipy.stats.mode(targets[start:end])[0] 52 | return pp_targets 53 | 54 | 55 | def main(): 56 | args = docopt(USAGE) 57 | 58 | fps = float(args['--fps']) 59 | win_size = int(float(args['--win_length']) * fps) 60 | if win_size % 2 == 0: 61 | win_size += 1 62 | 63 | out_dir = args['--out_dir'] 64 | 65 | files = args[''] 66 | ann_files = fnmatch.filter(files, '*.chords') 67 | pred_files = dmgr.files.match_files(ann_files, '.chords', 68 | files, '.chords.txt') 69 | 70 | if args['--beats']: 71 | beat_files = dmgr.files.match_files(ann_files, files, 72 | '.chords', '.beats') 73 | else: 74 | beat_files = None 75 | 76 | pre_filter_scores = test.compute_average_scores(ann_files, pred_files) 77 | print "Pre-Filter scores:" 78 | test.print_scores(pre_filter_scores) 79 | 80 | with TempDir() as tmpdir: 81 | target = ChordsMajMin(fps) 82 | pp_pred_files = [] 83 | for i, pf in enumerate(pred_files): 84 | name = os.path.basename(pf) 85 | targets = target(pf).argmax(axis=1) 86 | 87 | if not args['--beats']: 88 | pp_targets = majority_vote(targets, win_size) 89 | else: 90 | beats = np.loadtxt(beat_files[i], usecols=[0]) * fps 91 | pp_targets = majority_vote_beats(targets, beats) 92 | 93 | target.write_chord_predictions( 94 | os.path.join(tmpdir, name), 95 | pp_targets 96 | ) 97 | pp_pred_files.append(os.path.join(tmpdir, name)) 98 | 99 | post_filter_scores = test.compute_average_scores(ann_files, 100 | pp_pred_files) 101 | print "Post-Filter scores:" 102 | test.print_scores(post_filter_scores) 103 | 104 | if out_dir is not None: 105 | if not os.path.exists(out_dir): 106 | os.makedirs(out_dir) 107 | for f in pp_pred_files: 108 | shutil.move(f, os.path.join(out_dir, os.path.basename(f))) 109 | 110 | 111 | if __name__ == '__main__': 112 | main() 113 | --------------------------------------------------------------------------------