├── README.md ├── utils.py ├── experiments.cfg ├── focalloss.py ├── .gitignore ├── default_config.cfg ├── config.py ├── test_model.py ├── modules.py ├── training.py ├── dataloader.py ├── decoder.py ├── data └── language_models │ ├── lnn_tri.lm │ └── lnn_bi.lm ├── run_experiment.py ├── model.py └── tests.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # AccentedSpeechRecognition 2 | Experiments on speech recognition robustness to accents and dialects. 3 | 4 | Part of the code was borrowed from https://github.com/SeanNaren/deepspeech.pytorch, please follow their readme for setup. 5 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import time 4 | 5 | def tile(a, dim, n_tile): 6 | """Expands a tensor amongst a given dimension, repeating its components.""" 7 | init_dim = a.size(dim) 8 | repeat_idx = [1] * a.dim() 9 | repeat_idx[dim] = n_tile 10 | a = a.repeat(*(repeat_idx)) 11 | order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)])) 12 | if a.is_cuda: 13 | order_index = order_index.cuda() 14 | return torch.index_select(a, dim, order_index) 15 | 16 | def now_str(): 17 | return time.strftime("%d-%m-%Y_%Hh%Mm%S") -------------------------------------------------------------------------------- /experiments.cfg: -------------------------------------------------------------------------------- 1 | # List of experiments settings to override default_config.cfg 2 | # Use '#' for comments and '!' to seperate experiments 3 | 4 | 5 | 6 | # general 7 | exp_name_prefix 'TestMulti' 8 | 9 | # hyper params 10 | nb_head_layers 4 11 | nb_speech_layers 1 12 | nb_accents_layers 1 13 | 14 | embedding_size 256 15 | 16 | # network config 17 | use_mfcc_in True 18 | use_ivectors_in False 19 | use_embeddings_in False 20 | use_transcripts_out True 21 | use_accents_out True 22 | 23 | ! 24 | 25 | 26 | # general 27 | exp_name_prefix 'TestMulti' 28 | 29 | # hyper params 30 | nb_head_layers 4 31 | nb_speech_layers 1 32 | nb_accents_layers 1 33 | 34 | embedding_size 256 35 | 36 | # network config 37 | use_mfcc_in True 38 | use_ivectors_in False 39 | use_embeddings_in True 40 | use_transcripts_out True -------------------------------------------------------------------------------- /focalloss.py: -------------------------------------------------------------------------------- 1 | # Code taken from https://github.com/clcarwin/focal_loss_pytorch 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | 8 | class FocalLoss(nn.Module): 9 | def __init__(self, gamma=0, alpha=None, size_average=True): 10 | super(FocalLoss, self).__init__() 11 | self.gamma = gamma 12 | self.alpha = alpha 13 | if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha]) 14 | if isinstance(alpha,list): self.alpha = torch.Tensor(alpha) 15 | self.size_average = size_average 16 | 17 | def forward(self, input, target): 18 | if input.dim()>2: 19 | input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W 20 | input = input.transpose(1,2) # N,C,H*W => N,H*W,C 21 | input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C 22 | target = target.view(-1,1) 23 | 24 | logpt = F.log_softmax(input, dim=0) 25 | logpt = logpt.gather(1,target) 26 | logpt = logpt.view(-1) 27 | pt = Variable(logpt.data.exp()) 28 | 29 | if self.alpha is not None: 30 | if self.alpha.type()!=input.data.type(): 31 | self.alpha = self.alpha.type_as(input.data) 32 | at = self.alpha.gather(0,target.data.view(-1)) 33 | logpt = logpt * Variable(at) 34 | 35 | loss = -1 * (1-pt)**self.gamma * logpt 36 | if self.size_average: return loss.mean() 37 | else: return loss.sum() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | bak/ 3 | log.txt 4 | *.wav 5 | mfccs/ 6 | embeddings*/ 7 | *_dataset/ 8 | txt/ 9 | wav/ 10 | ivectors/ 11 | saved_models/ 12 | tensorboard_runs/ 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | -------------------------------------------------------------------------------- /default_config.cfg: -------------------------------------------------------------------------------- 1 | # configuration, separate name and values (can be multiple) with 2 | # if multiple values exists for a field, multiple experiments will be run 3 | # (see config.py: Config.create_multi_dict()) 4 | 5 | # general 6 | exp_name_prefix '' 7 | epochs 30 8 | labels "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ " 9 | batch_size 40 10 | num_workers 4 11 | cuda True 12 | 13 | # hyper params 14 | losses_mix 0.9 15 | learning_rate 3e-4 16 | mfcc_size 40 17 | ivector_size 100 18 | embedding_size 100 19 | rnn_type nn.GRU 20 | rnn_hidden_size 800 21 | nb_head_layers 3 22 | nb_speech_layers 1 23 | nb_accents_layers 1 24 | bidirectional True 25 | bottleneck_size 256 26 | accent_loss 'focal' 27 | 28 | # network config 29 | use_mfcc_in True 30 | use_ivectors_in False 31 | use_embeddings_in False 32 | use_transcripts_out True 33 | use_accents_out False 34 | 35 | # decoder 36 | decoder_alpha 0.8 37 | decoder_beta 1. 38 | decoder_cutoff_top_n 40 39 | decoder_cutoff_prob 1. 40 | decoder_beam_width 100 41 | 42 | # paths 43 | lm_path './data/language_models/cv.lm' 44 | train_manifest './data/CommonVoice_dataset/splits/train.csv' 45 | dev_manifest './data/CommonVoice_dataset/splits/dev.csv' 46 | test_manifest './data/CommonVoice_dataset/splits/test.csv' 47 | tensorboard_path './tensorboard_runs/' 48 | saved_models_path './saved_models/' 49 | 50 | # tests 51 | testing_manifests [('./data/CommonVoice_dataset/splits/test.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/dev.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/testnz.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/testin.csv', './data/language_models/cv.lm'), ('./data/Logi_dataset/splits/nonnative.csv', './data/language_models/lnn_tri.lm'), ('./data/Logi_dataset/splits/native.csv', './data/language_models/lnn_tri.lm')] 52 | #testing_manifests [('./data/Logi_dataset/splits/nonnative.csv', './data/language_models/lnn_bi.lm'), ('./data/Logi_dataset/splits/native.csv', './data/language_models/lnn_bi.lm')] 53 | 54 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import torch.nn as nn 3 | 4 | class Config(collections.MutableMapping): 5 | """A dictionary that applies an arbitrary key-altering 6 | function before accessing the keys""" 7 | 8 | def __init__(self, config_path='./default_config.cfg', sep=' ', *args, **kwargs): 9 | self.store = dict() 10 | self.update(dict(*args, **kwargs)) # use the free update to set keys 11 | 12 | with open(config_path, 'r') as f: 13 | confs = {} 14 | for l in f.readlines(): 15 | if (l[0] is not '#') and (l[0] is not '\n'): # remove comments and empty lines 16 | sep_idx = l.find(sep) 17 | confs[l[:sep_idx]] = eval(l[sep_idx+1:]) 18 | self.update(confs) 19 | 20 | def __getitem__(self, key): 21 | return self.store[self.__keytransform__(key)] 22 | 23 | def __setitem__(self, key, value): 24 | self.store[self.__keytransform__(key)] = value 25 | 26 | def __delitem__(self, key): 27 | del self.store[self.__keytransform__(key)] 28 | 29 | def __iter__(self): 30 | return iter(self.store) 31 | 32 | def __len__(self): 33 | return len(self.store) 34 | 35 | def __keytransform__(self, key): 36 | return key 37 | 38 | def __str__(self): 39 | return self.store.__str__() 40 | 41 | def __repr__(self): 42 | return self.store.__repr__() 43 | 44 | 45 | # def create_multi_dict(self): 46 | # """ Not recomended, please use the patch_config method instead 47 | # """ 48 | # """ Used to create as much configuration needed to run experiments with 49 | # all the possible combinations of values in the conf file.""" 50 | # prev_configs = [{}] 51 | # for key, vals in self.store.items(): 52 | # new_configs = [] 53 | # for v in vals: 54 | # for conf in prev_configs: 55 | # new_conf = {} 56 | # new_conf.update(conf) 57 | # new_configs.append(new_conf) 58 | # new_conf[key] = v 59 | # 60 | # prev_configs = new_configs 61 | # 62 | # return new_configs 63 | 64 | def patch_config(self, patch_path, patch_sep='!', sep=' '): 65 | """Takes a file with config patches separated by a line 66 | starting with the 'patch_sep' argument. 67 | For each creates a new config based on the default one.""" 68 | 69 | new_configs = [] 70 | 71 | with open(patch_path, 'r') as f: 72 | current = {} 73 | for l in f.readlines(): 74 | if (l[0] is not '#') and (l[0] is not '\n'): 75 | if (l[0] is '!'): 76 | new_configs.append(current) 77 | current = {} 78 | else: 79 | sep_idx = l.find(sep) 80 | current[l[:sep_idx]] = eval(l[sep_idx+1:]) 81 | 82 | # Checks if last patch was added 83 | if len(current) > 0: 84 | new_configs.append(current) 85 | 86 | final_configs = [self.store.copy() for __ in range(len(new_configs))] 87 | [store.update(conf) for conf, store in zip(new_configs, final_configs)] 88 | 89 | return final_configs if len(final_configs) > 0 else self.store 90 | -------------------------------------------------------------------------------- /test_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from model import MultiTask 3 | from training import test 4 | from dataloader import MultiDataset, MultiDataLoader 5 | import torch.nn as nn 6 | import torch 7 | from focalloss import FocalLoss 8 | from warpctc_pytorch import CTCLoss 9 | from decoder import GreedyDecoder, BeamCTCDecoder 10 | import sys 11 | import sys 12 | from pathlib import Path 13 | 14 | PRINT_LATEX_TABLE = True 15 | 16 | manual_seed = 666 17 | torch.manual_seed(manual_seed) 18 | torch.cuda.manual_seed_all 19 | print(f'Using torch manual seed {manual_seed}.') 20 | 21 | def eprint(*args, **kwargs): 22 | print(*args, file=sys.stderr, **kwargs) 23 | 24 | 25 | def result_for_manifest(model, criterion, manifest, decoder, target_decoder, batch_size, num_workers): 26 | ### LOADER 27 | test_dataset = MultiDataset(manifest, 28 | model._meta['labels'], 29 | use_mfcc_in=model._meta['use_mfcc_in'], 30 | use_ivectors_in=model._meta['use_ivectors_in'], 31 | use_embeddings_in=model._meta['use_embeddings_in'], 32 | embedding_size=model._meta['embedding_size'], 33 | use_transcripts_out=model._meta['use_transcripts_out'], 34 | use_accents_out=model._meta['use_accents_out']) 35 | 36 | test_loader = MultiDataLoader(test_dataset, 37 | batch_size=batch_size, 38 | shuffle=True, 39 | num_workers=num_workers) 40 | 41 | ### TEST 42 | test_results = test(model, test_loader, criterion, decoder, target_decoder) 43 | test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results 44 | 45 | results_dict = {} 46 | 47 | if test_wer != -1: 48 | results_dict['WER'] = test_wer 49 | if test_accent_acc != -1: 50 | results_dict['Accent accuracy'] = test_accent_acc 51 | 52 | return results_dict 53 | 54 | 55 | def main(model_path, confs): 56 | model, __ = MultiTask.load_model(model_path) 57 | if confs['cuda']: 58 | model = model.cuda() 59 | 60 | 61 | if not model._meta['use_transcripts_out']: # only accent classification 62 | criterion = nn.CrossEntropyLoss() 63 | elif not model._meta['use_accents_out']: # only text recognition 64 | criterion = CTCLoss() 65 | else: # both tasks 66 | criterion = (CTCLoss(), nn.CrossEntropyLoss()) 67 | 68 | 69 | # Results 70 | results = {} 71 | for manifest, lm in confs['testing_manifests']: 72 | eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}') 73 | 74 | # Decoder 75 | if model._meta['use_transcripts_out']: 76 | decoder = BeamCTCDecoder(confs['labels'], 77 | lm_path=lm, 78 | alpha=confs['decoder_alpha'], 79 | beta=confs['decoder_beta'], 80 | cutoff_top_n=confs['decoder_cutoff_top_n'], 81 | cutoff_prob=confs['decoder_cutoff_top_n'], 82 | beam_width=confs['decoder_beam_width'], 83 | num_processes=confs['num_workers']) 84 | 85 | target_decoder = GreedyDecoder(confs['labels']) 86 | else: 87 | decoder, target_decoder = None, None 88 | 89 | # Test 90 | results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers']) 91 | 92 | 93 | if not PRINT_LATEX_TABLE: 94 | print(f'Model: {model_path.split("/")[-1]}') 95 | for name, res in results.items(): 96 | print(f'\nResults for {name}:') 97 | print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()])) 98 | else: 99 | print(' & '.join(['model']+list([k[:-4] for k in results.keys()]))) 100 | val_dict = {} 101 | for k in list(results.values())[0].keys(): 102 | val_dict[k] = [] 103 | for res in results.values(): 104 | [val_dict[k].append(f'{v:.1f}') for k, v in res.items()] 105 | for val in val_dict.values(): 106 | print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\') 107 | 108 | if __name__ == '__main__': 109 | import config 110 | confs = config.Config() 111 | 112 | args = sys.argv[1:] 113 | 114 | if PRINT_LATEX_TABLE: 115 | eprint('\nLatex output selected, change PRINT_LATEX_TABLE in script to False for regular output.') 116 | 117 | for model_path in args: 118 | main(model_path, confs) -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import OrderedDict 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.nn.parameter import Parameter 8 | from torch.autograd import Variable 9 | 10 | supported_rnns = { 11 | 'lstm': nn.LSTM, 12 | 'rnn': nn.RNN, 13 | 'gru': nn.GRU 14 | } 15 | supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items()) 16 | 17 | 18 | class SequenceWise(nn.Module): 19 | def __init__(self, module): 20 | """ 21 | Collapses input of dim T*N*H to (T*N)*H, and applies to a module. 22 | Allows handling of variable sequence lengths and minibatch sizes. 23 | :param module: Module to apply input to. 24 | """ 25 | super(SequenceWise, self).__init__() 26 | self.module = module 27 | 28 | def forward(self, x): 29 | t, n = x.size(0), x.size(1) 30 | x = x.view(t * n, -1) 31 | x = self.module(x) 32 | x = x.view(t, n, -1) 33 | return x 34 | 35 | def __repr__(self): 36 | tmpstr = self.__class__.__name__ + ' (\n' 37 | tmpstr += self.module.__repr__() 38 | tmpstr += ')' 39 | return tmpstr 40 | 41 | 42 | class MaskConv(nn.Module): 43 | def __init__(self, seq_module): 44 | """ 45 | Adds padding to the output of the module based on the given lengths. This is to ensure that the 46 | results of the model do not change when batch sizes change during inference. 47 | Input needs to be in the shape of (BxCxDxT) 48 | :param seq_module: The sequential module containing the conv stack. 49 | """ 50 | super(MaskConv, self).__init__() 51 | self.seq_module = seq_module 52 | 53 | def forward(self, x, lengths): 54 | """ 55 | :param x: The input of size BxCxDxT 56 | :param lengths: The actual length of each sequence in the batch 57 | :return: Masked output from the module 58 | """ 59 | for module in self.seq_module: 60 | x = module(x) 61 | mask = torch.ByteTensor(x.size()).fill_(0) 62 | if x.is_cuda: 63 | mask = mask.cuda() 64 | for i, length in enumerate(lengths): 65 | length = length.item() 66 | if (mask[i].size(2) - length) > 0: 67 | mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1) 68 | x = x.masked_fill(mask, 0) 69 | return x, lengths 70 | 71 | 72 | class InferenceBatchSoftmax(nn.Module): 73 | def forward(self, input_): 74 | if not self.training: 75 | return F.softmax(input_, dim=-1) 76 | else: 77 | return input_ 78 | 79 | 80 | class BatchRNN(nn.Module): 81 | def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True): 82 | super(BatchRNN, self).__init__() 83 | self.input_size = input_size 84 | self.hidden_size = hidden_size 85 | self.bidirectional = bidirectional 86 | self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None 87 | self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, 88 | bidirectional=bidirectional, bias=True) 89 | self.num_directions = 2 if bidirectional else 1 90 | 91 | def flatten_parameters(self): 92 | self.rnn.flatten_parameters() 93 | 94 | def forward(self, x, output_lengths): 95 | if self.batch_norm is not None: 96 | x = self.batch_norm(x) 97 | x = nn.utils.rnn.pack_padded_sequence(x, output_lengths) 98 | x, h = self.rnn(x) 99 | x, _ = nn.utils.rnn.pad_packed_sequence(x) 100 | if self.bidirectional: 101 | x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum 102 | return x 103 | 104 | 105 | class Lookahead(nn.Module): 106 | # Wang et al 2016 - Lookahead Convolution Layer for Unidirectional Recurrent Neural Networks 107 | # input shape - sequence, batch, feature - TxNxH 108 | # output shape - same as input 109 | def __init__(self, n_features, context): 110 | # should we handle batch_first=True? 111 | super(Lookahead, self).__init__() 112 | self.n_features = n_features 113 | self.weight = Parameter(torch.Tensor(n_features, context + 1)) 114 | assert context > 0 115 | self.context = context 116 | self.register_parameter('bias', None) 117 | self.init_parameters() 118 | 119 | def init_parameters(self): # what's a better way initialiase this layer? 120 | stdv = 1. / math.sqrt(self.weight.size(1)) 121 | self.weight.data.uniform_(-stdv, stdv) 122 | 123 | def forward(self, input): 124 | seq_len = input.size(0) 125 | # pad the 0th dimension (T/sequence) with zeroes whose number = context 126 | # Once pytorch's padding functions have settled, should move to those. 127 | padding = torch.zeros(self.context, *(input.size()[1:])).type_as(input.data) 128 | x = torch.cat((input, Variable(padding)), 0) 129 | 130 | # add lookahead windows (with context+1 width) as a fourth dimension 131 | # for each seq-batch-feature combination 132 | x = [x[i:i + self.context + 1] for i in range(seq_len)] # TxLxNxH - sequence, context, batch, feature 133 | x = torch.stack(x) 134 | x = x.permute(0, 2, 3, 1) # TxNxHxL - sequence, batch, feature, context 135 | 136 | x = torch.mul(x, self.weight).sum(dim=3) 137 | return x 138 | 139 | def __repr__(self): 140 | return self.__class__.__name__ + '(' \ 141 | + 'n_features=' + str(self.n_features) \ 142 | + ', context=' + str(self.context) + ')' 143 | 144 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import torch 3 | import numpy as np 4 | import gc 5 | 6 | def get_mixed_loss(criterion, out_text, out_accent, out_lens, accents, transcripts, transcripts_lens, mix=0.5, corrective_coef=1000): 7 | loss, loss_text, loss_accent = None, None, None 8 | 9 | if out_text is None: 10 | loss_accent = criterion(out_accent, accents) 11 | loss = loss_accent 12 | elif out_accent is None: 13 | loss_text = criterion(out_text, transcripts, out_lens, transcripts_lens) 14 | loss = loss_text 15 | else: 16 | loss_text = criterion[0](out_text, transcripts, out_lens, transcripts_lens) 17 | loss_accent = criterion[1](out_accent, accents) 18 | 19 | if loss_accent.is_cuda: 20 | loss_text = loss_text.cuda() 21 | 22 | loss = mix * loss_text + (1 - mix) * loss_accent * corrective_coef 23 | 24 | return loss, loss_text, loss_accent 25 | 26 | 27 | ### TRAINING 28 | 29 | def train(model, train_loader, criterion, optimizer, losses_mix=0.5): 30 | epoch_losses = [] 31 | epoch_losses_text = [] 32 | epoch_losses_accent = [] 33 | 34 | model.train() 35 | 36 | for data in tqdm(train_loader, total=len(train_loader)): 37 | 38 | inputs, inputs_lens, transcripts, transcripts_lens, accents = data 39 | 40 | if next(model.parameters()).is_cuda: 41 | inputs = inputs.cuda() 42 | inputs_lens = inputs_lens.cuda() 43 | 44 | if accents is not None: 45 | accents = accents.cuda() 46 | 47 | out_text, out_accent, out_lens, __ = model(inputs, inputs_lens) 48 | 49 | loss, loss_text, loss_accent = get_mixed_loss(criterion, out_text, out_accent, 50 | out_lens, accents, transcripts, 51 | transcripts_lens, losses_mix) 52 | 53 | optimizer.zero_grad() 54 | loss.backward() 55 | optimizer.step() 56 | 57 | l = loss.clone().item() if loss is not None else None 58 | lt = loss_text.clone().item() if loss_text is not None else None 59 | la = loss_accent.clone().item() if loss_accent is not None else None 60 | epoch_losses.append(l) 61 | epoch_losses_text.append(lt) 62 | epoch_losses_accent.append(la) 63 | 64 | 65 | average_loss = lambda l: sum(l) / len(train_loader) if l[0] is not None else -1 66 | 67 | epoch_loss_i = average_loss(epoch_losses) 68 | epoch_loss_text_i = average_loss(epoch_losses_text) 69 | epoch_loss_accent_i = average_loss(epoch_losses_accent) 70 | 71 | return epoch_loss_i, epoch_loss_text_i, epoch_loss_accent_i 72 | 73 | 74 | ### TESTING 75 | 76 | def check_wer(transcripts, transcripts_lens, out, out_lens, decoder, target_decoder): 77 | split_transcripts = [] 78 | offset = 0 79 | for size in transcripts_lens: 80 | split_transcripts.append(transcripts[offset:offset + size]) 81 | offset += size 82 | 83 | decoded_output, _ = decoder.decode(out.data.transpose(0,1), out_lens) 84 | target_strings = target_decoder.convert_to_strings(split_transcripts) 85 | 86 | #if True: 87 | # print('targets', targets) 88 | # print('split_targets', split_targets) 89 | # print('out', out) 90 | # print('output_len', output_len) 91 | # print('decoded', decoded_output) 92 | # print('target', target_strings) 93 | 94 | wer, cer = 0, 0 95 | for x in range(len(target_strings)): 96 | transcript, reference = decoded_output[x][0], target_strings[x][0] 97 | wer += decoder.wer(transcript, reference) / float(len(reference.split())) 98 | #cer += decoder.cer(transcript, reference) / float(len(reference)) 99 | wer /= len(target_strings) 100 | return wer * 100 101 | 102 | 103 | def check_acc(accents, out): 104 | out_arg = np.argmax(out, axis=1) 105 | diff = torch.eq(out_arg, accents.cpu()) 106 | acc = torch.sum(diff) 107 | return acc.item() / len(accents) * 100 108 | 109 | 110 | def test(model, test_loader, criterion, decoder, target_decoder, losses_mix=0.5): 111 | with torch.no_grad(): 112 | model.eval() 113 | 114 | epoch_losses = [] 115 | epoch_losses_text = [] 116 | epoch_losses_accent = [] 117 | 118 | epoch_wers = [] 119 | epoch_accent_accs = [] 120 | 121 | for data in tqdm(test_loader, total=len(test_loader)): 122 | inputs, inputs_lens, transcripts, transcripts_lens, accents = data 123 | 124 | if next(model.parameters()).is_cuda: 125 | inputs = inputs.cuda() 126 | inputs_lens = inputs_lens.cuda() 127 | 128 | if accents is not None: 129 | accents = accents.cuda() 130 | 131 | out_text, out_accent, out_lens, __ = model(inputs, inputs_lens) 132 | 133 | 134 | if accents is None or len(model._meta['accents_dict']) > max(accents) + 1: # Check if we are testing a model with different accents 135 | loss, loss_text, loss_accent = get_mixed_loss(criterion, out_text, out_accent, 136 | out_lens, accents, transcripts, 137 | transcripts_lens, losses_mix) 138 | else: # in that case we do not care about the loss, section to refactor. 139 | loss, loss_text, loss_accent = torch.tensor([-1]), torch.tensor([-1]), torch.tensor([-1]) 140 | 141 | if out_text is not None: 142 | wer = check_wer(transcripts, transcripts_lens, 143 | out_text, out_lens, decoder, target_decoder) 144 | else: 145 | wer = None 146 | 147 | if out_accent is not None: 148 | accent_acc = check_acc(accents, out_accent) 149 | else: 150 | accent_acc = None 151 | 152 | l = loss.clone().item() if loss is not None else None 153 | lt = loss_text.clone().item() if loss_text is not None else None 154 | la = loss_accent.clone().item() if loss_accent is not None else None 155 | epoch_losses.append(l) 156 | epoch_losses_text.append(lt) 157 | epoch_losses_accent.append(la) 158 | 159 | epoch_wers.append(wer) 160 | epoch_accent_accs.append(accent_acc) 161 | 162 | 163 | 164 | 165 | average_loss = lambda l: sum(l) / len(test_loader) if l[0] is not None else -1 166 | 167 | epoch_loss = average_loss(epoch_losses) 168 | epoch_loss_text = average_loss(epoch_losses_text) 169 | epoch_loss_accent = average_loss(epoch_losses_accent) 170 | 171 | epoch_wer = average_loss(epoch_wers) 172 | epoch_accent_acc = average_loss(epoch_accent_accs) 173 | 174 | return epoch_loss, epoch_loss_text, epoch_loss_accent, epoch_wer, epoch_accent_acc -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | from utils import tile 7 | from torch.utils.data import DataLoader, Dataset 8 | 9 | 10 | ### DATASET 11 | 12 | class MultiDataset(Dataset): 13 | """Defines an iterator over the dataset. This class is intended to be used with 14 | the MultiDataLoader class.""" 15 | 16 | def __init__(self, manifest, labels, manifest_separator=',', 17 | use_mfcc_in=True, use_ivectors_in=False, use_embeddings_in=False, 18 | embedding_size=100, use_transcripts_out=True, use_accents_out=False): 19 | """ 20 | Allows to chose what will be trained on, and what are the outputs. 21 | At least on input and one output is needed. 22 | Default configuration is regular MFCCs to text. 23 | 24 | Manifest should be csv type file with following row for each sample: 25 | mfcc_path, ivector_path, embedding_path, transcripts_path, accent_label 26 | (Column can remain empty if not used, but must be present.) 27 | 28 | Scripts to create the database and manifest from audio and text in the scripts folder. 29 | """ 30 | 31 | assert(any([use_mfcc_in, use_ivectors_in, use_embeddings_in])), 'MultiDataset config needs at least one input set to True' 32 | assert(any([use_transcripts_out, use_accents_out])), 'MultiDataset config needs at least one output set to True' 33 | assert(not use_transcripts_out or use_mfcc_in), 'Can’t do speech to text without mfcc.' 34 | 35 | super(MultiDataset, self).__init__() 36 | 37 | self.config = {} 38 | self.config['use_mfcc_in']=use_mfcc_in 39 | self.config['use_ivectors_in']=use_ivectors_in 40 | self.config['use_embeddings_in']=use_embeddings_in 41 | self.config['embedding_size']=embedding_size 42 | self.config['use_transcripts_out']=use_transcripts_out 43 | self.config['use_accents_out']=use_accents_out 44 | 45 | self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) 46 | 47 | with open(manifest) as f: 48 | self.samples = [x.strip().split(manifest_separator) for x in f.readlines()] 49 | 50 | self.accent_dict = self.make_accent_dict(self.samples) 51 | 52 | def __getitem__(self, index): 53 | """Unused features are set to None for the Dataloader. Returns torch tensors.""" 54 | mfcc_path, ivector_path, embedding_path, transcript_path, accent_label = self.samples[index] 55 | mfcc, ivector, embedding, parsed_transcript, accent = None, None, None, None, None 56 | 57 | def load_array(path): 58 | with open(path) as f: 59 | array = json.load(f) 60 | return torch.FloatTensor(array) 61 | 62 | # Inputs 63 | if self.config['use_mfcc_in']: 64 | mfcc = load_array(mfcc_path) 65 | 66 | if self.config['use_ivectors_in']: 67 | ivector = load_array(ivector_path) 68 | 69 | if self.config['use_embeddings_in']: 70 | new_embedding_path = [] 71 | for split in embedding_path.split('/'): 72 | new = split if 'embedding' not in split else ''.join([split, '_', str(self.config['embedding_size'])]) 73 | new_embedding_path.append(new) 74 | new_embedding_path = '/'.join(new_embedding_path) 75 | embedding = torch.load(new_embedding_path, map_location=lambda storage, loc: storage) 76 | # map_location and loc are there to load the embedding on the CPU 77 | 78 | # Outputs 79 | if self.config['use_transcripts_out']: 80 | parsed_transcript = self.parse_transcript(transcript_path) 81 | 82 | if self.config['use_accents_out']: 83 | accent = self.accent_dict[accent_label] 84 | accent = torch.LongTensor([accent]) 85 | 86 | return mfcc, ivector, embedding, parsed_transcript, accent 87 | 88 | 89 | def parse_transcript(self, transcript_path): 90 | """Maps a text to integers using the given labels_map.""" 91 | 92 | with open(transcript_path, 'r', encoding='utf8') as transcript_file: 93 | transcript = transcript_file.read().replace('\n', '') 94 | 95 | transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) 96 | return transcript 97 | 98 | def __len__(self): 99 | return len(self.samples) 100 | 101 | @staticmethod 102 | def make_accent_dict(samples): 103 | acc_set = set() 104 | for __, __, __, __, accent in samples: 105 | acc_set.add(accent) 106 | enum = enumerate(sorted(acc_set)) # sorted set for consistant results 107 | return {acc: i for i, acc in enum} 108 | 109 | 110 | ### DATALOADER 111 | 112 | # Collate function for the MultiDataLoader 113 | def collate_fn(batch): 114 | """This function takes list of samples and assembles a batch. 115 | It is intended to used in PyTorch DataLoader.""" 116 | 117 | mfccs, ivectors, embeddings, transcripts, accents = list(zip(*batch)) 118 | 119 | def exists(list_): 120 | """Checks if we are not getting a list of None""" 121 | return list_[0] is not None 122 | 123 | ## Lens 124 | if exists(mfccs): 125 | inputs_lens = torch.IntTensor([len(m) for m in mfccs]) 126 | elif exists(ivectors): 127 | inputs_lens = torch.IntTensor([len(i) for i in ivectors]) 128 | else: 129 | inputs_lens = torch.IntTensor([1] * len(batch)) 130 | 131 | # Sorting order (needs to be descending in lens for the padder) 132 | inputs_lens, sorted_idx = inputs_lens.sort(descending=True) 133 | 134 | if exists(transcripts): 135 | transcripts_lens = torch.IntTensor([len(t) for t in transcripts]) 136 | transcripts_lens = transcripts_lens[sorted_idx] 137 | else: 138 | transcripts_lens = None 139 | 140 | ## Inputs 141 | inputs = [] 142 | if exists(mfccs): 143 | inputs.append(nn.utils.rnn.pad_sequence(mfccs, batch_first=True)) 144 | 145 | if exists(ivectors): 146 | ivect = nn.utils.rnn.pad_sequence(ivectors, batch_first=True) 147 | if exists(mfccs): # The ivector resolution is 10 times lower than the mfccs', so we expand them. 148 | ivect = tile(ivect, 1, 10) 149 | ivect = ivect[:, :inputs[0].size(1), :] 150 | inputs.append(ivect) 151 | 152 | if exists(embeddings): 153 | emb = torch.cat(embeddings) 154 | emb = emb.view(emb.size(0), 1, emb.size(1)) 155 | if exists(mfccs) or exists(ivectors): 156 | # tile embeddings to fit either mfccs or ivectors size if they are present 157 | emb = tile(emb, 1, inputs[0].size(1)) 158 | inputs.append(emb) 159 | 160 | inputs = torch.cat(inputs, dim=2) 161 | inputs = inputs[sorted_idx] 162 | 163 | ## Outputs 164 | if exists(transcripts): 165 | if inputs.size(0) == 1: # bugfix for when only one sample 166 | transcripts = [transcripts] 167 | transcripts = np.asarray(transcripts)[sorted_idx] # dtype=object because some transcripts were loaded with wrong type (Int64). TODO fix. 168 | transcripts = torch.IntTensor([t for trs in transcripts for t in trs]) 169 | # we need text targets as one concatenated vector 170 | 171 | if exists(accents): 172 | accents = torch.cat(accents)[sorted_idx] 173 | else: 174 | accents = None 175 | 176 | return inputs, inputs_lens, transcripts, transcripts_lens, accents 177 | 178 | class MultiDataLoader(DataLoader): 179 | def __init__(self, *args, **kwargs): 180 | """ 181 | Creates a data loader for SpeechDatasets. 182 | """ 183 | super(MultiDataLoader, self).__init__(*args, **kwargs) 184 | self.collate_fn = collate_fn -------------------------------------------------------------------------------- /decoder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | # Modified to support pytorch Tensors 17 | 18 | import Levenshtein as Lev 19 | import torch 20 | from six.moves import xrange 21 | 22 | 23 | class Decoder(object): 24 | """ 25 | Basic decoder class from which all other decoders inherit. Implements several 26 | helper functions. Subclasses should implement the decode() method. 27 | 28 | Arguments: 29 | labels (string): mapping from integers to characters. 30 | blank_index (int, optional): index for the blank '_' character. Defaults to 0. 31 | space_index (int, optional): index for the space ' ' character. Defaults to 28. 32 | """ 33 | 34 | def __init__(self, labels, blank_index=0): 35 | # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#" 36 | self.labels = labels 37 | self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)]) 38 | self.blank_index = blank_index 39 | space_index = len(labels) # To prevent errors in decode, we add an out of bounds index for the space 40 | if ' ' in labels: 41 | space_index = labels.index(' ') 42 | self.space_index = space_index 43 | 44 | def wer(self, s1, s2): 45 | """ 46 | Computes the Word Error Rate, defined as the edit distance between the 47 | two provided sentences after tokenizing to words. 48 | Arguments: 49 | s1 (string): space-separated sentence 50 | s2 (string): space-separated sentence 51 | """ 52 | 53 | # build mapping of words to integers 54 | b = set(s1.split() + s2.split()) 55 | word2char = dict(zip(b, range(len(b)))) 56 | 57 | # map the words to a char array (Levenshtein packages only accepts 58 | # strings) 59 | w1 = [chr(word2char[w]) for w in s1.split()] 60 | w2 = [chr(word2char[w]) for w in s2.split()] 61 | 62 | return Lev.distance(''.join(w1), ''.join(w2)) 63 | 64 | def cer(self, s1, s2): 65 | """ 66 | Computes the Character Error Rate, defined as the edit distance. 67 | 68 | Arguments: 69 | s1 (string): space-separated sentence 70 | s2 (string): space-separated sentence 71 | """ 72 | s1, s2, = s1.replace(' ', ''), s2.replace(' ', '') 73 | return Lev.distance(s1, s2) 74 | 75 | def decode(self, probs, sizes=None): 76 | """ 77 | Given a matrix of character probabilities, returns the decoder's 78 | best guess of the transcription 79 | 80 | Arguments: 81 | probs: Tensor of character probabilities, where probs[c,t] 82 | is the probability of character c at time t 83 | sizes(optional): Size of each sequence in the mini-batch 84 | Returns: 85 | string: sequence of the model's best guess for the transcription 86 | """ 87 | raise NotImplementedError 88 | 89 | 90 | class BeamCTCDecoder(Decoder): 91 | def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, 92 | num_processes=4, blank_index=0): 93 | super(BeamCTCDecoder, self).__init__(labels) 94 | try: 95 | from ctcdecode import CTCBeamDecoder 96 | except ImportError: 97 | raise ImportError("BeamCTCDecoder requires paddledecoder package.") 98 | 99 | #labels = labels.replace("'", "a") # TODO fix that 100 | self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, 101 | num_processes, blank_index) 102 | 103 | def convert_to_strings(self, out, seq_len): 104 | results = [] 105 | for b, batch in enumerate(out): 106 | utterances = [] 107 | for p, utt in enumerate(batch): 108 | size = seq_len[b][p] 109 | if size > 0: 110 | transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size])) 111 | else: 112 | transcript = '' 113 | utterances.append(transcript) 114 | results.append(utterances) 115 | return results 116 | 117 | def convert_tensor(self, offsets, sizes): 118 | results = [] 119 | for b, batch in enumerate(offsets): 120 | utterances = [] 121 | for p, utt in enumerate(batch): 122 | size = sizes[b][p] 123 | if sizes[b][p] > 0: 124 | utterances.append(utt[0:size]) 125 | else: 126 | utterances.append(torch.tensor([], dtype=torch.int)) 127 | results.append(utterances) 128 | return results 129 | 130 | def decode(self, probs, sizes=None): 131 | """ 132 | Decodes probability output using ctcdecode package. 133 | Arguments: 134 | probs: Tensor of character probabilities, where probs[c,t] 135 | is the probability of character c at time t 136 | sizes: Size of each sequence in the mini-batch 137 | Returns: 138 | string: sequences of the model's best guess for the transcription 139 | """ 140 | probs = probs.cpu() 141 | out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes) 142 | strings = self.convert_to_strings(out, seq_lens) 143 | offsets = self.convert_tensor(offsets, seq_lens) 144 | return strings, offsets 145 | 146 | 147 | class GreedyDecoder(Decoder): 148 | def __init__(self, labels, blank_index=0): 149 | super(GreedyDecoder, self).__init__(labels, blank_index) 150 | 151 | def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False): 152 | """Given a list of numeric sequences, returns the corresponding strings""" 153 | strings = [] 154 | offsets = [] if return_offsets else None 155 | for x in xrange(len(sequences)): 156 | seq_len = sizes[x] if sizes is not None else len(sequences[x]) 157 | string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions) 158 | strings.append([string]) # We only return one path 159 | if return_offsets: 160 | offsets.append([string_offsets]) 161 | if return_offsets: 162 | return strings, offsets 163 | else: 164 | return strings 165 | 166 | def process_string(self, sequence, size, remove_repetitions=False): 167 | string = '' 168 | offsets = [] 169 | for i in range(size): 170 | char = self.int_to_char[sequence[i].item()] 171 | if char != self.int_to_char[self.blank_index]: 172 | # if this char is a repetition and remove_repetitions=true, then skip 173 | if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]: 174 | pass 175 | elif char == self.labels[self.space_index]: 176 | string += ' ' 177 | offsets.append(i) 178 | else: 179 | string = string + char 180 | offsets.append(i) 181 | return string, torch.tensor(offsets, dtype=torch.int) 182 | 183 | def decode(self, probs, sizes=None): 184 | """ 185 | Returns the argmax decoding given the probability matrix. Removes 186 | repeated elements in the sequence, as well as blanks. 187 | 188 | Arguments: 189 | probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim 190 | sizes(optional): Size of each sequence in the mini-batch 191 | Returns: 192 | strings: sequences of the model's best guess for the transcription on inputs 193 | offsets: time step per character predicted 194 | """ 195 | _, max_probs = torch.max(probs, 2) 196 | strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes, 197 | remove_repetitions=True, return_offsets=True) 198 | return strings, offsets 199 | -------------------------------------------------------------------------------- /data/language_models/lnn_tri.lm: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=172 4 | ngram 2=289 5 | ngram 3=6 6 | 7 | \1-grams: 8 | -0.8200597 9 | -99 -0.3555717 10 | -2.2266 A -0.04052078 11 | -2.52763 AIR -0.04181526 12 | -2.52763 ALARM -0.03661407 13 | -2.52763 ALEXA 0.02815799 14 | -2.2266 AM -0.03922245 15 | -2.52763 AMAZON -0.04052079 16 | -2.2266 AND -0.02198101 17 | -2.52763 ANECHOIC -0.04181526 18 | -2.52763 AT -0.04181526 19 | -2.52763 BANGALORE 0.02815799 20 | -2.52763 BEFORE -0.03661407 21 | -2.52763 BIEBER 0.02815799 22 | -2.52763 BLACK 0.02815799 23 | -2.52763 BLAKE -0.04181526 24 | -2.52763 BLUE -0.04181526 25 | -2.52763 BOWIE 0.02815799 26 | -1.573387 BY -0.03134982 27 | -2.52763 CALENDAR -0.04052079 28 | -2.52763 CALIBRATE -0.02333186 29 | -2.52763 CALL -0.04181526 30 | -2.52763 CAME -0.04181526 31 | -2.52763 CAMERAS -0.03002374 32 | -2.52763 CARIBBEAN 0.02815799 33 | -2.52763 CHAMBER -0.03661407 34 | -2.52763 CHRIS -0.04181526 35 | -2.52763 CONDITIONING -0.04052079 36 | -2.52763 DAFT -0.04181526 37 | -2.52763 DAVID -0.04181526 38 | -2.52763 DINING -0.03792022 39 | -2.52763 DO -0.03661407 40 | -2.52763 DOOR -0.04181526 41 | -2.2266 DOWN -0.04550482 42 | -2.52763 DRAKE 0.02815799 43 | -2.52763 DRIVE -0.02869358 44 | -2.52763 EIGHT -0.04181526 45 | -2.52763 EPISODE -0.03792022 46 | -2.52763 ESPN 0.02815799 47 | -2.52763 FALL -0.03792022 48 | -2.52763 FIRST -0.04181526 49 | -2.52763 FIVE 0.02815799 50 | -2.52763 FLOOR -0.04181526 51 | -2.52763 FOLLOWING -0.04181526 52 | -1.82866 FOR -0.03267188 53 | -2.52763 FORGET -0.03134983 54 | -2.2266 FROM -0.03922245 55 | -2.52763 FRONT -0.04181526 56 | -2.52763 GARAGE -0.04181526 57 | -2.52763 GENIE 0.02815799 58 | -2.52763 GET -0.04181526 59 | -2.52763 GOING -0.02869358 60 | -2.050509 GOOGLE -0.01648962 61 | -2.52763 HELLO -0.04181526 62 | -2.52763 HERE -0.02869358 63 | -1.82866 HEY -0.03398992 64 | -2.52763 HI -0.03661407 65 | -2.52763 HOME 0.02815799 66 | -2.52763 HOURS -0.04052079 67 | -2.52763 HOW -0.04181526 68 | -1.82866 I -0.06459835 69 | -1.92557 IN -0.03530397 70 | -2.52763 INTENDED -0.02869358 71 | -1.82866 IS -0.04451953 72 | -2.050509 IT -0.02198102 73 | -2.52763 ITUNES 0.02815799 74 | -2.52763 JUSTIN -0.04181526 75 | -2.52763 KIDS 0.02815799 76 | -2.52763 LAMP 0.02815799 77 | -2.52763 LEAVE -0.02869358 78 | -2.52763 LIGHT -0.02869358 79 | -2.52763 LIGHTS 0.02815799 80 | -2.52763 LIKE -0.04052079 81 | -2.050509 LIVING -0.9322866 82 | -2.52763 LOGI 0.02815799 83 | -2.52763 LOGITECH 0.02815799 84 | -2.52763 LOGITECH'S -0.04181526 85 | -2.52763 LONG -0.04181526 86 | -2.52763 LOVE -0.04181526 87 | -2.52763 LUCKY -0.03134983 88 | -2.52763 MAKE -0.03922245 89 | -2.52763 MARS 0.02815799 90 | -2.2266 ME -0.02735934 91 | -2.52763 MEETING 0.02815799 92 | -2.52763 MINUTES 0.02815799 93 | -2.52763 MOM -0.04181526 94 | -2.2266 MUSIC -0.04550482 95 | -2.52763 MUTE 0.02815799 96 | -1.82866 MY -0.03267188 97 | -2.2266 NAME -0.03002373 98 | -2.52763 NEED -0.02869358 99 | -2.52763 NEW -0.04181526 100 | -2.52763 NEXT -0.04181526 101 | -2.2266 NOW 0.02967916 102 | -2.52763 NPR -0.03002374 103 | -2.52763 ODB 0.02815799 104 | -1.92557 OF -0.05589744 105 | -2.52763 OK -0.03922245 106 | -1.52763 ON -0.08432172 107 | -2.52763 ONE -0.04181526 108 | -2.52763 ONLY -0.04181526 109 | -2.52763 OPEN 0.02815799 110 | -2.52763 ORANGE -0.03661407 111 | -2.52763 OUT -0.03134983 112 | -2.52763 PANDORA 0.02815799 113 | -2.52763 PART -0.03792022 114 | -2.52763 PAUSE 0.02815799 115 | -2.52763 PHRASES 0.02815799 116 | -2.52763 PICK -0.04052079 117 | -2.52763 PILOTS 0.02815799 118 | -2.52763 PIRATES -0.03792022 119 | -2.52763 PLATTEN 0.02815799 120 | -1.448449 PLAY 0.06955066 121 | -2.52763 PM -0.04181526 122 | -2.52763 PUNK 0.02815799 123 | -2.52763 RACHEL -0.04181526 124 | -2.2266 RECOGNITION -0.02735934 125 | -2.52763 RECORDING -0.04052079 126 | -2.52763 RED 0.02815799 127 | -2.52763 REMINDER -0.03661407 128 | -2.52763 REWIND -0.04052079 129 | -2.52763 RIGHT -0.03922245 130 | -2.52763 RISE -0.04052079 131 | -1.92557 ROOM -0.03792021 132 | -2.52763 SAY -0.02333186 133 | -2.52763 SCRIPT -0.03661407 134 | -2.52763 SEARCH -0.03661407 135 | -2.52763 SECONDS 0.02815799 136 | -1.92557 SET -0.07057546 137 | -2.52763 SHELTON 0.02815799 138 | -2.52763 SIRI 0.02815799 139 | -2.52763 SIXTEEN -0.03134983 140 | -2.52763 SOMETHING -0.03134983 141 | -2.52763 SONOS 0.02815799 142 | -2.52763 SOUNDCLOUD 0.02815799 143 | -2.52763 SPIDERS -0.04052079 144 | -2.52763 SPOTIFY 0.02815799 145 | -2.52763 STAND -0.03134983 146 | -2.52763 STAPELTON 0.02815799 147 | -2.52763 STARDUST -0.04052079 148 | -2.52763 START -0.03661407 149 | -2.52763 STATE -0.04181526 150 | -2.52763 STOP 0.02815799 151 | -2.52763 STRESSED -0.04181526 152 | -2.52763 SUMMER -0.04181526 153 | -2.52763 TAKE -0.04052079 154 | -2.2266 TEN -0.04052078 155 | -2.52763 TESTING -0.04052079 156 | -1.351539 THE -0.02839587 157 | -2.2266 THIS -0.03661406 158 | -2.2266 TIME -0.03530397 159 | -2.52763 TIMER -0.03661407 160 | -1.486237 TO -0.03582891 161 | -2.2266 TODAY -0.04550482 162 | -2.52763 TRAVELER -0.03134983 163 | -1.82866 TURN -0.3301536 164 | -2.52763 TWENTY -0.04181526 165 | -2.52763 TWO -0.04181526 166 | -2.2266 UP 0.05155473 167 | -2.050509 VOICE -0.08691774 168 | -2.050509 VOLUME -0.03661407 169 | -2.52763 WANT -0.02869358 170 | -2.2266 WATCH -0.02198101 171 | -2.52763 WEATHER -0.04181526 172 | -2.2266 WHAT -0.1141836 173 | -2.2266 WHAT'S -0.009630572 174 | -2.52763 WILL -0.03922245 175 | -2.52763 WORK 0.02815799 176 | -2.52763 YOU -0.03134983 177 | -2.52763 YOUR -0.04052079 178 | -2.52763 YOURSELF -0.03134983 179 | -2.52763 ZIGGY -0.04181526 180 | 181 | \2-grams: 182 | -2.732193 CALL 183 | -2.732193 HELLO 184 | -1.0086 HEY 185 | -2.732193 HI 186 | -2.732193 HOW 187 | -2.732193 I 188 | -2.034075 IN 189 | -2.732193 IS 190 | -2.732193 MUTE 191 | -2.732193 OK 192 | -1.10551 ON 193 | -2.732193 PAUSE 194 | -0.6283889 PLAY 195 | -2.732193 REWIND 196 | -2.732193 SEARCH 197 | -1.10551 SET -0.1476935 198 | -2.732193 STOP 199 | -1.0086 TURN -0.06938255 200 | -2.034075 VOLUME 201 | -2.732193 WATCH 202 | -2.034075 WHAT -0.475604 203 | -2.034075 WHAT'S 204 | -1.325652 A REMINDER 205 | -1.325652 A TIMER 206 | -1.024622 AIR CONDITIONING 207 | -1.024622 ALARM FOR 208 | -1.024622 ALEXA 209 | -1.325652 AM NOW 210 | -1.325652 AM RECORDING 211 | -1.024622 AMAZON MUSIC 212 | -1.325652 AND FALL 213 | -1.325652 AND THE 214 | -1.024622 ANECHOIC CHAMBER 215 | -1.024622 AT HOME 216 | -1.024622 BANGALORE 217 | -1.024622 BEFORE I 218 | -1.024622 BIEBER 219 | -1.024622 BLACK 220 | -1.024622 BLAKE SHELTON 221 | -1.024622 BLUE GENIE 222 | -1.024622 BOWIE 223 | -1.978865 BY BLAKE 224 | -1.978865 BY CHRIS 225 | -1.978865 BY DAFT 226 | -1.978865 BY DRAKE 227 | -1.978865 BY JUSTIN 228 | -1.978865 BY ODB 229 | -1.978865 BY RACHEL 230 | -1.978865 BY TWENTY 231 | -1.978865 BY YOU 232 | -1.024622 CALENDAR TODAY 233 | -1.024622 CALIBRATE THE 234 | -1.024622 CALL MOM 235 | -1.024622 CAME HERE 236 | -1.024622 CAMERAS ON 237 | -1.024622 CARIBBEAN 238 | -1.024622 CHAMBER FOR 239 | -1.024622 CHRIS STAPELTON 240 | -1.024622 CONDITIONING DOWN 241 | -1.024622 DAFT PUNK 242 | -1.024622 DAVID BOWIE 243 | -1.024622 DINING ROOM 244 | -1.024622 DO I 245 | -1.024622 DOOR OPEN 246 | -0.6275349 DOWN 247 | -1.024622 DRAKE 248 | -1.024622 DRIVE TO 249 | -1.024622 EIGHT HOURS 250 | -1.024622 EPISODE OF 251 | -1.024622 ESPN 252 | -1.024622 FALL OF 253 | -1.024622 FIRST MEETING 254 | -1.024622 FIVE 255 | -1.024622 FLOOR LAMP 256 | -1.024622 FOLLOWING PHRASES 257 | -1.723593 FOR EIGHT 258 | -1.723593 FOR PIRATES 259 | -1.723593 FOR TEN 260 | -1.723593 FOR TWO 261 | -1.723593 FOR VOICE 262 | -1.024622 FORGET BY 263 | -1.325652 FROM MARS 264 | -1.325652 FROM NOW 265 | -1.024622 FRONT RIGHT 266 | -1.024622 GARAGE DOOR 267 | -1.024622 GENIE 268 | -1.024622 GET LUCKY 269 | -1.024622 GOING TO 270 | -0.8036261 GOOGLE 271 | -1.501744 GOOGLE MUSIC 272 | -1.024622 HELLO BLUE 273 | -1.024622 HERE TO 274 | -1.723593 HEY ALEXA 275 | -1.723593 HEY GOOGLE 276 | -1.723593 HEY LOGI 277 | -1.723593 HEY LOGITECH 278 | -1.723593 HEY SIRI 279 | -1.024622 HI MY 280 | -1.024622 HOME 281 | -1.024622 HOURS FROM 282 | -1.024622 HOW LONG 283 | -1.025475 I AM 284 | -1.723593 I NEED 285 | -1.723593 I START 286 | -1.723593 I WANT 287 | -1.626683 IN BANGALORE 288 | -1.626683 IN GOOGLE 289 | -1.626683 IN ITUNES 290 | -1.626683 IN LOGITECH'S 291 | -1.024622 INTENDED TO 292 | -1.723593 IS IT 293 | -1.723593 IS ONLY 294 | -1.723593 IS STATE 295 | -1.025475 IS THE 296 | -1.501744 IT IN 297 | -1.501744 IT TAKE 298 | -1.501744 IT TO 299 | -1.024622 ITUNES 300 | -1.024622 JUSTIN BIEBER 301 | -1.024622 KIDS 302 | -1.024622 LAMP 303 | -1.024622 LEAVE TO 304 | -1.024622 LIGHT TO 305 | -1.024622 LIGHTS 306 | -1.024622 LIKE TODAY 307 | -0.05329508 LIVING ROOM 308 | -1.024622 LOGI 309 | -1.024622 LOGITECH 310 | -1.024622 LOGITECH'S ANECHOIC 311 | -1.024622 LONG WILL 312 | -1.024622 LOVE YOURSELF 313 | -1.024622 LUCKY BY 314 | -1.024622 MAKE IT 315 | -1.024622 MARS 316 | -1.325652 ME SOMETHING 317 | -1.325652 ME TO 318 | -1.024622 MEETING 319 | -1.024622 MINUTES 320 | -1.024622 MOM AT 321 | -0.6275349 MUSIC 322 | -1.024622 MUTE 323 | -1.723593 MY CALENDAR 324 | -1.723593 MY DINING 325 | -1.723593 MY FIRST 326 | -1.723593 MY NAME 327 | -1.723593 MY VOICE 328 | -1.325652 NAME I 329 | -1.325652 NAME IS 330 | -1.024622 NEED TO 331 | -1.024622 NEW BLACK 332 | -1.024622 NEXT EPISODE 333 | -1.325652 NOW 334 | -1.325652 NOW GOING 335 | -1.024622 NPR ON 336 | -1.024622 ODB 337 | -1.626683 OF ORANGE 338 | -0.9285648 OF THE 339 | -1.626683 OF ZIGGY 340 | -1.024622 OK GOOGLE 341 | -2.024622 ON 342 | -2.024622 ON AMAZON 343 | -1.326505 ON MY 344 | -2.024622 ON PANDORA 345 | -2.024622 ON SOUNDCLOUD 346 | -2.024622 ON SPOTIFY 347 | -0.5761738 ON THE -0.2839059 348 | -1.024622 ONE PILOTS 349 | -1.024622 ONLY INTENDED 350 | -1.024622 OPEN 351 | -1.024622 ORANGE IS 352 | -1.024622 OUT BY 353 | -1.024622 PANDORA 354 | -1.024622 PART OF 355 | -1.024622 PAUSE 356 | -1.024622 PHRASES 357 | -1.024622 PICK UP 358 | -1.024622 PILOTS 359 | -1.024622 PIRATES OF 360 | -1.024622 PLATTEN 361 | -2.103804 PLAY 362 | -2.103804 PLAY CAME 363 | -2.103804 PLAY DAVID 364 | -2.103804 PLAY GET 365 | -2.103804 PLAY LOVE 366 | -2.103804 PLAY ME 367 | -2.103804 PLAY NPR 368 | -2.103804 PLAY STAND 369 | -2.103804 PLAY STRESSED 370 | -2.103804 PLAY SUMMER 371 | -2.103804 PLAY THE 372 | -2.103804 PLAY TRAVELER 373 | -1.024622 PM PICK 374 | -1.024622 PUNK 375 | -1.024622 RACHEL PLATTEN 376 | -1.325652 RECOGNITION TESTING 377 | -1.325652 RECOGNITION TO 378 | -1.024622 RECORDING THIS 379 | -1.024622 RED 380 | -1.024622 REMINDER FOR 381 | -1.024622 REWIND TEN 382 | -1.024622 RIGHT LIVING 383 | -1.024622 RISE AND 384 | -1.626683 ROOM FLOOR 385 | -1.626683 ROOM LIGHT 386 | -1.626683 ROOM LIGHTS 387 | -1.626683 ROOM SONOS 388 | -1.024622 SAY THE 389 | -1.024622 SCRIPT IS 390 | -1.024622 SEARCH FOR 391 | -1.024622 SECONDS 392 | -0.9285648 SET A 393 | -1.626683 SET ALARM 394 | -1.626683 SET VOLUME 395 | -1.024622 SHELTON 396 | -1.024622 SIRI 397 | -1.024622 SIXTEEN BY 398 | -1.024622 SOMETHING BY 399 | -1.024622 SONOS 400 | -1.024622 SOUNDCLOUD 401 | -1.024622 SPIDERS FROM 402 | -1.024622 SPOTIFY 403 | -1.024622 STAND BY 404 | -1.024622 STAPELTON 405 | -1.024622 STARDUST AND 406 | -1.024622 START I 407 | -1.024622 STATE YOUR 408 | -1.024622 STOP 409 | -1.024622 STRESSED OUT 410 | -1.024622 SUMMER SIXTEEN 411 | -1.024622 TAKE ME 412 | -1.325652 TEN MINUTES 413 | -1.325652 TEN SECONDS 414 | -1.024622 TESTING THIS 415 | -2.200714 THE AIR 416 | -2.200714 THE CARIBBEAN 417 | -2.200714 THE FOLLOWING 418 | -2.200714 THE FRONT 419 | -2.200714 THE GARAGE 420 | -2.200714 THE KIDS 421 | -1.502596 THE LIVING 0.3450996 422 | -2.200714 THE NEW 423 | -2.200714 THE NEXT 424 | -2.200714 THE RISE 425 | -2.200714 THE SCRIPT 426 | -2.200714 THE SPIDERS 427 | -2.200714 THE VOICE 428 | -2.200714 THE WEATHER 429 | -1.325652 THIS IN 430 | -1.325652 THIS PART 431 | -1.325652 TIME DO 432 | -1.325652 TIME IS 433 | -1.024622 TIMER FOR 434 | -2.066015 TO CALIBRATE 435 | -2.066015 TO DRIVE 436 | -2.066015 TO FORGET 437 | -2.066015 TO LEAVE 438 | -2.066015 TO MAKE 439 | -1.367898 TO MY 440 | -2.066015 TO RED 441 | -2.066015 TO SAY 442 | -2.066015 TO WATCH 443 | -2.066015 TO WORK 444 | -0.6275349 TODAY 445 | -1.024622 TRAVELER BY 446 | -1.723593 TURN CAMERAS 447 | -0.2751438 TURN ON -0.4681379 448 | -1.723593 TURN THE 449 | -1.024622 TWENTY ONE 450 | -1.024622 TWO PM 451 | -1.325652 UP 452 | -1.325652 UP THE 453 | -1.501744 VOICE BEFORE 454 | -0.8036261 VOICE RECOGNITION 455 | -1.501744 VOLUME DOWN 456 | -1.501744 VOLUME FIVE 457 | -1.501744 VOLUME UP 458 | -1.024622 WANT TO 459 | -1.325652 WATCH ESPN 460 | -1.325652 WATCH THE 461 | -1.024622 WEATHER LIKE 462 | -0.6275349 WHAT TIME 463 | -1.325652 WHAT'S ON 464 | -1.325652 WHAT'S THE 465 | -1.024622 WILL IT 466 | -1.024622 WORK 467 | -1.024622 YOU BY 468 | -1.024622 YOUR NAME 469 | -1.024622 YOURSELF BY 470 | -1.024622 ZIGGY STARDUST 471 | 472 | \3-grams: 473 | -0.1282164 THE LIVING ROOM 474 | -0.1249387 TURN ON THE 475 | -0.4292465 SET A 476 | -0.3043077 ON THE LIVING 477 | -0.2218488 TURN ON 478 | -0.1282164 WHAT TIME 479 | 480 | \end\ 481 | -------------------------------------------------------------------------------- /data/language_models/lnn_bi.lm: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=173 4 | ngram 2=289 5 | 6 | \1-grams: 7 | -0.7888443 8 | -99 -0.4177598 9 | -2.546674 10 | -2.462987 A -0.05261233 11 | -2.462987 AIR -0.05261233 12 | -2.462987 ALARM -0.05261234 13 | -2.462987 ALEXA -0.05261234 14 | -2.462987 AM -0.05261233 15 | -2.462987 AMAZON -0.05261234 16 | -2.160911 AND -0.05261233 17 | -2.462987 ANECHOIC -0.05261233 18 | -2.462987 AT -0.05261233 19 | -2.462987 BANGALORE -0.05261234 20 | -2.462987 BEFORE -0.05261234 21 | -2.462987 BIEBER -0.05261234 22 | -2.462987 BLACK -0.05261234 23 | -2.462987 BLAKE -0.05261233 24 | -2.462987 BLUE -0.05261233 25 | -2.462987 BOWIE -0.05261234 26 | -1.506887 BY -0.05261233 27 | -2.462987 CALENDAR -0.05261234 28 | -2.462987 CALIBRATE -0.05261233 29 | -2.462987 CALL -0.05261233 30 | -2.462987 CAME -0.05261233 31 | -2.462987 CAMERAS -0.05261234 32 | -2.462987 CARIBBEAN -0.05261234 33 | -2.462987 CHAMBER -0.05261234 34 | -2.462987 CHRIS -0.05261233 35 | -2.462987 CONDITIONING -0.05261234 36 | -2.462987 DAFT -0.05261233 37 | -2.462987 DAVID -0.05261233 38 | -2.462987 DINING -0.05261234 39 | -2.462987 DO -0.05261234 40 | -2.462987 DOOR -0.05261233 41 | -2.160911 DOWN -0.3536423 42 | -2.462987 DRAKE -0.05261234 43 | -2.462987 DRIVE -0.05261233 44 | -2.462987 EIGHT -0.05261233 45 | -2.462987 EPISODE -0.05261233 46 | -2.462987 ESPN -0.05261234 47 | -2.462987 FALL -0.05261233 48 | -2.462987 FIRST -0.05261233 49 | -2.462987 FIVE -0.05261234 50 | -2.462987 FLOOR -0.05261233 51 | -2.462987 FOLLOWING -0.05261233 52 | -1.762345 FOR -0.05261234 53 | -2.462987 FORGET -0.05261234 54 | -2.160911 FROM -0.05261233 55 | -2.462987 FRONT -0.05261233 56 | -2.462987 GARAGE -0.05261233 57 | -2.462987 GENIE -0.05261234 58 | -2.462987 GET -0.05261233 59 | -2.462987 GOING -0.05261233 60 | -1.984472 GOOGLE -0.2287036 61 | -2.462987 HELLO -0.05261233 62 | -2.462987 HERE -0.05261233 63 | -2.462987 HEY -0.05261234 64 | -2.462987 HI -0.05261233 65 | -2.462987 HOME -0.05261234 66 | -2.462987 HOURS -0.05261234 67 | -2.462987 HOW -0.05261233 68 | -1.762345 I -0.1495224 69 | -1.984472 IN -0.05261234 70 | -2.462987 INTENDED -0.05261233 71 | -1.762345 IS -0.1495224 72 | -1.984472 IT -0.05261235 73 | -2.462987 ITUNES -0.05261234 74 | -2.462987 JUSTIN -0.05261233 75 | -2.462987 KIDS -0.05261234 76 | -2.462987 LAMP -0.05261234 77 | -2.462987 LEAVE -0.05261233 78 | -2.462987 LIGHT -0.05261233 79 | -2.462987 LIGHTS -0.05261234 80 | -2.462987 LIKE -0.05261234 81 | -2.160911 LIVING -0.5297336 82 | -2.462987 LOGI -0.05261234 83 | -2.462987 LOGITECH -0.05261234 84 | -2.462987 LOGITECH'S -0.05261233 85 | -2.462987 LONG -0.05261233 86 | -2.462987 LOVE -0.05261233 87 | -2.462987 LUCKY -0.05261234 88 | -2.462987 MAKE -0.05261233 89 | -2.462987 MARS -0.05261234 90 | -2.160911 ME -0.05261233 91 | -2.462987 MEETING -0.05261234 92 | -2.462987 MINUTES -0.05261234 93 | -2.462987 MOM -0.05261233 94 | -2.160911 MUSIC -0.3536423 95 | -2.462987 MUTE -0.05261234 96 | -1.984472 MY -0.05261234 97 | -2.160911 NAME -0.05261233 98 | -2.462987 NEED -0.05261233 99 | -2.462987 NEW -0.05261233 100 | -2.462987 NEXT -0.05261233 101 | -2.160911 NOW -0.05261233 102 | -2.462987 NPR -0.05261234 103 | -2.462987 ODB -0.05261234 104 | -1.859359 OF -0.1775511 105 | -2.462987 OK -0.05261233 106 | -1.762345 ON -0.2075143 107 | -2.462987 ONE -0.05261233 108 | -2.462987 ONLY -0.05261233 109 | -2.462987 OPEN -0.05261234 110 | -2.462987 ORANGE -0.05261234 111 | -2.462987 OUT -0.05261234 112 | -2.462987 PANDORA -0.05261234 113 | -2.462987 PART -0.05261233 114 | -2.462987 PAUSE -0.05261234 115 | -2.462987 PHRASES -0.05261234 116 | -2.462987 PICK -0.05261234 117 | -2.462987 PILOTS -0.05261234 118 | -2.462987 PIRATES -0.05261233 119 | -2.462987 PLATTEN -0.05261234 120 | -2.462987 PLAY -0.05261233 121 | -2.462987 PM -0.05261233 122 | -2.462987 PUNK -0.05261234 123 | -2.462987 RACHEL -0.05261233 124 | -2.462987 RECOGNITION -0.05261233 125 | -2.462987 RECORDING -0.05261234 126 | -2.462987 RED -0.05261234 127 | -2.462987 REMINDER -0.05261234 128 | -2.462987 REWIND -0.05261234 129 | -2.462987 RIGHT -0.05261234 130 | -2.462987 RISE -0.05261234 131 | -2.160911 ROOM -0.05261234 132 | -2.462987 SAY -0.05261233 133 | -2.462987 SCRIPT -0.05261234 134 | -2.462987 SEARCH -0.05261234 135 | -2.462987 SECONDS -0.05261234 136 | -2.462987 SET -0.1775511 137 | -2.462987 SHELTON -0.05261234 138 | -2.462987 SIRI -0.05261234 139 | -2.462987 SIXTEEN -0.05261234 140 | -2.462987 SOMETHING -0.05261234 141 | -2.462987 SONOS -0.05261234 142 | -2.462987 SOUNDCLOUD -0.05261234 143 | -2.462987 SPIDERS -0.05261234 144 | -2.462987 SPOTIFY -0.05261234 145 | -2.462987 STAND -0.05261234 146 | -2.462987 STAPELTON -0.05261234 147 | -2.462987 STARDUST -0.05261234 148 | -2.462987 START -0.05261234 149 | -2.462987 STATE -0.05261233 150 | -2.462987 STOP -0.05261234 151 | -2.462987 STRESSED -0.05261233 152 | -2.462987 SUMMER -0.05261233 153 | -2.462987 TAKE -0.05261234 154 | -2.160911 TEN -0.05261233 155 | -2.462987 TESTING -0.05261234 156 | -1.419695 THE -0.08257556 157 | -2.160911 THIS -0.05261233 158 | -2.462987 TIME -0.05261233 159 | -2.462987 TIMER -0.05261234 160 | -1.419695 TO -0.09400501 161 | -2.160911 TODAY -0.3536423 162 | -2.462987 TRAVELER -0.05261234 163 | -2.462987 TURN -0.2744611 164 | -2.462987 TWENTY -0.05261233 165 | -2.462987 TWO -0.05261233 166 | -2.160911 UP -0.05261234 167 | -1.984472 VOICE -0.2287036 168 | -2.160911 VOLUME -0.05261234 169 | -2.462987 WANT -0.05261233 170 | -2.160911 WATCH -0.05261233 171 | -2.462987 WEATHER -0.05261233 172 | -2.462987 WHAT -0.3536424 173 | -2.462987 WHAT'S -0.05261233 174 | -2.462987 WILL -0.05261233 175 | -2.462987 WORK -0.05261234 176 | -2.462987 YOU -0.05261234 177 | -2.462987 YOUR -0.05261234 178 | -2.462987 YOURSELF -0.05261234 179 | -2.462987 ZIGGY -0.05261233 180 | 181 | \2-grams: 182 | -2.449389 CALL 183 | -2.449389 HELLO 184 | -1.086268 HEY 185 | -2.449389 HI 186 | -2.449389 HOW 187 | -2.053426 I 188 | -1.588285 IN 189 | -2.053426 IS 190 | -2.449389 MUTE 191 | -2.449389 OK 192 | -1.169629 ON 193 | -2.449389 PAUSE 194 | -0.6590814 PLAY 195 | -2.449389 REWIND 196 | -2.449389 SEARCH 197 | -1.204978 SET 198 | -2.449389 STOP 199 | -1.086268 TURN 200 | -1.61113 VOLUME 201 | -2.311984 WATCH 202 | -1.635243 WHAT 203 | -1.635243 WHAT'S 204 | -1.221142 A REMINDER 205 | -1.221142 A TIMER 206 | -0.9312775 AIR CONDITIONING 207 | -0.8880444 ALARM FOR 208 | -0.5881212 ALEXA 209 | -1.199537 AM NOW 210 | -1.221142 AM RECORDING 211 | -0.9200591 AMAZON MUSIC 212 | -1.221142 AND FALL 213 | -1.042144 AND THE 214 | -0.9312775 ANECHOIC CHAMBER 215 | -0.9312775 AT HOME 216 | -0.5881212 BANGALORE 217 | -0.8880444 BEFORE I 218 | -0.5881212 BIEBER 219 | -0.5881212 BLACK 220 | -0.9312775 BLAKE SHELTON 221 | -0.9312775 BLUE GENIE 222 | -0.5881212 BOWIE 223 | -1.803332 BY BLAKE 224 | -1.803332 BY CHRIS 225 | -1.803332 BY DAFT 226 | -1.803332 BY DRAKE 227 | -1.803332 BY JUSTIN 228 | -1.803332 BY ODB 229 | -1.803332 BY RACHEL 230 | -1.803332 BY TWENTY 231 | -1.803332 BY YOU 232 | -0.9200591 CALENDAR TODAY 233 | -0.8303289 CALIBRATE THE 234 | -0.9312775 CALL MOM 235 | -0.9312775 CAME HERE 236 | -0.8880444 CAMERAS ON 237 | -0.5881212 CARIBBEAN 238 | -0.8880444 CHAMBER FOR 239 | -0.9312775 CHRIS STAPELTON 240 | -0.9200591 CONDITIONING DOWN 241 | -0.9312775 DAFT PUNK 242 | -0.9312775 DAVID BOWIE 243 | -0.9200591 DINING ROOM 244 | -0.8880444 DO I 245 | -0.9312775 DOOR OPEN 246 | -0.2012962 DOWN 247 | -0.5881212 DRAKE 248 | -0.8303289 DRIVE TO 249 | -0.9312775 EIGHT HOURS 250 | -0.898456 EPISODE OF 251 | -0.5881212 ESPN 252 | -0.898456 FALL OF 253 | -0.9312775 FIRST MEETING 254 | -0.5881212 FIVE 255 | -0.9312775 FLOOR LAMP 256 | -0.9312775 FOLLOWING PHRASES 257 | -1.587212 FOR EIGHT 258 | -1.587212 FOR PIRATES 259 | -1.538578 FOR TEN 260 | -1.587212 FOR TWO 261 | -1.494846 FOR VOICE 262 | -0.8487282 FORGET BY 263 | -1.221142 FROM MARS 264 | -1.199537 FROM NOW 265 | -0.9312775 FRONT RIGHT 266 | -0.9312775 GARAGE DOOR 267 | -0.5881212 GENIE 268 | -0.9312775 GET LUCKY 269 | -0.8303289 GOING TO 270 | -0.330307 GOOGLE 271 | -1.375628 GOOGLE MUSIC 272 | -0.9312775 HELLO BLUE 273 | -0.8303289 HERE TO 274 | -1.587212 HEY ALEXA 275 | -1.494846 HEY GOOGLE 276 | -1.587212 HEY LOGI 277 | -1.587212 HEY LOGITECH 278 | -1.587212 HEY SIRI 279 | -0.9091232 HI MY 280 | -0.5881212 HOME 281 | -0.9200591 HOURS FROM 282 | -0.9312775 HOW LONG 283 | -0.6473172 I AM 284 | -1.597578 I NEED 285 | -1.597578 I START 286 | -1.597578 I WANT 287 | -1.500668 IN BANGALORE 288 | -1.423601 IN GOOGLE 289 | -1.500668 IN ITUNES 290 | -1.500668 IN LOGITECH'S 291 | -0.8303289 INTENDED TO 292 | -1.520511 IS IT 293 | -1.597578 IS ONLY 294 | -1.597578 IS STATE 295 | -0.6024376 IS THE 296 | -1.32594 IT IN 297 | -1.386348 IT TAKE 298 | -1.144261 IT TO 299 | -0.5881212 ITUNES 300 | -0.9312775 JUSTIN BIEBER 301 | -0.5881212 KIDS 302 | -0.5881212 LAMP 303 | -0.8303289 LEAVE TO 304 | -0.8303289 LIGHT TO 305 | -0.5881212 LIGHTS 306 | -0.9200591 LIKE TODAY 307 | -0.1507424 LIVING ROOM 308 | -0.5881212 LOGI 309 | -0.5881212 LOGITECH 310 | -0.9312775 LOGITECH'S ANECHOIC 311 | -0.9312775 LONG WILL 312 | -0.9312775 LOVE YOURSELF 313 | -0.8487282 LUCKY BY 314 | -0.9091232 MAKE IT 315 | -0.5881212 MARS 316 | -1.221142 ME SOMETHING 317 | -1.042144 ME TO 318 | -0.5881212 MEETING 319 | -0.5881212 MINUTES 320 | -0.9312775 MOM AT 321 | -0.2012962 MUSIC 322 | -0.5881212 MUTE 323 | -1.587212 MY CALENDAR 324 | -1.587212 MY DINING 325 | -1.587212 MY FIRST 326 | -1.538578 MY NAME 327 | -1.494846 MY VOICE 328 | -1.140505 NAME I 329 | -1.140505 NAME IS 330 | -0.8303289 NEED TO 331 | -0.9312775 NEW BLACK 332 | -0.9312775 NEXT EPISODE 333 | -0.6965729 NOW 334 | -1.221142 NOW GOING 335 | -0.8880444 NPR ON 336 | -0.5881212 ODB 337 | -1.511287 OF ORANGE 338 | -0.5174091 OF THE 339 | -1.511287 OF ZIGGY 340 | -0.9091232 OK GOOGLE 341 | -0.9498084 ON 342 | -1.868225 ON AMAZON 343 | -0.9287202 ON MY 344 | -1.868225 ON PANDORA 345 | -1.868225 ON SOUNDCLOUD 346 | -1.868225 ON SPOTIFY 347 | -0.6289269 ON THE 348 | -0.9312775 ONE PILOTS 349 | -0.9312775 ONLY INTENDED 350 | -0.5881212 OPEN 351 | -0.8880444 ORANGE IS 352 | -0.8487282 OUT BY 353 | -0.5881212 PANDORA 354 | -0.898456 PART OF 355 | -0.5881212 PAUSE 356 | -0.5881212 PHRASES 357 | -0.9200591 PICK UP 358 | -0.5881212 PILOTS 359 | -0.898456 PIRATES OF 360 | -0.5881212 PLATTEN 361 | -0.8136998 PLAY 362 | -1.901061 PLAY CAME 363 | -1.901061 PLAY DAVID 364 | -1.901061 PLAY GET 365 | -1.901061 PLAY LOVE 366 | -1.806209 PLAY ME 367 | -1.901061 PLAY NPR 368 | -1.901061 PLAY STAND 369 | -1.901061 PLAY STRESSED 370 | -1.901061 PLAY SUMMER 371 | -1.364388 PLAY THE 372 | -1.901061 PLAY TRAVELER 373 | -0.9312775 PM PICK 374 | -0.5881212 PUNK 375 | -0.9312775 RACHEL PLATTEN 376 | -1.221142 RECOGNITION TESTING 377 | -1.042144 RECOGNITION TO 378 | -0.9200591 RECORDING THIS 379 | -0.5881212 RED 380 | -0.8880444 REMINDER FOR 381 | -0.9200591 REWIND TEN 382 | -0.9200591 RIGHT LIVING 383 | -0.9200591 RISE AND 384 | -1.500668 ROOM FLOOR 385 | -1.500668 ROOM LIGHT 386 | -1.500668 ROOM LIGHTS 387 | -1.500668 ROOM SONOS 388 | -0.8303289 SAY THE 389 | -0.8880444 SCRIPT IS 390 | -0.8880444 SEARCH FOR 391 | -0.5881212 SECONDS 392 | -0.5515851 SET A 393 | -1.511287 SET ALARM 394 | -1.480033 SET VOLUME 395 | -0.5881212 SHELTON 396 | -0.5881212 SIRI 397 | -0.8487282 SIXTEEN BY 398 | -0.8487282 SOMETHING BY 399 | -0.5881212 SONOS 400 | -0.5881212 SOUNDCLOUD 401 | -0.9200591 SPIDERS FROM 402 | -0.5881212 SPOTIFY 403 | -0.8487282 STAND BY 404 | -0.5881212 STAPELTON 405 | -0.9200591 STARDUST AND 406 | -0.8880444 START I 407 | -0.9312775 STATE YOUR 408 | -0.5881212 STOP 409 | -0.9312775 STRESSED OUT 410 | -0.9312775 SUMMER SIXTEEN 411 | -0.9200591 TAKE ME 412 | -1.221142 TEN MINUTES 413 | -1.221142 TEN SECONDS 414 | -0.9200591 TESTING THIS 415 | -1.980735 THE AIR 416 | -1.980735 THE CARIBBEAN 417 | -1.980735 THE FOLLOWING 418 | -1.980735 THE FRONT 419 | -1.980735 THE GARAGE 420 | -1.980735 THE KIDS 421 | -1.097011 THE LIVING 422 | -1.980735 THE NEW 423 | -1.980735 THE NEXT 424 | -1.980735 THE RISE 425 | -1.980735 THE SCRIPT 426 | -1.980735 THE SPIDERS 427 | -1.791137 THE VOICE 428 | -1.980735 THE WEATHER 429 | -1.178955 THIS IN 430 | -1.221142 THIS PART 431 | -1.221142 TIME DO 432 | -1.140505 TIME IS 433 | -0.8880444 TIMER FOR 434 | -1.881221 TO CALIBRATE 435 | -1.881221 TO DRIVE 436 | -1.881221 TO FORGET 437 | -1.881221 TO LEAVE 438 | -1.881221 TO MAKE 439 | -0.960078 TO MY 440 | -1.881221 TO RED 441 | -1.881221 TO SAY 442 | -1.797722 TO WATCH 443 | -1.881221 TO WORK 444 | -0.2012962 TODAY 445 | -0.8487282 TRAVELER BY 446 | -1.608197 TURN CAMERAS 447 | -0.36451 TURN ON 448 | -1.36611 TURN THE 449 | -0.9312775 TWENTY ONE 450 | -0.9312775 TWO PM 451 | -0.6965729 UP 452 | -1.042144 UP THE 453 | -1.397233 VOICE BEFORE 454 | -0.4278275 VOICE RECOGNITION 455 | -1.355094 VOLUME DOWN 456 | -1.386348 VOLUME FIVE 457 | -1.355094 VOLUME UP 458 | -0.8303289 WANT TO 459 | -1.221142 WATCH ESPN 460 | -1.042144 WATCH THE 461 | -0.9312775 WEATHER LIKE 462 | -0.2529206 WHAT TIME 463 | -1.140505 WHAT'S ON 464 | -1.042144 WHAT'S THE 465 | -0.9091232 WILL IT 466 | -0.5881212 WORK 467 | -0.8487282 YOU BY 468 | -0.9200591 YOUR NAME 469 | -0.8487282 YOURSELF BY 470 | -0.9312775 ZIGGY STARDUST 471 | 472 | \end\ 473 | -------------------------------------------------------------------------------- /run_experiment.py: -------------------------------------------------------------------------------- 1 | from dataloader import MultiDataset, MultiDataLoader 2 | from focalloss import FocalLoss 3 | from warpctc_pytorch import CTCLoss 4 | from model import MultiTask 5 | from decoder import GreedyDecoder, BeamCTCDecoder 6 | from training import train, test 7 | import torch 8 | import torch.nn as nn 9 | from os import makedirs 10 | from tensorboardX import SummaryWriter 11 | from pathlib import Path 12 | import math 13 | from utils import now_str 14 | import gc 15 | 16 | manual_seed = 1337 17 | torch.manual_seed(manual_seed) 18 | torch.cuda.manual_seed_all 19 | print(f'Using torch manual seed {manual_seed}.') 20 | 21 | ### Start timer 22 | min_ = 0 23 | if min_ > 0: 24 | print(f'WARNING TIMER {min_} min') 25 | import time ; from tqdm import tqdm 26 | for __ in tqdm(range(min_)): 27 | time.sleep(60) 28 | ### 29 | 30 | 31 | def run_experiment(_exp_name, 32 | _epochs, 33 | _train_manifest, 34 | _test_manifest, 35 | _labels, 36 | _use_mfcc_in, 37 | _use_ivectors_in, 38 | _use_embeddings_in, 39 | _use_transcripts_out, 40 | _use_accents_out, 41 | _batch_size, 42 | _num_workers, 43 | _mfcc_size, 44 | _ivector_size, 45 | _embedding_size, 46 | _rnn_type, 47 | _rnn_hidden_size, 48 | _nb_head_layers, 49 | _nb_speech_layers, 50 | _nb_accents_layers, 51 | _bidirectional, 52 | _losses_mix, 53 | _learning_rate, 54 | _lm_path, 55 | _decoder_alpha, 56 | _decoder_beta, 57 | _decoder_cutoff_top_n, 58 | _decoder_beam_width, 59 | _cuda, 60 | _tensorboard_path, 61 | _saved_models_path, 62 | _bottleneck_size, 63 | _accent_loss): 64 | 65 | print(f'\n##### Running experiment {_exp_name} #####') 66 | 67 | # Tools to log values 68 | results_dict = {} 69 | results_dict['train_loss'] = [] 70 | results_dict['train_loss_text'] = [] 71 | results_dict['train_loss_accent'] = [] 72 | results_dict['test_loss'] = [] 73 | results_dict['test_loss_text'] = [] 74 | results_dict['test_loss_accent'] = [] 75 | results_dict['test_wer'] = [] 76 | results_dict['test_accent_acc'] = [] 77 | 78 | tb_path = Path(_tensorboard_path) / _exp_name 79 | makedirs(tb_path, exist_ok=True) 80 | tb_writer = SummaryWriter(tb_path) 81 | 82 | ### DATA LOADING 83 | 84 | # Training set 85 | train_dataset = MultiDataset(_train_manifest, 86 | _labels, 87 | use_mfcc_in=_use_mfcc_in, 88 | use_ivectors_in=_use_ivectors_in, 89 | use_embeddings_in=_use_embeddings_in, 90 | embedding_size=_embedding_size, 91 | use_transcripts_out=_use_transcripts_out, 92 | use_accents_out=_use_accents_out) 93 | 94 | train_loader = MultiDataLoader(train_dataset, 95 | batch_size=_batch_size, 96 | shuffle=True, 97 | num_workers=_num_workers) 98 | 99 | # Testing set 100 | test_dataset = MultiDataset(_test_manifest, 101 | _labels, 102 | use_mfcc_in=_use_mfcc_in, 103 | use_ivectors_in=_use_ivectors_in, 104 | use_embeddings_in=_use_embeddings_in, 105 | embedding_size=_embedding_size, 106 | use_transcripts_out=_use_transcripts_out, 107 | use_accents_out=_use_accents_out) 108 | 109 | test_loader = MultiDataLoader(test_dataset, 110 | batch_size=_batch_size, 111 | shuffle=True, 112 | num_workers=_num_workers) 113 | 114 | 115 | ### CREATE MODEL 116 | 117 | model = MultiTask(use_mfcc_in = _use_mfcc_in, 118 | use_ivectors_in = _use_ivectors_in, 119 | use_embeddings_in = _use_embeddings_in, 120 | use_transcripts_out = _use_transcripts_out, 121 | use_accents_out = _use_accents_out, 122 | mfcc_size = _mfcc_size, 123 | ivector_size = _ivector_size, 124 | embedding_size = _embedding_size, 125 | rnn_type = _rnn_type, 126 | labels = _labels, 127 | accents_dict = train_dataset.accent_dict, 128 | rnn_hidden_size = _rnn_hidden_size, 129 | nb_head_layers = _nb_head_layers, 130 | nb_speech_layers = _nb_speech_layers, 131 | nb_accents_layers = _nb_accents_layers, 132 | bidirectional = _bidirectional, 133 | bottleneck_size = _bottleneck_size, 134 | DEBUG=False) 135 | if _cuda: 136 | model = model.cuda() 137 | 138 | print(model, '\n') 139 | print('Model parameters counts:', MultiTask.get_param_size(model), '\n') 140 | 141 | ### OPTIMIZER, CRITERION, DECODER 142 | 143 | # Optimizer 144 | optimizer = torch.optim.Adam(model.parameters(), lr=_learning_rate) 145 | 146 | # Criterion 147 | if _use_accents_out: 148 | if _accent_loss == 'focal': 149 | AccLoss = FocalLoss() 150 | elif _accent_loss == 'CE': 151 | AccLoss = nn.CrossEntropyLoss() 152 | else: 153 | raise ValueError(f'Loss {_accent_loss} for accent_loss is unknown. Please use either "focal" or "CE".') 154 | 155 | if not _use_transcripts_out: # only accent classification 156 | criterion = AccLoss 157 | elif not _use_accents_out: # only text recognition 158 | criterion = CTCLoss() 159 | else: # both tasks 160 | criterion = (CTCLoss(), FocalLoss()) 161 | 162 | # Decoder 163 | if _use_transcripts_out: 164 | decoder = BeamCTCDecoder(_labels, 165 | lm_path=_lm_path, 166 | alpha=_decoder_alpha, 167 | beta=_decoder_beta, 168 | cutoff_top_n=_decoder_cutoff_top_n, 169 | cutoff_prob=_decoder_cutoff_top_n, 170 | beam_width=_decoder_beam_width, 171 | num_processes=_num_workers) 172 | 173 | target_decoder = GreedyDecoder(_labels) 174 | else: 175 | decoder, target_decoder = None, None 176 | 177 | 178 | ### EPOCHS 179 | best_wer = math.inf 180 | best_acc = 0 181 | 182 | for epoch in range(1, _epochs + 1): 183 | ### TRAIN 184 | print(f'Epoch {epoch} training: {exp_name}') 185 | train_results = train(model, train_loader, criterion, optimizer, losses_mix=_losses_mix) 186 | train_loss, train_loss_text, train_loss_accent = train_results 187 | 188 | results_dict['train_loss'].append(train_loss) 189 | results_dict['train_loss_text'].append(train_loss_text) 190 | results_dict['train_loss_accent'].append(train_loss_accent) 191 | print(f'Epoch {epoch} training loss: {train_loss}') 192 | 193 | ### TEST 194 | print(f'Epoch {epoch} testing') 195 | test_results = test(model, test_loader, criterion, decoder, target_decoder, losses_mix=_losses_mix) 196 | test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results 197 | 198 | results_dict['test_loss'].append(test_loss) 199 | results_dict['test_loss_text'].append(test_loss_text) 200 | results_dict['test_loss_accent'].append(test_loss_accent) 201 | results_dict['test_wer'].append(test_wer) 202 | results_dict['test_accent_acc'].append(test_accent_acc) 203 | print(f'Epoch {epoch} testing loss: {test_loss}') 204 | 205 | # Add values to tensorboard 206 | for key, results in results_dict.items(): 207 | tb_writer.add_scalar(key, results[-1], epoch) 208 | 209 | #Save model if it is best 210 | save_new=False 211 | if _use_transcripts_out: 212 | if test_wer < best_wer: 213 | save_new = True 214 | best_wer = test_wer 215 | else: 216 | if test_accent_acc > best_acc: 217 | save_new = True 218 | best_acc = test_accent_acc 219 | 220 | if save_new: 221 | MultiTask.serialize(model, 222 | Path(_saved_models_path) / _exp_name, 223 | save=True, 224 | exp_name=_exp_name, 225 | optimizer=optimizer, 226 | epoch=epoch, 227 | train_losses=results_dict['train_loss'], 228 | test_losses=results_dict['test_loss'], 229 | text_train_losses=results_dict['train_loss_text'], 230 | text_test_losses=results_dict['test_loss_text'], 231 | text_wers=results_dict['test_wer'], 232 | accent_train_losses=results_dict['train_loss_accent'], 233 | accent_test_losses=results_dict['test_loss_accent'], 234 | accent_accuracies=results_dict['test_accent_acc']) 235 | 236 | del model 237 | gc.collect() 238 | torch.cuda.empty_cache() 239 | ## end of run_experiment ## 240 | 241 | 242 | ### MAIN 243 | 244 | if __name__ == '__main__': 245 | import argparse 246 | 247 | parser = argparse.ArgumentParser(description='DeepSpeech model information') 248 | parser.add_argument('--train', action='store_true', help='Uses the train set instead of the dev set.') 249 | parser.add_argument('--epochs', default=None, type=int, help='Number of training epochs') 250 | parser.add_argument('--patch_path', default='experiments.cfg', type=str, help='Path to experiment list') 251 | args = parser.parse_args() 252 | 253 | DEV = not args.train 254 | PATCH_PATH = args.patch_path 255 | EPOCHS = args.epochs 256 | 257 | import config 258 | confs = config.Config() 259 | 260 | for conf in confs.patch_config(PATCH_PATH): 261 | exp_name = conf['exp_name_prefix'] 262 | exp_name += '_DEV' if DEV else '_TRAIN' 263 | exp_name += '__in' 264 | exp_name += '_mfcc' if conf['use_mfcc_in'] else '' 265 | exp_name += '_ivect' if conf['use_ivectors_in'] else '' 266 | exp_name += '_emb' if conf['use_embeddings_in'] else '' 267 | exp_name += '__out' 268 | exp_name += '_transcripts' if conf['use_transcripts_out'] else '' 269 | exp_name += f'_accents-mix{conf["losses_mix"]}-{conf["accent_loss"]}' if conf['use_accents_out'] else '' 270 | exp_name += f'__nblyrs-head-{conf["nb_head_layers"]}' 271 | exp_name += f'-speech-{conf["nb_speech_layers"]}' 272 | exp_name += f'-accent-{conf["nb_accents_layers"]}' 273 | exp_name += f'__bnf-{conf["bottleneck_size"]}' 274 | exp_name += f'__{now_str()}' 275 | 276 | train_manifest = conf['dev_manifest'] if DEV else conf['train_manifest'] 277 | epochs = EPOCHS if EPOCHS is not None else conf['epochs'] 278 | 279 | try: 280 | run_experiment(_exp_name = exp_name, 281 | _epochs = epochs, 282 | _train_manifest = train_manifest, 283 | _test_manifest = conf['test_manifest'], 284 | _labels = conf['labels'], 285 | _use_mfcc_in = conf['use_mfcc_in'], 286 | _use_ivectors_in = conf['use_ivectors_in'], 287 | _use_embeddings_in = conf['use_embeddings_in'], 288 | _use_transcripts_out = conf['use_transcripts_out'], 289 | _use_accents_out = conf['use_accents_out'], 290 | _batch_size = conf['batch_size'], 291 | _num_workers = conf['num_workers'], 292 | _mfcc_size = conf['mfcc_size'], 293 | _ivector_size = conf['ivector_size'], 294 | _embedding_size = conf['embedding_size'], 295 | _rnn_type = conf['rnn_type'], 296 | _rnn_hidden_size = conf['rnn_hidden_size'], 297 | _nb_head_layers = conf['nb_head_layers'], 298 | _nb_speech_layers = conf['nb_speech_layers'], 299 | _nb_accents_layers = conf['nb_accents_layers'], 300 | _bidirectional = conf['bidirectional'], 301 | _losses_mix = conf['losses_mix'], 302 | _learning_rate = conf['learning_rate'], 303 | _lm_path = conf['lm_path'], 304 | _decoder_alpha = conf['decoder_alpha'], 305 | _decoder_beta = conf['decoder_beta'], 306 | _decoder_cutoff_top_n = conf['decoder_cutoff_top_n'], 307 | _decoder_beam_width = conf['decoder_beam_width'], 308 | _cuda = conf['cuda'], 309 | _tensorboard_path = conf['tensorboard_path'], 310 | _saved_models_path = conf['saved_models_path'], 311 | _bottleneck_size = conf['bottleneck_size'], 312 | _accent_loss = conf['accent_loss']) 313 | 314 | except Exception as e: 315 | print(f'Error occured in run {exp_name}:', e) -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from collections import OrderedDict 5 | from modules import MaskConv, BatchRNN, InferenceBatchSoftmax, SequenceWise 6 | 7 | 8 | def rnn_block(rnn_input_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers): 9 | """Creates a stack of Batch RNNs with different input_size than hidden_size.""" 10 | rnns = [] 11 | rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, 12 | bidirectional=bidirectional, batch_norm=False) 13 | rnns.append(('0', rnn)) 14 | for x in range(nb_layers - 1): 15 | rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, 16 | bidirectional=bidirectional) 17 | rnns.append(('%d' % (x + 1), rnn)) 18 | return nn.Sequential(OrderedDict(rnns)) 19 | 20 | 21 | class Head(nn.Module): 22 | """Shared part of the neural network.""" 23 | def __init__(self, 24 | rnn_type, 25 | rnn_hidden_size, 26 | nb_layers, 27 | bidirectional, 28 | feature_len, 29 | DEBUG): 30 | 31 | super(Head, self).__init__() 32 | 33 | self._DEBUG = DEBUG 34 | 35 | # CONV 36 | self.conv = MaskConv(nn.Sequential( 37 | nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), 38 | nn.BatchNorm2d(32), 39 | nn.Hardtanh(0, 20, inplace=True), 40 | nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), 41 | nn.BatchNorm2d(32), 42 | nn.Hardtanh(0, 20, inplace=True) 43 | )) 44 | 45 | # RNN 46 | rnn_input_size = feature_len * 8 47 | 48 | self.rnns = rnn_block(rnn_input_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers) 49 | 50 | 51 | def forward(self, x, lengths): 52 | if self._DEBUG: 53 | print('') 54 | print('# BEGIN HEAD #') 55 | print('input', x.size()) 56 | 57 | lengths = lengths.cpu().int() 58 | output_lengths = self.get_seq_lens(lengths) 59 | 60 | x = x.view(x.size(0), 1, x.size(1), x.size(2)) 61 | x = x.transpose(2, 3) 62 | if self._DEBUG: 63 | print('after view transpose', x.size()) 64 | 65 | x, _ = self.conv(x, output_lengths) 66 | if self._DEBUG: 67 | print('after conv', x.size()) 68 | 69 | sizes = x.size() 70 | x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension 71 | x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH 72 | if self._DEBUG: 73 | print('after view transpose', x.size()) 74 | 75 | for rnn in self.rnns: 76 | x = rnn(x, output_lengths) 77 | if self._DEBUG: 78 | print('after rnn', x.size()) 79 | 80 | self._DEBUG = False 81 | return x, output_lengths 82 | 83 | def get_seq_lens(self, input_length): 84 | """ 85 | Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable 86 | containing the size sequences that will be output by the network. 87 | :param input_length: 1D Tensor 88 | :return: 1D Tensor scaled by model 89 | """ 90 | seq_len = input_length 91 | for m in self.conv.modules(): 92 | if type(m) == nn.modules.conv.Conv2d: 93 | seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1) 94 | return seq_len.int() 95 | 96 | 97 | class SpeechToText(nn.Module): 98 | def __init__(self, 99 | rnn_type, 100 | rnn_hidden_size, 101 | nb_layers, 102 | bidirectional, 103 | labels, 104 | DEBUG): 105 | 106 | super(SpeechToText, self).__init__() 107 | 108 | self._DEBUG = DEBUG 109 | 110 | # RNN 111 | self.rnns = rnn_block(rnn_hidden_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers) 112 | 113 | # FULLY CO 114 | num_classes = len(labels) 115 | 116 | fully_connected = nn.Sequential( 117 | nn.BatchNorm1d(rnn_hidden_size), 118 | nn.Linear(rnn_hidden_size, num_classes, bias=False) 119 | ) 120 | self.fc = nn.Sequential( 121 | SequenceWise(fully_connected), 122 | ) 123 | self.inference_softmax = InferenceBatchSoftmax() 124 | 125 | 126 | def forward(self, x, output_lengths): 127 | if self._DEBUG: 128 | print('') 129 | print('# BEGIN speech to text #') 130 | print('input', x.size()) 131 | 132 | for rnn in self.rnns: 133 | x = rnn(x, output_lengths) 134 | 135 | if self._DEBUG: 136 | print('after rnn', x.size()) 137 | 138 | x = self.fc(x) 139 | if self._DEBUG: 140 | print('after fc', x.size()) 141 | 142 | x = x.transpose(0, 1) 143 | if self._DEBUG: 144 | print('after transpose', x.size()) 145 | # identity in training mode, softmax in eval mode 146 | x = self.inference_softmax(x) 147 | if self._DEBUG: 148 | print('after softmax', x.size()) 149 | 150 | x = x.transpose(0, 1) 151 | if self._DEBUG: 152 | print('after transpose', x.size()) 153 | 154 | self._DEBUG = False 155 | return x 156 | 157 | 158 | class AccentClassifier(nn.Module): 159 | def __init__(self, 160 | rnn_type, 161 | rnn_hidden_size, 162 | nb_layers, 163 | bidirectional, 164 | accents_dict, 165 | bottleneck_size, 166 | DEBUG): 167 | 168 | super(AccentClassifier, self).__init__() 169 | 170 | self._DEBUG = DEBUG 171 | 172 | # RNN 173 | self.rnns = rnn_block(rnn_hidden_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers) 174 | 175 | # FULLY CO 176 | num_classes = len(accents_dict) 177 | 178 | self.bnf = nn.Sequential( 179 | nn.BatchNorm1d(rnn_hidden_size), 180 | nn.Linear(rnn_hidden_size, 1024), 181 | nn.ReLU(), 182 | nn.BatchNorm1d(1024), 183 | nn.Linear(1024, bottleneck_size), 184 | nn.ReLU(), 185 | ) 186 | 187 | self.fc = nn.Sequential( 188 | nn.BatchNorm1d(bottleneck_size), 189 | nn.Linear(bottleneck_size, num_classes), 190 | nn.ReLU(), 191 | ) 192 | 193 | self.softmax = nn.Softmax(dim=1) 194 | 195 | def forward(self, x, output_lengths): 196 | if self._DEBUG: 197 | print('') 198 | print('# BEGIN Acc #') 199 | print('input', x.size()) 200 | 201 | for rnn in self.rnns: 202 | x = rnn(x, output_lengths) 203 | 204 | if self._DEBUG: 205 | print('after rnn', x.size()) 206 | 207 | x = x.mean(dim=0) 208 | 209 | if self._DEBUG: 210 | print('after mean', x.size()) 211 | 212 | bottleneck = self.bnf(x) 213 | 214 | if self._DEBUG: 215 | print('after bnf', bottleneck.size()) 216 | 217 | x = self.fc(bottleneck) 218 | 219 | if self._DEBUG: 220 | print('after fc', x.size()) 221 | 222 | x = self.softmax(x) 223 | 224 | if self._DEBUG: 225 | print('after softmax', x.size()) 226 | 227 | self._DEBUG = False 228 | return x, bottleneck 229 | 230 | 231 | class MultiTask(nn.Module): 232 | def __init__(self, 233 | use_mfcc_in=True, 234 | use_ivectors_in=True, 235 | use_embeddings_in=True, 236 | use_transcripts_out=True, 237 | use_accents_out=True, 238 | mfcc_size=40, 239 | ivector_size=100, 240 | embedding_size=100, 241 | rnn_type=nn.GRU, 242 | labels="abc", 243 | accents_dict={'uk', 'us'}, 244 | rnn_hidden_size=800, 245 | nb_head_layers=2, 246 | nb_speech_layers=2, 247 | nb_accents_layers=2, 248 | bidirectional=True, 249 | bottleneck_size=256, 250 | DEBUG=False): 251 | 252 | self._meta = { 253 | 'use_mfcc_in': use_mfcc_in, 254 | 'use_ivectors_in': use_ivectors_in, 255 | 'use_embeddings_in': use_embeddings_in, 256 | 'use_transcripts_out': use_transcripts_out, 257 | 'use_accents_out': use_accents_out, 258 | 'mfcc_size': mfcc_size, 259 | 'ivector_size': ivector_size, 260 | 'embedding_size': embedding_size, 261 | 'rnn_type': rnn_type, 262 | 'labels': labels, 263 | 'accents_dict': accents_dict, 264 | 'rnn_hidden_size': rnn_hidden_size, 265 | 'nb_head_layers': nb_head_layers, 266 | 'nb_speech_layers': nb_speech_layers, 267 | 'nb_accents_layers': nb_accents_layers, 268 | 'bidirectional': bidirectional, 269 | 'bottleneck_size': bottleneck_size, 270 | 'DEBUG': DEBUG, 271 | } 272 | 273 | super(MultiTask, self).__init__() 274 | 275 | self.feature_len = 0 276 | self.feature_len += mfcc_size if use_mfcc_in else 0 277 | self.feature_len += ivector_size if use_ivectors_in else 0 278 | self.feature_len += embedding_size if use_embeddings_in else 0 279 | 280 | self.Head = Head(rnn_type=rnn_type, 281 | rnn_hidden_size=rnn_hidden_size, 282 | nb_layers=nb_head_layers, 283 | bidirectional=bidirectional, 284 | feature_len=self.feature_len, 285 | DEBUG=DEBUG) 286 | 287 | if self._meta['use_transcripts_out']: 288 | self.SpeechToText = SpeechToText(rnn_type=rnn_type, 289 | rnn_hidden_size=rnn_hidden_size, 290 | nb_layers=nb_speech_layers, 291 | bidirectional=bidirectional, 292 | labels=labels, 293 | DEBUG=DEBUG) 294 | 295 | if self._meta['use_accents_out']: 296 | self.AccentClassifier = AccentClassifier(rnn_type=rnn_type, 297 | rnn_hidden_size=rnn_hidden_size, 298 | nb_layers=nb_accents_layers, 299 | bidirectional=bidirectional, 300 | accents_dict=accents_dict, 301 | bottleneck_size=bottleneck_size, 302 | DEBUG=DEBUG) 303 | 304 | def forward(self, x, lengths): 305 | x, out_len = self.Head(x, lengths) 306 | x_stt, x_acc, bnf = None, None, None 307 | 308 | if self._meta['use_transcripts_out']: 309 | x_stt = self.SpeechToText(x, out_len) 310 | 311 | if self._meta['use_accents_out']: 312 | x_acc, bnf = self.AccentClassifier(x, out_len) 313 | 314 | return x_stt, x_acc, out_len, bnf 315 | 316 | 317 | @staticmethod 318 | def get_param_size(model): 319 | params = 0 320 | for p in model.parameters(): 321 | tmp = 1 322 | for x in p.size(): 323 | tmp *= x 324 | params += tmp 325 | return params 326 | 327 | @classmethod 328 | def load_model(cls, path): 329 | package = torch.load(path, map_location=lambda storage, loc: storage) 330 | meta = package['meta'] 331 | model = cls( 332 | use_mfcc_in = meta['use_mfcc_in'], 333 | use_ivectors_in = meta['use_ivectors_in'], 334 | use_embeddings_in = meta['use_embeddings_in'], 335 | use_transcripts_out = meta['use_transcripts_out'], 336 | use_accents_out = meta['use_accents_out'], 337 | mfcc_size = meta['mfcc_size'], 338 | ivector_size = meta['ivector_size'], 339 | embedding_size = meta['embedding_size'], 340 | rnn_type = meta['rnn_type'], 341 | labels = meta['labels'], 342 | accents_dict = meta['accents_dict'], 343 | rnn_hidden_size = meta['rnn_hidden_size'], 344 | nb_head_layers = meta['nb_head_layers'], 345 | nb_speech_layers = meta['nb_speech_layers'], 346 | nb_accents_layers = meta['nb_accents_layers'], 347 | bidirectional = meta['bidirectional'], 348 | bottleneck_size = meta['bottleneck_size'], 349 | DEBUG = meta['DEBUG'], 350 | ) 351 | model.load_state_dict(package['state_dict']) 352 | return model, package 353 | 354 | @staticmethod 355 | def serialize(model, 356 | path='./__temp__', 357 | save=True, 358 | exp_name=None, 359 | optimizer=None, 360 | epoch=None, 361 | train_losses=None, 362 | test_losses=None, 363 | text_train_losses=None, 364 | text_test_losses=None, 365 | text_wers=None, 366 | accent_train_losses=None, 367 | accent_test_losses=None, 368 | accent_accuracies=None): 369 | 370 | """Saves the model in a packaged form. Also returns the package. 371 | Use the load_model class method to recreate a model from a package.""" 372 | 373 | package = { 374 | 'state_dict': model.state_dict(), 375 | 'meta': model._meta 376 | } 377 | 378 | if exp_name is not None: 379 | package['exp_name'] = exp_name 380 | if optimizer is not None: 381 | package['optimizer'] = optimizer 382 | if epoch is not None: 383 | package['epoch'] = epoch 384 | if train_losses is not None: 385 | package['train_losses'] = train_losses 386 | if test_losses is not None: 387 | package['test_losses'] = test_losses 388 | if text_train_losses is not None: 389 | package['text_train_losses'] = text_train_losses 390 | if text_test_losses is not None: 391 | package['text_test_losses'] = text_test_losses 392 | if text_wers is not None: 393 | package['text_wers'] = text_wers 394 | if accent_train_losses is not None: 395 | package['accent_train_losses'] = accent_train_losses 396 | if accent_test_losses is not None: 397 | package['accent_test_losses'] = accent_test_losses 398 | if accent_accuracies is not None: 399 | package['accent_accuracies'] = accent_accuracies 400 | 401 | if save: 402 | torch.save(package, str(path) + '.pth') 403 | 404 | return package -------------------------------------------------------------------------------- /tests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "torch.Size([20, 960, 240])\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%reload_ext autoreload\n", 18 | "%autoreload 1\n", 19 | "%aimport config\n", 20 | "\n", 21 | "conf = config.Config()\n", 22 | "\n", 23 | "from model import MultiTask\n", 24 | "model = MultiTask.load_model('saved_models/SimpleDS_TRAIN__in_mfcc__out_transcripts__nblyrs-head-4-speech-1-accent-1__bnf-256__24-02-2019_23h50m00.pth')\n", 25 | "\n", 26 | "from dataloader import MultiDataset, MultiDataLoader\n", 27 | "import torch\n", 28 | "\n", 29 | "labels = \" 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_\"\n", 30 | "\n", 31 | "\n", 32 | "dataset = MultiDataset('data/splits/dev.csv', labels, \n", 33 | " use_mfcc_in=model._meta['use_mfcc_in'], \n", 34 | " use_ivectors_in=True,#model._meta['use_ivectors_in'], \n", 35 | " use_embeddings_in=True,#model._meta['use_embeddings_in'],\n", 36 | " use_transcripts_out=model._meta['use_transcripts_out'], \n", 37 | " use_accents_out=model._meta['use_accents_out'])\n", 38 | "\n", 39 | "dataloader = MultiDataLoader(dataset, batch_size=20, shuffle=False)\n", 40 | "\n", 41 | "for data in dataloader:\n", 42 | " print(data[0].size())\n", 43 | " break" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 6, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "{'use_mfcc_in': True,\n", 55 | " 'use_ivectors_in': False,\n", 56 | " 'use_embeddings_in': False,\n", 57 | " 'use_transcripts_out': True,\n", 58 | " 'use_accents_out': False,\n", 59 | " 'mfcc_size': 40,\n", 60 | " 'ivector_size': 100,\n", 61 | " 'embedding_size': 256,\n", 62 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n", 63 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n", 64 | " 'accents_dict': {'australia': 0,\n", 65 | " 'canada': 1,\n", 66 | " 'england': 2,\n", 67 | " 'ireland': 3,\n", 68 | " 'scotland': 4,\n", 69 | " 'us': 5,\n", 70 | " 'wales': 6},\n", 71 | " 'rnn_hidden_size': 800,\n", 72 | " 'nb_head_layers': 4,\n", 73 | " 'nb_speech_layers': 1,\n", 74 | " 'nb_accents_layers': 1,\n", 75 | " 'bidirectional': True,\n", 76 | " 'bottleneck_size': 256,\n", 77 | " 'DEBUG': False}" 78 | ] 79 | }, 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 70, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "tensor([7])" 96 | ] 97 | }, 98 | "execution_count": 70, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "import torch\n", 105 | "sum([torch.tensor([2]), torch.tensor([5])])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 65, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "t='this is test'\n", 115 | "i = t.find(' ')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 66, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "'is test'" 127 | ] 128 | }, 129 | "execution_count": 66, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "t[i+1:]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 67, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "[{'exp_name_prefix': 'a',\n", 147 | " 'epochs': 2,\n", 148 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n", 149 | " 'batch_size': 10,\n", 150 | " 'num_workers': 4,\n", 151 | " 'cuda': True,\n", 152 | " 'losses_mix': 0.9,\n", 153 | " 'learning_rate': 0.0003,\n", 154 | " 'mfcc_size': 40,\n", 155 | " 'ivector_size': 100,\n", 156 | " 'embedding_size': 100,\n", 157 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n", 158 | " 'rnn_hidden_size': 800,\n", 159 | " 'nb_head_layers': 3,\n", 160 | " 'nb_speech_layers': 1,\n", 161 | " 'nb_accents_layers': 1,\n", 162 | " 'bidirectional': True,\n", 163 | " 'bottleneck_size': 256,\n", 164 | " 'use_mfcc_in': True,\n", 165 | " 'use_ivectors_in': True,\n", 166 | " 'use_embeddings_in': True,\n", 167 | " 'use_transcripts_out': True,\n", 168 | " 'use_accents_out': False,\n", 169 | " 'decoder_alpha': 0.8,\n", 170 | " 'decoder_beta': 1.0,\n", 171 | " 'decoder_cutoff_top_n': 40,\n", 172 | " 'decoder_cutoff_prob': 1.0,\n", 173 | " 'decoder_beam_width': 100,\n", 174 | " 'lm_path': './data/language_models/cv.lm',\n", 175 | " 'train_manifest': './data/splits/train.csv',\n", 176 | " 'dev_manifest': './data/splits/dev.csv',\n", 177 | " 'test_manifest': './data/splits/test.csv',\n", 178 | " 'tensorboard_path': './tensorboard_runs/',\n", 179 | " 'saved_models_path': './saved_models/'},\n", 180 | " {'exp_name_prefix': 'b',\n", 181 | " 'epochs': 2,\n", 182 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n", 183 | " 'batch_size': 10,\n", 184 | " 'num_workers': 4,\n", 185 | " 'cuda': True,\n", 186 | " 'losses_mix': 0.9,\n", 187 | " 'learning_rate': 0.0003,\n", 188 | " 'mfcc_size': 40,\n", 189 | " 'ivector_size': 100,\n", 190 | " 'embedding_size': 100,\n", 191 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n", 192 | " 'rnn_hidden_size': 800,\n", 193 | " 'nb_head_layers': 3,\n", 194 | " 'nb_speech_layers': 1,\n", 195 | " 'nb_accents_layers': 1,\n", 196 | " 'bidirectional': True,\n", 197 | " 'bottleneck_size': 256,\n", 198 | " 'use_mfcc_in': False,\n", 199 | " 'use_ivectors_in': True,\n", 200 | " 'use_embeddings_in': True,\n", 201 | " 'use_transcripts_out': False,\n", 202 | " 'use_accents_out': True,\n", 203 | " 'decoder_alpha': 0.8,\n", 204 | " 'decoder_beta': 1.0,\n", 205 | " 'decoder_cutoff_top_n': 40,\n", 206 | " 'decoder_cutoff_prob': 1.0,\n", 207 | " 'decoder_beam_width': 100,\n", 208 | " 'lm_path': './data/language_models/cv.lm',\n", 209 | " 'train_manifest': './data/splits/train.csv',\n", 210 | " 'dev_manifest': './data/splits/dev.csv',\n", 211 | " 'test_manifest': './data/splits/test.csv',\n", 212 | " 'tensorboard_path': './tensorboard_runs/',\n", 213 | " 'saved_models_path': './saved_models/'},\n", 214 | " {'exp_name_prefix': 'c',\n", 215 | " 'epochs': 2,\n", 216 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n", 217 | " 'batch_size': 10,\n", 218 | " 'num_workers': 4,\n", 219 | " 'cuda': True,\n", 220 | " 'losses_mix': 0.9,\n", 221 | " 'learning_rate': 0.0003,\n", 222 | " 'mfcc_size': 40,\n", 223 | " 'ivector_size': 100,\n", 224 | " 'embedding_size': 100,\n", 225 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n", 226 | " 'rnn_hidden_size': 800,\n", 227 | " 'nb_head_layers': 3,\n", 228 | " 'nb_speech_layers': 1,\n", 229 | " 'nb_accents_layers': 1,\n", 230 | " 'bidirectional': True,\n", 231 | " 'bottleneck_size': 256,\n", 232 | " 'use_mfcc_in': True,\n", 233 | " 'use_ivectors_in': False,\n", 234 | " 'use_embeddings_in': False,\n", 235 | " 'use_transcripts_out': True,\n", 236 | " 'use_accents_out': True,\n", 237 | " 'decoder_alpha': 0.8,\n", 238 | " 'decoder_beta': 1.0,\n", 239 | " 'decoder_cutoff_top_n': 40,\n", 240 | " 'decoder_cutoff_prob': 1.0,\n", 241 | " 'decoder_beam_width': 100,\n", 242 | " 'lm_path': './data/language_models/cv.lm',\n", 243 | " 'train_manifest': './data/splits/train.csv',\n", 244 | " 'dev_manifest': './data/splits/dev.csv',\n", 245 | " 'test_manifest': './data/splits/test.csv',\n", 246 | " 'tensorboard_path': './tensorboard_runs/',\n", 247 | " 'saved_models_path': './saved_models/'}]" 248 | ] 249 | }, 250 | "execution_count": 67, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "conf.patch_config('experiments.cfg')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 4, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "from model import MultiTask\n", 273 | "\n", 274 | "model = MultiTask(DEBUG=False, rnn_hidden_size=800, \n", 275 | " use_mfcc_in=conf['use_mfcc_in'], \n", 276 | " use_ivectors_in=conf['use_ivectors_in'], \n", 277 | " use_embeddings_in=conf['use_embeddings_in'],\n", 278 | " use_transcripts_out=conf['use_transcripts_out'], \n", 279 | " use_accents_out=conf['use_accents_out'])" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 24, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "blib \n", 292 | "\n", 293 | "test\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "print('blib', '\\n')\n", 299 | "print('test')\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 25, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "{'australia': 0, 'canada': 1, 'england': 2, 'us': 3}" 311 | ] 312 | }, 313 | "execution_count": 25, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "dataset.accent_dict" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 26, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "[True]" 331 | ] 332 | }, 333 | "execution_count": 26, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "conf['use_embeddings_in']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 27, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "True" 351 | ] 352 | }, 353 | "execution_count": 27, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "model._meta['use_embeddings_in']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 29, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "application/vnd.jupyter.widget-view+json": { 370 | "model_id": "f3351a1c54734de0b6fe48058fa7e33e", 371 | "version_major": 2, 372 | "version_minor": 0 373 | }, 374 | "text/plain": [ 375 | "HBox(children=(IntProgress(value=0, max=58), HTML(value='')))" 376 | ] 377 | }, 378 | "metadata": {}, 379 | "output_type": "display_data" 380 | }, 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "from tqdm import tqdm_notebook as tqdm\n", 391 | "\n", 392 | "model = model.cuda()\n", 393 | "\n", 394 | "for data in tqdm(dataloader):\n", 395 | " inputs, inputs_lens, transcripts, transcripts_lens, accents = data\n", 396 | "\n", 397 | " \n", 398 | " a, b, c, __ = model(inputs.cuda(), inputs_lens.cuda())" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 10, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "MultiTask.serialize(model, 'tmp')\n", 408 | "\n", 409 | "modelb = MultiTask.load_model('tmp')\n" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 12, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "modelb = modelb.cuda()" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 13, 424 | "metadata": {}, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "application/vnd.jupyter.widget-view+json": { 429 | "model_id": "8850c82f6c9d4458bc727af18e20630b", 430 | "version_major": 2, 431 | "version_minor": 0 432 | }, 433 | "text/plain": [ 434 | "HBox(children=(IntProgress(value=0, max=571), HTML(value='')))" 435 | ] 436 | }, 437 | "metadata": {}, 438 | "output_type": "display_data" 439 | }, 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | "\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "for data in tqdm(dataloader):\n", 450 | " inputs, inputs_lens, transcripts, transcripts_lens, accents = data\n", 451 | "\n", 452 | " \n", 453 | " a, b, c = modelb(inputs.cuda(), inputs_lens.cuda())\n" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "@classmethod\n", 463 | "def load_model(cls, path):\n", 464 | " package = torch.load(path, map_location=lambda storage, loc: storage)\n", 465 | " model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['nb_layers'],\n", 466 | " labels=package['labels'], audio_conf=package['audio_conf'],\n", 467 | " rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))\n", 468 | " model.load_state_dict(package['state_dict'])\n", 469 | " for x in model.rnns:\n", 470 | " x.flatten_parameters()\n", 471 | " return model\n", 472 | "\n", 473 | "@classmethod\n", 474 | "def load_model_package(cls, package):\n", 475 | " model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['nb_layers'],\n", 476 | " labels=package['labels'], audio_conf=package['audio_conf'],\n", 477 | " rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))\n", 478 | " model.load_state_dict(package['state_dict'])\n", 479 | " return model\n", 480 | "\n", 481 | "@staticmethod\n", 482 | "def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None,\n", 483 | " main_loss_results=None, side_loss_results=None,\n", 484 | " cer_results=None, wer_results=None, mca_results=None, avg_loss=None, meta=None):\n", 485 | " model = model.module if DeepSpeech.is_parallel(model) else model\n", 486 | " package = {\n", 487 | " 'version': model._version,\n", 488 | " 'hidden_size': model._hidden_size,\n", 489 | " 'nb_layers': model._nb_layers,\n", 490 | " 'rnn_type': supported_rnns_inv.get(model._rnn_type, model._rnn_type.__name__.lower()),\n", 491 | " 'audio_conf': model._audio_conf,\n", 492 | " 'labels': model._labels,\n", 493 | " 'state_dict': model.state_dict(),\n", 494 | " 'bidirectional': model._bidirectional\n", 495 | " }\n", 496 | " if optimizer is not None:\n", 497 | " package['optim_dict'] = optimizer.state_dict()\n", 498 | " if avg_loss is not None:\n", 499 | " package['avg_loss'] = avg_loss\n", 500 | " if epoch is not None:\n", 501 | " package['epoch'] = epoch + 1 # increment for readability\n", 502 | " if iteration is not None:\n", 503 | " package['iteration'] = iteration\n", 504 | " if loss_results is not None:\n", 505 | " package['loss_results'] = loss_results\n", 506 | " package['main_loss_results'] = main_loss_results\n", 507 | " package['side_loss_results'] = side_loss_results\n", 508 | " package['cer_results'] = cer_results\n", 509 | " package['wer_results'] = wer_results\n", 510 | " package['mca_results'] = mca_results\n", 511 | " if meta is not None:\n", 512 | " package['meta'] = meta\n", 513 | " return package" 514 | ] 515 | } 516 | ], 517 | "metadata": { 518 | "kernelspec": { 519 | "display_name": "Python 3", 520 | "language": "python", 521 | "name": "python3" 522 | }, 523 | "language_info": { 524 | "codemirror_mode": { 525 | "name": "ipython", 526 | "version": 3 527 | }, 528 | "file_extension": ".py", 529 | "mimetype": "text/x-python", 530 | "name": "python", 531 | "nbconvert_exporter": "python", 532 | "pygments_lexer": "ipython3", 533 | "version": "3.6.8" 534 | } 535 | }, 536 | "nbformat": 4, 537 | "nbformat_minor": 2 538 | } 539 | --------------------------------------------------------------------------------