├── README.md
├── utils.py
├── experiments.cfg
├── focalloss.py
├── .gitignore
├── default_config.cfg
├── config.py
├── test_model.py
├── modules.py
├── training.py
├── dataloader.py
├── decoder.py
├── data
    └── language_models
    │   ├── lnn_tri.lm
    │   └── lnn_bi.lm
├── run_experiment.py
├── model.py
└── tests.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # AccentedSpeechRecognition
2 | Experiments on speech recognition robustness to accents and dialects.
3 | 
4 | Part of the code was borrowed from https://github.com/SeanNaren/deepspeech.pytorch, please follow their readme for setup.
5 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import time
 4 | 
 5 | def tile(a, dim, n_tile):
 6 |     """Expands a tensor amongst a given dimension, repeating its components."""
 7 |     init_dim = a.size(dim)
 8 |     repeat_idx = [1] * a.dim()
 9 |     repeat_idx[dim] = n_tile
10 |     a = a.repeat(*(repeat_idx))
11 |     order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
12 |     if a.is_cuda:
13 |         order_index = order_index.cuda()
14 |     return torch.index_select(a, dim, order_index)
15 | 
16 | def now_str():
17 |     return time.strftime("%d-%m-%Y_%Hh%Mm%S")


--------------------------------------------------------------------------------
/experiments.cfg:
--------------------------------------------------------------------------------
 1 | # List of experiments settings to override default_config.cfg
 2 | # Use '#' for comments and '!' to seperate experiments
 3 | 
 4 | 
 5 | 
 6 | # general
 7 | exp_name_prefix 'TestMulti'
 8 | 
 9 | # hyper params
10 | nb_head_layers 4
11 | nb_speech_layers 1
12 | nb_accents_layers 1
13 | 
14 | embedding_size 256
15 | 
16 | # network config
17 | use_mfcc_in True
18 | use_ivectors_in False
19 | use_embeddings_in False
20 | use_transcripts_out True
21 | use_accents_out True
22 | 
23 | !
24 | 
25 | 
26 | # general
27 | exp_name_prefix 'TestMulti'
28 | 
29 | # hyper params
30 | nb_head_layers 4
31 | nb_speech_layers 1
32 | nb_accents_layers 1
33 | 
34 | embedding_size 256
35 | 
36 | # network config
37 | use_mfcc_in True
38 | use_ivectors_in False
39 | use_embeddings_in True
40 | use_transcripts_out True


--------------------------------------------------------------------------------
/focalloss.py:
--------------------------------------------------------------------------------
 1 | # Code taken from https://github.com/clcarwin/focal_loss_pytorch
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | from torch.autograd import Variable
 7 | 
 8 | class FocalLoss(nn.Module):
 9 |     def __init__(self, gamma=0, alpha=None, size_average=True):
10 |         super(FocalLoss, self).__init__()
11 |         self.gamma = gamma
12 |         self.alpha = alpha
13 |         if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
14 |         if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
15 |         self.size_average = size_average
16 | 
17 |     def forward(self, input, target):
18 |         if input.dim()>2:
19 |             input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
20 |             input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
21 |             input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
22 |         target = target.view(-1,1)
23 | 
24 |         logpt = F.log_softmax(input, dim=0)
25 |         logpt = logpt.gather(1,target)
26 |         logpt = logpt.view(-1)
27 |         pt = Variable(logpt.data.exp())
28 | 
29 |         if self.alpha is not None:
30 |             if self.alpha.type()!=input.data.type():
31 |                 self.alpha = self.alpha.type_as(input.data)
32 |             at = self.alpha.gather(0,target.data.view(-1))
33 |             logpt = logpt * Variable(at)
34 | 
35 |         loss = -1 * (1-pt)**self.gamma * logpt
36 |         if self.size_average: return loss.mean()
37 |         else: return loss.sum()


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | bak/
  3 | log.txt
  4 | *.wav
  5 | mfccs/
  6 | embeddings*/
  7 | *_dataset/
  8 | txt/
  9 | wav/
 10 | ivectors/
 11 | saved_models/
 12 | tensorboard_runs/
 13 | 
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # celery beat schedule file
 92 | celerybeat-schedule
 93 | 
 94 | # SageMath parsed files
 95 | *.sage.py
 96 | 
 97 | # Environments
 98 | .env
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | 


--------------------------------------------------------------------------------
/default_config.cfg:
--------------------------------------------------------------------------------
 1 | # configuration, separate name and values (can be multiple) with  
 2 | # if multiple values exists for a field, multiple experiments will be run
 3 | # (see config.py: Config.create_multi_dict())
 4 | 
 5 | # general
 6 | exp_name_prefix ''
 7 | epochs 30
 8 | labels "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
 9 | batch_size 40
10 | num_workers 4
11 | cuda True
12 | 
13 | # hyper params
14 | losses_mix 0.9
15 | learning_rate 3e-4
16 | mfcc_size 40
17 | ivector_size 100
18 | embedding_size 100
19 | rnn_type nn.GRU
20 | rnn_hidden_size 800
21 | nb_head_layers 3
22 | nb_speech_layers 1
23 | nb_accents_layers 1
24 | bidirectional True
25 | bottleneck_size 256
26 | accent_loss 'focal'
27 | 
28 | # network config
29 | use_mfcc_in True
30 | use_ivectors_in False
31 | use_embeddings_in False
32 | use_transcripts_out True
33 | use_accents_out False
34 | 
35 | # decoder
36 | decoder_alpha 0.8
37 | decoder_beta 1.
38 | decoder_cutoff_top_n 40
39 | decoder_cutoff_prob 1.
40 | decoder_beam_width 100
41 | 
42 | # paths
43 | lm_path './data/language_models/cv.lm'
44 | train_manifest './data/CommonVoice_dataset/splits/train.csv'
45 | dev_manifest './data/CommonVoice_dataset/splits/dev.csv'
46 | test_manifest './data/CommonVoice_dataset/splits/test.csv'
47 | tensorboard_path './tensorboard_runs/'
48 | saved_models_path './saved_models/'
49 | 
50 | # tests
51 | testing_manifests [('./data/CommonVoice_dataset/splits/test.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/dev.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/testnz.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/testin.csv', './data/language_models/cv.lm'), ('./data/Logi_dataset/splits/nonnative.csv', './data/language_models/lnn_tri.lm'), ('./data/Logi_dataset/splits/native.csv', './data/language_models/lnn_tri.lm')]
52 | #testing_manifests [('./data/Logi_dataset/splits/nonnative.csv', './data/language_models/lnn_bi.lm'), ('./data/Logi_dataset/splits/native.csv', './data/language_models/lnn_bi.lm')]
53 | 
54 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import torch.nn as nn
 3 | 
 4 | class Config(collections.MutableMapping):
 5 |     """A dictionary that applies an arbitrary key-altering
 6 |        function before accessing the keys"""
 7 | 
 8 |     def __init__(self, config_path='./default_config.cfg', sep=' ', *args, **kwargs):
 9 |         self.store = dict()
10 |         self.update(dict(*args, **kwargs))  # use the free update to set keys
11 |         
12 |         with open(config_path, 'r') as f:
13 |             confs = {}
14 |             for l in f.readlines():
15 |                 if (l[0] is not '#') and (l[0] is not '\n'): # remove comments and empty lines
16 |                     sep_idx = l.find(sep)
17 |                     confs[l[:sep_idx]] = eval(l[sep_idx+1:])
18 |             self.update(confs)
19 |   
20 |     def __getitem__(self, key):
21 |         return self.store[self.__keytransform__(key)]
22 | 
23 |     def __setitem__(self, key, value):
24 |         self.store[self.__keytransform__(key)] = value
25 | 
26 |     def __delitem__(self, key):
27 |         del self.store[self.__keytransform__(key)]
28 | 
29 |     def __iter__(self):
30 |         return iter(self.store)
31 | 
32 |     def __len__(self):
33 |         return len(self.store)
34 | 
35 |     def __keytransform__(self, key):
36 |         return key
37 |     
38 |     def __str__(self):
39 |         return self.store.__str__()
40 |     
41 |     def __repr__(self):
42 |         return self.store.__repr__()
43 | 
44 |     
45 | #    def create_multi_dict(self):
46 | #        """ Not recomended, please use the patch_config method instead
47 | #        """
48 | #        """ Used to create as much configuration needed to run experiments with
49 | #        all the possible combinations of values in the conf file."""
50 | #        prev_configs = [{}]
51 | #        for key, vals in self.store.items():
52 | #            new_configs = []
53 | #            for v in vals:
54 | #                for conf in prev_configs:
55 | #                    new_conf = {}
56 | #                    new_conf.update(conf)
57 | #                    new_configs.append(new_conf)
58 | #                    new_conf[key] = v   
59 | #                            
60 | #            prev_configs = new_configs
61 | #                
62 | #        return new_configs
63 |     
64 |     def patch_config(self, patch_path, patch_sep='!', sep=' '):
65 |         """Takes a file with config patches separated by a line 
66 |         starting with the 'patch_sep' argument.
67 |         For each creates a new config based on the default one."""
68 |         
69 |         new_configs = []
70 |         
71 |         with open(patch_path, 'r') as f:
72 |             current = {}
73 |             for l in f.readlines():
74 |                 if (l[0] is not '#') and (l[0] is not '\n'):
75 |                     if (l[0] is '!'):
76 |                         new_configs.append(current)
77 |                         current = {}
78 |                     else:
79 |                         sep_idx = l.find(sep)
80 |                         current[l[:sep_idx]] = eval(l[sep_idx+1:])
81 |                         
82 |             # Checks if last patch was added
83 |             if len(current) > 0:
84 |                 new_configs.append(current)
85 |                 
86 |         final_configs = [self.store.copy() for __ in range(len(new_configs))]
87 |         [store.update(conf) for conf, store in zip(new_configs, final_configs)]
88 |         
89 |         return final_configs if len(final_configs) > 0 else self.store
90 |             


--------------------------------------------------------------------------------
/test_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from model import MultiTask
  3 | from training import test
  4 | from dataloader import MultiDataset, MultiDataLoader
  5 | import torch.nn as nn
  6 | import torch
  7 | from focalloss import FocalLoss
  8 | from warpctc_pytorch import CTCLoss
  9 | from decoder import GreedyDecoder, BeamCTCDecoder
 10 | import sys
 11 | import sys
 12 | from pathlib import Path
 13 | 
 14 | PRINT_LATEX_TABLE = True
 15 | 
 16 | manual_seed = 666
 17 | torch.manual_seed(manual_seed)
 18 | torch.cuda.manual_seed_all
 19 | print(f'Using torch manual seed {manual_seed}.')
 20 | 
 21 | def eprint(*args, **kwargs):
 22 |     print(*args, file=sys.stderr, **kwargs)
 23 | 
 24 |     
 25 | def result_for_manifest(model, criterion, manifest, decoder, target_decoder, batch_size, num_workers):
 26 |     ### LOADER
 27 |     test_dataset = MultiDataset(manifest,
 28 |                                 model._meta['labels'], 
 29 |                                 use_mfcc_in=model._meta['use_mfcc_in'], 
 30 |                                 use_ivectors_in=model._meta['use_ivectors_in'], 
 31 |                                 use_embeddings_in=model._meta['use_embeddings_in'], 
 32 |                                 embedding_size=model._meta['embedding_size'],
 33 |                                 use_transcripts_out=model._meta['use_transcripts_out'], 
 34 |                                 use_accents_out=model._meta['use_accents_out'])
 35 | 
 36 |     test_loader = MultiDataLoader(test_dataset, 
 37 |                                       batch_size=batch_size, 
 38 |                                       shuffle=True, 
 39 |                                       num_workers=num_workers)
 40 |     
 41 |     ### TEST
 42 |     test_results = test(model, test_loader, criterion, decoder, target_decoder)
 43 |     test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results
 44 | 
 45 |     results_dict = {}
 46 |     
 47 |     if test_wer != -1:
 48 |         results_dict['WER'] = test_wer
 49 |     if test_accent_acc != -1:
 50 |         results_dict['Accent accuracy'] = test_accent_acc
 51 |     
 52 |     return results_dict
 53 |         
 54 | 
 55 | def main(model_path, confs): 
 56 |     model, __ = MultiTask.load_model(model_path)
 57 |     if confs['cuda']:
 58 |         model = model.cuda()
 59 |     
 60 |     
 61 |     if not model._meta['use_transcripts_out']: # only accent classification
 62 |         criterion = nn.CrossEntropyLoss()
 63 |     elif not model._meta['use_accents_out']: # only text recognition
 64 |         criterion = CTCLoss()
 65 |     else: # both tasks
 66 |         criterion = (CTCLoss(), nn.CrossEntropyLoss())
 67 |         
 68 |     
 69 |     # Results
 70 |     results = {}
 71 |     for manifest, lm in confs['testing_manifests']:
 72 |         eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}')
 73 |         
 74 |         # Decoder
 75 |         if model._meta['use_transcripts_out']:
 76 |             decoder = BeamCTCDecoder(confs['labels'], 
 77 |                                      lm_path=lm,
 78 |                                      alpha=confs['decoder_alpha'], 
 79 |                                      beta=confs['decoder_beta'],
 80 |                                      cutoff_top_n=confs['decoder_cutoff_top_n'],
 81 |                                      cutoff_prob=confs['decoder_cutoff_top_n'],
 82 |                                      beam_width=confs['decoder_beam_width'], 
 83 |                                      num_processes=confs['num_workers'])
 84 | 
 85 |             target_decoder = GreedyDecoder(confs['labels'])
 86 |         else:
 87 |             decoder, target_decoder = None, None
 88 |         
 89 |         # Test
 90 |         results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers'])
 91 |         
 92 |         
 93 |     if not PRINT_LATEX_TABLE:
 94 |         print(f'Model: {model_path.split("/")[-1]}')
 95 |         for name, res in results.items():
 96 |             print(f'\nResults for {name}:')
 97 |             print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()]))
 98 |     else:
 99 |         print(' & '.join(['model']+list([k[:-4] for k in results.keys()])))
100 |         val_dict = {}
101 |         for k in list(results.values())[0].keys():
102 |             val_dict[k] = []
103 |         for res in results.values():
104 |             [val_dict[k].append(f'{v:.1f}') for k, v in res.items()]
105 |         for val in val_dict.values():
106 |             print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
107 |         
108 | if __name__ == '__main__':
109 |     import config
110 |     confs = config.Config()
111 |     
112 |     args = sys.argv[1:]
113 |     
114 |     if PRINT_LATEX_TABLE:
115 |         eprint('\nLatex output selected, change PRINT_LATEX_TABLE in script to False for regular output.')
116 |       
117 |     for model_path in args:
118 |         main(model_path, confs)


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections import OrderedDict
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.nn.parameter import Parameter
  8 | from torch.autograd import Variable
  9 | 
 10 | supported_rnns = {
 11 |     'lstm': nn.LSTM,
 12 |     'rnn': nn.RNN,
 13 |     'gru': nn.GRU
 14 | }
 15 | supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
 16 | 
 17 | 
 18 | class SequenceWise(nn.Module):
 19 |     def __init__(self, module):
 20 |         """
 21 |         Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
 22 |         Allows handling of variable sequence lengths and minibatch sizes.
 23 |         :param module: Module to apply input to.
 24 |         """
 25 |         super(SequenceWise, self).__init__()
 26 |         self.module = module
 27 | 
 28 |     def forward(self, x):
 29 |         t, n = x.size(0), x.size(1)
 30 |         x = x.view(t * n, -1)
 31 |         x = self.module(x)
 32 |         x = x.view(t, n, -1)
 33 |         return x
 34 | 
 35 |     def __repr__(self):
 36 |         tmpstr = self.__class__.__name__ + ' (\n'
 37 |         tmpstr += self.module.__repr__()
 38 |         tmpstr += ')'
 39 |         return tmpstr
 40 | 
 41 | 
 42 | class MaskConv(nn.Module):
 43 |     def __init__(self, seq_module):
 44 |         """
 45 |         Adds padding to the output of the module based on the given lengths. This is to ensure that the
 46 |         results of the model do not change when batch sizes change during inference.
 47 |         Input needs to be in the shape of (BxCxDxT)
 48 |         :param seq_module: The sequential module containing the conv stack.
 49 |         """
 50 |         super(MaskConv, self).__init__()
 51 |         self.seq_module = seq_module
 52 | 
 53 |     def forward(self, x, lengths):
 54 |         """
 55 |         :param x: The input of size BxCxDxT
 56 |         :param lengths: The actual length of each sequence in the batch
 57 |         :return: Masked output from the module
 58 |         """
 59 |         for module in self.seq_module:
 60 |             x = module(x)
 61 |             mask = torch.ByteTensor(x.size()).fill_(0)
 62 |             if x.is_cuda:
 63 |                 mask = mask.cuda()
 64 |             for i, length in enumerate(lengths):
 65 |                 length = length.item()
 66 |                 if (mask[i].size(2) - length) > 0:
 67 |                     mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
 68 |             x = x.masked_fill(mask, 0)
 69 |         return x, lengths
 70 | 
 71 | 
 72 | class InferenceBatchSoftmax(nn.Module):
 73 |     def forward(self, input_):
 74 |         if not self.training:
 75 |             return F.softmax(input_, dim=-1)
 76 |         else:
 77 |             return input_
 78 | 
 79 | 
 80 | class BatchRNN(nn.Module):
 81 |     def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
 82 |         super(BatchRNN, self).__init__()
 83 |         self.input_size = input_size
 84 |         self.hidden_size = hidden_size
 85 |         self.bidirectional = bidirectional
 86 |         self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
 87 |         self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
 88 |                             bidirectional=bidirectional, bias=True)
 89 |         self.num_directions = 2 if bidirectional else 1
 90 | 
 91 |     def flatten_parameters(self):
 92 |         self.rnn.flatten_parameters()
 93 | 
 94 |     def forward(self, x, output_lengths):
 95 |         if self.batch_norm is not None:
 96 |             x = self.batch_norm(x)
 97 |         x = nn.utils.rnn.pack_padded_sequence(x, output_lengths)
 98 |         x, h = self.rnn(x)
 99 |         x, _ = nn.utils.rnn.pad_packed_sequence(x)
100 |         if self.bidirectional:
101 |             x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
102 |         return x
103 | 
104 | 
105 | class Lookahead(nn.Module):
106 |     # Wang et al 2016 - Lookahead Convolution Layer for Unidirectional Recurrent Neural Networks
107 |     # input shape - sequence, batch, feature - TxNxH
108 |     # output shape - same as input
109 |     def __init__(self, n_features, context):
110 |         # should we handle batch_first=True?
111 |         super(Lookahead, self).__init__()
112 |         self.n_features = n_features
113 |         self.weight = Parameter(torch.Tensor(n_features, context + 1))
114 |         assert context > 0
115 |         self.context = context
116 |         self.register_parameter('bias', None)
117 |         self.init_parameters()
118 | 
119 |     def init_parameters(self):  # what's a better way initialiase this layer?
120 |         stdv = 1. / math.sqrt(self.weight.size(1))
121 |         self.weight.data.uniform_(-stdv, stdv)
122 | 
123 |     def forward(self, input):
124 |         seq_len = input.size(0)
125 |         # pad the 0th dimension (T/sequence) with zeroes whose number = context
126 |         # Once pytorch's padding functions have settled, should move to those.
127 |         padding = torch.zeros(self.context, *(input.size()[1:])).type_as(input.data)
128 |         x = torch.cat((input, Variable(padding)), 0)
129 | 
130 |         # add lookahead windows (with context+1 width) as a fourth dimension
131 |         # for each seq-batch-feature combination
132 |         x = [x[i:i + self.context + 1] for i in range(seq_len)]  # TxLxNxH - sequence, context, batch, feature
133 |         x = torch.stack(x)
134 |         x = x.permute(0, 2, 3, 1)  # TxNxHxL - sequence, batch, feature, context
135 | 
136 |         x = torch.mul(x, self.weight).sum(dim=3)
137 |         return x
138 | 
139 |     def __repr__(self):
140 |         return self.__class__.__name__ + '(' \
141 |                + 'n_features=' + str(self.n_features) \
142 |                + ', context=' + str(self.context) + ')'
143 | 
144 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import torch
  3 | import numpy as np
  4 | import gc
  5 | 
  6 | def get_mixed_loss(criterion, out_text, out_accent, out_lens, accents, transcripts, transcripts_lens, mix=0.5, corrective_coef=1000):
  7 |     loss, loss_text, loss_accent = None, None, None
  8 | 
  9 |     if out_text is None:
 10 |         loss_accent = criterion(out_accent, accents)
 11 |         loss = loss_accent
 12 |     elif out_accent is None:
 13 |         loss_text = criterion(out_text, transcripts, out_lens, transcripts_lens)
 14 |         loss = loss_text
 15 |     else:
 16 |         loss_text = criterion[0](out_text, transcripts, out_lens, transcripts_lens)
 17 |         loss_accent = criterion[1](out_accent, accents)
 18 | 
 19 |         if loss_accent.is_cuda:
 20 |             loss_text = loss_text.cuda()
 21 |             
 22 |         loss = mix * loss_text + (1 - mix) * loss_accent * corrective_coef
 23 |         
 24 |     return loss, loss_text, loss_accent
 25 | 
 26 | 
 27 | ### TRAINING
 28 | 
 29 | def train(model, train_loader, criterion, optimizer, losses_mix=0.5):
 30 |     epoch_losses = []
 31 |     epoch_losses_text = []
 32 |     epoch_losses_accent = []
 33 | 
 34 |     model.train()
 35 | 
 36 |     for data in tqdm(train_loader, total=len(train_loader)):
 37 |         
 38 |         inputs, inputs_lens, transcripts, transcripts_lens, accents = data
 39 | 
 40 |         if next(model.parameters()).is_cuda:
 41 |             inputs = inputs.cuda()
 42 |             inputs_lens = inputs_lens.cuda()
 43 | 
 44 |             if accents is not None:
 45 |                 accents = accents.cuda()
 46 | 
 47 |         out_text, out_accent, out_lens, __ = model(inputs, inputs_lens)
 48 | 
 49 |         loss, loss_text, loss_accent = get_mixed_loss(criterion, out_text, out_accent, 
 50 |                                                       out_lens, accents, transcripts, 
 51 |                                                       transcripts_lens, losses_mix)
 52 | 
 53 |         optimizer.zero_grad()
 54 |         loss.backward()
 55 |         optimizer.step()
 56 |         
 57 |         l = loss.clone().item() if loss is not None else None
 58 |         lt = loss_text.clone().item() if loss_text is not None else None
 59 |         la = loss_accent.clone().item() if loss_accent is not None else None
 60 |         epoch_losses.append(l)
 61 |         epoch_losses_text.append(lt)
 62 |         epoch_losses_accent.append(la)
 63 |                 
 64 | 
 65 |     average_loss = lambda l: sum(l) / len(train_loader) if l[0] is not None else -1
 66 | 
 67 |     epoch_loss_i = average_loss(epoch_losses)
 68 |     epoch_loss_text_i = average_loss(epoch_losses_text)
 69 |     epoch_loss_accent_i = average_loss(epoch_losses_accent)
 70 |     
 71 |     return epoch_loss_i, epoch_loss_text_i, epoch_loss_accent_i
 72 | 
 73 | 
 74 | ### TESTING
 75 | 
 76 | def check_wer(transcripts, transcripts_lens, out, out_lens, decoder, target_decoder):
 77 |     split_transcripts = []
 78 |     offset = 0
 79 |     for size in transcripts_lens:
 80 |         split_transcripts.append(transcripts[offset:offset + size])
 81 |         offset += size
 82 |         
 83 |     decoded_output, _ = decoder.decode(out.data.transpose(0,1), out_lens)
 84 |     target_strings = target_decoder.convert_to_strings(split_transcripts)
 85 |            
 86 |     #if True:
 87 |     #    print('targets', targets)
 88 |     #    print('split_targets', split_targets)
 89 |     #    print('out', out)
 90 |     #    print('output_len', output_len)
 91 |     #    print('decoded', decoded_output)
 92 |     #    print('target', target_strings)
 93 |         
 94 |     wer, cer = 0, 0
 95 |     for x in range(len(target_strings)):
 96 |         transcript, reference = decoded_output[x][0], target_strings[x][0]
 97 |         wer += decoder.wer(transcript, reference) / float(len(reference.split()))
 98 |         #cer += decoder.cer(transcript, reference) / float(len(reference))
 99 |     wer /= len(target_strings)
100 |     return wer * 100
101 | 
102 | 
103 | def check_acc(accents, out):
104 |     out_arg = np.argmax(out, axis=1)
105 |     diff = torch.eq(out_arg, accents.cpu())
106 |     acc = torch.sum(diff)
107 |     return acc.item() / len(accents) * 100
108 | 
109 | 
110 | def test(model, test_loader, criterion, decoder, target_decoder, losses_mix=0.5):
111 |     with torch.no_grad():
112 |         model.eval()
113 | 
114 |         epoch_losses = []
115 |         epoch_losses_text = []
116 |         epoch_losses_accent = []
117 | 
118 |         epoch_wers = []
119 |         epoch_accent_accs = []
120 | 
121 |         for data in tqdm(test_loader, total=len(test_loader)):
122 |             inputs, inputs_lens, transcripts, transcripts_lens, accents = data
123 | 
124 |             if next(model.parameters()).is_cuda:
125 |                 inputs = inputs.cuda()
126 |                 inputs_lens = inputs_lens.cuda()
127 | 
128 |                 if accents is not None:
129 |                     accents = accents.cuda()
130 | 
131 |             out_text, out_accent, out_lens, __ = model(inputs, inputs_lens)
132 | 
133 | 
134 |             if accents is None or len(model._meta['accents_dict']) > max(accents) + 1: # Check if we are testing a model with different accents
135 |                 loss, loss_text, loss_accent = get_mixed_loss(criterion, out_text, out_accent, 
136 |                                                           out_lens, accents, transcripts, 
137 |                                                           transcripts_lens, losses_mix)
138 |             else: # in that case we do not care about the loss, section to refactor.
139 |                 loss, loss_text, loss_accent = torch.tensor([-1]), torch.tensor([-1]), torch.tensor([-1])
140 | 
141 |             if out_text is not None:
142 |                 wer = check_wer(transcripts, transcripts_lens, 
143 |                                 out_text, out_lens, decoder, target_decoder)
144 |             else:
145 |                 wer = None
146 | 
147 |             if out_accent is not None:
148 |                 accent_acc = check_acc(accents, out_accent)
149 |             else:
150 |                 accent_acc = None
151 |             
152 |             l = loss.clone().item() if loss is not None else None
153 |             lt = loss_text.clone().item() if loss_text is not None else None
154 |             la = loss_accent.clone().item() if loss_accent is not None else None
155 |             epoch_losses.append(l)
156 |             epoch_losses_text.append(lt)
157 |             epoch_losses_accent.append(la)
158 | 
159 |             epoch_wers.append(wer)
160 |             epoch_accent_accs.append(accent_acc)
161 |             
162 |                     
163 |                 
164 | 
165 |         average_loss = lambda l: sum(l) / len(test_loader) if l[0] is not None else -1
166 | 
167 |         epoch_loss = average_loss(epoch_losses)
168 |         epoch_loss_text = average_loss(epoch_losses_text)
169 |         epoch_loss_accent = average_loss(epoch_losses_accent)
170 | 
171 |         epoch_wer = average_loss(epoch_wers)
172 |         epoch_accent_acc = average_loss(epoch_accent_accs)
173 | 
174 |     return epoch_loss, epoch_loss_text, epoch_loss_accent, epoch_wer, epoch_accent_acc


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import numpy as np
  6 | from utils import tile
  7 | from torch.utils.data import DataLoader, Dataset
  8 | 
  9 | 
 10 | ### DATASET
 11 | 
 12 | class MultiDataset(Dataset):
 13 |     """Defines an iterator over the dataset. This class is intended to be used with
 14 |     the MultiDataLoader class."""
 15 | 
 16 |     def __init__(self, manifest, labels, manifest_separator=',',
 17 |                use_mfcc_in=True, use_ivectors_in=False, use_embeddings_in=False,
 18 |                embedding_size=100, use_transcripts_out=True, use_accents_out=False):
 19 |         """
 20 |         Allows to chose what will be trained on, and what are the outputs.
 21 |         At least on input and one output is needed.
 22 |         Default configuration is regular MFCCs to text.
 23 |         
 24 |         Manifest should be csv type file with following row for each sample:
 25 |         mfcc_path, ivector_path, embedding_path, transcripts_path, accent_label
 26 |         (Column can remain empty if not used, but must be present.)
 27 |         
 28 |         Scripts to create the database and manifest from audio and text in the scripts folder.
 29 |         """
 30 |         
 31 |         assert(any([use_mfcc_in, use_ivectors_in, use_embeddings_in])), 'MultiDataset config needs at least one input set to True'
 32 |         assert(any([use_transcripts_out, use_accents_out])), 'MultiDataset config needs at least one output set to True'
 33 |         assert(not use_transcripts_out or use_mfcc_in), 'Can’t do speech to text without mfcc.'
 34 |         
 35 |         super(MultiDataset, self).__init__()
 36 |         
 37 |         self.config = {}
 38 |         self.config['use_mfcc_in']=use_mfcc_in
 39 |         self.config['use_ivectors_in']=use_ivectors_in
 40 |         self.config['use_embeddings_in']=use_embeddings_in
 41 |         self.config['embedding_size']=embedding_size
 42 |         self.config['use_transcripts_out']=use_transcripts_out
 43 |         self.config['use_accents_out']=use_accents_out
 44 | 
 45 |         self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
 46 |         
 47 |         with open(manifest) as f:
 48 |             self.samples = [x.strip().split(manifest_separator) for x in f.readlines()]
 49 |             
 50 |         self.accent_dict = self.make_accent_dict(self.samples)
 51 |             
 52 |     def __getitem__(self, index):
 53 |         """Unused features are set to None for the Dataloader. Returns torch tensors."""
 54 |         mfcc_path, ivector_path, embedding_path, transcript_path, accent_label = self.samples[index]
 55 |         mfcc, ivector, embedding, parsed_transcript, accent = None, None, None, None, None
 56 |         
 57 |         def load_array(path):
 58 |             with open(path) as f:
 59 |                 array = json.load(f)
 60 |             return torch.FloatTensor(array)
 61 |             
 62 |         # Inputs
 63 |         if self.config['use_mfcc_in']:
 64 |             mfcc = load_array(mfcc_path)
 65 |             
 66 |         if self.config['use_ivectors_in']:
 67 |             ivector = load_array(ivector_path)
 68 |             
 69 |         if self.config['use_embeddings_in']:
 70 |             new_embedding_path = []
 71 |             for split in embedding_path.split('/'):
 72 |                 new = split if 'embedding' not in split else ''.join([split, '_', str(self.config['embedding_size'])])
 73 |                 new_embedding_path.append(new)
 74 |             new_embedding_path = '/'.join(new_embedding_path)
 75 |             embedding = torch.load(new_embedding_path, map_location=lambda storage, loc: storage)
 76 |             # map_location and loc are there to load the embedding on the CPU
 77 |             
 78 |         # Outputs
 79 |         if self.config['use_transcripts_out']:
 80 |             parsed_transcript = self.parse_transcript(transcript_path)
 81 |             
 82 |         if self.config['use_accents_out']:
 83 |             accent = self.accent_dict[accent_label]
 84 |             accent = torch.LongTensor([accent])
 85 |         
 86 |         return mfcc, ivector, embedding, parsed_transcript, accent
 87 |         
 88 |         
 89 |     def parse_transcript(self, transcript_path):
 90 |         """Maps a text to integers using the given labels_map."""
 91 |         
 92 |         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
 93 |             transcript = transcript_file.read().replace('\n', '')
 94 |             
 95 |         transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
 96 |         return transcript
 97 |     
 98 |     def __len__(self):
 99 |         return len(self.samples)
100 |     
101 |     @staticmethod
102 |     def make_accent_dict(samples):
103 |         acc_set = set()
104 |         for __, __, __, __, accent in samples:
105 |             acc_set.add(accent)
106 |         enum = enumerate(sorted(acc_set)) # sorted set for consistant results
107 |         return {acc: i for i, acc in enum}
108 |     
109 |     
110 | ### DATALOADER
111 | 
112 | # Collate function for the MultiDataLoader
113 | def collate_fn(batch):
114 |     """This function takes list of samples and assembles a batch. 
115 |     It is intended to used in PyTorch DataLoader."""
116 |     
117 |     mfccs, ivectors, embeddings, transcripts, accents = list(zip(*batch))
118 |     
119 |     def exists(list_): 
120 |         """Checks if we are not getting a list of None"""
121 |         return list_[0] is not None
122 |     
123 |     ## Lens
124 |     if exists(mfccs): 
125 |         inputs_lens = torch.IntTensor([len(m) for m in mfccs])
126 |     elif exists(ivectors):
127 |         inputs_lens = torch.IntTensor([len(i) for i in ivectors])
128 |     else:
129 |         inputs_lens = torch.IntTensor([1] * len(batch))
130 |         
131 |     # Sorting order (needs to be descending in lens for the padder)
132 |     inputs_lens, sorted_idx = inputs_lens.sort(descending=True)
133 |         
134 |     if exists(transcripts):
135 |         transcripts_lens = torch.IntTensor([len(t) for t in transcripts])
136 |         transcripts_lens = transcripts_lens[sorted_idx]
137 |     else:
138 |         transcripts_lens = None
139 |         
140 |     ## Inputs
141 |     inputs = []
142 |     if exists(mfccs):
143 |         inputs.append(nn.utils.rnn.pad_sequence(mfccs, batch_first=True))
144 |         
145 |     if exists(ivectors):
146 |         ivect = nn.utils.rnn.pad_sequence(ivectors, batch_first=True)
147 |         if exists(mfccs): # The ivector resolution is 10 times lower than the mfccs', so we expand them.
148 |             ivect = tile(ivect, 1, 10) 
149 |             ivect = ivect[:, :inputs[0].size(1), :]
150 |         inputs.append(ivect)
151 |         
152 |     if exists(embeddings):
153 |         emb = torch.cat(embeddings)
154 |         emb = emb.view(emb.size(0), 1, emb.size(1))
155 |         if exists(mfccs) or exists(ivectors): 
156 |             # tile embeddings to fit either mfccs or ivectors size if they are present
157 |             emb = tile(emb, 1, inputs[0].size(1))   
158 |         inputs.append(emb)
159 |         
160 |     inputs = torch.cat(inputs, dim=2)
161 |     inputs = inputs[sorted_idx]
162 |     
163 |     ## Outputs
164 |     if exists(transcripts):
165 |         if inputs.size(0) == 1: # bugfix for when only one sample
166 |             transcripts = [transcripts]
167 |         transcripts = np.asarray(transcripts)[sorted_idx] # dtype=object because some transcripts were loaded with wrong type (Int64). TODO fix.
168 |         transcripts = torch.IntTensor([t for trs in transcripts for t in trs]) 
169 |         # we need text targets as one concatenated vector
170 |         
171 |     if exists(accents):
172 |         accents = torch.cat(accents)[sorted_idx]
173 |     else:
174 |         accents = None
175 | 
176 |     return inputs, inputs_lens, transcripts, transcripts_lens, accents
177 | 
178 | class MultiDataLoader(DataLoader):
179 |     def __init__(self, *args, **kwargs):
180 |         """
181 |         Creates a data loader for SpeechDatasets.
182 |         """
183 |         super(MultiDataLoader, self).__init__(*args, **kwargs)
184 |         self.collate_fn = collate_fn


--------------------------------------------------------------------------------
/decoder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ----------------------------------------------------------------------------
  3 | # Copyright 2015-2016 Nervana Systems Inc.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ----------------------------------------------------------------------------
 16 | # Modified to support pytorch Tensors
 17 | 
 18 | import Levenshtein as Lev
 19 | import torch
 20 | from six.moves import xrange
 21 | 
 22 | 
 23 | class Decoder(object):
 24 |     """
 25 |     Basic decoder class from which all other decoders inherit. Implements several
 26 |     helper functions. Subclasses should implement the decode() method.
 27 | 
 28 |     Arguments:
 29 |         labels (string): mapping from integers to characters.
 30 |         blank_index (int, optional): index for the blank '_' character. Defaults to 0.
 31 |         space_index (int, optional): index for the space ' ' character. Defaults to 28.
 32 |     """
 33 | 
 34 |     def __init__(self, labels, blank_index=0):
 35 |         # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
 36 |         self.labels = labels
 37 |         self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
 38 |         self.blank_index = blank_index
 39 |         space_index = len(labels)  # To prevent errors in decode, we add an out of bounds index for the space
 40 |         if ' ' in labels:
 41 |             space_index = labels.index(' ')
 42 |         self.space_index = space_index
 43 | 
 44 |     def wer(self, s1, s2):
 45 |         """
 46 |         Computes the Word Error Rate, defined as the edit distance between the
 47 |         two provided sentences after tokenizing to words.
 48 |         Arguments:
 49 |             s1 (string): space-separated sentence
 50 |             s2 (string): space-separated sentence
 51 |         """
 52 | 
 53 |         # build mapping of words to integers
 54 |         b = set(s1.split() + s2.split())
 55 |         word2char = dict(zip(b, range(len(b))))
 56 | 
 57 |         # map the words to a char array (Levenshtein packages only accepts
 58 |         # strings)
 59 |         w1 = [chr(word2char[w]) for w in s1.split()]
 60 |         w2 = [chr(word2char[w]) for w in s2.split()]
 61 | 
 62 |         return Lev.distance(''.join(w1), ''.join(w2))
 63 | 
 64 |     def cer(self, s1, s2):
 65 |         """
 66 |         Computes the Character Error Rate, defined as the edit distance.
 67 | 
 68 |         Arguments:
 69 |             s1 (string): space-separated sentence
 70 |             s2 (string): space-separated sentence
 71 |         """
 72 |         s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
 73 |         return Lev.distance(s1, s2)
 74 | 
 75 |     def decode(self, probs, sizes=None):
 76 |         """
 77 |         Given a matrix of character probabilities, returns the decoder's
 78 |         best guess of the transcription
 79 | 
 80 |         Arguments:
 81 |             probs: Tensor of character probabilities, where probs[c,t]
 82 |                             is the probability of character c at time t
 83 |             sizes(optional): Size of each sequence in the mini-batch
 84 |         Returns:
 85 |             string: sequence of the model's best guess for the transcription
 86 |         """
 87 |         raise NotImplementedError
 88 | 
 89 | 
 90 | class BeamCTCDecoder(Decoder):
 91 |     def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
 92 |                  num_processes=4, blank_index=0):
 93 |         super(BeamCTCDecoder, self).__init__(labels)
 94 |         try:
 95 |             from ctcdecode import CTCBeamDecoder
 96 |         except ImportError:
 97 |             raise ImportError("BeamCTCDecoder requires paddledecoder package.")
 98 | 
 99 |         #labels = labels.replace("'", "a") # TODO fix that
100 |         self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
101 |                                        num_processes, blank_index)
102 | 
103 |     def convert_to_strings(self, out, seq_len):
104 |         results = []
105 |         for b, batch in enumerate(out):
106 |             utterances = []
107 |             for p, utt in enumerate(batch):
108 |                 size = seq_len[b][p]
109 |                 if size > 0:
110 |                     transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size]))
111 |                 else:
112 |                     transcript = ''
113 |                 utterances.append(transcript)
114 |             results.append(utterances)
115 |         return results
116 | 
117 |     def convert_tensor(self, offsets, sizes):
118 |         results = []
119 |         for b, batch in enumerate(offsets):
120 |             utterances = []
121 |             for p, utt in enumerate(batch):
122 |                 size = sizes[b][p]
123 |                 if sizes[b][p] > 0:
124 |                     utterances.append(utt[0:size])
125 |                 else:
126 |                     utterances.append(torch.tensor([], dtype=torch.int))
127 |             results.append(utterances)
128 |         return results
129 | 
130 |     def decode(self, probs, sizes=None):
131 |         """
132 |         Decodes probability output using ctcdecode package.
133 |         Arguments:
134 |             probs: Tensor of character probabilities, where probs[c,t]
135 |                             is the probability of character c at time t
136 |             sizes: Size of each sequence in the mini-batch
137 |         Returns:
138 |             string: sequences of the model's best guess for the transcription
139 |         """
140 |         probs = probs.cpu()
141 |         out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)
142 |         strings = self.convert_to_strings(out, seq_lens)
143 |         offsets = self.convert_tensor(offsets, seq_lens)
144 |         return strings, offsets
145 | 
146 | 
147 | class GreedyDecoder(Decoder):
148 |     def __init__(self, labels, blank_index=0):
149 |         super(GreedyDecoder, self).__init__(labels, blank_index)
150 | 
151 |     def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False):
152 |         """Given a list of numeric sequences, returns the corresponding strings"""
153 |         strings = []
154 |         offsets = [] if return_offsets else None
155 |         for x in xrange(len(sequences)):
156 |             seq_len = sizes[x] if sizes is not None else len(sequences[x])
157 |             string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions)
158 |             strings.append([string])  # We only return one path
159 |             if return_offsets:
160 |                 offsets.append([string_offsets])
161 |         if return_offsets:
162 |             return strings, offsets
163 |         else:
164 |             return strings
165 | 
166 |     def process_string(self, sequence, size, remove_repetitions=False):
167 |         string = ''
168 |         offsets = []
169 |         for i in range(size):
170 |             char = self.int_to_char[sequence[i].item()]
171 |             if char != self.int_to_char[self.blank_index]:
172 |                 # if this char is a repetition and remove_repetitions=true, then skip
173 |                 if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]:
174 |                     pass
175 |                 elif char == self.labels[self.space_index]:
176 |                     string += ' '
177 |                     offsets.append(i)
178 |                 else:
179 |                     string = string + char
180 |                     offsets.append(i)
181 |         return string, torch.tensor(offsets, dtype=torch.int)
182 | 
183 |     def decode(self, probs, sizes=None):
184 |         """
185 |         Returns the argmax decoding given the probability matrix. Removes
186 |         repeated elements in the sequence, as well as blanks.
187 | 
188 |         Arguments:
189 |             probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
190 |             sizes(optional): Size of each sequence in the mini-batch
191 |         Returns:
192 |             strings: sequences of the model's best guess for the transcription on inputs
193 |             offsets: time step per character predicted
194 |         """
195 |         _, max_probs = torch.max(probs, 2)
196 |         strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes,
197 |                                                    remove_repetitions=True, return_offsets=True)
198 |         return strings, offsets
199 | 


--------------------------------------------------------------------------------
/data/language_models/lnn_tri.lm:
--------------------------------------------------------------------------------
  1 | 
  2 | \data\
  3 | ngram 1=172
  4 | ngram 2=289
  5 | ngram 3=6
  6 | 
  7 | \1-grams:
  8 | -0.8200597	</s>
  9 | -99	<s>	-0.3555717
 10 | -2.2266	A	-0.04052078
 11 | -2.52763	AIR	-0.04181526
 12 | -2.52763	ALARM	-0.03661407
 13 | -2.52763	ALEXA	0.02815799
 14 | -2.2266	AM	-0.03922245
 15 | -2.52763	AMAZON	-0.04052079
 16 | -2.2266	AND	-0.02198101
 17 | -2.52763	ANECHOIC	-0.04181526
 18 | -2.52763	AT	-0.04181526
 19 | -2.52763	BANGALORE	0.02815799
 20 | -2.52763	BEFORE	-0.03661407
 21 | -2.52763	BIEBER	0.02815799
 22 | -2.52763	BLACK	0.02815799
 23 | -2.52763	BLAKE	-0.04181526
 24 | -2.52763	BLUE	-0.04181526
 25 | -2.52763	BOWIE	0.02815799
 26 | -1.573387	BY	-0.03134982
 27 | -2.52763	CALENDAR	-0.04052079
 28 | -2.52763	CALIBRATE	-0.02333186
 29 | -2.52763	CALL	-0.04181526
 30 | -2.52763	CAME	-0.04181526
 31 | -2.52763	CAMERAS	-0.03002374
 32 | -2.52763	CARIBBEAN	0.02815799
 33 | -2.52763	CHAMBER	-0.03661407
 34 | -2.52763	CHRIS	-0.04181526
 35 | -2.52763	CONDITIONING	-0.04052079
 36 | -2.52763	DAFT	-0.04181526
 37 | -2.52763	DAVID	-0.04181526
 38 | -2.52763	DINING	-0.03792022
 39 | -2.52763	DO	-0.03661407
 40 | -2.52763	DOOR	-0.04181526
 41 | -2.2266	DOWN	-0.04550482
 42 | -2.52763	DRAKE	0.02815799
 43 | -2.52763	DRIVE	-0.02869358
 44 | -2.52763	EIGHT	-0.04181526
 45 | -2.52763	EPISODE	-0.03792022
 46 | -2.52763	ESPN	0.02815799
 47 | -2.52763	FALL	-0.03792022
 48 | -2.52763	FIRST	-0.04181526
 49 | -2.52763	FIVE	0.02815799
 50 | -2.52763	FLOOR	-0.04181526
 51 | -2.52763	FOLLOWING	-0.04181526
 52 | -1.82866	FOR	-0.03267188
 53 | -2.52763	FORGET	-0.03134983
 54 | -2.2266	FROM	-0.03922245
 55 | -2.52763	FRONT	-0.04181526
 56 | -2.52763	GARAGE	-0.04181526
 57 | -2.52763	GENIE	0.02815799
 58 | -2.52763	GET	-0.04181526
 59 | -2.52763	GOING	-0.02869358
 60 | -2.050509	GOOGLE	-0.01648962
 61 | -2.52763	HELLO	-0.04181526
 62 | -2.52763	HERE	-0.02869358
 63 | -1.82866	HEY	-0.03398992
 64 | -2.52763	HI	-0.03661407
 65 | -2.52763	HOME	0.02815799
 66 | -2.52763	HOURS	-0.04052079
 67 | -2.52763	HOW	-0.04181526
 68 | -1.82866	I	-0.06459835
 69 | -1.92557	IN	-0.03530397
 70 | -2.52763	INTENDED	-0.02869358
 71 | -1.82866	IS	-0.04451953
 72 | -2.050509	IT	-0.02198102
 73 | -2.52763	ITUNES	0.02815799
 74 | -2.52763	JUSTIN	-0.04181526
 75 | -2.52763	KIDS	0.02815799
 76 | -2.52763	LAMP	0.02815799
 77 | -2.52763	LEAVE	-0.02869358
 78 | -2.52763	LIGHT	-0.02869358
 79 | -2.52763	LIGHTS	0.02815799
 80 | -2.52763	LIKE	-0.04052079
 81 | -2.050509	LIVING	-0.9322866
 82 | -2.52763	LOGI	0.02815799
 83 | -2.52763	LOGITECH	0.02815799
 84 | -2.52763	LOGITECH'S	-0.04181526
 85 | -2.52763	LONG	-0.04181526
 86 | -2.52763	LOVE	-0.04181526
 87 | -2.52763	LUCKY	-0.03134983
 88 | -2.52763	MAKE	-0.03922245
 89 | -2.52763	MARS	0.02815799
 90 | -2.2266	ME	-0.02735934
 91 | -2.52763	MEETING	0.02815799
 92 | -2.52763	MINUTES	0.02815799
 93 | -2.52763	MOM	-0.04181526
 94 | -2.2266	MUSIC	-0.04550482
 95 | -2.52763	MUTE	0.02815799
 96 | -1.82866	MY	-0.03267188
 97 | -2.2266	NAME	-0.03002373
 98 | -2.52763	NEED	-0.02869358
 99 | -2.52763	NEW	-0.04181526
100 | -2.52763	NEXT	-0.04181526
101 | -2.2266	NOW	0.02967916
102 | -2.52763	NPR	-0.03002374
103 | -2.52763	ODB	0.02815799
104 | -1.92557	OF	-0.05589744
105 | -2.52763	OK	-0.03922245
106 | -1.52763	ON	-0.08432172
107 | -2.52763	ONE	-0.04181526
108 | -2.52763	ONLY	-0.04181526
109 | -2.52763	OPEN	0.02815799
110 | -2.52763	ORANGE	-0.03661407
111 | -2.52763	OUT	-0.03134983
112 | -2.52763	PANDORA	0.02815799
113 | -2.52763	PART	-0.03792022
114 | -2.52763	PAUSE	0.02815799
115 | -2.52763	PHRASES	0.02815799
116 | -2.52763	PICK	-0.04052079
117 | -2.52763	PILOTS	0.02815799
118 | -2.52763	PIRATES	-0.03792022
119 | -2.52763	PLATTEN	0.02815799
120 | -1.448449	PLAY	0.06955066
121 | -2.52763	PM	-0.04181526
122 | -2.52763	PUNK	0.02815799
123 | -2.52763	RACHEL	-0.04181526
124 | -2.2266	RECOGNITION	-0.02735934
125 | -2.52763	RECORDING	-0.04052079
126 | -2.52763	RED	0.02815799
127 | -2.52763	REMINDER	-0.03661407
128 | -2.52763	REWIND	-0.04052079
129 | -2.52763	RIGHT	-0.03922245
130 | -2.52763	RISE	-0.04052079
131 | -1.92557	ROOM	-0.03792021
132 | -2.52763	SAY	-0.02333186
133 | -2.52763	SCRIPT	-0.03661407
134 | -2.52763	SEARCH	-0.03661407
135 | -2.52763	SECONDS	0.02815799
136 | -1.92557	SET	-0.07057546
137 | -2.52763	SHELTON	0.02815799
138 | -2.52763	SIRI	0.02815799
139 | -2.52763	SIXTEEN	-0.03134983
140 | -2.52763	SOMETHING	-0.03134983
141 | -2.52763	SONOS	0.02815799
142 | -2.52763	SOUNDCLOUD	0.02815799
143 | -2.52763	SPIDERS	-0.04052079
144 | -2.52763	SPOTIFY	0.02815799
145 | -2.52763	STAND	-0.03134983
146 | -2.52763	STAPELTON	0.02815799
147 | -2.52763	STARDUST	-0.04052079
148 | -2.52763	START	-0.03661407
149 | -2.52763	STATE	-0.04181526
150 | -2.52763	STOP	0.02815799
151 | -2.52763	STRESSED	-0.04181526
152 | -2.52763	SUMMER	-0.04181526
153 | -2.52763	TAKE	-0.04052079
154 | -2.2266	TEN	-0.04052078
155 | -2.52763	TESTING	-0.04052079
156 | -1.351539	THE	-0.02839587
157 | -2.2266	THIS	-0.03661406
158 | -2.2266	TIME	-0.03530397
159 | -2.52763	TIMER	-0.03661407
160 | -1.486237	TO	-0.03582891
161 | -2.2266	TODAY	-0.04550482
162 | -2.52763	TRAVELER	-0.03134983
163 | -1.82866	TURN	-0.3301536
164 | -2.52763	TWENTY	-0.04181526
165 | -2.52763	TWO	-0.04181526
166 | -2.2266	UP	0.05155473
167 | -2.050509	VOICE	-0.08691774
168 | -2.050509	VOLUME	-0.03661407
169 | -2.52763	WANT	-0.02869358
170 | -2.2266	WATCH	-0.02198101
171 | -2.52763	WEATHER	-0.04181526
172 | -2.2266	WHAT	-0.1141836
173 | -2.2266	WHAT'S	-0.009630572
174 | -2.52763	WILL	-0.03922245
175 | -2.52763	WORK	0.02815799
176 | -2.52763	YOU	-0.03134983
177 | -2.52763	YOUR	-0.04052079
178 | -2.52763	YOURSELF	-0.03134983
179 | -2.52763	ZIGGY	-0.04181526
180 | 
181 | \2-grams:
182 | -2.732193	<s> CALL
183 | -2.732193	<s> HELLO
184 | -1.0086	<s> HEY
185 | -2.732193	<s> HI
186 | -2.732193	<s> HOW
187 | -2.732193	<s> I
188 | -2.034075	<s> IN
189 | -2.732193	<s> IS
190 | -2.732193	<s> MUTE
191 | -2.732193	<s> OK
192 | -1.10551	<s> ON
193 | -2.732193	<s> PAUSE
194 | -0.6283889	<s> PLAY
195 | -2.732193	<s> REWIND
196 | -2.732193	<s> SEARCH
197 | -1.10551	<s> SET	-0.1476935
198 | -2.732193	<s> STOP
199 | -1.0086	<s> TURN	-0.06938255
200 | -2.034075	<s> VOLUME
201 | -2.732193	<s> WATCH
202 | -2.034075	<s> WHAT	-0.475604
203 | -2.034075	<s> WHAT'S
204 | -1.325652	A REMINDER
205 | -1.325652	A TIMER
206 | -1.024622	AIR CONDITIONING
207 | -1.024622	ALARM FOR
208 | -1.024622	ALEXA </s>
209 | -1.325652	AM NOW
210 | -1.325652	AM RECORDING
211 | -1.024622	AMAZON MUSIC
212 | -1.325652	AND FALL
213 | -1.325652	AND THE
214 | -1.024622	ANECHOIC CHAMBER
215 | -1.024622	AT HOME
216 | -1.024622	BANGALORE </s>
217 | -1.024622	BEFORE I
218 | -1.024622	BIEBER </s>
219 | -1.024622	BLACK </s>
220 | -1.024622	BLAKE SHELTON
221 | -1.024622	BLUE GENIE
222 | -1.024622	BOWIE </s>
223 | -1.978865	BY BLAKE
224 | -1.978865	BY CHRIS
225 | -1.978865	BY DAFT
226 | -1.978865	BY DRAKE
227 | -1.978865	BY JUSTIN
228 | -1.978865	BY ODB
229 | -1.978865	BY RACHEL
230 | -1.978865	BY TWENTY
231 | -1.978865	BY YOU
232 | -1.024622	CALENDAR TODAY
233 | -1.024622	CALIBRATE THE
234 | -1.024622	CALL MOM
235 | -1.024622	CAME HERE
236 | -1.024622	CAMERAS ON
237 | -1.024622	CARIBBEAN </s>
238 | -1.024622	CHAMBER FOR
239 | -1.024622	CHRIS STAPELTON
240 | -1.024622	CONDITIONING DOWN
241 | -1.024622	DAFT PUNK
242 | -1.024622	DAVID BOWIE
243 | -1.024622	DINING ROOM
244 | -1.024622	DO I
245 | -1.024622	DOOR OPEN
246 | -0.6275349	DOWN </s>
247 | -1.024622	DRAKE </s>
248 | -1.024622	DRIVE TO
249 | -1.024622	EIGHT HOURS
250 | -1.024622	EPISODE OF
251 | -1.024622	ESPN </s>
252 | -1.024622	FALL OF
253 | -1.024622	FIRST MEETING
254 | -1.024622	FIVE </s>
255 | -1.024622	FLOOR LAMP
256 | -1.024622	FOLLOWING PHRASES
257 | -1.723593	FOR EIGHT
258 | -1.723593	FOR PIRATES
259 | -1.723593	FOR TEN
260 | -1.723593	FOR TWO
261 | -1.723593	FOR VOICE
262 | -1.024622	FORGET BY
263 | -1.325652	FROM MARS
264 | -1.325652	FROM NOW
265 | -1.024622	FRONT RIGHT
266 | -1.024622	GARAGE DOOR
267 | -1.024622	GENIE </s>
268 | -1.024622	GET LUCKY
269 | -1.024622	GOING TO
270 | -0.8036261	GOOGLE </s>
271 | -1.501744	GOOGLE MUSIC
272 | -1.024622	HELLO BLUE
273 | -1.024622	HERE TO
274 | -1.723593	HEY ALEXA
275 | -1.723593	HEY GOOGLE
276 | -1.723593	HEY LOGI
277 | -1.723593	HEY LOGITECH
278 | -1.723593	HEY SIRI
279 | -1.024622	HI MY
280 | -1.024622	HOME </s>
281 | -1.024622	HOURS FROM
282 | -1.024622	HOW LONG
283 | -1.025475	I AM
284 | -1.723593	I NEED
285 | -1.723593	I START
286 | -1.723593	I WANT
287 | -1.626683	IN BANGALORE
288 | -1.626683	IN GOOGLE
289 | -1.626683	IN ITUNES
290 | -1.626683	IN LOGITECH'S
291 | -1.024622	INTENDED TO
292 | -1.723593	IS IT
293 | -1.723593	IS ONLY
294 | -1.723593	IS STATE
295 | -1.025475	IS THE
296 | -1.501744	IT IN
297 | -1.501744	IT TAKE
298 | -1.501744	IT TO
299 | -1.024622	ITUNES </s>
300 | -1.024622	JUSTIN BIEBER
301 | -1.024622	KIDS </s>
302 | -1.024622	LAMP </s>
303 | -1.024622	LEAVE TO
304 | -1.024622	LIGHT TO
305 | -1.024622	LIGHTS </s>
306 | -1.024622	LIKE TODAY
307 | -0.05329508	LIVING ROOM
308 | -1.024622	LOGI </s>
309 | -1.024622	LOGITECH </s>
310 | -1.024622	LOGITECH'S ANECHOIC
311 | -1.024622	LONG WILL
312 | -1.024622	LOVE YOURSELF
313 | -1.024622	LUCKY BY
314 | -1.024622	MAKE IT
315 | -1.024622	MARS </s>
316 | -1.325652	ME SOMETHING
317 | -1.325652	ME TO
318 | -1.024622	MEETING </s>
319 | -1.024622	MINUTES </s>
320 | -1.024622	MOM AT
321 | -0.6275349	MUSIC </s>
322 | -1.024622	MUTE </s>
323 | -1.723593	MY CALENDAR
324 | -1.723593	MY DINING
325 | -1.723593	MY FIRST
326 | -1.723593	MY NAME
327 | -1.723593	MY VOICE
328 | -1.325652	NAME I
329 | -1.325652	NAME IS
330 | -1.024622	NEED TO
331 | -1.024622	NEW BLACK
332 | -1.024622	NEXT EPISODE
333 | -1.325652	NOW </s>
334 | -1.325652	NOW GOING
335 | -1.024622	NPR ON
336 | -1.024622	ODB </s>
337 | -1.626683	OF ORANGE
338 | -0.9285648	OF THE
339 | -1.626683	OF ZIGGY
340 | -1.024622	OK GOOGLE
341 | -2.024622	ON </s>
342 | -2.024622	ON AMAZON
343 | -1.326505	ON MY
344 | -2.024622	ON PANDORA
345 | -2.024622	ON SOUNDCLOUD
346 | -2.024622	ON SPOTIFY
347 | -0.5761738	ON THE	-0.2839059
348 | -1.024622	ONE PILOTS
349 | -1.024622	ONLY INTENDED
350 | -1.024622	OPEN </s>
351 | -1.024622	ORANGE IS
352 | -1.024622	OUT BY
353 | -1.024622	PANDORA </s>
354 | -1.024622	PART OF
355 | -1.024622	PAUSE </s>
356 | -1.024622	PHRASES </s>
357 | -1.024622	PICK UP
358 | -1.024622	PILOTS </s>
359 | -1.024622	PIRATES OF
360 | -1.024622	PLATTEN </s>
361 | -2.103804	PLAY </s>
362 | -2.103804	PLAY CAME
363 | -2.103804	PLAY DAVID
364 | -2.103804	PLAY GET
365 | -2.103804	PLAY LOVE
366 | -2.103804	PLAY ME
367 | -2.103804	PLAY NPR
368 | -2.103804	PLAY STAND
369 | -2.103804	PLAY STRESSED
370 | -2.103804	PLAY SUMMER
371 | -2.103804	PLAY THE
372 | -2.103804	PLAY TRAVELER
373 | -1.024622	PM PICK
374 | -1.024622	PUNK </s>
375 | -1.024622	RACHEL PLATTEN
376 | -1.325652	RECOGNITION TESTING
377 | -1.325652	RECOGNITION TO
378 | -1.024622	RECORDING THIS
379 | -1.024622	RED </s>
380 | -1.024622	REMINDER FOR
381 | -1.024622	REWIND TEN
382 | -1.024622	RIGHT LIVING
383 | -1.024622	RISE AND
384 | -1.626683	ROOM FLOOR
385 | -1.626683	ROOM LIGHT
386 | -1.626683	ROOM LIGHTS
387 | -1.626683	ROOM SONOS
388 | -1.024622	SAY THE
389 | -1.024622	SCRIPT IS
390 | -1.024622	SEARCH FOR
391 | -1.024622	SECONDS </s>
392 | -0.9285648	SET A
393 | -1.626683	SET ALARM
394 | -1.626683	SET VOLUME
395 | -1.024622	SHELTON </s>
396 | -1.024622	SIRI </s>
397 | -1.024622	SIXTEEN BY
398 | -1.024622	SOMETHING BY
399 | -1.024622	SONOS </s>
400 | -1.024622	SOUNDCLOUD </s>
401 | -1.024622	SPIDERS FROM
402 | -1.024622	SPOTIFY </s>
403 | -1.024622	STAND BY
404 | -1.024622	STAPELTON </s>
405 | -1.024622	STARDUST AND
406 | -1.024622	START I
407 | -1.024622	STATE YOUR
408 | -1.024622	STOP </s>
409 | -1.024622	STRESSED OUT
410 | -1.024622	SUMMER SIXTEEN
411 | -1.024622	TAKE ME
412 | -1.325652	TEN MINUTES
413 | -1.325652	TEN SECONDS
414 | -1.024622	TESTING THIS
415 | -2.200714	THE AIR
416 | -2.200714	THE CARIBBEAN
417 | -2.200714	THE FOLLOWING
418 | -2.200714	THE FRONT
419 | -2.200714	THE GARAGE
420 | -2.200714	THE KIDS
421 | -1.502596	THE LIVING	0.3450996
422 | -2.200714	THE NEW
423 | -2.200714	THE NEXT
424 | -2.200714	THE RISE
425 | -2.200714	THE SCRIPT
426 | -2.200714	THE SPIDERS
427 | -2.200714	THE VOICE
428 | -2.200714	THE WEATHER
429 | -1.325652	THIS IN
430 | -1.325652	THIS PART
431 | -1.325652	TIME DO
432 | -1.325652	TIME IS
433 | -1.024622	TIMER FOR
434 | -2.066015	TO CALIBRATE
435 | -2.066015	TO DRIVE
436 | -2.066015	TO FORGET
437 | -2.066015	TO LEAVE
438 | -2.066015	TO MAKE
439 | -1.367898	TO MY
440 | -2.066015	TO RED
441 | -2.066015	TO SAY
442 | -2.066015	TO WATCH
443 | -2.066015	TO WORK
444 | -0.6275349	TODAY </s>
445 | -1.024622	TRAVELER BY
446 | -1.723593	TURN CAMERAS
447 | -0.2751438	TURN ON	-0.4681379
448 | -1.723593	TURN THE
449 | -1.024622	TWENTY ONE
450 | -1.024622	TWO PM
451 | -1.325652	UP </s>
452 | -1.325652	UP THE
453 | -1.501744	VOICE BEFORE
454 | -0.8036261	VOICE RECOGNITION
455 | -1.501744	VOLUME DOWN
456 | -1.501744	VOLUME FIVE
457 | -1.501744	VOLUME UP
458 | -1.024622	WANT TO
459 | -1.325652	WATCH ESPN
460 | -1.325652	WATCH THE
461 | -1.024622	WEATHER LIKE
462 | -0.6275349	WHAT TIME
463 | -1.325652	WHAT'S ON
464 | -1.325652	WHAT'S THE
465 | -1.024622	WILL IT
466 | -1.024622	WORK </s>
467 | -1.024622	YOU BY
468 | -1.024622	YOUR NAME
469 | -1.024622	YOURSELF BY
470 | -1.024622	ZIGGY STARDUST
471 | 
472 | \3-grams:
473 | -0.1282164	THE LIVING ROOM
474 | -0.1249387	TURN ON THE
475 | -0.4292465	<s> SET A
476 | -0.3043077	ON THE LIVING
477 | -0.2218488	<s> TURN ON
478 | -0.1282164	<s> WHAT TIME
479 | 
480 | \end\
481 | 


--------------------------------------------------------------------------------
/data/language_models/lnn_bi.lm:
--------------------------------------------------------------------------------
  1 | 
  2 | \data\
  3 | ngram 1=173
  4 | ngram 2=289
  5 | 
  6 | \1-grams:
  7 | -0.7888443	</s>
  8 | -99	<s>	-0.4177598
  9 | -2.546674	<unk>
 10 | -2.462987	A	-0.05261233
 11 | -2.462987	AIR	-0.05261233
 12 | -2.462987	ALARM	-0.05261234
 13 | -2.462987	ALEXA	-0.05261234
 14 | -2.462987	AM	-0.05261233
 15 | -2.462987	AMAZON	-0.05261234
 16 | -2.160911	AND	-0.05261233
 17 | -2.462987	ANECHOIC	-0.05261233
 18 | -2.462987	AT	-0.05261233
 19 | -2.462987	BANGALORE	-0.05261234
 20 | -2.462987	BEFORE	-0.05261234
 21 | -2.462987	BIEBER	-0.05261234
 22 | -2.462987	BLACK	-0.05261234
 23 | -2.462987	BLAKE	-0.05261233
 24 | -2.462987	BLUE	-0.05261233
 25 | -2.462987	BOWIE	-0.05261234
 26 | -1.506887	BY	-0.05261233
 27 | -2.462987	CALENDAR	-0.05261234
 28 | -2.462987	CALIBRATE	-0.05261233
 29 | -2.462987	CALL	-0.05261233
 30 | -2.462987	CAME	-0.05261233
 31 | -2.462987	CAMERAS	-0.05261234
 32 | -2.462987	CARIBBEAN	-0.05261234
 33 | -2.462987	CHAMBER	-0.05261234
 34 | -2.462987	CHRIS	-0.05261233
 35 | -2.462987	CONDITIONING	-0.05261234
 36 | -2.462987	DAFT	-0.05261233
 37 | -2.462987	DAVID	-0.05261233
 38 | -2.462987	DINING	-0.05261234
 39 | -2.462987	DO	-0.05261234
 40 | -2.462987	DOOR	-0.05261233
 41 | -2.160911	DOWN	-0.3536423
 42 | -2.462987	DRAKE	-0.05261234
 43 | -2.462987	DRIVE	-0.05261233
 44 | -2.462987	EIGHT	-0.05261233
 45 | -2.462987	EPISODE	-0.05261233
 46 | -2.462987	ESPN	-0.05261234
 47 | -2.462987	FALL	-0.05261233
 48 | -2.462987	FIRST	-0.05261233
 49 | -2.462987	FIVE	-0.05261234
 50 | -2.462987	FLOOR	-0.05261233
 51 | -2.462987	FOLLOWING	-0.05261233
 52 | -1.762345	FOR	-0.05261234
 53 | -2.462987	FORGET	-0.05261234
 54 | -2.160911	FROM	-0.05261233
 55 | -2.462987	FRONT	-0.05261233
 56 | -2.462987	GARAGE	-0.05261233
 57 | -2.462987	GENIE	-0.05261234
 58 | -2.462987	GET	-0.05261233
 59 | -2.462987	GOING	-0.05261233
 60 | -1.984472	GOOGLE	-0.2287036
 61 | -2.462987	HELLO	-0.05261233
 62 | -2.462987	HERE	-0.05261233
 63 | -2.462987	HEY	-0.05261234
 64 | -2.462987	HI	-0.05261233
 65 | -2.462987	HOME	-0.05261234
 66 | -2.462987	HOURS	-0.05261234
 67 | -2.462987	HOW	-0.05261233
 68 | -1.762345	I	-0.1495224
 69 | -1.984472	IN	-0.05261234
 70 | -2.462987	INTENDED	-0.05261233
 71 | -1.762345	IS	-0.1495224
 72 | -1.984472	IT	-0.05261235
 73 | -2.462987	ITUNES	-0.05261234
 74 | -2.462987	JUSTIN	-0.05261233
 75 | -2.462987	KIDS	-0.05261234
 76 | -2.462987	LAMP	-0.05261234
 77 | -2.462987	LEAVE	-0.05261233
 78 | -2.462987	LIGHT	-0.05261233
 79 | -2.462987	LIGHTS	-0.05261234
 80 | -2.462987	LIKE	-0.05261234
 81 | -2.160911	LIVING	-0.5297336
 82 | -2.462987	LOGI	-0.05261234
 83 | -2.462987	LOGITECH	-0.05261234
 84 | -2.462987	LOGITECH'S	-0.05261233
 85 | -2.462987	LONG	-0.05261233
 86 | -2.462987	LOVE	-0.05261233
 87 | -2.462987	LUCKY	-0.05261234
 88 | -2.462987	MAKE	-0.05261233
 89 | -2.462987	MARS	-0.05261234
 90 | -2.160911	ME	-0.05261233
 91 | -2.462987	MEETING	-0.05261234
 92 | -2.462987	MINUTES	-0.05261234
 93 | -2.462987	MOM	-0.05261233
 94 | -2.160911	MUSIC	-0.3536423
 95 | -2.462987	MUTE	-0.05261234
 96 | -1.984472	MY	-0.05261234
 97 | -2.160911	NAME	-0.05261233
 98 | -2.462987	NEED	-0.05261233
 99 | -2.462987	NEW	-0.05261233
100 | -2.462987	NEXT	-0.05261233
101 | -2.160911	NOW	-0.05261233
102 | -2.462987	NPR	-0.05261234
103 | -2.462987	ODB	-0.05261234
104 | -1.859359	OF	-0.1775511
105 | -2.462987	OK	-0.05261233
106 | -1.762345	ON	-0.2075143
107 | -2.462987	ONE	-0.05261233
108 | -2.462987	ONLY	-0.05261233
109 | -2.462987	OPEN	-0.05261234
110 | -2.462987	ORANGE	-0.05261234
111 | -2.462987	OUT	-0.05261234
112 | -2.462987	PANDORA	-0.05261234
113 | -2.462987	PART	-0.05261233
114 | -2.462987	PAUSE	-0.05261234
115 | -2.462987	PHRASES	-0.05261234
116 | -2.462987	PICK	-0.05261234
117 | -2.462987	PILOTS	-0.05261234
118 | -2.462987	PIRATES	-0.05261233
119 | -2.462987	PLATTEN	-0.05261234
120 | -2.462987	PLAY	-0.05261233
121 | -2.462987	PM	-0.05261233
122 | -2.462987	PUNK	-0.05261234
123 | -2.462987	RACHEL	-0.05261233
124 | -2.462987	RECOGNITION	-0.05261233
125 | -2.462987	RECORDING	-0.05261234
126 | -2.462987	RED	-0.05261234
127 | -2.462987	REMINDER	-0.05261234
128 | -2.462987	REWIND	-0.05261234
129 | -2.462987	RIGHT	-0.05261234
130 | -2.462987	RISE	-0.05261234
131 | -2.160911	ROOM	-0.05261234
132 | -2.462987	SAY	-0.05261233
133 | -2.462987	SCRIPT	-0.05261234
134 | -2.462987	SEARCH	-0.05261234
135 | -2.462987	SECONDS	-0.05261234
136 | -2.462987	SET	-0.1775511
137 | -2.462987	SHELTON	-0.05261234
138 | -2.462987	SIRI	-0.05261234
139 | -2.462987	SIXTEEN	-0.05261234
140 | -2.462987	SOMETHING	-0.05261234
141 | -2.462987	SONOS	-0.05261234
142 | -2.462987	SOUNDCLOUD	-0.05261234
143 | -2.462987	SPIDERS	-0.05261234
144 | -2.462987	SPOTIFY	-0.05261234
145 | -2.462987	STAND	-0.05261234
146 | -2.462987	STAPELTON	-0.05261234
147 | -2.462987	STARDUST	-0.05261234
148 | -2.462987	START	-0.05261234
149 | -2.462987	STATE	-0.05261233
150 | -2.462987	STOP	-0.05261234
151 | -2.462987	STRESSED	-0.05261233
152 | -2.462987	SUMMER	-0.05261233
153 | -2.462987	TAKE	-0.05261234
154 | -2.160911	TEN	-0.05261233
155 | -2.462987	TESTING	-0.05261234
156 | -1.419695	THE	-0.08257556
157 | -2.160911	THIS	-0.05261233
158 | -2.462987	TIME	-0.05261233
159 | -2.462987	TIMER	-0.05261234
160 | -1.419695	TO	-0.09400501
161 | -2.160911	TODAY	-0.3536423
162 | -2.462987	TRAVELER	-0.05261234
163 | -2.462987	TURN	-0.2744611
164 | -2.462987	TWENTY	-0.05261233
165 | -2.462987	TWO	-0.05261233
166 | -2.160911	UP	-0.05261234
167 | -1.984472	VOICE	-0.2287036
168 | -2.160911	VOLUME	-0.05261234
169 | -2.462987	WANT	-0.05261233
170 | -2.160911	WATCH	-0.05261233
171 | -2.462987	WEATHER	-0.05261233
172 | -2.462987	WHAT	-0.3536424
173 | -2.462987	WHAT'S	-0.05261233
174 | -2.462987	WILL	-0.05261233
175 | -2.462987	WORK	-0.05261234
176 | -2.462987	YOU	-0.05261234
177 | -2.462987	YOUR	-0.05261234
178 | -2.462987	YOURSELF	-0.05261234
179 | -2.462987	ZIGGY	-0.05261233
180 | 
181 | \2-grams:
182 | -2.449389	<s> CALL
183 | -2.449389	<s> HELLO
184 | -1.086268	<s> HEY
185 | -2.449389	<s> HI
186 | -2.449389	<s> HOW
187 | -2.053426	<s> I
188 | -1.588285	<s> IN
189 | -2.053426	<s> IS
190 | -2.449389	<s> MUTE
191 | -2.449389	<s> OK
192 | -1.169629	<s> ON
193 | -2.449389	<s> PAUSE
194 | -0.6590814	<s> PLAY
195 | -2.449389	<s> REWIND
196 | -2.449389	<s> SEARCH
197 | -1.204978	<s> SET
198 | -2.449389	<s> STOP
199 | -1.086268	<s> TURN
200 | -1.61113	<s> VOLUME
201 | -2.311984	<s> WATCH
202 | -1.635243	<s> WHAT
203 | -1.635243	<s> WHAT'S
204 | -1.221142	A REMINDER
205 | -1.221142	A TIMER
206 | -0.9312775	AIR CONDITIONING
207 | -0.8880444	ALARM FOR
208 | -0.5881212	ALEXA </s>
209 | -1.199537	AM NOW
210 | -1.221142	AM RECORDING
211 | -0.9200591	AMAZON MUSIC
212 | -1.221142	AND FALL
213 | -1.042144	AND THE
214 | -0.9312775	ANECHOIC CHAMBER
215 | -0.9312775	AT HOME
216 | -0.5881212	BANGALORE </s>
217 | -0.8880444	BEFORE I
218 | -0.5881212	BIEBER </s>
219 | -0.5881212	BLACK </s>
220 | -0.9312775	BLAKE SHELTON
221 | -0.9312775	BLUE GENIE
222 | -0.5881212	BOWIE </s>
223 | -1.803332	BY BLAKE
224 | -1.803332	BY CHRIS
225 | -1.803332	BY DAFT
226 | -1.803332	BY DRAKE
227 | -1.803332	BY JUSTIN
228 | -1.803332	BY ODB
229 | -1.803332	BY RACHEL
230 | -1.803332	BY TWENTY
231 | -1.803332	BY YOU
232 | -0.9200591	CALENDAR TODAY
233 | -0.8303289	CALIBRATE THE
234 | -0.9312775	CALL MOM
235 | -0.9312775	CAME HERE
236 | -0.8880444	CAMERAS ON
237 | -0.5881212	CARIBBEAN </s>
238 | -0.8880444	CHAMBER FOR
239 | -0.9312775	CHRIS STAPELTON
240 | -0.9200591	CONDITIONING DOWN
241 | -0.9312775	DAFT PUNK
242 | -0.9312775	DAVID BOWIE
243 | -0.9200591	DINING ROOM
244 | -0.8880444	DO I
245 | -0.9312775	DOOR OPEN
246 | -0.2012962	DOWN </s>
247 | -0.5881212	DRAKE </s>
248 | -0.8303289	DRIVE TO
249 | -0.9312775	EIGHT HOURS
250 | -0.898456	EPISODE OF
251 | -0.5881212	ESPN </s>
252 | -0.898456	FALL OF
253 | -0.9312775	FIRST MEETING
254 | -0.5881212	FIVE </s>
255 | -0.9312775	FLOOR LAMP
256 | -0.9312775	FOLLOWING PHRASES
257 | -1.587212	FOR EIGHT
258 | -1.587212	FOR PIRATES
259 | -1.538578	FOR TEN
260 | -1.587212	FOR TWO
261 | -1.494846	FOR VOICE
262 | -0.8487282	FORGET BY
263 | -1.221142	FROM MARS
264 | -1.199537	FROM NOW
265 | -0.9312775	FRONT RIGHT
266 | -0.9312775	GARAGE DOOR
267 | -0.5881212	GENIE </s>
268 | -0.9312775	GET LUCKY
269 | -0.8303289	GOING TO
270 | -0.330307	GOOGLE </s>
271 | -1.375628	GOOGLE MUSIC
272 | -0.9312775	HELLO BLUE
273 | -0.8303289	HERE TO
274 | -1.587212	HEY ALEXA
275 | -1.494846	HEY GOOGLE
276 | -1.587212	HEY LOGI
277 | -1.587212	HEY LOGITECH
278 | -1.587212	HEY SIRI
279 | -0.9091232	HI MY
280 | -0.5881212	HOME </s>
281 | -0.9200591	HOURS FROM
282 | -0.9312775	HOW LONG
283 | -0.6473172	I AM
284 | -1.597578	I NEED
285 | -1.597578	I START
286 | -1.597578	I WANT
287 | -1.500668	IN BANGALORE
288 | -1.423601	IN GOOGLE
289 | -1.500668	IN ITUNES
290 | -1.500668	IN LOGITECH'S
291 | -0.8303289	INTENDED TO
292 | -1.520511	IS IT
293 | -1.597578	IS ONLY
294 | -1.597578	IS STATE
295 | -0.6024376	IS THE
296 | -1.32594	IT IN
297 | -1.386348	IT TAKE
298 | -1.144261	IT TO
299 | -0.5881212	ITUNES </s>
300 | -0.9312775	JUSTIN BIEBER
301 | -0.5881212	KIDS </s>
302 | -0.5881212	LAMP </s>
303 | -0.8303289	LEAVE TO
304 | -0.8303289	LIGHT TO
305 | -0.5881212	LIGHTS </s>
306 | -0.9200591	LIKE TODAY
307 | -0.1507424	LIVING ROOM
308 | -0.5881212	LOGI </s>
309 | -0.5881212	LOGITECH </s>
310 | -0.9312775	LOGITECH'S ANECHOIC
311 | -0.9312775	LONG WILL
312 | -0.9312775	LOVE YOURSELF
313 | -0.8487282	LUCKY BY
314 | -0.9091232	MAKE IT
315 | -0.5881212	MARS </s>
316 | -1.221142	ME SOMETHING
317 | -1.042144	ME TO
318 | -0.5881212	MEETING </s>
319 | -0.5881212	MINUTES </s>
320 | -0.9312775	MOM AT
321 | -0.2012962	MUSIC </s>
322 | -0.5881212	MUTE </s>
323 | -1.587212	MY CALENDAR
324 | -1.587212	MY DINING
325 | -1.587212	MY FIRST
326 | -1.538578	MY NAME
327 | -1.494846	MY VOICE
328 | -1.140505	NAME I
329 | -1.140505	NAME IS
330 | -0.8303289	NEED TO
331 | -0.9312775	NEW BLACK
332 | -0.9312775	NEXT EPISODE
333 | -0.6965729	NOW </s>
334 | -1.221142	NOW GOING
335 | -0.8880444	NPR ON
336 | -0.5881212	ODB </s>
337 | -1.511287	OF ORANGE
338 | -0.5174091	OF THE
339 | -1.511287	OF ZIGGY
340 | -0.9091232	OK GOOGLE
341 | -0.9498084	ON </s>
342 | -1.868225	ON AMAZON
343 | -0.9287202	ON MY
344 | -1.868225	ON PANDORA
345 | -1.868225	ON SOUNDCLOUD
346 | -1.868225	ON SPOTIFY
347 | -0.6289269	ON THE
348 | -0.9312775	ONE PILOTS
349 | -0.9312775	ONLY INTENDED
350 | -0.5881212	OPEN </s>
351 | -0.8880444	ORANGE IS
352 | -0.8487282	OUT BY
353 | -0.5881212	PANDORA </s>
354 | -0.898456	PART OF
355 | -0.5881212	PAUSE </s>
356 | -0.5881212	PHRASES </s>
357 | -0.9200591	PICK UP
358 | -0.5881212	PILOTS </s>
359 | -0.898456	PIRATES OF
360 | -0.5881212	PLATTEN </s>
361 | -0.8136998	PLAY </s>
362 | -1.901061	PLAY CAME
363 | -1.901061	PLAY DAVID
364 | -1.901061	PLAY GET
365 | -1.901061	PLAY LOVE
366 | -1.806209	PLAY ME
367 | -1.901061	PLAY NPR
368 | -1.901061	PLAY STAND
369 | -1.901061	PLAY STRESSED
370 | -1.901061	PLAY SUMMER
371 | -1.364388	PLAY THE
372 | -1.901061	PLAY TRAVELER
373 | -0.9312775	PM PICK
374 | -0.5881212	PUNK </s>
375 | -0.9312775	RACHEL PLATTEN
376 | -1.221142	RECOGNITION TESTING
377 | -1.042144	RECOGNITION TO
378 | -0.9200591	RECORDING THIS
379 | -0.5881212	RED </s>
380 | -0.8880444	REMINDER FOR
381 | -0.9200591	REWIND TEN
382 | -0.9200591	RIGHT LIVING
383 | -0.9200591	RISE AND
384 | -1.500668	ROOM FLOOR
385 | -1.500668	ROOM LIGHT
386 | -1.500668	ROOM LIGHTS
387 | -1.500668	ROOM SONOS
388 | -0.8303289	SAY THE
389 | -0.8880444	SCRIPT IS
390 | -0.8880444	SEARCH FOR
391 | -0.5881212	SECONDS </s>
392 | -0.5515851	SET A
393 | -1.511287	SET ALARM
394 | -1.480033	SET VOLUME
395 | -0.5881212	SHELTON </s>
396 | -0.5881212	SIRI </s>
397 | -0.8487282	SIXTEEN BY
398 | -0.8487282	SOMETHING BY
399 | -0.5881212	SONOS </s>
400 | -0.5881212	SOUNDCLOUD </s>
401 | -0.9200591	SPIDERS FROM
402 | -0.5881212	SPOTIFY </s>
403 | -0.8487282	STAND BY
404 | -0.5881212	STAPELTON </s>
405 | -0.9200591	STARDUST AND
406 | -0.8880444	START I
407 | -0.9312775	STATE YOUR
408 | -0.5881212	STOP </s>
409 | -0.9312775	STRESSED OUT
410 | -0.9312775	SUMMER SIXTEEN
411 | -0.9200591	TAKE ME
412 | -1.221142	TEN MINUTES
413 | -1.221142	TEN SECONDS
414 | -0.9200591	TESTING THIS
415 | -1.980735	THE AIR
416 | -1.980735	THE CARIBBEAN
417 | -1.980735	THE FOLLOWING
418 | -1.980735	THE FRONT
419 | -1.980735	THE GARAGE
420 | -1.980735	THE KIDS
421 | -1.097011	THE LIVING
422 | -1.980735	THE NEW
423 | -1.980735	THE NEXT
424 | -1.980735	THE RISE
425 | -1.980735	THE SCRIPT
426 | -1.980735	THE SPIDERS
427 | -1.791137	THE VOICE
428 | -1.980735	THE WEATHER
429 | -1.178955	THIS IN
430 | -1.221142	THIS PART
431 | -1.221142	TIME DO
432 | -1.140505	TIME IS
433 | -0.8880444	TIMER FOR
434 | -1.881221	TO CALIBRATE
435 | -1.881221	TO DRIVE
436 | -1.881221	TO FORGET
437 | -1.881221	TO LEAVE
438 | -1.881221	TO MAKE
439 | -0.960078	TO MY
440 | -1.881221	TO RED
441 | -1.881221	TO SAY
442 | -1.797722	TO WATCH
443 | -1.881221	TO WORK
444 | -0.2012962	TODAY </s>
445 | -0.8487282	TRAVELER BY
446 | -1.608197	TURN CAMERAS
447 | -0.36451	TURN ON
448 | -1.36611	TURN THE
449 | -0.9312775	TWENTY ONE
450 | -0.9312775	TWO PM
451 | -0.6965729	UP </s>
452 | -1.042144	UP THE
453 | -1.397233	VOICE BEFORE
454 | -0.4278275	VOICE RECOGNITION
455 | -1.355094	VOLUME DOWN
456 | -1.386348	VOLUME FIVE
457 | -1.355094	VOLUME UP
458 | -0.8303289	WANT TO
459 | -1.221142	WATCH ESPN
460 | -1.042144	WATCH THE
461 | -0.9312775	WEATHER LIKE
462 | -0.2529206	WHAT TIME
463 | -1.140505	WHAT'S ON
464 | -1.042144	WHAT'S THE
465 | -0.9091232	WILL IT
466 | -0.5881212	WORK </s>
467 | -0.8487282	YOU BY
468 | -0.9200591	YOUR NAME
469 | -0.8487282	YOURSELF BY
470 | -0.9312775	ZIGGY STARDUST
471 | 
472 | \end\
473 | 


--------------------------------------------------------------------------------
/run_experiment.py:
--------------------------------------------------------------------------------
  1 | from dataloader import MultiDataset, MultiDataLoader
  2 | from focalloss import FocalLoss
  3 | from warpctc_pytorch import CTCLoss
  4 | from model import MultiTask
  5 | from decoder import GreedyDecoder, BeamCTCDecoder
  6 | from training import train, test
  7 | import torch
  8 | import torch.nn as nn
  9 | from os import makedirs
 10 | from tensorboardX import SummaryWriter
 11 | from pathlib import Path
 12 | import math
 13 | from utils import now_str
 14 | import gc
 15 | 
 16 | manual_seed = 1337
 17 | torch.manual_seed(manual_seed)
 18 | torch.cuda.manual_seed_all
 19 | print(f'Using torch manual seed {manual_seed}.')
 20 | 
 21 | ### Start timer
 22 | min_ = 0
 23 | if min_ > 0:
 24 |     print(f'WARNING TIMER {min_} min')
 25 |     import time ; from tqdm import tqdm
 26 |     for __ in tqdm(range(min_)):
 27 |         time.sleep(60)
 28 | ###
 29 | 
 30 | 
 31 | def run_experiment(_exp_name,
 32 |                    _epochs,
 33 |                    _train_manifest, 
 34 |                    _test_manifest, 
 35 |                    _labels, 
 36 |                    _use_mfcc_in, 
 37 |                    _use_ivectors_in, 
 38 |                    _use_embeddings_in, 
 39 |                    _use_transcripts_out, 
 40 |                    _use_accents_out, 
 41 |                    _batch_size, 
 42 |                    _num_workers,
 43 |                    _mfcc_size,
 44 |                    _ivector_size,
 45 |                    _embedding_size,
 46 |                    _rnn_type, 
 47 |                    _rnn_hidden_size, 
 48 |                    _nb_head_layers,
 49 |                    _nb_speech_layers,
 50 |                    _nb_accents_layers,
 51 |                    _bidirectional,
 52 |                    _losses_mix,
 53 |                    _learning_rate,
 54 |                    _lm_path,
 55 |                    _decoder_alpha, 
 56 |                    _decoder_beta,
 57 |                    _decoder_cutoff_top_n,
 58 |                    _decoder_beam_width,
 59 |                    _cuda,
 60 |                    _tensorboard_path,
 61 |                    _saved_models_path,
 62 |                    _bottleneck_size,
 63 |                    _accent_loss):
 64 |     
 65 |     print(f'\n##### Running experiment {_exp_name} #####')
 66 |     
 67 |     # Tools to log values
 68 |     results_dict = {}
 69 |     results_dict['train_loss'] = []
 70 |     results_dict['train_loss_text'] = []
 71 |     results_dict['train_loss_accent'] = []   
 72 |     results_dict['test_loss'] = []
 73 |     results_dict['test_loss_text'] = []
 74 |     results_dict['test_loss_accent'] = []
 75 |     results_dict['test_wer'] = []
 76 |     results_dict['test_accent_acc'] = []
 77 |     
 78 |     tb_path = Path(_tensorboard_path) / _exp_name
 79 |     makedirs(tb_path, exist_ok=True)
 80 |     tb_writer = SummaryWriter(tb_path)
 81 | 
 82 |     ### DATA LOADING
 83 | 
 84 |     # Training set
 85 |     train_dataset = MultiDataset(_train_manifest,
 86 |                                  _labels, 
 87 |                                  use_mfcc_in=_use_mfcc_in, 
 88 |                                  use_ivectors_in=_use_ivectors_in, 
 89 |                                  use_embeddings_in=_use_embeddings_in, 
 90 |                                  embedding_size=_embedding_size,
 91 |                                  use_transcripts_out=_use_transcripts_out, 
 92 |                                  use_accents_out=_use_accents_out)
 93 |     
 94 |     train_loader = MultiDataLoader(train_dataset, 
 95 |                                        batch_size=_batch_size, 
 96 |                                        shuffle=True, 
 97 |                                        num_workers=_num_workers)
 98 |         
 99 |     # Testing set
100 |     test_dataset = MultiDataset(_test_manifest,
101 |                                 _labels, 
102 |                                 use_mfcc_in=_use_mfcc_in, 
103 |                                 use_ivectors_in=_use_ivectors_in, 
104 |                                 use_embeddings_in=_use_embeddings_in, 
105 |                                 embedding_size=_embedding_size,
106 |                                 use_transcripts_out=_use_transcripts_out, 
107 |                                 use_accents_out=_use_accents_out)
108 | 
109 |     test_loader = MultiDataLoader(test_dataset, 
110 |                                       batch_size=_batch_size, 
111 |                                       shuffle=True, 
112 |                                       num_workers=_num_workers)
113 | 
114 |     
115 |     ### CREATE MODEL
116 |     
117 |     model = MultiTask(use_mfcc_in = _use_mfcc_in, 
118 |                       use_ivectors_in = _use_ivectors_in, 
119 |                       use_embeddings_in = _use_embeddings_in,
120 |                       use_transcripts_out = _use_transcripts_out, 
121 |                       use_accents_out = _use_accents_out,
122 |                       mfcc_size = _mfcc_size,
123 |                       ivector_size = _ivector_size,
124 |                       embedding_size = _embedding_size,
125 |                       rnn_type = _rnn_type, 
126 |                       labels = _labels,
127 |                       accents_dict = train_dataset.accent_dict,
128 |                       rnn_hidden_size = _rnn_hidden_size, 
129 |                       nb_head_layers = _nb_head_layers,
130 |                       nb_speech_layers = _nb_speech_layers,
131 |                       nb_accents_layers = _nb_accents_layers,
132 |                       bidirectional = _bidirectional,
133 |                       bottleneck_size = _bottleneck_size,
134 |                       DEBUG=False)
135 |     if _cuda:
136 |         model = model.cuda()
137 |         
138 |     print(model, '\n')
139 |     print('Model parameters counts:', MultiTask.get_param_size(model), '\n')
140 |     
141 |     ### OPTIMIZER, CRITERION, DECODER
142 |     
143 |     # Optimizer
144 |     optimizer = torch.optim.Adam(model.parameters(), lr=_learning_rate)
145 |     
146 |     # Criterion
147 |     if _use_accents_out:
148 |         if _accent_loss == 'focal':
149 |             AccLoss = FocalLoss()
150 |         elif _accent_loss == 'CE':
151 |             AccLoss = nn.CrossEntropyLoss()
152 |         else:
153 |             raise ValueError(f'Loss {_accent_loss} for accent_loss is unknown. Please use either "focal" or "CE".')
154 |     
155 |     if not _use_transcripts_out: # only accent classification
156 |         criterion = AccLoss
157 |     elif not _use_accents_out: # only text recognition
158 |         criterion = CTCLoss()
159 |     else: # both tasks
160 |         criterion = (CTCLoss(), FocalLoss())
161 |         
162 |     # Decoder
163 |     if _use_transcripts_out:
164 |         decoder = BeamCTCDecoder(_labels, 
165 |                                  lm_path=_lm_path,
166 |                                  alpha=_decoder_alpha, 
167 |                                  beta=_decoder_beta,
168 |                                  cutoff_top_n=_decoder_cutoff_top_n,
169 |                                  cutoff_prob=_decoder_cutoff_top_n,
170 |                                  beam_width=_decoder_beam_width, 
171 |                                  num_processes=_num_workers)
172 |         
173 |         target_decoder = GreedyDecoder(_labels)
174 |     else:
175 |         decoder, target_decoder = None, None
176 |     
177 |     
178 |     ### EPOCHS
179 |     best_wer = math.inf
180 |     best_acc = 0
181 |     
182 |     for epoch in range(1, _epochs + 1):
183 |         ### TRAIN    
184 |         print(f'Epoch {epoch} training: {exp_name}')
185 |         train_results = train(model, train_loader, criterion, optimizer, losses_mix=_losses_mix)
186 |         train_loss, train_loss_text, train_loss_accent = train_results        
187 | 
188 |         results_dict['train_loss'].append(train_loss)
189 |         results_dict['train_loss_text'].append(train_loss_text)
190 |         results_dict['train_loss_accent'].append(train_loss_accent)
191 |         print(f'Epoch {epoch} training loss: {train_loss}')
192 |         
193 |         ### TEST
194 |         print(f'Epoch {epoch} testing')
195 |         test_results = test(model, test_loader, criterion, decoder, target_decoder, losses_mix=_losses_mix)
196 |         test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results
197 |         
198 |         results_dict['test_loss'].append(test_loss)
199 |         results_dict['test_loss_text'].append(test_loss_text)
200 |         results_dict['test_loss_accent'].append(test_loss_accent)
201 |         results_dict['test_wer'].append(test_wer)
202 |         results_dict['test_accent_acc'].append(test_accent_acc)
203 |         print(f'Epoch {epoch} testing loss: {test_loss}')
204 |         
205 |         # Add values to tensorboard
206 |         for key, results in results_dict.items():
207 |             tb_writer.add_scalar(key, results[-1], epoch)
208 |         
209 |         #Save model if it is best
210 |         save_new=False
211 |         if _use_transcripts_out:
212 |             if test_wer < best_wer:
213 |                 save_new = True                
214 |                 best_wer = test_wer
215 |         else:
216 |             if test_accent_acc > best_acc:
217 |                 save_new = True
218 |                 best_acc = test_accent_acc
219 |                 
220 |         if save_new:
221 |             MultiTask.serialize(model, 
222 |                                 Path(_saved_models_path) / _exp_name,
223 |                                 save=True,
224 |                                 exp_name=_exp_name,
225 |                                 optimizer=optimizer, 
226 |                                 epoch=epoch,
227 |                                 train_losses=results_dict['train_loss'],
228 |                                 test_losses=results_dict['test_loss'],
229 |                                 text_train_losses=results_dict['train_loss_text'],
230 |                                 text_test_losses=results_dict['test_loss_text'],
231 |                                 text_wers=results_dict['test_wer'],
232 |                                 accent_train_losses=results_dict['train_loss_accent'],
233 |                                 accent_test_losses=results_dict['test_loss_accent'],
234 |                                 accent_accuracies=results_dict['test_accent_acc'])
235 |     
236 |     del model
237 |     gc.collect()
238 |     torch.cuda.empty_cache()
239 |     ## end of run_experiment ##
240 | 
241 |     
242 | ### MAIN
243 | 
244 | if __name__ == '__main__':
245 |     import argparse
246 | 
247 |     parser = argparse.ArgumentParser(description='DeepSpeech model information')
248 |     parser.add_argument('--train', action='store_true', help='Uses the train set instead of the dev set.')
249 |     parser.add_argument('--epochs', default=None, type=int, help='Number of training epochs')
250 |     parser.add_argument('--patch_path', default='experiments.cfg', type=str, help='Path to experiment list')
251 |     args = parser.parse_args()
252 |     
253 |     DEV = not args.train
254 |     PATCH_PATH = args.patch_path
255 |     EPOCHS = args.epochs
256 |     
257 |     import config
258 |     confs = config.Config()
259 |     
260 |     for conf in confs.patch_config(PATCH_PATH):
261 |         exp_name = conf['exp_name_prefix']
262 |         exp_name += '_DEV' if DEV else '_TRAIN'
263 |         exp_name += '__in'
264 |         exp_name += '_mfcc' if conf['use_mfcc_in'] else '' 
265 |         exp_name += '_ivect' if conf['use_ivectors_in'] else '' 
266 |         exp_name += '_emb' if conf['use_embeddings_in'] else ''
267 |         exp_name += '__out'
268 |         exp_name += '_transcripts' if conf['use_transcripts_out'] else '' 
269 |         exp_name += f'_accents-mix{conf["losses_mix"]}-{conf["accent_loss"]}' if conf['use_accents_out'] else ''
270 |         exp_name += f'__nblyrs-head-{conf["nb_head_layers"]}'
271 |         exp_name += f'-speech-{conf["nb_speech_layers"]}'
272 |         exp_name += f'-accent-{conf["nb_accents_layers"]}'
273 |         exp_name += f'__bnf-{conf["bottleneck_size"]}'
274 |         exp_name += f'__{now_str()}'
275 |         
276 |         train_manifest = conf['dev_manifest'] if DEV else conf['train_manifest']
277 |         epochs = EPOCHS if EPOCHS is not None else conf['epochs']
278 |         
279 |         try:
280 |             run_experiment(_exp_name = exp_name,
281 |                            _epochs = epochs,
282 |                            _train_manifest = train_manifest, 
283 |                            _test_manifest = conf['test_manifest'], 
284 |                            _labels = conf['labels'], 
285 |                            _use_mfcc_in = conf['use_mfcc_in'], 
286 |                            _use_ivectors_in = conf['use_ivectors_in'], 
287 |                            _use_embeddings_in = conf['use_embeddings_in'], 
288 |                            _use_transcripts_out = conf['use_transcripts_out'], 
289 |                            _use_accents_out = conf['use_accents_out'], 
290 |                            _batch_size = conf['batch_size'], 
291 |                            _num_workers = conf['num_workers'],
292 |                            _mfcc_size = conf['mfcc_size'],
293 |                            _ivector_size = conf['ivector_size'],
294 |                            _embedding_size = conf['embedding_size'],
295 |                            _rnn_type = conf['rnn_type'], 
296 |                            _rnn_hidden_size = conf['rnn_hidden_size'], 
297 |                            _nb_head_layers = conf['nb_head_layers'],
298 |                            _nb_speech_layers = conf['nb_speech_layers'],
299 |                            _nb_accents_layers = conf['nb_accents_layers'],
300 |                            _bidirectional = conf['bidirectional'],
301 |                            _losses_mix = conf['losses_mix'],
302 |                            _learning_rate = conf['learning_rate'],
303 |                            _lm_path = conf['lm_path'],
304 |                            _decoder_alpha = conf['decoder_alpha'], 
305 |                            _decoder_beta = conf['decoder_beta'],
306 |                            _decoder_cutoff_top_n = conf['decoder_cutoff_top_n'],
307 |                            _decoder_beam_width = conf['decoder_beam_width'],
308 |                            _cuda = conf['cuda'],
309 |                            _tensorboard_path = conf['tensorboard_path'],
310 |                            _saved_models_path = conf['saved_models_path'],
311 |                            _bottleneck_size = conf['bottleneck_size'],
312 |                            _accent_loss = conf['accent_loss'])
313 |         
314 |         except Exception as e:
315 |             print(f'Error occured in run {exp_name}:', e)


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from collections import OrderedDict
  5 | from modules import MaskConv, BatchRNN, InferenceBatchSoftmax, SequenceWise
  6 | 
  7 | 
  8 | def rnn_block(rnn_input_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers):
  9 |     """Creates a stack of Batch RNNs with different input_size than hidden_size."""
 10 |     rnns = []
 11 |     rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
 12 |                    bidirectional=bidirectional, batch_norm=False)
 13 |     rnns.append(('0', rnn))
 14 |     for x in range(nb_layers - 1):
 15 |         rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
 16 |                        bidirectional=bidirectional)
 17 |         rnns.append(('%d' % (x + 1), rnn))
 18 |     return nn.Sequential(OrderedDict(rnns))
 19 |         
 20 |     
 21 | class Head(nn.Module):
 22 |     """Shared part of the neural network."""
 23 |     def __init__(self, 
 24 |                  rnn_type, 
 25 |                  rnn_hidden_size,
 26 |                  nb_layers, 
 27 |                  bidirectional,
 28 |                  feature_len,
 29 |                  DEBUG):
 30 | 
 31 |         super(Head, self).__init__()
 32 |         
 33 |         self._DEBUG = DEBUG
 34 | 
 35 |         # CONV
 36 |         self.conv = MaskConv(nn.Sequential(
 37 |             nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
 38 |             nn.BatchNorm2d(32),
 39 |             nn.Hardtanh(0, 20, inplace=True),
 40 |             nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
 41 |             nn.BatchNorm2d(32),
 42 |             nn.Hardtanh(0, 20, inplace=True)
 43 |         ))
 44 | 
 45 |         # RNN
 46 |         rnn_input_size = feature_len * 8
 47 | 
 48 |         self.rnns = rnn_block(rnn_input_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers)
 49 | 
 50 | 
 51 |     def forward(self, x, lengths):
 52 |         if self._DEBUG:
 53 |             print('')
 54 |             print('# BEGIN HEAD #')
 55 |             print('input', x.size())
 56 | 
 57 |         lengths = lengths.cpu().int()
 58 |         output_lengths = self.get_seq_lens(lengths)
 59 | 
 60 |         x = x.view(x.size(0), 1, x.size(1), x.size(2))
 61 |         x = x.transpose(2, 3)
 62 |         if self._DEBUG:
 63 |             print('after view transpose', x.size())
 64 |             
 65 |         x, _ = self.conv(x, output_lengths)
 66 |         if self._DEBUG:
 67 |             print('after conv', x.size())
 68 | 
 69 |         sizes = x.size()
 70 |         x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
 71 |         x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
 72 |         if self._DEBUG:
 73 |             print('after view transpose', x.size())
 74 | 
 75 |         for rnn in self.rnns:
 76 |             x = rnn(x, output_lengths)
 77 |         if self._DEBUG:
 78 |             print('after rnn', x.size())
 79 |     
 80 |         self._DEBUG = False
 81 |         return x, output_lengths
 82 | 
 83 |     def get_seq_lens(self, input_length):
 84 |         """
 85 |         Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
 86 |         containing the size sequences that will be output by the network.
 87 |         :param input_length: 1D Tensor
 88 |         :return: 1D Tensor scaled by model
 89 |         """
 90 |         seq_len = input_length
 91 |         for m in self.conv.modules():
 92 |             if type(m) == nn.modules.conv.Conv2d:
 93 |                 seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
 94 |         return seq_len.int()
 95 | 
 96 | 
 97 | class SpeechToText(nn.Module):
 98 |     def __init__(self, 
 99 |                  rnn_type, 
100 |                  rnn_hidden_size,
101 |                  nb_layers, 
102 |                  bidirectional,
103 |                  labels,
104 |                  DEBUG):
105 |     
106 |         super(SpeechToText, self).__init__()
107 | 
108 |         self._DEBUG = DEBUG
109 | 
110 |         # RNN
111 |         self.rnns = rnn_block(rnn_hidden_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers)
112 |         
113 |         # FULLY CO
114 |         num_classes = len(labels)
115 |         
116 |         fully_connected = nn.Sequential(
117 |             nn.BatchNorm1d(rnn_hidden_size),
118 |             nn.Linear(rnn_hidden_size, num_classes, bias=False)
119 |         )
120 |         self.fc = nn.Sequential(
121 |             SequenceWise(fully_connected),
122 |         )
123 |         self.inference_softmax = InferenceBatchSoftmax()
124 | 
125 | 
126 |     def forward(self, x, output_lengths):
127 |         if self._DEBUG:
128 |             print('')
129 |             print('# BEGIN speech to text #')
130 |             print('input', x.size())
131 |             
132 |         for rnn in self.rnns:
133 |             x = rnn(x, output_lengths)
134 |             
135 |         if self._DEBUG:
136 |             print('after rnn', x.size())
137 |         
138 |         x = self.fc(x)
139 |         if self._DEBUG:
140 |             print('after fc', x.size())
141 |         
142 |         x = x.transpose(0, 1)
143 |         if self._DEBUG:
144 |             print('after transpose', x.size())
145 |         # identity in training mode, softmax in eval mode
146 |         x = self.inference_softmax(x)
147 |         if self._DEBUG:
148 |             print('after softmax', x.size())
149 |             
150 |         x = x.transpose(0, 1)
151 |         if self._DEBUG:
152 |             print('after transpose', x.size())
153 |             
154 |         self._DEBUG = False
155 |         return x
156 | 
157 | 
158 | class AccentClassifier(nn.Module):
159 |     def __init__(self,   
160 |                  rnn_type, 
161 |                  rnn_hidden_size,
162 |                  nb_layers, 
163 |                  bidirectional,
164 |                  accents_dict,
165 |                  bottleneck_size,
166 |                  DEBUG):
167 |         
168 |         super(AccentClassifier, self).__init__()
169 |         
170 |         self._DEBUG = DEBUG
171 |                 
172 |         # RNN
173 |         self.rnns = rnn_block(rnn_hidden_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers)
174 |             
175 |         # FULLY CO
176 |         num_classes = len(accents_dict)
177 |         
178 |         self.bnf = nn.Sequential(
179 |             nn.BatchNorm1d(rnn_hidden_size),
180 |             nn.Linear(rnn_hidden_size, 1024),
181 |             nn.ReLU(),
182 |             nn.BatchNorm1d(1024),
183 |             nn.Linear(1024, bottleneck_size),
184 |             nn.ReLU(),
185 |         )
186 |         
187 |         self.fc = nn.Sequential(
188 |             nn.BatchNorm1d(bottleneck_size),
189 |             nn.Linear(bottleneck_size, num_classes),
190 |             nn.ReLU(),
191 |         )
192 |         
193 |         self.softmax = nn.Softmax(dim=1)
194 | 
195 |     def forward(self, x, output_lengths):     
196 |         if self._DEBUG:
197 |             print('')
198 |             print('# BEGIN Acc #')
199 |             print('input', x.size())
200 |             
201 |         for rnn in self.rnns:
202 |             x = rnn(x, output_lengths)
203 |             
204 |         if self._DEBUG:
205 |             print('after rnn', x.size())
206 |             
207 |         x = x.mean(dim=0)
208 |         
209 |         if self._DEBUG:
210 |             print('after mean', x.size())
211 |             
212 |         bottleneck = self.bnf(x)
213 |         
214 |         if self._DEBUG:
215 |             print('after bnf', bottleneck.size())
216 |             
217 |         x = self.fc(bottleneck)
218 |         
219 |         if self._DEBUG:
220 |             print('after fc', x.size())
221 |             
222 |         x = self.softmax(x)
223 |         
224 |         if self._DEBUG:
225 |             print('after softmax', x.size())
226 |             
227 |         self._DEBUG = False
228 |         return x, bottleneck
229 | 
230 | 
231 | class MultiTask(nn.Module):
232 |     def __init__(self,
233 |                 use_mfcc_in=True, 
234 |                 use_ivectors_in=True, 
235 |                 use_embeddings_in=True,
236 |                 use_transcripts_out=True, 
237 |                 use_accents_out=True,
238 |                 mfcc_size=40,
239 |                 ivector_size=100,
240 |                 embedding_size=100,
241 |                 rnn_type=nn.GRU, 
242 |                 labels="abc",
243 |                 accents_dict={'uk', 'us'},
244 |                 rnn_hidden_size=800, 
245 |                 nb_head_layers=2,
246 |                 nb_speech_layers=2,
247 |                 nb_accents_layers=2,
248 |                 bidirectional=True,
249 |                 bottleneck_size=256,
250 |                 DEBUG=False):
251 |         
252 |         self._meta = {
253 |             'use_mfcc_in': use_mfcc_in, 
254 |             'use_ivectors_in': use_ivectors_in, 
255 |             'use_embeddings_in': use_embeddings_in,
256 |             'use_transcripts_out': use_transcripts_out, 
257 |             'use_accents_out': use_accents_out,
258 |             'mfcc_size': mfcc_size,
259 |             'ivector_size': ivector_size,
260 |             'embedding_size': embedding_size,
261 |             'rnn_type': rnn_type, 
262 |             'labels': labels,
263 |             'accents_dict': accents_dict,
264 |             'rnn_hidden_size': rnn_hidden_size, 
265 |             'nb_head_layers': nb_head_layers,
266 |             'nb_speech_layers': nb_speech_layers,
267 |             'nb_accents_layers': nb_accents_layers,
268 |             'bidirectional': bidirectional,
269 |             'bottleneck_size': bottleneck_size,
270 |             'DEBUG': DEBUG,
271 |         }
272 |         
273 |         super(MultiTask, self).__init__()
274 |             
275 |         self.feature_len = 0
276 |         self.feature_len += mfcc_size if use_mfcc_in else 0
277 |         self.feature_len += ivector_size if use_ivectors_in else 0
278 |         self.feature_len += embedding_size if use_embeddings_in else 0
279 |             
280 |         self.Head = Head(rnn_type=rnn_type, 
281 |                          rnn_hidden_size=rnn_hidden_size,
282 |                          nb_layers=nb_head_layers, 
283 |                          bidirectional=bidirectional,
284 |                          feature_len=self.feature_len,
285 |                          DEBUG=DEBUG)
286 |             
287 |         if self._meta['use_transcripts_out']:
288 |             self.SpeechToText = SpeechToText(rnn_type=rnn_type, 
289 |                                              rnn_hidden_size=rnn_hidden_size,
290 |                                              nb_layers=nb_speech_layers, 
291 |                                              bidirectional=bidirectional,
292 |                                              labels=labels,
293 |                                              DEBUG=DEBUG)
294 |             
295 |         if self._meta['use_accents_out']:
296 |             self.AccentClassifier = AccentClassifier(rnn_type=rnn_type, 
297 |                                         rnn_hidden_size=rnn_hidden_size,
298 |                                         nb_layers=nb_accents_layers, 
299 |                                         bidirectional=bidirectional,
300 |                                         accents_dict=accents_dict,
301 |                                         bottleneck_size=bottleneck_size,
302 |                                         DEBUG=DEBUG)
303 |         
304 |     def forward(self, x, lengths):
305 |         x, out_len = self.Head(x, lengths)
306 |         x_stt, x_acc, bnf = None, None, None
307 |         
308 |         if self._meta['use_transcripts_out']:
309 |             x_stt = self.SpeechToText(x, out_len)
310 |             
311 |         if self._meta['use_accents_out']:
312 |             x_acc, bnf = self.AccentClassifier(x, out_len)
313 |             
314 |         return x_stt, x_acc, out_len, bnf
315 |     
316 |     
317 |     @staticmethod
318 |     def get_param_size(model):
319 |         params = 0
320 |         for p in model.parameters():
321 |             tmp = 1
322 |             for x in p.size():
323 |                 tmp *= x
324 |             params += tmp
325 |         return params
326 |     
327 |     @classmethod
328 |     def load_model(cls, path):
329 |         package = torch.load(path, map_location=lambda storage, loc: storage)
330 |         meta = package['meta']
331 |         model = cls(
332 |             use_mfcc_in = meta['use_mfcc_in'], 
333 |             use_ivectors_in = meta['use_ivectors_in'], 
334 |             use_embeddings_in = meta['use_embeddings_in'],
335 |             use_transcripts_out = meta['use_transcripts_out'], 
336 |             use_accents_out = meta['use_accents_out'],
337 |             mfcc_size = meta['mfcc_size'],
338 |             ivector_size = meta['ivector_size'],
339 |             embedding_size = meta['embedding_size'],
340 |             rnn_type = meta['rnn_type'], 
341 |             labels = meta['labels'],
342 |             accents_dict = meta['accents_dict'],
343 |             rnn_hidden_size = meta['rnn_hidden_size'], 
344 |             nb_head_layers = meta['nb_head_layers'],
345 |             nb_speech_layers = meta['nb_speech_layers'],
346 |             nb_accents_layers = meta['nb_accents_layers'],
347 |             bidirectional = meta['bidirectional'],
348 |             bottleneck_size = meta['bottleneck_size'],
349 |             DEBUG = meta['DEBUG'],
350 |         )
351 |         model.load_state_dict(package['state_dict'])
352 |         return model, package
353 |         
354 |     @staticmethod
355 |     def serialize(model, 
356 |                   path='./__temp__',
357 |                   save=True,
358 |                   exp_name=None,
359 |                   optimizer=None, 
360 |                   epoch=None,
361 |                   train_losses=None,
362 |                   test_losses=None,
363 |                   text_train_losses=None,
364 |                   text_test_losses=None,
365 |                   text_wers=None,
366 |                   accent_train_losses=None,
367 |                   accent_test_losses=None,
368 |                   accent_accuracies=None):
369 |         
370 |         """Saves the model in a packaged form. Also returns the package.
371 |         Use the load_model class method to recreate a model from a package."""
372 |         
373 |         package = {
374 |             'state_dict': model.state_dict(),
375 |             'meta': model._meta
376 |         }
377 |         
378 |         if exp_name is not None:
379 |             package['exp_name'] = exp_name
380 |         if optimizer is not None:
381 |             package['optimizer'] = optimizer
382 |         if epoch is not None:
383 |             package['epoch'] = epoch
384 |         if train_losses is not None:
385 |             package['train_losses'] = train_losses
386 |         if test_losses is not None:
387 |             package['test_losses'] = test_losses
388 |         if text_train_losses is not None:
389 |             package['text_train_losses'] = text_train_losses
390 |         if text_test_losses is not None:
391 |             package['text_test_losses'] = text_test_losses
392 |         if text_wers is not None:
393 |             package['text_wers'] = text_wers
394 |         if accent_train_losses is not None:
395 |             package['accent_train_losses'] = accent_train_losses
396 |         if accent_test_losses is not None:
397 |             package['accent_test_losses'] = accent_test_losses
398 |         if accent_accuracies is not None:
399 |             package['accent_accuracies'] = accent_accuracies
400 |             
401 |         if save:
402 |             torch.save(package, str(path) + '.pth')
403 |             
404 |         return package


--------------------------------------------------------------------------------
/tests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "torch.Size([20, 960, 240])\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%reload_ext autoreload\n",
 18 |     "%autoreload 1\n",
 19 |     "%aimport config\n",
 20 |     "\n",
 21 |     "conf = config.Config()\n",
 22 |     "\n",
 23 |     "from model import MultiTask\n",
 24 |     "model = MultiTask.load_model('saved_models/SimpleDS_TRAIN__in_mfcc__out_transcripts__nblyrs-head-4-speech-1-accent-1__bnf-256__24-02-2019_23h50m00.pth')\n",
 25 |     "\n",
 26 |     "from dataloader import MultiDataset, MultiDataLoader\n",
 27 |     "import torch\n",
 28 |     "\n",
 29 |     "labels = \" 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_\"\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "dataset = MultiDataset('data/splits/dev.csv', labels, \n",
 33 |     "                       use_mfcc_in=model._meta['use_mfcc_in'], \n",
 34 |     "                       use_ivectors_in=True,#model._meta['use_ivectors_in'], \n",
 35 |     "                       use_embeddings_in=True,#model._meta['use_embeddings_in'],\n",
 36 |     "                       use_transcripts_out=model._meta['use_transcripts_out'], \n",
 37 |     "                       use_accents_out=model._meta['use_accents_out'])\n",
 38 |     "\n",
 39 |     "dataloader = MultiDataLoader(dataset, batch_size=20, shuffle=False)\n",
 40 |     "\n",
 41 |     "for data in dataloader:\n",
 42 |     "    print(data[0].size())\n",
 43 |     "    break"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 6,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "{'use_mfcc_in': True,\n",
 55 |        " 'use_ivectors_in': False,\n",
 56 |        " 'use_embeddings_in': False,\n",
 57 |        " 'use_transcripts_out': True,\n",
 58 |        " 'use_accents_out': False,\n",
 59 |        " 'mfcc_size': 40,\n",
 60 |        " 'ivector_size': 100,\n",
 61 |        " 'embedding_size': 256,\n",
 62 |        " 'rnn_type': torch.nn.modules.rnn.GRU,\n",
 63 |        " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
 64 |        " 'accents_dict': {'australia': 0,\n",
 65 |        "  'canada': 1,\n",
 66 |        "  'england': 2,\n",
 67 |        "  'ireland': 3,\n",
 68 |        "  'scotland': 4,\n",
 69 |        "  'us': 5,\n",
 70 |        "  'wales': 6},\n",
 71 |        " 'rnn_hidden_size': 800,\n",
 72 |        " 'nb_head_layers': 4,\n",
 73 |        " 'nb_speech_layers': 1,\n",
 74 |        " 'nb_accents_layers': 1,\n",
 75 |        " 'bidirectional': True,\n",
 76 |        " 'bottleneck_size': 256,\n",
 77 |        " 'DEBUG': False}"
 78 |       ]
 79 |      },
 80 |      "execution_count": 6,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": []
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 70,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "tensor([7])"
 96 |       ]
 97 |      },
 98 |      "execution_count": 70,
 99 |      "metadata": {},
100 |      "output_type": "execute_result"
101 |     }
102 |    ],
103 |    "source": [
104 |     "import torch\n",
105 |     "sum([torch.tensor([2]), torch.tensor([5])])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 65,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "t='this is test'\n",
115 |     "i = t.find(' ')"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 66,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "'is test'"
127 |       ]
128 |      },
129 |      "execution_count": 66,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "t[i+1:]"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 67,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/plain": [
146 |        "[{'exp_name_prefix': 'a',\n",
147 |        "  'epochs': 2,\n",
148 |        "  'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
149 |        "  'batch_size': 10,\n",
150 |        "  'num_workers': 4,\n",
151 |        "  'cuda': True,\n",
152 |        "  'losses_mix': 0.9,\n",
153 |        "  'learning_rate': 0.0003,\n",
154 |        "  'mfcc_size': 40,\n",
155 |        "  'ivector_size': 100,\n",
156 |        "  'embedding_size': 100,\n",
157 |        "  'rnn_type': torch.nn.modules.rnn.GRU,\n",
158 |        "  'rnn_hidden_size': 800,\n",
159 |        "  'nb_head_layers': 3,\n",
160 |        "  'nb_speech_layers': 1,\n",
161 |        "  'nb_accents_layers': 1,\n",
162 |        "  'bidirectional': True,\n",
163 |        "  'bottleneck_size': 256,\n",
164 |        "  'use_mfcc_in': True,\n",
165 |        "  'use_ivectors_in': True,\n",
166 |        "  'use_embeddings_in': True,\n",
167 |        "  'use_transcripts_out': True,\n",
168 |        "  'use_accents_out': False,\n",
169 |        "  'decoder_alpha': 0.8,\n",
170 |        "  'decoder_beta': 1.0,\n",
171 |        "  'decoder_cutoff_top_n': 40,\n",
172 |        "  'decoder_cutoff_prob': 1.0,\n",
173 |        "  'decoder_beam_width': 100,\n",
174 |        "  'lm_path': './data/language_models/cv.lm',\n",
175 |        "  'train_manifest': './data/splits/train.csv',\n",
176 |        "  'dev_manifest': './data/splits/dev.csv',\n",
177 |        "  'test_manifest': './data/splits/test.csv',\n",
178 |        "  'tensorboard_path': './tensorboard_runs/',\n",
179 |        "  'saved_models_path': './saved_models/'},\n",
180 |        " {'exp_name_prefix': 'b',\n",
181 |        "  'epochs': 2,\n",
182 |        "  'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
183 |        "  'batch_size': 10,\n",
184 |        "  'num_workers': 4,\n",
185 |        "  'cuda': True,\n",
186 |        "  'losses_mix': 0.9,\n",
187 |        "  'learning_rate': 0.0003,\n",
188 |        "  'mfcc_size': 40,\n",
189 |        "  'ivector_size': 100,\n",
190 |        "  'embedding_size': 100,\n",
191 |        "  'rnn_type': torch.nn.modules.rnn.GRU,\n",
192 |        "  'rnn_hidden_size': 800,\n",
193 |        "  'nb_head_layers': 3,\n",
194 |        "  'nb_speech_layers': 1,\n",
195 |        "  'nb_accents_layers': 1,\n",
196 |        "  'bidirectional': True,\n",
197 |        "  'bottleneck_size': 256,\n",
198 |        "  'use_mfcc_in': False,\n",
199 |        "  'use_ivectors_in': True,\n",
200 |        "  'use_embeddings_in': True,\n",
201 |        "  'use_transcripts_out': False,\n",
202 |        "  'use_accents_out': True,\n",
203 |        "  'decoder_alpha': 0.8,\n",
204 |        "  'decoder_beta': 1.0,\n",
205 |        "  'decoder_cutoff_top_n': 40,\n",
206 |        "  'decoder_cutoff_prob': 1.0,\n",
207 |        "  'decoder_beam_width': 100,\n",
208 |        "  'lm_path': './data/language_models/cv.lm',\n",
209 |        "  'train_manifest': './data/splits/train.csv',\n",
210 |        "  'dev_manifest': './data/splits/dev.csv',\n",
211 |        "  'test_manifest': './data/splits/test.csv',\n",
212 |        "  'tensorboard_path': './tensorboard_runs/',\n",
213 |        "  'saved_models_path': './saved_models/'},\n",
214 |        " {'exp_name_prefix': 'c',\n",
215 |        "  'epochs': 2,\n",
216 |        "  'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
217 |        "  'batch_size': 10,\n",
218 |        "  'num_workers': 4,\n",
219 |        "  'cuda': True,\n",
220 |        "  'losses_mix': 0.9,\n",
221 |        "  'learning_rate': 0.0003,\n",
222 |        "  'mfcc_size': 40,\n",
223 |        "  'ivector_size': 100,\n",
224 |        "  'embedding_size': 100,\n",
225 |        "  'rnn_type': torch.nn.modules.rnn.GRU,\n",
226 |        "  'rnn_hidden_size': 800,\n",
227 |        "  'nb_head_layers': 3,\n",
228 |        "  'nb_speech_layers': 1,\n",
229 |        "  'nb_accents_layers': 1,\n",
230 |        "  'bidirectional': True,\n",
231 |        "  'bottleneck_size': 256,\n",
232 |        "  'use_mfcc_in': True,\n",
233 |        "  'use_ivectors_in': False,\n",
234 |        "  'use_embeddings_in': False,\n",
235 |        "  'use_transcripts_out': True,\n",
236 |        "  'use_accents_out': True,\n",
237 |        "  'decoder_alpha': 0.8,\n",
238 |        "  'decoder_beta': 1.0,\n",
239 |        "  'decoder_cutoff_top_n': 40,\n",
240 |        "  'decoder_cutoff_prob': 1.0,\n",
241 |        "  'decoder_beam_width': 100,\n",
242 |        "  'lm_path': './data/language_models/cv.lm',\n",
243 |        "  'train_manifest': './data/splits/train.csv',\n",
244 |        "  'dev_manifest': './data/splits/dev.csv',\n",
245 |        "  'test_manifest': './data/splits/test.csv',\n",
246 |        "  'tensorboard_path': './tensorboard_runs/',\n",
247 |        "  'saved_models_path': './saved_models/'}]"
248 |       ]
249 |      },
250 |      "execution_count": 67,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "conf.patch_config('experiments.cfg')"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": []
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 4,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "from model import MultiTask\n",
273 |     "\n",
274 |     "model = MultiTask(DEBUG=False, rnn_hidden_size=800, \n",
275 |     "                  use_mfcc_in=conf['use_mfcc_in'], \n",
276 |     "                  use_ivectors_in=conf['use_ivectors_in'], \n",
277 |     "                  use_embeddings_in=conf['use_embeddings_in'],\n",
278 |     "                  use_transcripts_out=conf['use_transcripts_out'], \n",
279 |     "                  use_accents_out=conf['use_accents_out'])"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 24,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "name": "stdout",
289 |      "output_type": "stream",
290 |      "text": [
291 |       "blib \n",
292 |       "\n",
293 |       "test\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "print('blib', '\\n')\n",
299 |     "print('test')\n"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 25,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "{'australia': 0, 'canada': 1, 'england': 2, 'us': 3}"
311 |       ]
312 |      },
313 |      "execution_count": 25,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "dataset.accent_dict"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 26,
325 |    "metadata": {},
326 |    "outputs": [
327 |     {
328 |      "data": {
329 |       "text/plain": [
330 |        "[True]"
331 |       ]
332 |      },
333 |      "execution_count": 26,
334 |      "metadata": {},
335 |      "output_type": "execute_result"
336 |     }
337 |    ],
338 |    "source": [
339 |     "conf['use_embeddings_in']"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 27,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "data": {
349 |       "text/plain": [
350 |        "True"
351 |       ]
352 |      },
353 |      "execution_count": 27,
354 |      "metadata": {},
355 |      "output_type": "execute_result"
356 |     }
357 |    ],
358 |    "source": [
359 |     "model._meta['use_embeddings_in']"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 29,
365 |    "metadata": {},
366 |    "outputs": [
367 |     {
368 |      "data": {
369 |       "application/vnd.jupyter.widget-view+json": {
370 |        "model_id": "f3351a1c54734de0b6fe48058fa7e33e",
371 |        "version_major": 2,
372 |        "version_minor": 0
373 |       },
374 |       "text/plain": [
375 |        "HBox(children=(IntProgress(value=0, max=58), HTML(value='')))"
376 |       ]
377 |      },
378 |      "metadata": {},
379 |      "output_type": "display_data"
380 |     },
381 |     {
382 |      "name": "stdout",
383 |      "output_type": "stream",
384 |      "text": [
385 |       "\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "from tqdm import tqdm_notebook as tqdm\n",
391 |     "\n",
392 |     "model = model.cuda()\n",
393 |     "\n",
394 |     "for data in tqdm(dataloader):\n",
395 |     "    inputs, inputs_lens, transcripts, transcripts_lens, accents = data\n",
396 |     "\n",
397 |     "    \n",
398 |     "    a, b, c, __ = model(inputs.cuda(), inputs_lens.cuda())"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 10,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "MultiTask.serialize(model, 'tmp')\n",
408 |     "\n",
409 |     "modelb = MultiTask.load_model('tmp')\n"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 12,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "modelb = modelb.cuda()"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 13,
424 |    "metadata": {},
425 |    "outputs": [
426 |     {
427 |      "data": {
428 |       "application/vnd.jupyter.widget-view+json": {
429 |        "model_id": "8850c82f6c9d4458bc727af18e20630b",
430 |        "version_major": 2,
431 |        "version_minor": 0
432 |       },
433 |       "text/plain": [
434 |        "HBox(children=(IntProgress(value=0, max=571), HTML(value='')))"
435 |       ]
436 |      },
437 |      "metadata": {},
438 |      "output_type": "display_data"
439 |     },
440 |     {
441 |      "name": "stdout",
442 |      "output_type": "stream",
443 |      "text": [
444 |       "\n"
445 |      ]
446 |     }
447 |    ],
448 |    "source": [
449 |     "for data in tqdm(dataloader):\n",
450 |     "    inputs, inputs_lens, transcripts, transcripts_lens, accents = data\n",
451 |     "\n",
452 |     "    \n",
453 |     "    a, b, c = modelb(inputs.cuda(), inputs_lens.cuda())\n"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "@classmethod\n",
463 |     "def load_model(cls, path):\n",
464 |     "    package = torch.load(path, map_location=lambda storage, loc: storage)\n",
465 |     "    model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['nb_layers'],\n",
466 |     "                labels=package['labels'], audio_conf=package['audio_conf'],\n",
467 |     "                rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))\n",
468 |     "    model.load_state_dict(package['state_dict'])\n",
469 |     "    for x in model.rnns:\n",
470 |     "        x.flatten_parameters()\n",
471 |     "    return model\n",
472 |     "\n",
473 |     "@classmethod\n",
474 |     "def load_model_package(cls, package):\n",
475 |     "    model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['nb_layers'],\n",
476 |     "                labels=package['labels'], audio_conf=package['audio_conf'],\n",
477 |     "                rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))\n",
478 |     "    model.load_state_dict(package['state_dict'])\n",
479 |     "    return model\n",
480 |     "\n",
481 |     "@staticmethod\n",
482 |     "def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None,\n",
483 |     "              main_loss_results=None, side_loss_results=None,\n",
484 |     "              cer_results=None, wer_results=None, mca_results=None, avg_loss=None, meta=None):\n",
485 |     "    model = model.module if DeepSpeech.is_parallel(model) else model\n",
486 |     "    package = {\n",
487 |     "        'version': model._version,\n",
488 |     "        'hidden_size': model._hidden_size,\n",
489 |     "        'nb_layers': model._nb_layers,\n",
490 |     "        'rnn_type': supported_rnns_inv.get(model._rnn_type, model._rnn_type.__name__.lower()),\n",
491 |     "        'audio_conf': model._audio_conf,\n",
492 |     "        'labels': model._labels,\n",
493 |     "        'state_dict': model.state_dict(),\n",
494 |     "        'bidirectional': model._bidirectional\n",
495 |     "    }\n",
496 |     "    if optimizer is not None:\n",
497 |     "        package['optim_dict'] = optimizer.state_dict()\n",
498 |     "    if avg_loss is not None:\n",
499 |     "        package['avg_loss'] = avg_loss\n",
500 |     "    if epoch is not None:\n",
501 |     "        package['epoch'] = epoch + 1  # increment for readability\n",
502 |     "    if iteration is not None:\n",
503 |     "        package['iteration'] = iteration\n",
504 |     "    if loss_results is not None:\n",
505 |     "        package['loss_results'] = loss_results\n",
506 |     "        package['main_loss_results'] = main_loss_results\n",
507 |     "        package['side_loss_results'] = side_loss_results\n",
508 |     "        package['cer_results'] = cer_results\n",
509 |     "        package['wer_results'] = wer_results\n",
510 |     "        package['mca_results'] = mca_results\n",
511 |     "    if meta is not None:\n",
512 |     "        package['meta'] = meta\n",
513 |     "    return package"
514 |    ]
515 |   }
516 |  ],
517 |  "metadata": {
518 |   "kernelspec": {
519 |    "display_name": "Python 3",
520 |    "language": "python",
521 |    "name": "python3"
522 |   },
523 |   "language_info": {
524 |    "codemirror_mode": {
525 |     "name": "ipython",
526 |     "version": 3
527 |    },
528 |    "file_extension": ".py",
529 |    "mimetype": "text/x-python",
530 |    "name": "python",
531 |    "nbconvert_exporter": "python",
532 |    "pygments_lexer": "ipython3",
533 |    "version": "3.6.8"
534 |   }
535 |  },
536 |  "nbformat": 4,
537 |  "nbformat_minor": 2
538 | }
539 | 


--------------------------------------------------------------------------------