├── README.md
├── utils.py
├── experiments.cfg
├── focalloss.py
├── .gitignore
├── default_config.cfg
├── config.py
├── test_model.py
├── modules.py
├── training.py
├── dataloader.py
├── decoder.py
├── data
└── language_models
│ ├── lnn_tri.lm
│ └── lnn_bi.lm
├── run_experiment.py
├── model.py
└── tests.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # AccentedSpeechRecognition
2 | Experiments on speech recognition robustness to accents and dialects.
3 |
4 | Part of the code was borrowed from https://github.com/SeanNaren/deepspeech.pytorch, please follow their readme for setup.
5 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import time
4 |
5 | def tile(a, dim, n_tile):
6 | """Expands a tensor amongst a given dimension, repeating its components."""
7 | init_dim = a.size(dim)
8 | repeat_idx = [1] * a.dim()
9 | repeat_idx[dim] = n_tile
10 | a = a.repeat(*(repeat_idx))
11 | order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
12 | if a.is_cuda:
13 | order_index = order_index.cuda()
14 | return torch.index_select(a, dim, order_index)
15 |
16 | def now_str():
17 | return time.strftime("%d-%m-%Y_%Hh%Mm%S")
--------------------------------------------------------------------------------
/experiments.cfg:
--------------------------------------------------------------------------------
1 | # List of experiments settings to override default_config.cfg
2 | # Use '#' for comments and '!' to seperate experiments
3 |
4 |
5 |
6 | # general
7 | exp_name_prefix 'TestMulti'
8 |
9 | # hyper params
10 | nb_head_layers 4
11 | nb_speech_layers 1
12 | nb_accents_layers 1
13 |
14 | embedding_size 256
15 |
16 | # network config
17 | use_mfcc_in True
18 | use_ivectors_in False
19 | use_embeddings_in False
20 | use_transcripts_out True
21 | use_accents_out True
22 |
23 | !
24 |
25 |
26 | # general
27 | exp_name_prefix 'TestMulti'
28 |
29 | # hyper params
30 | nb_head_layers 4
31 | nb_speech_layers 1
32 | nb_accents_layers 1
33 |
34 | embedding_size 256
35 |
36 | # network config
37 | use_mfcc_in True
38 | use_ivectors_in False
39 | use_embeddings_in True
40 | use_transcripts_out True
--------------------------------------------------------------------------------
/focalloss.py:
--------------------------------------------------------------------------------
1 | # Code taken from https://github.com/clcarwin/focal_loss_pytorch
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.autograd import Variable
7 |
8 | class FocalLoss(nn.Module):
9 | def __init__(self, gamma=0, alpha=None, size_average=True):
10 | super(FocalLoss, self).__init__()
11 | self.gamma = gamma
12 | self.alpha = alpha
13 | if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
14 | if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
15 | self.size_average = size_average
16 |
17 | def forward(self, input, target):
18 | if input.dim()>2:
19 | input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W
20 | input = input.transpose(1,2) # N,C,H*W => N,H*W,C
21 | input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C
22 | target = target.view(-1,1)
23 |
24 | logpt = F.log_softmax(input, dim=0)
25 | logpt = logpt.gather(1,target)
26 | logpt = logpt.view(-1)
27 | pt = Variable(logpt.data.exp())
28 |
29 | if self.alpha is not None:
30 | if self.alpha.type()!=input.data.type():
31 | self.alpha = self.alpha.type_as(input.data)
32 | at = self.alpha.gather(0,target.data.view(-1))
33 | logpt = logpt * Variable(at)
34 |
35 | loss = -1 * (1-pt)**self.gamma * logpt
36 | if self.size_average: return loss.mean()
37 | else: return loss.sum()
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | bak/
3 | log.txt
4 | *.wav
5 | mfccs/
6 | embeddings*/
7 | *_dataset/
8 | txt/
9 | wav/
10 | ivectors/
11 | saved_models/
12 | tensorboard_runs/
13 |
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | *.py[cod]
17 | *$py.class
18 |
19 | # C extensions
20 | *.so
21 |
22 | # Distribution / packaging
23 | .Python
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | MANIFEST
40 |
41 | # PyInstaller
42 | # Usually these files are written by a python script from a template
43 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 |
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 |
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .coverage
55 | .coverage.*
56 | .cache
57 | nosetests.xml
58 | coverage.xml
59 | *.cover
60 | .hypothesis/
61 | .pytest_cache/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 |
72 | # Flask stuff:
73 | instance/
74 | .webassets-cache
75 |
76 | # Scrapy stuff:
77 | .scrapy
78 |
79 | # Sphinx documentation
80 | docs/_build/
81 |
82 | # PyBuilder
83 | target/
84 |
85 | # Jupyter Notebook
86 | .ipynb_checkpoints
87 |
88 | # pyenv
89 | .python-version
90 |
91 | # celery beat schedule file
92 | celerybeat-schedule
93 |
94 | # SageMath parsed files
95 | *.sage.py
96 |
97 | # Environments
98 | .env
99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 |
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 |
110 | # Rope project settings
111 | .ropeproject
112 |
113 | # mkdocs documentation
114 | /site
115 |
116 | # mypy
117 | .mypy_cache/
118 |
--------------------------------------------------------------------------------
/default_config.cfg:
--------------------------------------------------------------------------------
1 | # configuration, separate name and values (can be multiple) with
2 | # if multiple values exists for a field, multiple experiments will be run
3 | # (see config.py: Config.create_multi_dict())
4 |
5 | # general
6 | exp_name_prefix ''
7 | epochs 30
8 | labels "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
9 | batch_size 40
10 | num_workers 4
11 | cuda True
12 |
13 | # hyper params
14 | losses_mix 0.9
15 | learning_rate 3e-4
16 | mfcc_size 40
17 | ivector_size 100
18 | embedding_size 100
19 | rnn_type nn.GRU
20 | rnn_hidden_size 800
21 | nb_head_layers 3
22 | nb_speech_layers 1
23 | nb_accents_layers 1
24 | bidirectional True
25 | bottleneck_size 256
26 | accent_loss 'focal'
27 |
28 | # network config
29 | use_mfcc_in True
30 | use_ivectors_in False
31 | use_embeddings_in False
32 | use_transcripts_out True
33 | use_accents_out False
34 |
35 | # decoder
36 | decoder_alpha 0.8
37 | decoder_beta 1.
38 | decoder_cutoff_top_n 40
39 | decoder_cutoff_prob 1.
40 | decoder_beam_width 100
41 |
42 | # paths
43 | lm_path './data/language_models/cv.lm'
44 | train_manifest './data/CommonVoice_dataset/splits/train.csv'
45 | dev_manifest './data/CommonVoice_dataset/splits/dev.csv'
46 | test_manifest './data/CommonVoice_dataset/splits/test.csv'
47 | tensorboard_path './tensorboard_runs/'
48 | saved_models_path './saved_models/'
49 |
50 | # tests
51 | testing_manifests [('./data/CommonVoice_dataset/splits/test.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/dev.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/testnz.csv', './data/language_models/cv.lm'), ('./data/CommonVoice_dataset/splits/testin.csv', './data/language_models/cv.lm'), ('./data/Logi_dataset/splits/nonnative.csv', './data/language_models/lnn_tri.lm'), ('./data/Logi_dataset/splits/native.csv', './data/language_models/lnn_tri.lm')]
52 | #testing_manifests [('./data/Logi_dataset/splits/nonnative.csv', './data/language_models/lnn_bi.lm'), ('./data/Logi_dataset/splits/native.csv', './data/language_models/lnn_bi.lm')]
53 |
54 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import torch.nn as nn
3 |
4 | class Config(collections.MutableMapping):
5 | """A dictionary that applies an arbitrary key-altering
6 | function before accessing the keys"""
7 |
8 | def __init__(self, config_path='./default_config.cfg', sep=' ', *args, **kwargs):
9 | self.store = dict()
10 | self.update(dict(*args, **kwargs)) # use the free update to set keys
11 |
12 | with open(config_path, 'r') as f:
13 | confs = {}
14 | for l in f.readlines():
15 | if (l[0] is not '#') and (l[0] is not '\n'): # remove comments and empty lines
16 | sep_idx = l.find(sep)
17 | confs[l[:sep_idx]] = eval(l[sep_idx+1:])
18 | self.update(confs)
19 |
20 | def __getitem__(self, key):
21 | return self.store[self.__keytransform__(key)]
22 |
23 | def __setitem__(self, key, value):
24 | self.store[self.__keytransform__(key)] = value
25 |
26 | def __delitem__(self, key):
27 | del self.store[self.__keytransform__(key)]
28 |
29 | def __iter__(self):
30 | return iter(self.store)
31 |
32 | def __len__(self):
33 | return len(self.store)
34 |
35 | def __keytransform__(self, key):
36 | return key
37 |
38 | def __str__(self):
39 | return self.store.__str__()
40 |
41 | def __repr__(self):
42 | return self.store.__repr__()
43 |
44 |
45 | # def create_multi_dict(self):
46 | # """ Not recomended, please use the patch_config method instead
47 | # """
48 | # """ Used to create as much configuration needed to run experiments with
49 | # all the possible combinations of values in the conf file."""
50 | # prev_configs = [{}]
51 | # for key, vals in self.store.items():
52 | # new_configs = []
53 | # for v in vals:
54 | # for conf in prev_configs:
55 | # new_conf = {}
56 | # new_conf.update(conf)
57 | # new_configs.append(new_conf)
58 | # new_conf[key] = v
59 | #
60 | # prev_configs = new_configs
61 | #
62 | # return new_configs
63 |
64 | def patch_config(self, patch_path, patch_sep='!', sep=' '):
65 | """Takes a file with config patches separated by a line
66 | starting with the 'patch_sep' argument.
67 | For each creates a new config based on the default one."""
68 |
69 | new_configs = []
70 |
71 | with open(patch_path, 'r') as f:
72 | current = {}
73 | for l in f.readlines():
74 | if (l[0] is not '#') and (l[0] is not '\n'):
75 | if (l[0] is '!'):
76 | new_configs.append(current)
77 | current = {}
78 | else:
79 | sep_idx = l.find(sep)
80 | current[l[:sep_idx]] = eval(l[sep_idx+1:])
81 |
82 | # Checks if last patch was added
83 | if len(current) > 0:
84 | new_configs.append(current)
85 |
86 | final_configs = [self.store.copy() for __ in range(len(new_configs))]
87 | [store.update(conf) for conf, store in zip(new_configs, final_configs)]
88 |
89 | return final_configs if len(final_configs) > 0 else self.store
90 |
--------------------------------------------------------------------------------
/test_model.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from model import MultiTask
3 | from training import test
4 | from dataloader import MultiDataset, MultiDataLoader
5 | import torch.nn as nn
6 | import torch
7 | from focalloss import FocalLoss
8 | from warpctc_pytorch import CTCLoss
9 | from decoder import GreedyDecoder, BeamCTCDecoder
10 | import sys
11 | import sys
12 | from pathlib import Path
13 |
14 | PRINT_LATEX_TABLE = True
15 |
16 | manual_seed = 666
17 | torch.manual_seed(manual_seed)
18 | torch.cuda.manual_seed_all
19 | print(f'Using torch manual seed {manual_seed}.')
20 |
21 | def eprint(*args, **kwargs):
22 | print(*args, file=sys.stderr, **kwargs)
23 |
24 |
25 | def result_for_manifest(model, criterion, manifest, decoder, target_decoder, batch_size, num_workers):
26 | ### LOADER
27 | test_dataset = MultiDataset(manifest,
28 | model._meta['labels'],
29 | use_mfcc_in=model._meta['use_mfcc_in'],
30 | use_ivectors_in=model._meta['use_ivectors_in'],
31 | use_embeddings_in=model._meta['use_embeddings_in'],
32 | embedding_size=model._meta['embedding_size'],
33 | use_transcripts_out=model._meta['use_transcripts_out'],
34 | use_accents_out=model._meta['use_accents_out'])
35 |
36 | test_loader = MultiDataLoader(test_dataset,
37 | batch_size=batch_size,
38 | shuffle=True,
39 | num_workers=num_workers)
40 |
41 | ### TEST
42 | test_results = test(model, test_loader, criterion, decoder, target_decoder)
43 | test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results
44 |
45 | results_dict = {}
46 |
47 | if test_wer != -1:
48 | results_dict['WER'] = test_wer
49 | if test_accent_acc != -1:
50 | results_dict['Accent accuracy'] = test_accent_acc
51 |
52 | return results_dict
53 |
54 |
55 | def main(model_path, confs):
56 | model, __ = MultiTask.load_model(model_path)
57 | if confs['cuda']:
58 | model = model.cuda()
59 |
60 |
61 | if not model._meta['use_transcripts_out']: # only accent classification
62 | criterion = nn.CrossEntropyLoss()
63 | elif not model._meta['use_accents_out']: # only text recognition
64 | criterion = CTCLoss()
65 | else: # both tasks
66 | criterion = (CTCLoss(), nn.CrossEntropyLoss())
67 |
68 |
69 | # Results
70 | results = {}
71 | for manifest, lm in confs['testing_manifests']:
72 | eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}')
73 |
74 | # Decoder
75 | if model._meta['use_transcripts_out']:
76 | decoder = BeamCTCDecoder(confs['labels'],
77 | lm_path=lm,
78 | alpha=confs['decoder_alpha'],
79 | beta=confs['decoder_beta'],
80 | cutoff_top_n=confs['decoder_cutoff_top_n'],
81 | cutoff_prob=confs['decoder_cutoff_top_n'],
82 | beam_width=confs['decoder_beam_width'],
83 | num_processes=confs['num_workers'])
84 |
85 | target_decoder = GreedyDecoder(confs['labels'])
86 | else:
87 | decoder, target_decoder = None, None
88 |
89 | # Test
90 | results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers'])
91 |
92 |
93 | if not PRINT_LATEX_TABLE:
94 | print(f'Model: {model_path.split("/")[-1]}')
95 | for name, res in results.items():
96 | print(f'\nResults for {name}:')
97 | print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()]))
98 | else:
99 | print(' & '.join(['model']+list([k[:-4] for k in results.keys()])))
100 | val_dict = {}
101 | for k in list(results.values())[0].keys():
102 | val_dict[k] = []
103 | for res in results.values():
104 | [val_dict[k].append(f'{v:.1f}') for k, v in res.items()]
105 | for val in val_dict.values():
106 | print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
107 |
108 | if __name__ == '__main__':
109 | import config
110 | confs = config.Config()
111 |
112 | args = sys.argv[1:]
113 |
114 | if PRINT_LATEX_TABLE:
115 | eprint('\nLatex output selected, change PRINT_LATEX_TABLE in script to False for regular output.')
116 |
117 | for model_path in args:
118 | main(model_path, confs)
--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
1 | import math
2 | from collections import OrderedDict
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.nn.parameter import Parameter
8 | from torch.autograd import Variable
9 |
10 | supported_rnns = {
11 | 'lstm': nn.LSTM,
12 | 'rnn': nn.RNN,
13 | 'gru': nn.GRU
14 | }
15 | supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
16 |
17 |
18 | class SequenceWise(nn.Module):
19 | def __init__(self, module):
20 | """
21 | Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
22 | Allows handling of variable sequence lengths and minibatch sizes.
23 | :param module: Module to apply input to.
24 | """
25 | super(SequenceWise, self).__init__()
26 | self.module = module
27 |
28 | def forward(self, x):
29 | t, n = x.size(0), x.size(1)
30 | x = x.view(t * n, -1)
31 | x = self.module(x)
32 | x = x.view(t, n, -1)
33 | return x
34 |
35 | def __repr__(self):
36 | tmpstr = self.__class__.__name__ + ' (\n'
37 | tmpstr += self.module.__repr__()
38 | tmpstr += ')'
39 | return tmpstr
40 |
41 |
42 | class MaskConv(nn.Module):
43 | def __init__(self, seq_module):
44 | """
45 | Adds padding to the output of the module based on the given lengths. This is to ensure that the
46 | results of the model do not change when batch sizes change during inference.
47 | Input needs to be in the shape of (BxCxDxT)
48 | :param seq_module: The sequential module containing the conv stack.
49 | """
50 | super(MaskConv, self).__init__()
51 | self.seq_module = seq_module
52 |
53 | def forward(self, x, lengths):
54 | """
55 | :param x: The input of size BxCxDxT
56 | :param lengths: The actual length of each sequence in the batch
57 | :return: Masked output from the module
58 | """
59 | for module in self.seq_module:
60 | x = module(x)
61 | mask = torch.ByteTensor(x.size()).fill_(0)
62 | if x.is_cuda:
63 | mask = mask.cuda()
64 | for i, length in enumerate(lengths):
65 | length = length.item()
66 | if (mask[i].size(2) - length) > 0:
67 | mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
68 | x = x.masked_fill(mask, 0)
69 | return x, lengths
70 |
71 |
72 | class InferenceBatchSoftmax(nn.Module):
73 | def forward(self, input_):
74 | if not self.training:
75 | return F.softmax(input_, dim=-1)
76 | else:
77 | return input_
78 |
79 |
80 | class BatchRNN(nn.Module):
81 | def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
82 | super(BatchRNN, self).__init__()
83 | self.input_size = input_size
84 | self.hidden_size = hidden_size
85 | self.bidirectional = bidirectional
86 | self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
87 | self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
88 | bidirectional=bidirectional, bias=True)
89 | self.num_directions = 2 if bidirectional else 1
90 |
91 | def flatten_parameters(self):
92 | self.rnn.flatten_parameters()
93 |
94 | def forward(self, x, output_lengths):
95 | if self.batch_norm is not None:
96 | x = self.batch_norm(x)
97 | x = nn.utils.rnn.pack_padded_sequence(x, output_lengths)
98 | x, h = self.rnn(x)
99 | x, _ = nn.utils.rnn.pad_packed_sequence(x)
100 | if self.bidirectional:
101 | x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum
102 | return x
103 |
104 |
105 | class Lookahead(nn.Module):
106 | # Wang et al 2016 - Lookahead Convolution Layer for Unidirectional Recurrent Neural Networks
107 | # input shape - sequence, batch, feature - TxNxH
108 | # output shape - same as input
109 | def __init__(self, n_features, context):
110 | # should we handle batch_first=True?
111 | super(Lookahead, self).__init__()
112 | self.n_features = n_features
113 | self.weight = Parameter(torch.Tensor(n_features, context + 1))
114 | assert context > 0
115 | self.context = context
116 | self.register_parameter('bias', None)
117 | self.init_parameters()
118 |
119 | def init_parameters(self): # what's a better way initialiase this layer?
120 | stdv = 1. / math.sqrt(self.weight.size(1))
121 | self.weight.data.uniform_(-stdv, stdv)
122 |
123 | def forward(self, input):
124 | seq_len = input.size(0)
125 | # pad the 0th dimension (T/sequence) with zeroes whose number = context
126 | # Once pytorch's padding functions have settled, should move to those.
127 | padding = torch.zeros(self.context, *(input.size()[1:])).type_as(input.data)
128 | x = torch.cat((input, Variable(padding)), 0)
129 |
130 | # add lookahead windows (with context+1 width) as a fourth dimension
131 | # for each seq-batch-feature combination
132 | x = [x[i:i + self.context + 1] for i in range(seq_len)] # TxLxNxH - sequence, context, batch, feature
133 | x = torch.stack(x)
134 | x = x.permute(0, 2, 3, 1) # TxNxHxL - sequence, batch, feature, context
135 |
136 | x = torch.mul(x, self.weight).sum(dim=3)
137 | return x
138 |
139 | def __repr__(self):
140 | return self.__class__.__name__ + '(' \
141 | + 'n_features=' + str(self.n_features) \
142 | + ', context=' + str(self.context) + ')'
143 |
144 |
--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import torch
3 | import numpy as np
4 | import gc
5 |
6 | def get_mixed_loss(criterion, out_text, out_accent, out_lens, accents, transcripts, transcripts_lens, mix=0.5, corrective_coef=1000):
7 | loss, loss_text, loss_accent = None, None, None
8 |
9 | if out_text is None:
10 | loss_accent = criterion(out_accent, accents)
11 | loss = loss_accent
12 | elif out_accent is None:
13 | loss_text = criterion(out_text, transcripts, out_lens, transcripts_lens)
14 | loss = loss_text
15 | else:
16 | loss_text = criterion[0](out_text, transcripts, out_lens, transcripts_lens)
17 | loss_accent = criterion[1](out_accent, accents)
18 |
19 | if loss_accent.is_cuda:
20 | loss_text = loss_text.cuda()
21 |
22 | loss = mix * loss_text + (1 - mix) * loss_accent * corrective_coef
23 |
24 | return loss, loss_text, loss_accent
25 |
26 |
27 | ### TRAINING
28 |
29 | def train(model, train_loader, criterion, optimizer, losses_mix=0.5):
30 | epoch_losses = []
31 | epoch_losses_text = []
32 | epoch_losses_accent = []
33 |
34 | model.train()
35 |
36 | for data in tqdm(train_loader, total=len(train_loader)):
37 |
38 | inputs, inputs_lens, transcripts, transcripts_lens, accents = data
39 |
40 | if next(model.parameters()).is_cuda:
41 | inputs = inputs.cuda()
42 | inputs_lens = inputs_lens.cuda()
43 |
44 | if accents is not None:
45 | accents = accents.cuda()
46 |
47 | out_text, out_accent, out_lens, __ = model(inputs, inputs_lens)
48 |
49 | loss, loss_text, loss_accent = get_mixed_loss(criterion, out_text, out_accent,
50 | out_lens, accents, transcripts,
51 | transcripts_lens, losses_mix)
52 |
53 | optimizer.zero_grad()
54 | loss.backward()
55 | optimizer.step()
56 |
57 | l = loss.clone().item() if loss is not None else None
58 | lt = loss_text.clone().item() if loss_text is not None else None
59 | la = loss_accent.clone().item() if loss_accent is not None else None
60 | epoch_losses.append(l)
61 | epoch_losses_text.append(lt)
62 | epoch_losses_accent.append(la)
63 |
64 |
65 | average_loss = lambda l: sum(l) / len(train_loader) if l[0] is not None else -1
66 |
67 | epoch_loss_i = average_loss(epoch_losses)
68 | epoch_loss_text_i = average_loss(epoch_losses_text)
69 | epoch_loss_accent_i = average_loss(epoch_losses_accent)
70 |
71 | return epoch_loss_i, epoch_loss_text_i, epoch_loss_accent_i
72 |
73 |
74 | ### TESTING
75 |
76 | def check_wer(transcripts, transcripts_lens, out, out_lens, decoder, target_decoder):
77 | split_transcripts = []
78 | offset = 0
79 | for size in transcripts_lens:
80 | split_transcripts.append(transcripts[offset:offset + size])
81 | offset += size
82 |
83 | decoded_output, _ = decoder.decode(out.data.transpose(0,1), out_lens)
84 | target_strings = target_decoder.convert_to_strings(split_transcripts)
85 |
86 | #if True:
87 | # print('targets', targets)
88 | # print('split_targets', split_targets)
89 | # print('out', out)
90 | # print('output_len', output_len)
91 | # print('decoded', decoded_output)
92 | # print('target', target_strings)
93 |
94 | wer, cer = 0, 0
95 | for x in range(len(target_strings)):
96 | transcript, reference = decoded_output[x][0], target_strings[x][0]
97 | wer += decoder.wer(transcript, reference) / float(len(reference.split()))
98 | #cer += decoder.cer(transcript, reference) / float(len(reference))
99 | wer /= len(target_strings)
100 | return wer * 100
101 |
102 |
103 | def check_acc(accents, out):
104 | out_arg = np.argmax(out, axis=1)
105 | diff = torch.eq(out_arg, accents.cpu())
106 | acc = torch.sum(diff)
107 | return acc.item() / len(accents) * 100
108 |
109 |
110 | def test(model, test_loader, criterion, decoder, target_decoder, losses_mix=0.5):
111 | with torch.no_grad():
112 | model.eval()
113 |
114 | epoch_losses = []
115 | epoch_losses_text = []
116 | epoch_losses_accent = []
117 |
118 | epoch_wers = []
119 | epoch_accent_accs = []
120 |
121 | for data in tqdm(test_loader, total=len(test_loader)):
122 | inputs, inputs_lens, transcripts, transcripts_lens, accents = data
123 |
124 | if next(model.parameters()).is_cuda:
125 | inputs = inputs.cuda()
126 | inputs_lens = inputs_lens.cuda()
127 |
128 | if accents is not None:
129 | accents = accents.cuda()
130 |
131 | out_text, out_accent, out_lens, __ = model(inputs, inputs_lens)
132 |
133 |
134 | if accents is None or len(model._meta['accents_dict']) > max(accents) + 1: # Check if we are testing a model with different accents
135 | loss, loss_text, loss_accent = get_mixed_loss(criterion, out_text, out_accent,
136 | out_lens, accents, transcripts,
137 | transcripts_lens, losses_mix)
138 | else: # in that case we do not care about the loss, section to refactor.
139 | loss, loss_text, loss_accent = torch.tensor([-1]), torch.tensor([-1]), torch.tensor([-1])
140 |
141 | if out_text is not None:
142 | wer = check_wer(transcripts, transcripts_lens,
143 | out_text, out_lens, decoder, target_decoder)
144 | else:
145 | wer = None
146 |
147 | if out_accent is not None:
148 | accent_acc = check_acc(accents, out_accent)
149 | else:
150 | accent_acc = None
151 |
152 | l = loss.clone().item() if loss is not None else None
153 | lt = loss_text.clone().item() if loss_text is not None else None
154 | la = loss_accent.clone().item() if loss_accent is not None else None
155 | epoch_losses.append(l)
156 | epoch_losses_text.append(lt)
157 | epoch_losses_accent.append(la)
158 |
159 | epoch_wers.append(wer)
160 | epoch_accent_accs.append(accent_acc)
161 |
162 |
163 |
164 |
165 | average_loss = lambda l: sum(l) / len(test_loader) if l[0] is not None else -1
166 |
167 | epoch_loss = average_loss(epoch_losses)
168 | epoch_loss_text = average_loss(epoch_losses_text)
169 | epoch_loss_accent = average_loss(epoch_losses_accent)
170 |
171 | epoch_wer = average_loss(epoch_wers)
172 | epoch_accent_acc = average_loss(epoch_accent_accs)
173 |
174 | return epoch_loss, epoch_loss_text, epoch_loss_accent, epoch_wer, epoch_accent_acc
--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import torch
4 | import torch.nn as nn
5 | import numpy as np
6 | from utils import tile
7 | from torch.utils.data import DataLoader, Dataset
8 |
9 |
10 | ### DATASET
11 |
12 | class MultiDataset(Dataset):
13 | """Defines an iterator over the dataset. This class is intended to be used with
14 | the MultiDataLoader class."""
15 |
16 | def __init__(self, manifest, labels, manifest_separator=',',
17 | use_mfcc_in=True, use_ivectors_in=False, use_embeddings_in=False,
18 | embedding_size=100, use_transcripts_out=True, use_accents_out=False):
19 | """
20 | Allows to chose what will be trained on, and what are the outputs.
21 | At least on input and one output is needed.
22 | Default configuration is regular MFCCs to text.
23 |
24 | Manifest should be csv type file with following row for each sample:
25 | mfcc_path, ivector_path, embedding_path, transcripts_path, accent_label
26 | (Column can remain empty if not used, but must be present.)
27 |
28 | Scripts to create the database and manifest from audio and text in the scripts folder.
29 | """
30 |
31 | assert(any([use_mfcc_in, use_ivectors_in, use_embeddings_in])), 'MultiDataset config needs at least one input set to True'
32 | assert(any([use_transcripts_out, use_accents_out])), 'MultiDataset config needs at least one output set to True'
33 | assert(not use_transcripts_out or use_mfcc_in), 'Can’t do speech to text without mfcc.'
34 |
35 | super(MultiDataset, self).__init__()
36 |
37 | self.config = {}
38 | self.config['use_mfcc_in']=use_mfcc_in
39 | self.config['use_ivectors_in']=use_ivectors_in
40 | self.config['use_embeddings_in']=use_embeddings_in
41 | self.config['embedding_size']=embedding_size
42 | self.config['use_transcripts_out']=use_transcripts_out
43 | self.config['use_accents_out']=use_accents_out
44 |
45 | self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
46 |
47 | with open(manifest) as f:
48 | self.samples = [x.strip().split(manifest_separator) for x in f.readlines()]
49 |
50 | self.accent_dict = self.make_accent_dict(self.samples)
51 |
52 | def __getitem__(self, index):
53 | """Unused features are set to None for the Dataloader. Returns torch tensors."""
54 | mfcc_path, ivector_path, embedding_path, transcript_path, accent_label = self.samples[index]
55 | mfcc, ivector, embedding, parsed_transcript, accent = None, None, None, None, None
56 |
57 | def load_array(path):
58 | with open(path) as f:
59 | array = json.load(f)
60 | return torch.FloatTensor(array)
61 |
62 | # Inputs
63 | if self.config['use_mfcc_in']:
64 | mfcc = load_array(mfcc_path)
65 |
66 | if self.config['use_ivectors_in']:
67 | ivector = load_array(ivector_path)
68 |
69 | if self.config['use_embeddings_in']:
70 | new_embedding_path = []
71 | for split in embedding_path.split('/'):
72 | new = split if 'embedding' not in split else ''.join([split, '_', str(self.config['embedding_size'])])
73 | new_embedding_path.append(new)
74 | new_embedding_path = '/'.join(new_embedding_path)
75 | embedding = torch.load(new_embedding_path, map_location=lambda storage, loc: storage)
76 | # map_location and loc are there to load the embedding on the CPU
77 |
78 | # Outputs
79 | if self.config['use_transcripts_out']:
80 | parsed_transcript = self.parse_transcript(transcript_path)
81 |
82 | if self.config['use_accents_out']:
83 | accent = self.accent_dict[accent_label]
84 | accent = torch.LongTensor([accent])
85 |
86 | return mfcc, ivector, embedding, parsed_transcript, accent
87 |
88 |
89 | def parse_transcript(self, transcript_path):
90 | """Maps a text to integers using the given labels_map."""
91 |
92 | with open(transcript_path, 'r', encoding='utf8') as transcript_file:
93 | transcript = transcript_file.read().replace('\n', '')
94 |
95 | transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
96 | return transcript
97 |
98 | def __len__(self):
99 | return len(self.samples)
100 |
101 | @staticmethod
102 | def make_accent_dict(samples):
103 | acc_set = set()
104 | for __, __, __, __, accent in samples:
105 | acc_set.add(accent)
106 | enum = enumerate(sorted(acc_set)) # sorted set for consistant results
107 | return {acc: i for i, acc in enum}
108 |
109 |
110 | ### DATALOADER
111 |
112 | # Collate function for the MultiDataLoader
113 | def collate_fn(batch):
114 | """This function takes list of samples and assembles a batch.
115 | It is intended to used in PyTorch DataLoader."""
116 |
117 | mfccs, ivectors, embeddings, transcripts, accents = list(zip(*batch))
118 |
119 | def exists(list_):
120 | """Checks if we are not getting a list of None"""
121 | return list_[0] is not None
122 |
123 | ## Lens
124 | if exists(mfccs):
125 | inputs_lens = torch.IntTensor([len(m) for m in mfccs])
126 | elif exists(ivectors):
127 | inputs_lens = torch.IntTensor([len(i) for i in ivectors])
128 | else:
129 | inputs_lens = torch.IntTensor([1] * len(batch))
130 |
131 | # Sorting order (needs to be descending in lens for the padder)
132 | inputs_lens, sorted_idx = inputs_lens.sort(descending=True)
133 |
134 | if exists(transcripts):
135 | transcripts_lens = torch.IntTensor([len(t) for t in transcripts])
136 | transcripts_lens = transcripts_lens[sorted_idx]
137 | else:
138 | transcripts_lens = None
139 |
140 | ## Inputs
141 | inputs = []
142 | if exists(mfccs):
143 | inputs.append(nn.utils.rnn.pad_sequence(mfccs, batch_first=True))
144 |
145 | if exists(ivectors):
146 | ivect = nn.utils.rnn.pad_sequence(ivectors, batch_first=True)
147 | if exists(mfccs): # The ivector resolution is 10 times lower than the mfccs', so we expand them.
148 | ivect = tile(ivect, 1, 10)
149 | ivect = ivect[:, :inputs[0].size(1), :]
150 | inputs.append(ivect)
151 |
152 | if exists(embeddings):
153 | emb = torch.cat(embeddings)
154 | emb = emb.view(emb.size(0), 1, emb.size(1))
155 | if exists(mfccs) or exists(ivectors):
156 | # tile embeddings to fit either mfccs or ivectors size if they are present
157 | emb = tile(emb, 1, inputs[0].size(1))
158 | inputs.append(emb)
159 |
160 | inputs = torch.cat(inputs, dim=2)
161 | inputs = inputs[sorted_idx]
162 |
163 | ## Outputs
164 | if exists(transcripts):
165 | if inputs.size(0) == 1: # bugfix for when only one sample
166 | transcripts = [transcripts]
167 | transcripts = np.asarray(transcripts)[sorted_idx] # dtype=object because some transcripts were loaded with wrong type (Int64). TODO fix.
168 | transcripts = torch.IntTensor([t for trs in transcripts for t in trs])
169 | # we need text targets as one concatenated vector
170 |
171 | if exists(accents):
172 | accents = torch.cat(accents)[sorted_idx]
173 | else:
174 | accents = None
175 |
176 | return inputs, inputs_lens, transcripts, transcripts_lens, accents
177 |
178 | class MultiDataLoader(DataLoader):
179 | def __init__(self, *args, **kwargs):
180 | """
181 | Creates a data loader for SpeechDatasets.
182 | """
183 | super(MultiDataLoader, self).__init__(*args, **kwargs)
184 | self.collate_fn = collate_fn
--------------------------------------------------------------------------------
/decoder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # ----------------------------------------------------------------------------
3 | # Copyright 2015-2016 Nervana Systems Inc.
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ----------------------------------------------------------------------------
16 | # Modified to support pytorch Tensors
17 |
18 | import Levenshtein as Lev
19 | import torch
20 | from six.moves import xrange
21 |
22 |
23 | class Decoder(object):
24 | """
25 | Basic decoder class from which all other decoders inherit. Implements several
26 | helper functions. Subclasses should implement the decode() method.
27 |
28 | Arguments:
29 | labels (string): mapping from integers to characters.
30 | blank_index (int, optional): index for the blank '_' character. Defaults to 0.
31 | space_index (int, optional): index for the space ' ' character. Defaults to 28.
32 | """
33 |
34 | def __init__(self, labels, blank_index=0):
35 | # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
36 | self.labels = labels
37 | self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
38 | self.blank_index = blank_index
39 | space_index = len(labels) # To prevent errors in decode, we add an out of bounds index for the space
40 | if ' ' in labels:
41 | space_index = labels.index(' ')
42 | self.space_index = space_index
43 |
44 | def wer(self, s1, s2):
45 | """
46 | Computes the Word Error Rate, defined as the edit distance between the
47 | two provided sentences after tokenizing to words.
48 | Arguments:
49 | s1 (string): space-separated sentence
50 | s2 (string): space-separated sentence
51 | """
52 |
53 | # build mapping of words to integers
54 | b = set(s1.split() + s2.split())
55 | word2char = dict(zip(b, range(len(b))))
56 |
57 | # map the words to a char array (Levenshtein packages only accepts
58 | # strings)
59 | w1 = [chr(word2char[w]) for w in s1.split()]
60 | w2 = [chr(word2char[w]) for w in s2.split()]
61 |
62 | return Lev.distance(''.join(w1), ''.join(w2))
63 |
64 | def cer(self, s1, s2):
65 | """
66 | Computes the Character Error Rate, defined as the edit distance.
67 |
68 | Arguments:
69 | s1 (string): space-separated sentence
70 | s2 (string): space-separated sentence
71 | """
72 | s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
73 | return Lev.distance(s1, s2)
74 |
75 | def decode(self, probs, sizes=None):
76 | """
77 | Given a matrix of character probabilities, returns the decoder's
78 | best guess of the transcription
79 |
80 | Arguments:
81 | probs: Tensor of character probabilities, where probs[c,t]
82 | is the probability of character c at time t
83 | sizes(optional): Size of each sequence in the mini-batch
84 | Returns:
85 | string: sequence of the model's best guess for the transcription
86 | """
87 | raise NotImplementedError
88 |
89 |
90 | class BeamCTCDecoder(Decoder):
91 | def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
92 | num_processes=4, blank_index=0):
93 | super(BeamCTCDecoder, self).__init__(labels)
94 | try:
95 | from ctcdecode import CTCBeamDecoder
96 | except ImportError:
97 | raise ImportError("BeamCTCDecoder requires paddledecoder package.")
98 |
99 | #labels = labels.replace("'", "a") # TODO fix that
100 | self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
101 | num_processes, blank_index)
102 |
103 | def convert_to_strings(self, out, seq_len):
104 | results = []
105 | for b, batch in enumerate(out):
106 | utterances = []
107 | for p, utt in enumerate(batch):
108 | size = seq_len[b][p]
109 | if size > 0:
110 | transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size]))
111 | else:
112 | transcript = ''
113 | utterances.append(transcript)
114 | results.append(utterances)
115 | return results
116 |
117 | def convert_tensor(self, offsets, sizes):
118 | results = []
119 | for b, batch in enumerate(offsets):
120 | utterances = []
121 | for p, utt in enumerate(batch):
122 | size = sizes[b][p]
123 | if sizes[b][p] > 0:
124 | utterances.append(utt[0:size])
125 | else:
126 | utterances.append(torch.tensor([], dtype=torch.int))
127 | results.append(utterances)
128 | return results
129 |
130 | def decode(self, probs, sizes=None):
131 | """
132 | Decodes probability output using ctcdecode package.
133 | Arguments:
134 | probs: Tensor of character probabilities, where probs[c,t]
135 | is the probability of character c at time t
136 | sizes: Size of each sequence in the mini-batch
137 | Returns:
138 | string: sequences of the model's best guess for the transcription
139 | """
140 | probs = probs.cpu()
141 | out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)
142 | strings = self.convert_to_strings(out, seq_lens)
143 | offsets = self.convert_tensor(offsets, seq_lens)
144 | return strings, offsets
145 |
146 |
147 | class GreedyDecoder(Decoder):
148 | def __init__(self, labels, blank_index=0):
149 | super(GreedyDecoder, self).__init__(labels, blank_index)
150 |
151 | def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False):
152 | """Given a list of numeric sequences, returns the corresponding strings"""
153 | strings = []
154 | offsets = [] if return_offsets else None
155 | for x in xrange(len(sequences)):
156 | seq_len = sizes[x] if sizes is not None else len(sequences[x])
157 | string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions)
158 | strings.append([string]) # We only return one path
159 | if return_offsets:
160 | offsets.append([string_offsets])
161 | if return_offsets:
162 | return strings, offsets
163 | else:
164 | return strings
165 |
166 | def process_string(self, sequence, size, remove_repetitions=False):
167 | string = ''
168 | offsets = []
169 | for i in range(size):
170 | char = self.int_to_char[sequence[i].item()]
171 | if char != self.int_to_char[self.blank_index]:
172 | # if this char is a repetition and remove_repetitions=true, then skip
173 | if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]:
174 | pass
175 | elif char == self.labels[self.space_index]:
176 | string += ' '
177 | offsets.append(i)
178 | else:
179 | string = string + char
180 | offsets.append(i)
181 | return string, torch.tensor(offsets, dtype=torch.int)
182 |
183 | def decode(self, probs, sizes=None):
184 | """
185 | Returns the argmax decoding given the probability matrix. Removes
186 | repeated elements in the sequence, as well as blanks.
187 |
188 | Arguments:
189 | probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
190 | sizes(optional): Size of each sequence in the mini-batch
191 | Returns:
192 | strings: sequences of the model's best guess for the transcription on inputs
193 | offsets: time step per character predicted
194 | """
195 | _, max_probs = torch.max(probs, 2)
196 | strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes,
197 | remove_repetitions=True, return_offsets=True)
198 | return strings, offsets
199 |
--------------------------------------------------------------------------------
/data/language_models/lnn_tri.lm:
--------------------------------------------------------------------------------
1 |
2 | \data\
3 | ngram 1=172
4 | ngram 2=289
5 | ngram 3=6
6 |
7 | \1-grams:
8 | -0.8200597
9 | -99 -0.3555717
10 | -2.2266 A -0.04052078
11 | -2.52763 AIR -0.04181526
12 | -2.52763 ALARM -0.03661407
13 | -2.52763 ALEXA 0.02815799
14 | -2.2266 AM -0.03922245
15 | -2.52763 AMAZON -0.04052079
16 | -2.2266 AND -0.02198101
17 | -2.52763 ANECHOIC -0.04181526
18 | -2.52763 AT -0.04181526
19 | -2.52763 BANGALORE 0.02815799
20 | -2.52763 BEFORE -0.03661407
21 | -2.52763 BIEBER 0.02815799
22 | -2.52763 BLACK 0.02815799
23 | -2.52763 BLAKE -0.04181526
24 | -2.52763 BLUE -0.04181526
25 | -2.52763 BOWIE 0.02815799
26 | -1.573387 BY -0.03134982
27 | -2.52763 CALENDAR -0.04052079
28 | -2.52763 CALIBRATE -0.02333186
29 | -2.52763 CALL -0.04181526
30 | -2.52763 CAME -0.04181526
31 | -2.52763 CAMERAS -0.03002374
32 | -2.52763 CARIBBEAN 0.02815799
33 | -2.52763 CHAMBER -0.03661407
34 | -2.52763 CHRIS -0.04181526
35 | -2.52763 CONDITIONING -0.04052079
36 | -2.52763 DAFT -0.04181526
37 | -2.52763 DAVID -0.04181526
38 | -2.52763 DINING -0.03792022
39 | -2.52763 DO -0.03661407
40 | -2.52763 DOOR -0.04181526
41 | -2.2266 DOWN -0.04550482
42 | -2.52763 DRAKE 0.02815799
43 | -2.52763 DRIVE -0.02869358
44 | -2.52763 EIGHT -0.04181526
45 | -2.52763 EPISODE -0.03792022
46 | -2.52763 ESPN 0.02815799
47 | -2.52763 FALL -0.03792022
48 | -2.52763 FIRST -0.04181526
49 | -2.52763 FIVE 0.02815799
50 | -2.52763 FLOOR -0.04181526
51 | -2.52763 FOLLOWING -0.04181526
52 | -1.82866 FOR -0.03267188
53 | -2.52763 FORGET -0.03134983
54 | -2.2266 FROM -0.03922245
55 | -2.52763 FRONT -0.04181526
56 | -2.52763 GARAGE -0.04181526
57 | -2.52763 GENIE 0.02815799
58 | -2.52763 GET -0.04181526
59 | -2.52763 GOING -0.02869358
60 | -2.050509 GOOGLE -0.01648962
61 | -2.52763 HELLO -0.04181526
62 | -2.52763 HERE -0.02869358
63 | -1.82866 HEY -0.03398992
64 | -2.52763 HI -0.03661407
65 | -2.52763 HOME 0.02815799
66 | -2.52763 HOURS -0.04052079
67 | -2.52763 HOW -0.04181526
68 | -1.82866 I -0.06459835
69 | -1.92557 IN -0.03530397
70 | -2.52763 INTENDED -0.02869358
71 | -1.82866 IS -0.04451953
72 | -2.050509 IT -0.02198102
73 | -2.52763 ITUNES 0.02815799
74 | -2.52763 JUSTIN -0.04181526
75 | -2.52763 KIDS 0.02815799
76 | -2.52763 LAMP 0.02815799
77 | -2.52763 LEAVE -0.02869358
78 | -2.52763 LIGHT -0.02869358
79 | -2.52763 LIGHTS 0.02815799
80 | -2.52763 LIKE -0.04052079
81 | -2.050509 LIVING -0.9322866
82 | -2.52763 LOGI 0.02815799
83 | -2.52763 LOGITECH 0.02815799
84 | -2.52763 LOGITECH'S -0.04181526
85 | -2.52763 LONG -0.04181526
86 | -2.52763 LOVE -0.04181526
87 | -2.52763 LUCKY -0.03134983
88 | -2.52763 MAKE -0.03922245
89 | -2.52763 MARS 0.02815799
90 | -2.2266 ME -0.02735934
91 | -2.52763 MEETING 0.02815799
92 | -2.52763 MINUTES 0.02815799
93 | -2.52763 MOM -0.04181526
94 | -2.2266 MUSIC -0.04550482
95 | -2.52763 MUTE 0.02815799
96 | -1.82866 MY -0.03267188
97 | -2.2266 NAME -0.03002373
98 | -2.52763 NEED -0.02869358
99 | -2.52763 NEW -0.04181526
100 | -2.52763 NEXT -0.04181526
101 | -2.2266 NOW 0.02967916
102 | -2.52763 NPR -0.03002374
103 | -2.52763 ODB 0.02815799
104 | -1.92557 OF -0.05589744
105 | -2.52763 OK -0.03922245
106 | -1.52763 ON -0.08432172
107 | -2.52763 ONE -0.04181526
108 | -2.52763 ONLY -0.04181526
109 | -2.52763 OPEN 0.02815799
110 | -2.52763 ORANGE -0.03661407
111 | -2.52763 OUT -0.03134983
112 | -2.52763 PANDORA 0.02815799
113 | -2.52763 PART -0.03792022
114 | -2.52763 PAUSE 0.02815799
115 | -2.52763 PHRASES 0.02815799
116 | -2.52763 PICK -0.04052079
117 | -2.52763 PILOTS 0.02815799
118 | -2.52763 PIRATES -0.03792022
119 | -2.52763 PLATTEN 0.02815799
120 | -1.448449 PLAY 0.06955066
121 | -2.52763 PM -0.04181526
122 | -2.52763 PUNK 0.02815799
123 | -2.52763 RACHEL -0.04181526
124 | -2.2266 RECOGNITION -0.02735934
125 | -2.52763 RECORDING -0.04052079
126 | -2.52763 RED 0.02815799
127 | -2.52763 REMINDER -0.03661407
128 | -2.52763 REWIND -0.04052079
129 | -2.52763 RIGHT -0.03922245
130 | -2.52763 RISE -0.04052079
131 | -1.92557 ROOM -0.03792021
132 | -2.52763 SAY -0.02333186
133 | -2.52763 SCRIPT -0.03661407
134 | -2.52763 SEARCH -0.03661407
135 | -2.52763 SECONDS 0.02815799
136 | -1.92557 SET -0.07057546
137 | -2.52763 SHELTON 0.02815799
138 | -2.52763 SIRI 0.02815799
139 | -2.52763 SIXTEEN -0.03134983
140 | -2.52763 SOMETHING -0.03134983
141 | -2.52763 SONOS 0.02815799
142 | -2.52763 SOUNDCLOUD 0.02815799
143 | -2.52763 SPIDERS -0.04052079
144 | -2.52763 SPOTIFY 0.02815799
145 | -2.52763 STAND -0.03134983
146 | -2.52763 STAPELTON 0.02815799
147 | -2.52763 STARDUST -0.04052079
148 | -2.52763 START -0.03661407
149 | -2.52763 STATE -0.04181526
150 | -2.52763 STOP 0.02815799
151 | -2.52763 STRESSED -0.04181526
152 | -2.52763 SUMMER -0.04181526
153 | -2.52763 TAKE -0.04052079
154 | -2.2266 TEN -0.04052078
155 | -2.52763 TESTING -0.04052079
156 | -1.351539 THE -0.02839587
157 | -2.2266 THIS -0.03661406
158 | -2.2266 TIME -0.03530397
159 | -2.52763 TIMER -0.03661407
160 | -1.486237 TO -0.03582891
161 | -2.2266 TODAY -0.04550482
162 | -2.52763 TRAVELER -0.03134983
163 | -1.82866 TURN -0.3301536
164 | -2.52763 TWENTY -0.04181526
165 | -2.52763 TWO -0.04181526
166 | -2.2266 UP 0.05155473
167 | -2.050509 VOICE -0.08691774
168 | -2.050509 VOLUME -0.03661407
169 | -2.52763 WANT -0.02869358
170 | -2.2266 WATCH -0.02198101
171 | -2.52763 WEATHER -0.04181526
172 | -2.2266 WHAT -0.1141836
173 | -2.2266 WHAT'S -0.009630572
174 | -2.52763 WILL -0.03922245
175 | -2.52763 WORK 0.02815799
176 | -2.52763 YOU -0.03134983
177 | -2.52763 YOUR -0.04052079
178 | -2.52763 YOURSELF -0.03134983
179 | -2.52763 ZIGGY -0.04181526
180 |
181 | \2-grams:
182 | -2.732193 CALL
183 | -2.732193 HELLO
184 | -1.0086 HEY
185 | -2.732193 HI
186 | -2.732193 HOW
187 | -2.732193 I
188 | -2.034075 IN
189 | -2.732193 IS
190 | -2.732193 MUTE
191 | -2.732193 OK
192 | -1.10551 ON
193 | -2.732193 PAUSE
194 | -0.6283889 PLAY
195 | -2.732193 REWIND
196 | -2.732193 SEARCH
197 | -1.10551 SET -0.1476935
198 | -2.732193 STOP
199 | -1.0086 TURN -0.06938255
200 | -2.034075 VOLUME
201 | -2.732193 WATCH
202 | -2.034075 WHAT -0.475604
203 | -2.034075 WHAT'S
204 | -1.325652 A REMINDER
205 | -1.325652 A TIMER
206 | -1.024622 AIR CONDITIONING
207 | -1.024622 ALARM FOR
208 | -1.024622 ALEXA
209 | -1.325652 AM NOW
210 | -1.325652 AM RECORDING
211 | -1.024622 AMAZON MUSIC
212 | -1.325652 AND FALL
213 | -1.325652 AND THE
214 | -1.024622 ANECHOIC CHAMBER
215 | -1.024622 AT HOME
216 | -1.024622 BANGALORE
217 | -1.024622 BEFORE I
218 | -1.024622 BIEBER
219 | -1.024622 BLACK
220 | -1.024622 BLAKE SHELTON
221 | -1.024622 BLUE GENIE
222 | -1.024622 BOWIE
223 | -1.978865 BY BLAKE
224 | -1.978865 BY CHRIS
225 | -1.978865 BY DAFT
226 | -1.978865 BY DRAKE
227 | -1.978865 BY JUSTIN
228 | -1.978865 BY ODB
229 | -1.978865 BY RACHEL
230 | -1.978865 BY TWENTY
231 | -1.978865 BY YOU
232 | -1.024622 CALENDAR TODAY
233 | -1.024622 CALIBRATE THE
234 | -1.024622 CALL MOM
235 | -1.024622 CAME HERE
236 | -1.024622 CAMERAS ON
237 | -1.024622 CARIBBEAN
238 | -1.024622 CHAMBER FOR
239 | -1.024622 CHRIS STAPELTON
240 | -1.024622 CONDITIONING DOWN
241 | -1.024622 DAFT PUNK
242 | -1.024622 DAVID BOWIE
243 | -1.024622 DINING ROOM
244 | -1.024622 DO I
245 | -1.024622 DOOR OPEN
246 | -0.6275349 DOWN
247 | -1.024622 DRAKE
248 | -1.024622 DRIVE TO
249 | -1.024622 EIGHT HOURS
250 | -1.024622 EPISODE OF
251 | -1.024622 ESPN
252 | -1.024622 FALL OF
253 | -1.024622 FIRST MEETING
254 | -1.024622 FIVE
255 | -1.024622 FLOOR LAMP
256 | -1.024622 FOLLOWING PHRASES
257 | -1.723593 FOR EIGHT
258 | -1.723593 FOR PIRATES
259 | -1.723593 FOR TEN
260 | -1.723593 FOR TWO
261 | -1.723593 FOR VOICE
262 | -1.024622 FORGET BY
263 | -1.325652 FROM MARS
264 | -1.325652 FROM NOW
265 | -1.024622 FRONT RIGHT
266 | -1.024622 GARAGE DOOR
267 | -1.024622 GENIE
268 | -1.024622 GET LUCKY
269 | -1.024622 GOING TO
270 | -0.8036261 GOOGLE
271 | -1.501744 GOOGLE MUSIC
272 | -1.024622 HELLO BLUE
273 | -1.024622 HERE TO
274 | -1.723593 HEY ALEXA
275 | -1.723593 HEY GOOGLE
276 | -1.723593 HEY LOGI
277 | -1.723593 HEY LOGITECH
278 | -1.723593 HEY SIRI
279 | -1.024622 HI MY
280 | -1.024622 HOME
281 | -1.024622 HOURS FROM
282 | -1.024622 HOW LONG
283 | -1.025475 I AM
284 | -1.723593 I NEED
285 | -1.723593 I START
286 | -1.723593 I WANT
287 | -1.626683 IN BANGALORE
288 | -1.626683 IN GOOGLE
289 | -1.626683 IN ITUNES
290 | -1.626683 IN LOGITECH'S
291 | -1.024622 INTENDED TO
292 | -1.723593 IS IT
293 | -1.723593 IS ONLY
294 | -1.723593 IS STATE
295 | -1.025475 IS THE
296 | -1.501744 IT IN
297 | -1.501744 IT TAKE
298 | -1.501744 IT TO
299 | -1.024622 ITUNES
300 | -1.024622 JUSTIN BIEBER
301 | -1.024622 KIDS
302 | -1.024622 LAMP
303 | -1.024622 LEAVE TO
304 | -1.024622 LIGHT TO
305 | -1.024622 LIGHTS
306 | -1.024622 LIKE TODAY
307 | -0.05329508 LIVING ROOM
308 | -1.024622 LOGI
309 | -1.024622 LOGITECH
310 | -1.024622 LOGITECH'S ANECHOIC
311 | -1.024622 LONG WILL
312 | -1.024622 LOVE YOURSELF
313 | -1.024622 LUCKY BY
314 | -1.024622 MAKE IT
315 | -1.024622 MARS
316 | -1.325652 ME SOMETHING
317 | -1.325652 ME TO
318 | -1.024622 MEETING
319 | -1.024622 MINUTES
320 | -1.024622 MOM AT
321 | -0.6275349 MUSIC
322 | -1.024622 MUTE
323 | -1.723593 MY CALENDAR
324 | -1.723593 MY DINING
325 | -1.723593 MY FIRST
326 | -1.723593 MY NAME
327 | -1.723593 MY VOICE
328 | -1.325652 NAME I
329 | -1.325652 NAME IS
330 | -1.024622 NEED TO
331 | -1.024622 NEW BLACK
332 | -1.024622 NEXT EPISODE
333 | -1.325652 NOW
334 | -1.325652 NOW GOING
335 | -1.024622 NPR ON
336 | -1.024622 ODB
337 | -1.626683 OF ORANGE
338 | -0.9285648 OF THE
339 | -1.626683 OF ZIGGY
340 | -1.024622 OK GOOGLE
341 | -2.024622 ON
342 | -2.024622 ON AMAZON
343 | -1.326505 ON MY
344 | -2.024622 ON PANDORA
345 | -2.024622 ON SOUNDCLOUD
346 | -2.024622 ON SPOTIFY
347 | -0.5761738 ON THE -0.2839059
348 | -1.024622 ONE PILOTS
349 | -1.024622 ONLY INTENDED
350 | -1.024622 OPEN
351 | -1.024622 ORANGE IS
352 | -1.024622 OUT BY
353 | -1.024622 PANDORA
354 | -1.024622 PART OF
355 | -1.024622 PAUSE
356 | -1.024622 PHRASES
357 | -1.024622 PICK UP
358 | -1.024622 PILOTS
359 | -1.024622 PIRATES OF
360 | -1.024622 PLATTEN
361 | -2.103804 PLAY
362 | -2.103804 PLAY CAME
363 | -2.103804 PLAY DAVID
364 | -2.103804 PLAY GET
365 | -2.103804 PLAY LOVE
366 | -2.103804 PLAY ME
367 | -2.103804 PLAY NPR
368 | -2.103804 PLAY STAND
369 | -2.103804 PLAY STRESSED
370 | -2.103804 PLAY SUMMER
371 | -2.103804 PLAY THE
372 | -2.103804 PLAY TRAVELER
373 | -1.024622 PM PICK
374 | -1.024622 PUNK
375 | -1.024622 RACHEL PLATTEN
376 | -1.325652 RECOGNITION TESTING
377 | -1.325652 RECOGNITION TO
378 | -1.024622 RECORDING THIS
379 | -1.024622 RED
380 | -1.024622 REMINDER FOR
381 | -1.024622 REWIND TEN
382 | -1.024622 RIGHT LIVING
383 | -1.024622 RISE AND
384 | -1.626683 ROOM FLOOR
385 | -1.626683 ROOM LIGHT
386 | -1.626683 ROOM LIGHTS
387 | -1.626683 ROOM SONOS
388 | -1.024622 SAY THE
389 | -1.024622 SCRIPT IS
390 | -1.024622 SEARCH FOR
391 | -1.024622 SECONDS
392 | -0.9285648 SET A
393 | -1.626683 SET ALARM
394 | -1.626683 SET VOLUME
395 | -1.024622 SHELTON
396 | -1.024622 SIRI
397 | -1.024622 SIXTEEN BY
398 | -1.024622 SOMETHING BY
399 | -1.024622 SONOS
400 | -1.024622 SOUNDCLOUD
401 | -1.024622 SPIDERS FROM
402 | -1.024622 SPOTIFY
403 | -1.024622 STAND BY
404 | -1.024622 STAPELTON
405 | -1.024622 STARDUST AND
406 | -1.024622 START I
407 | -1.024622 STATE YOUR
408 | -1.024622 STOP
409 | -1.024622 STRESSED OUT
410 | -1.024622 SUMMER SIXTEEN
411 | -1.024622 TAKE ME
412 | -1.325652 TEN MINUTES
413 | -1.325652 TEN SECONDS
414 | -1.024622 TESTING THIS
415 | -2.200714 THE AIR
416 | -2.200714 THE CARIBBEAN
417 | -2.200714 THE FOLLOWING
418 | -2.200714 THE FRONT
419 | -2.200714 THE GARAGE
420 | -2.200714 THE KIDS
421 | -1.502596 THE LIVING 0.3450996
422 | -2.200714 THE NEW
423 | -2.200714 THE NEXT
424 | -2.200714 THE RISE
425 | -2.200714 THE SCRIPT
426 | -2.200714 THE SPIDERS
427 | -2.200714 THE VOICE
428 | -2.200714 THE WEATHER
429 | -1.325652 THIS IN
430 | -1.325652 THIS PART
431 | -1.325652 TIME DO
432 | -1.325652 TIME IS
433 | -1.024622 TIMER FOR
434 | -2.066015 TO CALIBRATE
435 | -2.066015 TO DRIVE
436 | -2.066015 TO FORGET
437 | -2.066015 TO LEAVE
438 | -2.066015 TO MAKE
439 | -1.367898 TO MY
440 | -2.066015 TO RED
441 | -2.066015 TO SAY
442 | -2.066015 TO WATCH
443 | -2.066015 TO WORK
444 | -0.6275349 TODAY
445 | -1.024622 TRAVELER BY
446 | -1.723593 TURN CAMERAS
447 | -0.2751438 TURN ON -0.4681379
448 | -1.723593 TURN THE
449 | -1.024622 TWENTY ONE
450 | -1.024622 TWO PM
451 | -1.325652 UP
452 | -1.325652 UP THE
453 | -1.501744 VOICE BEFORE
454 | -0.8036261 VOICE RECOGNITION
455 | -1.501744 VOLUME DOWN
456 | -1.501744 VOLUME FIVE
457 | -1.501744 VOLUME UP
458 | -1.024622 WANT TO
459 | -1.325652 WATCH ESPN
460 | -1.325652 WATCH THE
461 | -1.024622 WEATHER LIKE
462 | -0.6275349 WHAT TIME
463 | -1.325652 WHAT'S ON
464 | -1.325652 WHAT'S THE
465 | -1.024622 WILL IT
466 | -1.024622 WORK
467 | -1.024622 YOU BY
468 | -1.024622 YOUR NAME
469 | -1.024622 YOURSELF BY
470 | -1.024622 ZIGGY STARDUST
471 |
472 | \3-grams:
473 | -0.1282164 THE LIVING ROOM
474 | -0.1249387 TURN ON THE
475 | -0.4292465 SET A
476 | -0.3043077 ON THE LIVING
477 | -0.2218488 TURN ON
478 | -0.1282164 WHAT TIME
479 |
480 | \end\
481 |
--------------------------------------------------------------------------------
/data/language_models/lnn_bi.lm:
--------------------------------------------------------------------------------
1 |
2 | \data\
3 | ngram 1=173
4 | ngram 2=289
5 |
6 | \1-grams:
7 | -0.7888443
8 | -99 -0.4177598
9 | -2.546674
10 | -2.462987 A -0.05261233
11 | -2.462987 AIR -0.05261233
12 | -2.462987 ALARM -0.05261234
13 | -2.462987 ALEXA -0.05261234
14 | -2.462987 AM -0.05261233
15 | -2.462987 AMAZON -0.05261234
16 | -2.160911 AND -0.05261233
17 | -2.462987 ANECHOIC -0.05261233
18 | -2.462987 AT -0.05261233
19 | -2.462987 BANGALORE -0.05261234
20 | -2.462987 BEFORE -0.05261234
21 | -2.462987 BIEBER -0.05261234
22 | -2.462987 BLACK -0.05261234
23 | -2.462987 BLAKE -0.05261233
24 | -2.462987 BLUE -0.05261233
25 | -2.462987 BOWIE -0.05261234
26 | -1.506887 BY -0.05261233
27 | -2.462987 CALENDAR -0.05261234
28 | -2.462987 CALIBRATE -0.05261233
29 | -2.462987 CALL -0.05261233
30 | -2.462987 CAME -0.05261233
31 | -2.462987 CAMERAS -0.05261234
32 | -2.462987 CARIBBEAN -0.05261234
33 | -2.462987 CHAMBER -0.05261234
34 | -2.462987 CHRIS -0.05261233
35 | -2.462987 CONDITIONING -0.05261234
36 | -2.462987 DAFT -0.05261233
37 | -2.462987 DAVID -0.05261233
38 | -2.462987 DINING -0.05261234
39 | -2.462987 DO -0.05261234
40 | -2.462987 DOOR -0.05261233
41 | -2.160911 DOWN -0.3536423
42 | -2.462987 DRAKE -0.05261234
43 | -2.462987 DRIVE -0.05261233
44 | -2.462987 EIGHT -0.05261233
45 | -2.462987 EPISODE -0.05261233
46 | -2.462987 ESPN -0.05261234
47 | -2.462987 FALL -0.05261233
48 | -2.462987 FIRST -0.05261233
49 | -2.462987 FIVE -0.05261234
50 | -2.462987 FLOOR -0.05261233
51 | -2.462987 FOLLOWING -0.05261233
52 | -1.762345 FOR -0.05261234
53 | -2.462987 FORGET -0.05261234
54 | -2.160911 FROM -0.05261233
55 | -2.462987 FRONT -0.05261233
56 | -2.462987 GARAGE -0.05261233
57 | -2.462987 GENIE -0.05261234
58 | -2.462987 GET -0.05261233
59 | -2.462987 GOING -0.05261233
60 | -1.984472 GOOGLE -0.2287036
61 | -2.462987 HELLO -0.05261233
62 | -2.462987 HERE -0.05261233
63 | -2.462987 HEY -0.05261234
64 | -2.462987 HI -0.05261233
65 | -2.462987 HOME -0.05261234
66 | -2.462987 HOURS -0.05261234
67 | -2.462987 HOW -0.05261233
68 | -1.762345 I -0.1495224
69 | -1.984472 IN -0.05261234
70 | -2.462987 INTENDED -0.05261233
71 | -1.762345 IS -0.1495224
72 | -1.984472 IT -0.05261235
73 | -2.462987 ITUNES -0.05261234
74 | -2.462987 JUSTIN -0.05261233
75 | -2.462987 KIDS -0.05261234
76 | -2.462987 LAMP -0.05261234
77 | -2.462987 LEAVE -0.05261233
78 | -2.462987 LIGHT -0.05261233
79 | -2.462987 LIGHTS -0.05261234
80 | -2.462987 LIKE -0.05261234
81 | -2.160911 LIVING -0.5297336
82 | -2.462987 LOGI -0.05261234
83 | -2.462987 LOGITECH -0.05261234
84 | -2.462987 LOGITECH'S -0.05261233
85 | -2.462987 LONG -0.05261233
86 | -2.462987 LOVE -0.05261233
87 | -2.462987 LUCKY -0.05261234
88 | -2.462987 MAKE -0.05261233
89 | -2.462987 MARS -0.05261234
90 | -2.160911 ME -0.05261233
91 | -2.462987 MEETING -0.05261234
92 | -2.462987 MINUTES -0.05261234
93 | -2.462987 MOM -0.05261233
94 | -2.160911 MUSIC -0.3536423
95 | -2.462987 MUTE -0.05261234
96 | -1.984472 MY -0.05261234
97 | -2.160911 NAME -0.05261233
98 | -2.462987 NEED -0.05261233
99 | -2.462987 NEW -0.05261233
100 | -2.462987 NEXT -0.05261233
101 | -2.160911 NOW -0.05261233
102 | -2.462987 NPR -0.05261234
103 | -2.462987 ODB -0.05261234
104 | -1.859359 OF -0.1775511
105 | -2.462987 OK -0.05261233
106 | -1.762345 ON -0.2075143
107 | -2.462987 ONE -0.05261233
108 | -2.462987 ONLY -0.05261233
109 | -2.462987 OPEN -0.05261234
110 | -2.462987 ORANGE -0.05261234
111 | -2.462987 OUT -0.05261234
112 | -2.462987 PANDORA -0.05261234
113 | -2.462987 PART -0.05261233
114 | -2.462987 PAUSE -0.05261234
115 | -2.462987 PHRASES -0.05261234
116 | -2.462987 PICK -0.05261234
117 | -2.462987 PILOTS -0.05261234
118 | -2.462987 PIRATES -0.05261233
119 | -2.462987 PLATTEN -0.05261234
120 | -2.462987 PLAY -0.05261233
121 | -2.462987 PM -0.05261233
122 | -2.462987 PUNK -0.05261234
123 | -2.462987 RACHEL -0.05261233
124 | -2.462987 RECOGNITION -0.05261233
125 | -2.462987 RECORDING -0.05261234
126 | -2.462987 RED -0.05261234
127 | -2.462987 REMINDER -0.05261234
128 | -2.462987 REWIND -0.05261234
129 | -2.462987 RIGHT -0.05261234
130 | -2.462987 RISE -0.05261234
131 | -2.160911 ROOM -0.05261234
132 | -2.462987 SAY -0.05261233
133 | -2.462987 SCRIPT -0.05261234
134 | -2.462987 SEARCH -0.05261234
135 | -2.462987 SECONDS -0.05261234
136 | -2.462987 SET -0.1775511
137 | -2.462987 SHELTON -0.05261234
138 | -2.462987 SIRI -0.05261234
139 | -2.462987 SIXTEEN -0.05261234
140 | -2.462987 SOMETHING -0.05261234
141 | -2.462987 SONOS -0.05261234
142 | -2.462987 SOUNDCLOUD -0.05261234
143 | -2.462987 SPIDERS -0.05261234
144 | -2.462987 SPOTIFY -0.05261234
145 | -2.462987 STAND -0.05261234
146 | -2.462987 STAPELTON -0.05261234
147 | -2.462987 STARDUST -0.05261234
148 | -2.462987 START -0.05261234
149 | -2.462987 STATE -0.05261233
150 | -2.462987 STOP -0.05261234
151 | -2.462987 STRESSED -0.05261233
152 | -2.462987 SUMMER -0.05261233
153 | -2.462987 TAKE -0.05261234
154 | -2.160911 TEN -0.05261233
155 | -2.462987 TESTING -0.05261234
156 | -1.419695 THE -0.08257556
157 | -2.160911 THIS -0.05261233
158 | -2.462987 TIME -0.05261233
159 | -2.462987 TIMER -0.05261234
160 | -1.419695 TO -0.09400501
161 | -2.160911 TODAY -0.3536423
162 | -2.462987 TRAVELER -0.05261234
163 | -2.462987 TURN -0.2744611
164 | -2.462987 TWENTY -0.05261233
165 | -2.462987 TWO -0.05261233
166 | -2.160911 UP -0.05261234
167 | -1.984472 VOICE -0.2287036
168 | -2.160911 VOLUME -0.05261234
169 | -2.462987 WANT -0.05261233
170 | -2.160911 WATCH -0.05261233
171 | -2.462987 WEATHER -0.05261233
172 | -2.462987 WHAT -0.3536424
173 | -2.462987 WHAT'S -0.05261233
174 | -2.462987 WILL -0.05261233
175 | -2.462987 WORK -0.05261234
176 | -2.462987 YOU -0.05261234
177 | -2.462987 YOUR -0.05261234
178 | -2.462987 YOURSELF -0.05261234
179 | -2.462987 ZIGGY -0.05261233
180 |
181 | \2-grams:
182 | -2.449389 CALL
183 | -2.449389 HELLO
184 | -1.086268 HEY
185 | -2.449389 HI
186 | -2.449389 HOW
187 | -2.053426 I
188 | -1.588285 IN
189 | -2.053426 IS
190 | -2.449389 MUTE
191 | -2.449389 OK
192 | -1.169629 ON
193 | -2.449389 PAUSE
194 | -0.6590814 PLAY
195 | -2.449389 REWIND
196 | -2.449389 SEARCH
197 | -1.204978 SET
198 | -2.449389 STOP
199 | -1.086268 TURN
200 | -1.61113 VOLUME
201 | -2.311984 WATCH
202 | -1.635243 WHAT
203 | -1.635243 WHAT'S
204 | -1.221142 A REMINDER
205 | -1.221142 A TIMER
206 | -0.9312775 AIR CONDITIONING
207 | -0.8880444 ALARM FOR
208 | -0.5881212 ALEXA
209 | -1.199537 AM NOW
210 | -1.221142 AM RECORDING
211 | -0.9200591 AMAZON MUSIC
212 | -1.221142 AND FALL
213 | -1.042144 AND THE
214 | -0.9312775 ANECHOIC CHAMBER
215 | -0.9312775 AT HOME
216 | -0.5881212 BANGALORE
217 | -0.8880444 BEFORE I
218 | -0.5881212 BIEBER
219 | -0.5881212 BLACK
220 | -0.9312775 BLAKE SHELTON
221 | -0.9312775 BLUE GENIE
222 | -0.5881212 BOWIE
223 | -1.803332 BY BLAKE
224 | -1.803332 BY CHRIS
225 | -1.803332 BY DAFT
226 | -1.803332 BY DRAKE
227 | -1.803332 BY JUSTIN
228 | -1.803332 BY ODB
229 | -1.803332 BY RACHEL
230 | -1.803332 BY TWENTY
231 | -1.803332 BY YOU
232 | -0.9200591 CALENDAR TODAY
233 | -0.8303289 CALIBRATE THE
234 | -0.9312775 CALL MOM
235 | -0.9312775 CAME HERE
236 | -0.8880444 CAMERAS ON
237 | -0.5881212 CARIBBEAN
238 | -0.8880444 CHAMBER FOR
239 | -0.9312775 CHRIS STAPELTON
240 | -0.9200591 CONDITIONING DOWN
241 | -0.9312775 DAFT PUNK
242 | -0.9312775 DAVID BOWIE
243 | -0.9200591 DINING ROOM
244 | -0.8880444 DO I
245 | -0.9312775 DOOR OPEN
246 | -0.2012962 DOWN
247 | -0.5881212 DRAKE
248 | -0.8303289 DRIVE TO
249 | -0.9312775 EIGHT HOURS
250 | -0.898456 EPISODE OF
251 | -0.5881212 ESPN
252 | -0.898456 FALL OF
253 | -0.9312775 FIRST MEETING
254 | -0.5881212 FIVE
255 | -0.9312775 FLOOR LAMP
256 | -0.9312775 FOLLOWING PHRASES
257 | -1.587212 FOR EIGHT
258 | -1.587212 FOR PIRATES
259 | -1.538578 FOR TEN
260 | -1.587212 FOR TWO
261 | -1.494846 FOR VOICE
262 | -0.8487282 FORGET BY
263 | -1.221142 FROM MARS
264 | -1.199537 FROM NOW
265 | -0.9312775 FRONT RIGHT
266 | -0.9312775 GARAGE DOOR
267 | -0.5881212 GENIE
268 | -0.9312775 GET LUCKY
269 | -0.8303289 GOING TO
270 | -0.330307 GOOGLE
271 | -1.375628 GOOGLE MUSIC
272 | -0.9312775 HELLO BLUE
273 | -0.8303289 HERE TO
274 | -1.587212 HEY ALEXA
275 | -1.494846 HEY GOOGLE
276 | -1.587212 HEY LOGI
277 | -1.587212 HEY LOGITECH
278 | -1.587212 HEY SIRI
279 | -0.9091232 HI MY
280 | -0.5881212 HOME
281 | -0.9200591 HOURS FROM
282 | -0.9312775 HOW LONG
283 | -0.6473172 I AM
284 | -1.597578 I NEED
285 | -1.597578 I START
286 | -1.597578 I WANT
287 | -1.500668 IN BANGALORE
288 | -1.423601 IN GOOGLE
289 | -1.500668 IN ITUNES
290 | -1.500668 IN LOGITECH'S
291 | -0.8303289 INTENDED TO
292 | -1.520511 IS IT
293 | -1.597578 IS ONLY
294 | -1.597578 IS STATE
295 | -0.6024376 IS THE
296 | -1.32594 IT IN
297 | -1.386348 IT TAKE
298 | -1.144261 IT TO
299 | -0.5881212 ITUNES
300 | -0.9312775 JUSTIN BIEBER
301 | -0.5881212 KIDS
302 | -0.5881212 LAMP
303 | -0.8303289 LEAVE TO
304 | -0.8303289 LIGHT TO
305 | -0.5881212 LIGHTS
306 | -0.9200591 LIKE TODAY
307 | -0.1507424 LIVING ROOM
308 | -0.5881212 LOGI
309 | -0.5881212 LOGITECH
310 | -0.9312775 LOGITECH'S ANECHOIC
311 | -0.9312775 LONG WILL
312 | -0.9312775 LOVE YOURSELF
313 | -0.8487282 LUCKY BY
314 | -0.9091232 MAKE IT
315 | -0.5881212 MARS
316 | -1.221142 ME SOMETHING
317 | -1.042144 ME TO
318 | -0.5881212 MEETING
319 | -0.5881212 MINUTES
320 | -0.9312775 MOM AT
321 | -0.2012962 MUSIC
322 | -0.5881212 MUTE
323 | -1.587212 MY CALENDAR
324 | -1.587212 MY DINING
325 | -1.587212 MY FIRST
326 | -1.538578 MY NAME
327 | -1.494846 MY VOICE
328 | -1.140505 NAME I
329 | -1.140505 NAME IS
330 | -0.8303289 NEED TO
331 | -0.9312775 NEW BLACK
332 | -0.9312775 NEXT EPISODE
333 | -0.6965729 NOW
334 | -1.221142 NOW GOING
335 | -0.8880444 NPR ON
336 | -0.5881212 ODB
337 | -1.511287 OF ORANGE
338 | -0.5174091 OF THE
339 | -1.511287 OF ZIGGY
340 | -0.9091232 OK GOOGLE
341 | -0.9498084 ON
342 | -1.868225 ON AMAZON
343 | -0.9287202 ON MY
344 | -1.868225 ON PANDORA
345 | -1.868225 ON SOUNDCLOUD
346 | -1.868225 ON SPOTIFY
347 | -0.6289269 ON THE
348 | -0.9312775 ONE PILOTS
349 | -0.9312775 ONLY INTENDED
350 | -0.5881212 OPEN
351 | -0.8880444 ORANGE IS
352 | -0.8487282 OUT BY
353 | -0.5881212 PANDORA
354 | -0.898456 PART OF
355 | -0.5881212 PAUSE
356 | -0.5881212 PHRASES
357 | -0.9200591 PICK UP
358 | -0.5881212 PILOTS
359 | -0.898456 PIRATES OF
360 | -0.5881212 PLATTEN
361 | -0.8136998 PLAY
362 | -1.901061 PLAY CAME
363 | -1.901061 PLAY DAVID
364 | -1.901061 PLAY GET
365 | -1.901061 PLAY LOVE
366 | -1.806209 PLAY ME
367 | -1.901061 PLAY NPR
368 | -1.901061 PLAY STAND
369 | -1.901061 PLAY STRESSED
370 | -1.901061 PLAY SUMMER
371 | -1.364388 PLAY THE
372 | -1.901061 PLAY TRAVELER
373 | -0.9312775 PM PICK
374 | -0.5881212 PUNK
375 | -0.9312775 RACHEL PLATTEN
376 | -1.221142 RECOGNITION TESTING
377 | -1.042144 RECOGNITION TO
378 | -0.9200591 RECORDING THIS
379 | -0.5881212 RED
380 | -0.8880444 REMINDER FOR
381 | -0.9200591 REWIND TEN
382 | -0.9200591 RIGHT LIVING
383 | -0.9200591 RISE AND
384 | -1.500668 ROOM FLOOR
385 | -1.500668 ROOM LIGHT
386 | -1.500668 ROOM LIGHTS
387 | -1.500668 ROOM SONOS
388 | -0.8303289 SAY THE
389 | -0.8880444 SCRIPT IS
390 | -0.8880444 SEARCH FOR
391 | -0.5881212 SECONDS
392 | -0.5515851 SET A
393 | -1.511287 SET ALARM
394 | -1.480033 SET VOLUME
395 | -0.5881212 SHELTON
396 | -0.5881212 SIRI
397 | -0.8487282 SIXTEEN BY
398 | -0.8487282 SOMETHING BY
399 | -0.5881212 SONOS
400 | -0.5881212 SOUNDCLOUD
401 | -0.9200591 SPIDERS FROM
402 | -0.5881212 SPOTIFY
403 | -0.8487282 STAND BY
404 | -0.5881212 STAPELTON
405 | -0.9200591 STARDUST AND
406 | -0.8880444 START I
407 | -0.9312775 STATE YOUR
408 | -0.5881212 STOP
409 | -0.9312775 STRESSED OUT
410 | -0.9312775 SUMMER SIXTEEN
411 | -0.9200591 TAKE ME
412 | -1.221142 TEN MINUTES
413 | -1.221142 TEN SECONDS
414 | -0.9200591 TESTING THIS
415 | -1.980735 THE AIR
416 | -1.980735 THE CARIBBEAN
417 | -1.980735 THE FOLLOWING
418 | -1.980735 THE FRONT
419 | -1.980735 THE GARAGE
420 | -1.980735 THE KIDS
421 | -1.097011 THE LIVING
422 | -1.980735 THE NEW
423 | -1.980735 THE NEXT
424 | -1.980735 THE RISE
425 | -1.980735 THE SCRIPT
426 | -1.980735 THE SPIDERS
427 | -1.791137 THE VOICE
428 | -1.980735 THE WEATHER
429 | -1.178955 THIS IN
430 | -1.221142 THIS PART
431 | -1.221142 TIME DO
432 | -1.140505 TIME IS
433 | -0.8880444 TIMER FOR
434 | -1.881221 TO CALIBRATE
435 | -1.881221 TO DRIVE
436 | -1.881221 TO FORGET
437 | -1.881221 TO LEAVE
438 | -1.881221 TO MAKE
439 | -0.960078 TO MY
440 | -1.881221 TO RED
441 | -1.881221 TO SAY
442 | -1.797722 TO WATCH
443 | -1.881221 TO WORK
444 | -0.2012962 TODAY
445 | -0.8487282 TRAVELER BY
446 | -1.608197 TURN CAMERAS
447 | -0.36451 TURN ON
448 | -1.36611 TURN THE
449 | -0.9312775 TWENTY ONE
450 | -0.9312775 TWO PM
451 | -0.6965729 UP
452 | -1.042144 UP THE
453 | -1.397233 VOICE BEFORE
454 | -0.4278275 VOICE RECOGNITION
455 | -1.355094 VOLUME DOWN
456 | -1.386348 VOLUME FIVE
457 | -1.355094 VOLUME UP
458 | -0.8303289 WANT TO
459 | -1.221142 WATCH ESPN
460 | -1.042144 WATCH THE
461 | -0.9312775 WEATHER LIKE
462 | -0.2529206 WHAT TIME
463 | -1.140505 WHAT'S ON
464 | -1.042144 WHAT'S THE
465 | -0.9091232 WILL IT
466 | -0.5881212 WORK
467 | -0.8487282 YOU BY
468 | -0.9200591 YOUR NAME
469 | -0.8487282 YOURSELF BY
470 | -0.9312775 ZIGGY STARDUST
471 |
472 | \end\
473 |
--------------------------------------------------------------------------------
/run_experiment.py:
--------------------------------------------------------------------------------
1 | from dataloader import MultiDataset, MultiDataLoader
2 | from focalloss import FocalLoss
3 | from warpctc_pytorch import CTCLoss
4 | from model import MultiTask
5 | from decoder import GreedyDecoder, BeamCTCDecoder
6 | from training import train, test
7 | import torch
8 | import torch.nn as nn
9 | from os import makedirs
10 | from tensorboardX import SummaryWriter
11 | from pathlib import Path
12 | import math
13 | from utils import now_str
14 | import gc
15 |
16 | manual_seed = 1337
17 | torch.manual_seed(manual_seed)
18 | torch.cuda.manual_seed_all
19 | print(f'Using torch manual seed {manual_seed}.')
20 |
21 | ### Start timer
22 | min_ = 0
23 | if min_ > 0:
24 | print(f'WARNING TIMER {min_} min')
25 | import time ; from tqdm import tqdm
26 | for __ in tqdm(range(min_)):
27 | time.sleep(60)
28 | ###
29 |
30 |
31 | def run_experiment(_exp_name,
32 | _epochs,
33 | _train_manifest,
34 | _test_manifest,
35 | _labels,
36 | _use_mfcc_in,
37 | _use_ivectors_in,
38 | _use_embeddings_in,
39 | _use_transcripts_out,
40 | _use_accents_out,
41 | _batch_size,
42 | _num_workers,
43 | _mfcc_size,
44 | _ivector_size,
45 | _embedding_size,
46 | _rnn_type,
47 | _rnn_hidden_size,
48 | _nb_head_layers,
49 | _nb_speech_layers,
50 | _nb_accents_layers,
51 | _bidirectional,
52 | _losses_mix,
53 | _learning_rate,
54 | _lm_path,
55 | _decoder_alpha,
56 | _decoder_beta,
57 | _decoder_cutoff_top_n,
58 | _decoder_beam_width,
59 | _cuda,
60 | _tensorboard_path,
61 | _saved_models_path,
62 | _bottleneck_size,
63 | _accent_loss):
64 |
65 | print(f'\n##### Running experiment {_exp_name} #####')
66 |
67 | # Tools to log values
68 | results_dict = {}
69 | results_dict['train_loss'] = []
70 | results_dict['train_loss_text'] = []
71 | results_dict['train_loss_accent'] = []
72 | results_dict['test_loss'] = []
73 | results_dict['test_loss_text'] = []
74 | results_dict['test_loss_accent'] = []
75 | results_dict['test_wer'] = []
76 | results_dict['test_accent_acc'] = []
77 |
78 | tb_path = Path(_tensorboard_path) / _exp_name
79 | makedirs(tb_path, exist_ok=True)
80 | tb_writer = SummaryWriter(tb_path)
81 |
82 | ### DATA LOADING
83 |
84 | # Training set
85 | train_dataset = MultiDataset(_train_manifest,
86 | _labels,
87 | use_mfcc_in=_use_mfcc_in,
88 | use_ivectors_in=_use_ivectors_in,
89 | use_embeddings_in=_use_embeddings_in,
90 | embedding_size=_embedding_size,
91 | use_transcripts_out=_use_transcripts_out,
92 | use_accents_out=_use_accents_out)
93 |
94 | train_loader = MultiDataLoader(train_dataset,
95 | batch_size=_batch_size,
96 | shuffle=True,
97 | num_workers=_num_workers)
98 |
99 | # Testing set
100 | test_dataset = MultiDataset(_test_manifest,
101 | _labels,
102 | use_mfcc_in=_use_mfcc_in,
103 | use_ivectors_in=_use_ivectors_in,
104 | use_embeddings_in=_use_embeddings_in,
105 | embedding_size=_embedding_size,
106 | use_transcripts_out=_use_transcripts_out,
107 | use_accents_out=_use_accents_out)
108 |
109 | test_loader = MultiDataLoader(test_dataset,
110 | batch_size=_batch_size,
111 | shuffle=True,
112 | num_workers=_num_workers)
113 |
114 |
115 | ### CREATE MODEL
116 |
117 | model = MultiTask(use_mfcc_in = _use_mfcc_in,
118 | use_ivectors_in = _use_ivectors_in,
119 | use_embeddings_in = _use_embeddings_in,
120 | use_transcripts_out = _use_transcripts_out,
121 | use_accents_out = _use_accents_out,
122 | mfcc_size = _mfcc_size,
123 | ivector_size = _ivector_size,
124 | embedding_size = _embedding_size,
125 | rnn_type = _rnn_type,
126 | labels = _labels,
127 | accents_dict = train_dataset.accent_dict,
128 | rnn_hidden_size = _rnn_hidden_size,
129 | nb_head_layers = _nb_head_layers,
130 | nb_speech_layers = _nb_speech_layers,
131 | nb_accents_layers = _nb_accents_layers,
132 | bidirectional = _bidirectional,
133 | bottleneck_size = _bottleneck_size,
134 | DEBUG=False)
135 | if _cuda:
136 | model = model.cuda()
137 |
138 | print(model, '\n')
139 | print('Model parameters counts:', MultiTask.get_param_size(model), '\n')
140 |
141 | ### OPTIMIZER, CRITERION, DECODER
142 |
143 | # Optimizer
144 | optimizer = torch.optim.Adam(model.parameters(), lr=_learning_rate)
145 |
146 | # Criterion
147 | if _use_accents_out:
148 | if _accent_loss == 'focal':
149 | AccLoss = FocalLoss()
150 | elif _accent_loss == 'CE':
151 | AccLoss = nn.CrossEntropyLoss()
152 | else:
153 | raise ValueError(f'Loss {_accent_loss} for accent_loss is unknown. Please use either "focal" or "CE".')
154 |
155 | if not _use_transcripts_out: # only accent classification
156 | criterion = AccLoss
157 | elif not _use_accents_out: # only text recognition
158 | criterion = CTCLoss()
159 | else: # both tasks
160 | criterion = (CTCLoss(), FocalLoss())
161 |
162 | # Decoder
163 | if _use_transcripts_out:
164 | decoder = BeamCTCDecoder(_labels,
165 | lm_path=_lm_path,
166 | alpha=_decoder_alpha,
167 | beta=_decoder_beta,
168 | cutoff_top_n=_decoder_cutoff_top_n,
169 | cutoff_prob=_decoder_cutoff_top_n,
170 | beam_width=_decoder_beam_width,
171 | num_processes=_num_workers)
172 |
173 | target_decoder = GreedyDecoder(_labels)
174 | else:
175 | decoder, target_decoder = None, None
176 |
177 |
178 | ### EPOCHS
179 | best_wer = math.inf
180 | best_acc = 0
181 |
182 | for epoch in range(1, _epochs + 1):
183 | ### TRAIN
184 | print(f'Epoch {epoch} training: {exp_name}')
185 | train_results = train(model, train_loader, criterion, optimizer, losses_mix=_losses_mix)
186 | train_loss, train_loss_text, train_loss_accent = train_results
187 |
188 | results_dict['train_loss'].append(train_loss)
189 | results_dict['train_loss_text'].append(train_loss_text)
190 | results_dict['train_loss_accent'].append(train_loss_accent)
191 | print(f'Epoch {epoch} training loss: {train_loss}')
192 |
193 | ### TEST
194 | print(f'Epoch {epoch} testing')
195 | test_results = test(model, test_loader, criterion, decoder, target_decoder, losses_mix=_losses_mix)
196 | test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results
197 |
198 | results_dict['test_loss'].append(test_loss)
199 | results_dict['test_loss_text'].append(test_loss_text)
200 | results_dict['test_loss_accent'].append(test_loss_accent)
201 | results_dict['test_wer'].append(test_wer)
202 | results_dict['test_accent_acc'].append(test_accent_acc)
203 | print(f'Epoch {epoch} testing loss: {test_loss}')
204 |
205 | # Add values to tensorboard
206 | for key, results in results_dict.items():
207 | tb_writer.add_scalar(key, results[-1], epoch)
208 |
209 | #Save model if it is best
210 | save_new=False
211 | if _use_transcripts_out:
212 | if test_wer < best_wer:
213 | save_new = True
214 | best_wer = test_wer
215 | else:
216 | if test_accent_acc > best_acc:
217 | save_new = True
218 | best_acc = test_accent_acc
219 |
220 | if save_new:
221 | MultiTask.serialize(model,
222 | Path(_saved_models_path) / _exp_name,
223 | save=True,
224 | exp_name=_exp_name,
225 | optimizer=optimizer,
226 | epoch=epoch,
227 | train_losses=results_dict['train_loss'],
228 | test_losses=results_dict['test_loss'],
229 | text_train_losses=results_dict['train_loss_text'],
230 | text_test_losses=results_dict['test_loss_text'],
231 | text_wers=results_dict['test_wer'],
232 | accent_train_losses=results_dict['train_loss_accent'],
233 | accent_test_losses=results_dict['test_loss_accent'],
234 | accent_accuracies=results_dict['test_accent_acc'])
235 |
236 | del model
237 | gc.collect()
238 | torch.cuda.empty_cache()
239 | ## end of run_experiment ##
240 |
241 |
242 | ### MAIN
243 |
244 | if __name__ == '__main__':
245 | import argparse
246 |
247 | parser = argparse.ArgumentParser(description='DeepSpeech model information')
248 | parser.add_argument('--train', action='store_true', help='Uses the train set instead of the dev set.')
249 | parser.add_argument('--epochs', default=None, type=int, help='Number of training epochs')
250 | parser.add_argument('--patch_path', default='experiments.cfg', type=str, help='Path to experiment list')
251 | args = parser.parse_args()
252 |
253 | DEV = not args.train
254 | PATCH_PATH = args.patch_path
255 | EPOCHS = args.epochs
256 |
257 | import config
258 | confs = config.Config()
259 |
260 | for conf in confs.patch_config(PATCH_PATH):
261 | exp_name = conf['exp_name_prefix']
262 | exp_name += '_DEV' if DEV else '_TRAIN'
263 | exp_name += '__in'
264 | exp_name += '_mfcc' if conf['use_mfcc_in'] else ''
265 | exp_name += '_ivect' if conf['use_ivectors_in'] else ''
266 | exp_name += '_emb' if conf['use_embeddings_in'] else ''
267 | exp_name += '__out'
268 | exp_name += '_transcripts' if conf['use_transcripts_out'] else ''
269 | exp_name += f'_accents-mix{conf["losses_mix"]}-{conf["accent_loss"]}' if conf['use_accents_out'] else ''
270 | exp_name += f'__nblyrs-head-{conf["nb_head_layers"]}'
271 | exp_name += f'-speech-{conf["nb_speech_layers"]}'
272 | exp_name += f'-accent-{conf["nb_accents_layers"]}'
273 | exp_name += f'__bnf-{conf["bottleneck_size"]}'
274 | exp_name += f'__{now_str()}'
275 |
276 | train_manifest = conf['dev_manifest'] if DEV else conf['train_manifest']
277 | epochs = EPOCHS if EPOCHS is not None else conf['epochs']
278 |
279 | try:
280 | run_experiment(_exp_name = exp_name,
281 | _epochs = epochs,
282 | _train_manifest = train_manifest,
283 | _test_manifest = conf['test_manifest'],
284 | _labels = conf['labels'],
285 | _use_mfcc_in = conf['use_mfcc_in'],
286 | _use_ivectors_in = conf['use_ivectors_in'],
287 | _use_embeddings_in = conf['use_embeddings_in'],
288 | _use_transcripts_out = conf['use_transcripts_out'],
289 | _use_accents_out = conf['use_accents_out'],
290 | _batch_size = conf['batch_size'],
291 | _num_workers = conf['num_workers'],
292 | _mfcc_size = conf['mfcc_size'],
293 | _ivector_size = conf['ivector_size'],
294 | _embedding_size = conf['embedding_size'],
295 | _rnn_type = conf['rnn_type'],
296 | _rnn_hidden_size = conf['rnn_hidden_size'],
297 | _nb_head_layers = conf['nb_head_layers'],
298 | _nb_speech_layers = conf['nb_speech_layers'],
299 | _nb_accents_layers = conf['nb_accents_layers'],
300 | _bidirectional = conf['bidirectional'],
301 | _losses_mix = conf['losses_mix'],
302 | _learning_rate = conf['learning_rate'],
303 | _lm_path = conf['lm_path'],
304 | _decoder_alpha = conf['decoder_alpha'],
305 | _decoder_beta = conf['decoder_beta'],
306 | _decoder_cutoff_top_n = conf['decoder_cutoff_top_n'],
307 | _decoder_beam_width = conf['decoder_beam_width'],
308 | _cuda = conf['cuda'],
309 | _tensorboard_path = conf['tensorboard_path'],
310 | _saved_models_path = conf['saved_models_path'],
311 | _bottleneck_size = conf['bottleneck_size'],
312 | _accent_loss = conf['accent_loss'])
313 |
314 | except Exception as e:
315 | print(f'Error occured in run {exp_name}:', e)
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | from collections import OrderedDict
5 | from modules import MaskConv, BatchRNN, InferenceBatchSoftmax, SequenceWise
6 |
7 |
8 | def rnn_block(rnn_input_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers):
9 | """Creates a stack of Batch RNNs with different input_size than hidden_size."""
10 | rnns = []
11 | rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
12 | bidirectional=bidirectional, batch_norm=False)
13 | rnns.append(('0', rnn))
14 | for x in range(nb_layers - 1):
15 | rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
16 | bidirectional=bidirectional)
17 | rnns.append(('%d' % (x + 1), rnn))
18 | return nn.Sequential(OrderedDict(rnns))
19 |
20 |
21 | class Head(nn.Module):
22 | """Shared part of the neural network."""
23 | def __init__(self,
24 | rnn_type,
25 | rnn_hidden_size,
26 | nb_layers,
27 | bidirectional,
28 | feature_len,
29 | DEBUG):
30 |
31 | super(Head, self).__init__()
32 |
33 | self._DEBUG = DEBUG
34 |
35 | # CONV
36 | self.conv = MaskConv(nn.Sequential(
37 | nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
38 | nn.BatchNorm2d(32),
39 | nn.Hardtanh(0, 20, inplace=True),
40 | nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
41 | nn.BatchNorm2d(32),
42 | nn.Hardtanh(0, 20, inplace=True)
43 | ))
44 |
45 | # RNN
46 | rnn_input_size = feature_len * 8
47 |
48 | self.rnns = rnn_block(rnn_input_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers)
49 |
50 |
51 | def forward(self, x, lengths):
52 | if self._DEBUG:
53 | print('')
54 | print('# BEGIN HEAD #')
55 | print('input', x.size())
56 |
57 | lengths = lengths.cpu().int()
58 | output_lengths = self.get_seq_lens(lengths)
59 |
60 | x = x.view(x.size(0), 1, x.size(1), x.size(2))
61 | x = x.transpose(2, 3)
62 | if self._DEBUG:
63 | print('after view transpose', x.size())
64 |
65 | x, _ = self.conv(x, output_lengths)
66 | if self._DEBUG:
67 | print('after conv', x.size())
68 |
69 | sizes = x.size()
70 | x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension
71 | x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH
72 | if self._DEBUG:
73 | print('after view transpose', x.size())
74 |
75 | for rnn in self.rnns:
76 | x = rnn(x, output_lengths)
77 | if self._DEBUG:
78 | print('after rnn', x.size())
79 |
80 | self._DEBUG = False
81 | return x, output_lengths
82 |
83 | def get_seq_lens(self, input_length):
84 | """
85 | Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
86 | containing the size sequences that will be output by the network.
87 | :param input_length: 1D Tensor
88 | :return: 1D Tensor scaled by model
89 | """
90 | seq_len = input_length
91 | for m in self.conv.modules():
92 | if type(m) == nn.modules.conv.Conv2d:
93 | seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
94 | return seq_len.int()
95 |
96 |
97 | class SpeechToText(nn.Module):
98 | def __init__(self,
99 | rnn_type,
100 | rnn_hidden_size,
101 | nb_layers,
102 | bidirectional,
103 | labels,
104 | DEBUG):
105 |
106 | super(SpeechToText, self).__init__()
107 |
108 | self._DEBUG = DEBUG
109 |
110 | # RNN
111 | self.rnns = rnn_block(rnn_hidden_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers)
112 |
113 | # FULLY CO
114 | num_classes = len(labels)
115 |
116 | fully_connected = nn.Sequential(
117 | nn.BatchNorm1d(rnn_hidden_size),
118 | nn.Linear(rnn_hidden_size, num_classes, bias=False)
119 | )
120 | self.fc = nn.Sequential(
121 | SequenceWise(fully_connected),
122 | )
123 | self.inference_softmax = InferenceBatchSoftmax()
124 |
125 |
126 | def forward(self, x, output_lengths):
127 | if self._DEBUG:
128 | print('')
129 | print('# BEGIN speech to text #')
130 | print('input', x.size())
131 |
132 | for rnn in self.rnns:
133 | x = rnn(x, output_lengths)
134 |
135 | if self._DEBUG:
136 | print('after rnn', x.size())
137 |
138 | x = self.fc(x)
139 | if self._DEBUG:
140 | print('after fc', x.size())
141 |
142 | x = x.transpose(0, 1)
143 | if self._DEBUG:
144 | print('after transpose', x.size())
145 | # identity in training mode, softmax in eval mode
146 | x = self.inference_softmax(x)
147 | if self._DEBUG:
148 | print('after softmax', x.size())
149 |
150 | x = x.transpose(0, 1)
151 | if self._DEBUG:
152 | print('after transpose', x.size())
153 |
154 | self._DEBUG = False
155 | return x
156 |
157 |
158 | class AccentClassifier(nn.Module):
159 | def __init__(self,
160 | rnn_type,
161 | rnn_hidden_size,
162 | nb_layers,
163 | bidirectional,
164 | accents_dict,
165 | bottleneck_size,
166 | DEBUG):
167 |
168 | super(AccentClassifier, self).__init__()
169 |
170 | self._DEBUG = DEBUG
171 |
172 | # RNN
173 | self.rnns = rnn_block(rnn_hidden_size, rnn_hidden_size, rnn_type, bidirectional, nb_layers)
174 |
175 | # FULLY CO
176 | num_classes = len(accents_dict)
177 |
178 | self.bnf = nn.Sequential(
179 | nn.BatchNorm1d(rnn_hidden_size),
180 | nn.Linear(rnn_hidden_size, 1024),
181 | nn.ReLU(),
182 | nn.BatchNorm1d(1024),
183 | nn.Linear(1024, bottleneck_size),
184 | nn.ReLU(),
185 | )
186 |
187 | self.fc = nn.Sequential(
188 | nn.BatchNorm1d(bottleneck_size),
189 | nn.Linear(bottleneck_size, num_classes),
190 | nn.ReLU(),
191 | )
192 |
193 | self.softmax = nn.Softmax(dim=1)
194 |
195 | def forward(self, x, output_lengths):
196 | if self._DEBUG:
197 | print('')
198 | print('# BEGIN Acc #')
199 | print('input', x.size())
200 |
201 | for rnn in self.rnns:
202 | x = rnn(x, output_lengths)
203 |
204 | if self._DEBUG:
205 | print('after rnn', x.size())
206 |
207 | x = x.mean(dim=0)
208 |
209 | if self._DEBUG:
210 | print('after mean', x.size())
211 |
212 | bottleneck = self.bnf(x)
213 |
214 | if self._DEBUG:
215 | print('after bnf', bottleneck.size())
216 |
217 | x = self.fc(bottleneck)
218 |
219 | if self._DEBUG:
220 | print('after fc', x.size())
221 |
222 | x = self.softmax(x)
223 |
224 | if self._DEBUG:
225 | print('after softmax', x.size())
226 |
227 | self._DEBUG = False
228 | return x, bottleneck
229 |
230 |
231 | class MultiTask(nn.Module):
232 | def __init__(self,
233 | use_mfcc_in=True,
234 | use_ivectors_in=True,
235 | use_embeddings_in=True,
236 | use_transcripts_out=True,
237 | use_accents_out=True,
238 | mfcc_size=40,
239 | ivector_size=100,
240 | embedding_size=100,
241 | rnn_type=nn.GRU,
242 | labels="abc",
243 | accents_dict={'uk', 'us'},
244 | rnn_hidden_size=800,
245 | nb_head_layers=2,
246 | nb_speech_layers=2,
247 | nb_accents_layers=2,
248 | bidirectional=True,
249 | bottleneck_size=256,
250 | DEBUG=False):
251 |
252 | self._meta = {
253 | 'use_mfcc_in': use_mfcc_in,
254 | 'use_ivectors_in': use_ivectors_in,
255 | 'use_embeddings_in': use_embeddings_in,
256 | 'use_transcripts_out': use_transcripts_out,
257 | 'use_accents_out': use_accents_out,
258 | 'mfcc_size': mfcc_size,
259 | 'ivector_size': ivector_size,
260 | 'embedding_size': embedding_size,
261 | 'rnn_type': rnn_type,
262 | 'labels': labels,
263 | 'accents_dict': accents_dict,
264 | 'rnn_hidden_size': rnn_hidden_size,
265 | 'nb_head_layers': nb_head_layers,
266 | 'nb_speech_layers': nb_speech_layers,
267 | 'nb_accents_layers': nb_accents_layers,
268 | 'bidirectional': bidirectional,
269 | 'bottleneck_size': bottleneck_size,
270 | 'DEBUG': DEBUG,
271 | }
272 |
273 | super(MultiTask, self).__init__()
274 |
275 | self.feature_len = 0
276 | self.feature_len += mfcc_size if use_mfcc_in else 0
277 | self.feature_len += ivector_size if use_ivectors_in else 0
278 | self.feature_len += embedding_size if use_embeddings_in else 0
279 |
280 | self.Head = Head(rnn_type=rnn_type,
281 | rnn_hidden_size=rnn_hidden_size,
282 | nb_layers=nb_head_layers,
283 | bidirectional=bidirectional,
284 | feature_len=self.feature_len,
285 | DEBUG=DEBUG)
286 |
287 | if self._meta['use_transcripts_out']:
288 | self.SpeechToText = SpeechToText(rnn_type=rnn_type,
289 | rnn_hidden_size=rnn_hidden_size,
290 | nb_layers=nb_speech_layers,
291 | bidirectional=bidirectional,
292 | labels=labels,
293 | DEBUG=DEBUG)
294 |
295 | if self._meta['use_accents_out']:
296 | self.AccentClassifier = AccentClassifier(rnn_type=rnn_type,
297 | rnn_hidden_size=rnn_hidden_size,
298 | nb_layers=nb_accents_layers,
299 | bidirectional=bidirectional,
300 | accents_dict=accents_dict,
301 | bottleneck_size=bottleneck_size,
302 | DEBUG=DEBUG)
303 |
304 | def forward(self, x, lengths):
305 | x, out_len = self.Head(x, lengths)
306 | x_stt, x_acc, bnf = None, None, None
307 |
308 | if self._meta['use_transcripts_out']:
309 | x_stt = self.SpeechToText(x, out_len)
310 |
311 | if self._meta['use_accents_out']:
312 | x_acc, bnf = self.AccentClassifier(x, out_len)
313 |
314 | return x_stt, x_acc, out_len, bnf
315 |
316 |
317 | @staticmethod
318 | def get_param_size(model):
319 | params = 0
320 | for p in model.parameters():
321 | tmp = 1
322 | for x in p.size():
323 | tmp *= x
324 | params += tmp
325 | return params
326 |
327 | @classmethod
328 | def load_model(cls, path):
329 | package = torch.load(path, map_location=lambda storage, loc: storage)
330 | meta = package['meta']
331 | model = cls(
332 | use_mfcc_in = meta['use_mfcc_in'],
333 | use_ivectors_in = meta['use_ivectors_in'],
334 | use_embeddings_in = meta['use_embeddings_in'],
335 | use_transcripts_out = meta['use_transcripts_out'],
336 | use_accents_out = meta['use_accents_out'],
337 | mfcc_size = meta['mfcc_size'],
338 | ivector_size = meta['ivector_size'],
339 | embedding_size = meta['embedding_size'],
340 | rnn_type = meta['rnn_type'],
341 | labels = meta['labels'],
342 | accents_dict = meta['accents_dict'],
343 | rnn_hidden_size = meta['rnn_hidden_size'],
344 | nb_head_layers = meta['nb_head_layers'],
345 | nb_speech_layers = meta['nb_speech_layers'],
346 | nb_accents_layers = meta['nb_accents_layers'],
347 | bidirectional = meta['bidirectional'],
348 | bottleneck_size = meta['bottleneck_size'],
349 | DEBUG = meta['DEBUG'],
350 | )
351 | model.load_state_dict(package['state_dict'])
352 | return model, package
353 |
354 | @staticmethod
355 | def serialize(model,
356 | path='./__temp__',
357 | save=True,
358 | exp_name=None,
359 | optimizer=None,
360 | epoch=None,
361 | train_losses=None,
362 | test_losses=None,
363 | text_train_losses=None,
364 | text_test_losses=None,
365 | text_wers=None,
366 | accent_train_losses=None,
367 | accent_test_losses=None,
368 | accent_accuracies=None):
369 |
370 | """Saves the model in a packaged form. Also returns the package.
371 | Use the load_model class method to recreate a model from a package."""
372 |
373 | package = {
374 | 'state_dict': model.state_dict(),
375 | 'meta': model._meta
376 | }
377 |
378 | if exp_name is not None:
379 | package['exp_name'] = exp_name
380 | if optimizer is not None:
381 | package['optimizer'] = optimizer
382 | if epoch is not None:
383 | package['epoch'] = epoch
384 | if train_losses is not None:
385 | package['train_losses'] = train_losses
386 | if test_losses is not None:
387 | package['test_losses'] = test_losses
388 | if text_train_losses is not None:
389 | package['text_train_losses'] = text_train_losses
390 | if text_test_losses is not None:
391 | package['text_test_losses'] = text_test_losses
392 | if text_wers is not None:
393 | package['text_wers'] = text_wers
394 | if accent_train_losses is not None:
395 | package['accent_train_losses'] = accent_train_losses
396 | if accent_test_losses is not None:
397 | package['accent_test_losses'] = accent_test_losses
398 | if accent_accuracies is not None:
399 | package['accent_accuracies'] = accent_accuracies
400 |
401 | if save:
402 | torch.save(package, str(path) + '.pth')
403 |
404 | return package
--------------------------------------------------------------------------------
/tests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "torch.Size([20, 960, 240])\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "%reload_ext autoreload\n",
18 | "%autoreload 1\n",
19 | "%aimport config\n",
20 | "\n",
21 | "conf = config.Config()\n",
22 | "\n",
23 | "from model import MultiTask\n",
24 | "model = MultiTask.load_model('saved_models/SimpleDS_TRAIN__in_mfcc__out_transcripts__nblyrs-head-4-speech-1-accent-1__bnf-256__24-02-2019_23h50m00.pth')\n",
25 | "\n",
26 | "from dataloader import MultiDataset, MultiDataLoader\n",
27 | "import torch\n",
28 | "\n",
29 | "labels = \" 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_\"\n",
30 | "\n",
31 | "\n",
32 | "dataset = MultiDataset('data/splits/dev.csv', labels, \n",
33 | " use_mfcc_in=model._meta['use_mfcc_in'], \n",
34 | " use_ivectors_in=True,#model._meta['use_ivectors_in'], \n",
35 | " use_embeddings_in=True,#model._meta['use_embeddings_in'],\n",
36 | " use_transcripts_out=model._meta['use_transcripts_out'], \n",
37 | " use_accents_out=model._meta['use_accents_out'])\n",
38 | "\n",
39 | "dataloader = MultiDataLoader(dataset, batch_size=20, shuffle=False)\n",
40 | "\n",
41 | "for data in dataloader:\n",
42 | " print(data[0].size())\n",
43 | " break"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 6,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/plain": [
54 | "{'use_mfcc_in': True,\n",
55 | " 'use_ivectors_in': False,\n",
56 | " 'use_embeddings_in': False,\n",
57 | " 'use_transcripts_out': True,\n",
58 | " 'use_accents_out': False,\n",
59 | " 'mfcc_size': 40,\n",
60 | " 'ivector_size': 100,\n",
61 | " 'embedding_size': 256,\n",
62 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n",
63 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
64 | " 'accents_dict': {'australia': 0,\n",
65 | " 'canada': 1,\n",
66 | " 'england': 2,\n",
67 | " 'ireland': 3,\n",
68 | " 'scotland': 4,\n",
69 | " 'us': 5,\n",
70 | " 'wales': 6},\n",
71 | " 'rnn_hidden_size': 800,\n",
72 | " 'nb_head_layers': 4,\n",
73 | " 'nb_speech_layers': 1,\n",
74 | " 'nb_accents_layers': 1,\n",
75 | " 'bidirectional': True,\n",
76 | " 'bottleneck_size': 256,\n",
77 | " 'DEBUG': False}"
78 | ]
79 | },
80 | "execution_count": 6,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | }
84 | ],
85 | "source": []
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 70,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/plain": [
95 | "tensor([7])"
96 | ]
97 | },
98 | "execution_count": 70,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "import torch\n",
105 | "sum([torch.tensor([2]), torch.tensor([5])])"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 65,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "t='this is test'\n",
115 | "i = t.find(' ')"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 66,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "data": {
125 | "text/plain": [
126 | "'is test'"
127 | ]
128 | },
129 | "execution_count": 66,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "t[i+1:]"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 67,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/plain": [
146 | "[{'exp_name_prefix': 'a',\n",
147 | " 'epochs': 2,\n",
148 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
149 | " 'batch_size': 10,\n",
150 | " 'num_workers': 4,\n",
151 | " 'cuda': True,\n",
152 | " 'losses_mix': 0.9,\n",
153 | " 'learning_rate': 0.0003,\n",
154 | " 'mfcc_size': 40,\n",
155 | " 'ivector_size': 100,\n",
156 | " 'embedding_size': 100,\n",
157 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n",
158 | " 'rnn_hidden_size': 800,\n",
159 | " 'nb_head_layers': 3,\n",
160 | " 'nb_speech_layers': 1,\n",
161 | " 'nb_accents_layers': 1,\n",
162 | " 'bidirectional': True,\n",
163 | " 'bottleneck_size': 256,\n",
164 | " 'use_mfcc_in': True,\n",
165 | " 'use_ivectors_in': True,\n",
166 | " 'use_embeddings_in': True,\n",
167 | " 'use_transcripts_out': True,\n",
168 | " 'use_accents_out': False,\n",
169 | " 'decoder_alpha': 0.8,\n",
170 | " 'decoder_beta': 1.0,\n",
171 | " 'decoder_cutoff_top_n': 40,\n",
172 | " 'decoder_cutoff_prob': 1.0,\n",
173 | " 'decoder_beam_width': 100,\n",
174 | " 'lm_path': './data/language_models/cv.lm',\n",
175 | " 'train_manifest': './data/splits/train.csv',\n",
176 | " 'dev_manifest': './data/splits/dev.csv',\n",
177 | " 'test_manifest': './data/splits/test.csv',\n",
178 | " 'tensorboard_path': './tensorboard_runs/',\n",
179 | " 'saved_models_path': './saved_models/'},\n",
180 | " {'exp_name_prefix': 'b',\n",
181 | " 'epochs': 2,\n",
182 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
183 | " 'batch_size': 10,\n",
184 | " 'num_workers': 4,\n",
185 | " 'cuda': True,\n",
186 | " 'losses_mix': 0.9,\n",
187 | " 'learning_rate': 0.0003,\n",
188 | " 'mfcc_size': 40,\n",
189 | " 'ivector_size': 100,\n",
190 | " 'embedding_size': 100,\n",
191 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n",
192 | " 'rnn_hidden_size': 800,\n",
193 | " 'nb_head_layers': 3,\n",
194 | " 'nb_speech_layers': 1,\n",
195 | " 'nb_accents_layers': 1,\n",
196 | " 'bidirectional': True,\n",
197 | " 'bottleneck_size': 256,\n",
198 | " 'use_mfcc_in': False,\n",
199 | " 'use_ivectors_in': True,\n",
200 | " 'use_embeddings_in': True,\n",
201 | " 'use_transcripts_out': False,\n",
202 | " 'use_accents_out': True,\n",
203 | " 'decoder_alpha': 0.8,\n",
204 | " 'decoder_beta': 1.0,\n",
205 | " 'decoder_cutoff_top_n': 40,\n",
206 | " 'decoder_cutoff_prob': 1.0,\n",
207 | " 'decoder_beam_width': 100,\n",
208 | " 'lm_path': './data/language_models/cv.lm',\n",
209 | " 'train_manifest': './data/splits/train.csv',\n",
210 | " 'dev_manifest': './data/splits/dev.csv',\n",
211 | " 'test_manifest': './data/splits/test.csv',\n",
212 | " 'tensorboard_path': './tensorboard_runs/',\n",
213 | " 'saved_models_path': './saved_models/'},\n",
214 | " {'exp_name_prefix': 'c',\n",
215 | " 'epochs': 2,\n",
216 | " 'labels': \"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ \",\n",
217 | " 'batch_size': 10,\n",
218 | " 'num_workers': 4,\n",
219 | " 'cuda': True,\n",
220 | " 'losses_mix': 0.9,\n",
221 | " 'learning_rate': 0.0003,\n",
222 | " 'mfcc_size': 40,\n",
223 | " 'ivector_size': 100,\n",
224 | " 'embedding_size': 100,\n",
225 | " 'rnn_type': torch.nn.modules.rnn.GRU,\n",
226 | " 'rnn_hidden_size': 800,\n",
227 | " 'nb_head_layers': 3,\n",
228 | " 'nb_speech_layers': 1,\n",
229 | " 'nb_accents_layers': 1,\n",
230 | " 'bidirectional': True,\n",
231 | " 'bottleneck_size': 256,\n",
232 | " 'use_mfcc_in': True,\n",
233 | " 'use_ivectors_in': False,\n",
234 | " 'use_embeddings_in': False,\n",
235 | " 'use_transcripts_out': True,\n",
236 | " 'use_accents_out': True,\n",
237 | " 'decoder_alpha': 0.8,\n",
238 | " 'decoder_beta': 1.0,\n",
239 | " 'decoder_cutoff_top_n': 40,\n",
240 | " 'decoder_cutoff_prob': 1.0,\n",
241 | " 'decoder_beam_width': 100,\n",
242 | " 'lm_path': './data/language_models/cv.lm',\n",
243 | " 'train_manifest': './data/splits/train.csv',\n",
244 | " 'dev_manifest': './data/splits/dev.csv',\n",
245 | " 'test_manifest': './data/splits/test.csv',\n",
246 | " 'tensorboard_path': './tensorboard_runs/',\n",
247 | " 'saved_models_path': './saved_models/'}]"
248 | ]
249 | },
250 | "execution_count": 67,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "conf.patch_config('experiments.cfg')"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": []
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 4,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "from model import MultiTask\n",
273 | "\n",
274 | "model = MultiTask(DEBUG=False, rnn_hidden_size=800, \n",
275 | " use_mfcc_in=conf['use_mfcc_in'], \n",
276 | " use_ivectors_in=conf['use_ivectors_in'], \n",
277 | " use_embeddings_in=conf['use_embeddings_in'],\n",
278 | " use_transcripts_out=conf['use_transcripts_out'], \n",
279 | " use_accents_out=conf['use_accents_out'])"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 24,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "name": "stdout",
289 | "output_type": "stream",
290 | "text": [
291 | "blib \n",
292 | "\n",
293 | "test\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "print('blib', '\\n')\n",
299 | "print('test')\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 25,
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/plain": [
310 | "{'australia': 0, 'canada': 1, 'england': 2, 'us': 3}"
311 | ]
312 | },
313 | "execution_count": 25,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "dataset.accent_dict"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 26,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "data": {
329 | "text/plain": [
330 | "[True]"
331 | ]
332 | },
333 | "execution_count": 26,
334 | "metadata": {},
335 | "output_type": "execute_result"
336 | }
337 | ],
338 | "source": [
339 | "conf['use_embeddings_in']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 27,
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "data": {
349 | "text/plain": [
350 | "True"
351 | ]
352 | },
353 | "execution_count": 27,
354 | "metadata": {},
355 | "output_type": "execute_result"
356 | }
357 | ],
358 | "source": [
359 | "model._meta['use_embeddings_in']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 29,
365 | "metadata": {},
366 | "outputs": [
367 | {
368 | "data": {
369 | "application/vnd.jupyter.widget-view+json": {
370 | "model_id": "f3351a1c54734de0b6fe48058fa7e33e",
371 | "version_major": 2,
372 | "version_minor": 0
373 | },
374 | "text/plain": [
375 | "HBox(children=(IntProgress(value=0, max=58), HTML(value='')))"
376 | ]
377 | },
378 | "metadata": {},
379 | "output_type": "display_data"
380 | },
381 | {
382 | "name": "stdout",
383 | "output_type": "stream",
384 | "text": [
385 | "\n"
386 | ]
387 | }
388 | ],
389 | "source": [
390 | "from tqdm import tqdm_notebook as tqdm\n",
391 | "\n",
392 | "model = model.cuda()\n",
393 | "\n",
394 | "for data in tqdm(dataloader):\n",
395 | " inputs, inputs_lens, transcripts, transcripts_lens, accents = data\n",
396 | "\n",
397 | " \n",
398 | " a, b, c, __ = model(inputs.cuda(), inputs_lens.cuda())"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 10,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "MultiTask.serialize(model, 'tmp')\n",
408 | "\n",
409 | "modelb = MultiTask.load_model('tmp')\n"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 12,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "modelb = modelb.cuda()"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 13,
424 | "metadata": {},
425 | "outputs": [
426 | {
427 | "data": {
428 | "application/vnd.jupyter.widget-view+json": {
429 | "model_id": "8850c82f6c9d4458bc727af18e20630b",
430 | "version_major": 2,
431 | "version_minor": 0
432 | },
433 | "text/plain": [
434 | "HBox(children=(IntProgress(value=0, max=571), HTML(value='')))"
435 | ]
436 | },
437 | "metadata": {},
438 | "output_type": "display_data"
439 | },
440 | {
441 | "name": "stdout",
442 | "output_type": "stream",
443 | "text": [
444 | "\n"
445 | ]
446 | }
447 | ],
448 | "source": [
449 | "for data in tqdm(dataloader):\n",
450 | " inputs, inputs_lens, transcripts, transcripts_lens, accents = data\n",
451 | "\n",
452 | " \n",
453 | " a, b, c = modelb(inputs.cuda(), inputs_lens.cuda())\n"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "@classmethod\n",
463 | "def load_model(cls, path):\n",
464 | " package = torch.load(path, map_location=lambda storage, loc: storage)\n",
465 | " model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['nb_layers'],\n",
466 | " labels=package['labels'], audio_conf=package['audio_conf'],\n",
467 | " rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))\n",
468 | " model.load_state_dict(package['state_dict'])\n",
469 | " for x in model.rnns:\n",
470 | " x.flatten_parameters()\n",
471 | " return model\n",
472 | "\n",
473 | "@classmethod\n",
474 | "def load_model_package(cls, package):\n",
475 | " model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['nb_layers'],\n",
476 | " labels=package['labels'], audio_conf=package['audio_conf'],\n",
477 | " rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))\n",
478 | " model.load_state_dict(package['state_dict'])\n",
479 | " return model\n",
480 | "\n",
481 | "@staticmethod\n",
482 | "def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None,\n",
483 | " main_loss_results=None, side_loss_results=None,\n",
484 | " cer_results=None, wer_results=None, mca_results=None, avg_loss=None, meta=None):\n",
485 | " model = model.module if DeepSpeech.is_parallel(model) else model\n",
486 | " package = {\n",
487 | " 'version': model._version,\n",
488 | " 'hidden_size': model._hidden_size,\n",
489 | " 'nb_layers': model._nb_layers,\n",
490 | " 'rnn_type': supported_rnns_inv.get(model._rnn_type, model._rnn_type.__name__.lower()),\n",
491 | " 'audio_conf': model._audio_conf,\n",
492 | " 'labels': model._labels,\n",
493 | " 'state_dict': model.state_dict(),\n",
494 | " 'bidirectional': model._bidirectional\n",
495 | " }\n",
496 | " if optimizer is not None:\n",
497 | " package['optim_dict'] = optimizer.state_dict()\n",
498 | " if avg_loss is not None:\n",
499 | " package['avg_loss'] = avg_loss\n",
500 | " if epoch is not None:\n",
501 | " package['epoch'] = epoch + 1 # increment for readability\n",
502 | " if iteration is not None:\n",
503 | " package['iteration'] = iteration\n",
504 | " if loss_results is not None:\n",
505 | " package['loss_results'] = loss_results\n",
506 | " package['main_loss_results'] = main_loss_results\n",
507 | " package['side_loss_results'] = side_loss_results\n",
508 | " package['cer_results'] = cer_results\n",
509 | " package['wer_results'] = wer_results\n",
510 | " package['mca_results'] = mca_results\n",
511 | " if meta is not None:\n",
512 | " package['meta'] = meta\n",
513 | " return package"
514 | ]
515 | }
516 | ],
517 | "metadata": {
518 | "kernelspec": {
519 | "display_name": "Python 3",
520 | "language": "python",
521 | "name": "python3"
522 | },
523 | "language_info": {
524 | "codemirror_mode": {
525 | "name": "ipython",
526 | "version": 3
527 | },
528 | "file_extension": ".py",
529 | "mimetype": "text/x-python",
530 | "name": "python",
531 | "nbconvert_exporter": "python",
532 | "pygments_lexer": "ipython3",
533 | "version": "3.6.8"
534 | }
535 | },
536 | "nbformat": 4,
537 | "nbformat_minor": 2
538 | }
539 |
--------------------------------------------------------------------------------