├── ukro_g2p
├── models
│ ├── __init__.py
│ ├── modules.py
│ └── g2p_model.py
├── utils
│ ├── __init__.py
│ ├── lexicon_util.py
│ ├── infolog.py
│ └── util.py
├── datasets
│ ├── __init__.py
│ └── lexicon_datasets.py
├── __init__.py
├── configs
│ └── ukro_base_uncased.config
├── tokenization.py
├── train.py
├── predict.py
├── test.py
└── trainer.py
├── .gitignore
├── .gitattributes
├── MANIFEST.in
├── setup.py
└── README.md
/ukro_g2p/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ukro_g2p/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ukro_g2p/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ukro_g2p/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.5"
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /data/
2 | /exp/
3 | /trained_models/
4 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.th filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include ukro_g2p/configs/ukro_base_uncased.config
2 |
--------------------------------------------------------------------------------
/ukro_g2p/utils/lexicon_util.py:
--------------------------------------------------------------------------------
1 | from ukro_g2p.utils.util import load_list
2 |
3 |
4 | def read_lexicon_dataset(lexicon_dataset_path):
5 | """
6 | Read lexicon out of file to the dict in format - "id word pronunciation"
7 | :param lexicon_dataset_path: path to the lexicon dataset text file
8 | :return: lexicon dictionary
9 | """
10 |
11 | lexicon = {x.split()[0]: ' '.join(x.split()[1:]) for x in load_list(lexicon_dataset_path)}
12 |
13 | return lexicon
14 |
--------------------------------------------------------------------------------
/ukro_g2p/utils/infolog.py:
--------------------------------------------------------------------------------
1 | import atexit
2 | from datetime import datetime
3 |
4 |
5 | _format = '%Y-%m-%d %H:%M:%S.%f'
6 | _file = None
7 | _run_name = None
8 |
9 |
10 | def init(filename, run_name):
11 | global _file, _run_name
12 | _close_logfile()
13 | _file = open(filename, 'a')
14 | _file.write('\n-----------------------------------------------------------------\n')
15 | _file.write('Starting new training run\n')
16 | _file.write('-----------------------------------------------------------------\n')
17 | _run_name = run_name
18 |
19 |
20 | def log(msg):
21 | print(msg)
22 | if _file is not None:
23 | _file.write(f'[{datetime.now().strftime(_format)[:-3]}] {msg}\n')
24 |
25 |
26 | def _close_logfile():
27 | global _file
28 | if _file is not None:
29 | _file.close()
30 | _file = None
31 |
32 |
33 | atexit.register(_close_logfile)
34 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from os import path
3 |
4 |
5 | this_directory = path.abspath(path.dirname(__file__))
6 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
7 | long_description = f.read()
8 |
9 |
10 | setup(
11 | name="ukro_g2p",
12 | version="0.1.5",
13 | author="Kostiantyn Pylypenko",
14 | author_email="k.pylypenko@hotmail.com",
15 | description="NN based grapheme to phoneme model for Ukrainian language",
16 | license="MIT",
17 | keywords="Ukrainian grapheme to phoneme",
18 | url="https://github.com/kosti4ka/ukro_g2p",
19 | packages=find_packages(),
20 | long_description=long_description,
21 | long_description_content_type='text/markdown',
22 | include_package_data=True,
23 | classifiers=[
24 | "License :: OSI Approved :: MIT License",
25 | "Intended Audience :: Developers",
26 | "Intended Audience :: Science/Research",
27 | "Operating System :: OS Independent",
28 | "Programming Language :: Python",
29 | "Programming Language :: Python :: 3",
30 | "Topic :: Software Development :: Libraries",
31 | "Topic :: Software Development :: Libraries :: Python Modules",
32 | ],
33 | )
34 |
--------------------------------------------------------------------------------
/ukro_g2p/utils/util.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import json
3 | from collections import OrderedDict
4 | import Levenshtein
5 |
6 |
7 | def read_json(fname):
8 | fname = Path(fname)
9 | with fname.open('rt') as handle:
10 | return json.load(handle, object_hook=OrderedDict)
11 |
12 |
13 | def load_list(fname):
14 | fname = Path(fname)
15 | return [x for x in open(fname, 'r', encoding='utf-8').read().split('\n') if x]
16 |
17 |
18 | def phoneme_error_rate(p_seq1, p_seq2):
19 | p_vocab = set(p_seq1 + p_seq2)
20 | p2c = dict(zip(p_vocab, range(len(p_vocab))))
21 | c_seq1 = [chr(p2c[p]) for p in p_seq1]
22 | c_seq2 = [chr(p2c[p]) for p in p_seq2]
23 | return Levenshtein.distance(''.join(c_seq1), ''.join(c_seq2)) / len(c_seq2)
24 |
25 |
26 | def dump(iterable, file_name, append=False):
27 |
28 | # init paths and make sure the dir to write in exists
29 | file_name = Path(file_name)
30 | out_dir = file_name.parent
31 | out_dir.mkdir(parents=True, exist_ok=True)
32 |
33 | with open(file_name, 'a' if append else 'w', encoding='utf-8') as f:
34 | f.writelines(('\t'.join(str(ll) for ll in l) if type(l) != str and hasattr(l, '__iter__') else str(l)).rstrip('\r\n') + '\n' for l in ([iterable] if type(iterable) == str else iterable))
35 |
--------------------------------------------------------------------------------
/ukro_g2p/configs/ukro_base_uncased.config:
--------------------------------------------------------------------------------
1 | [VocabConfig]
2 | graphemes: _ ' - а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ь ю я є і ї ґ pad
3 | phonemes: AA AA1 AO AO1 AOU B B2 B23 B3 CH CH2 CH23 CH3 D D3 DJ DJ3 DZ DZJ EH EH1 EIH EIY F F2 F23 F3 G G2 H H2 H3 IH IH1 IHE IY IY1 JH JH2 K K2 K23 K3 L L3 LJ LJ3 M M2 M23 M3 N N3 NJ NJ3 P P2 P23 P3 R R3 RJ RJ3 S S3 SH SH2 SH23 SH3 SJ SJ3 T T3 TJ TJ3 TS TS3 TSJ TSJ3 UH UH1 V V2 V23 WH WH2 X X2 X3 Y Y3 Z Z3 ZH ZH2 ZH23 ZH3 ZJ ZJ3 pad
4 | human_phonemes: а а́ о о́ оу б б` б`: б: ч ч` ч`: ч: д д: д’ д’: дз дз’ е е́ еи еі ф ф` ф`: ф: ґ ґ` г г` г: и и́ ие і і́ дж дж` к к` к`: к: л л: л’ л’: м м` м`: м: н н: н’ н’: п п` п`: п: р р: р’ р’: с с: ш ш` ш`: ш: с’ с’: т т: т’ т’: ц ц: ц’ ц’: у у́ в в` в`: ў ў` х х` х: й й` з з: ж ж` ж`: ж: з’ з’: pad
5 | padding: pad
6 | phoneme_bos:
7 | phoneme_eos:
8 |
9 | [EncoderConfig]
10 | encoder_d_embed: 256
11 | encoder_d_hidden: 256
12 | encoder_n_layers: 1
13 | encoder_bidirectional: False
14 |
15 | [DecoderConfig]
16 | decoder_d_embed: 256
17 | decoder_d_hidden: 256
18 | decoder_n_layers: 1
19 | attention: False
20 |
21 | [GeneratorConfig]
22 | beam_size: 3
23 | max_generate_len: 22
24 |
25 | [OptimizerConfig]
26 | lr: 1e-3
27 | weight_decay: 1e-5
28 |
--------------------------------------------------------------------------------
/ukro_g2p/tokenization.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from ukro_g2p.models.g2p_model import G2PConfig
3 | from collections import namedtuple
4 |
5 | tokenizer_obj = namedtuple("tokenizer_obj", ["config_path"])
6 |
7 | pretrained_tokenizers = {
8 | "ukro-base-uncased": tokenizer_obj(
9 | config_path=Path(__file__).parent / "configs/ukro_base_uncased.config",
10 | )
11 | }
12 |
13 |
14 | class G2PTokenizer(object):
15 | def __init__(self, config):
16 |
17 | self.g2idx = {g: idx for idx, g in enumerate(config.graphemes)}
18 | self.idx2p = {idx: p for idx, p in enumerate(config.phonemes)}
19 | self.idx2h = {idx: h for idx, h in enumerate(config.human_phonemes)}
20 |
21 | def tokenize_graphemes(self, word):
22 | return list(word)
23 |
24 | def convert_graphemes_to_ids(self, graphemes):
25 | return [self.g2idx[g] for g in graphemes]
26 |
27 | def convert_ids_to_phonemes(self, ids):
28 | return [self.idx2p[i] for i in ids if self.idx2p[i] not in ['', '']]
29 |
30 | def convert_ids_to_human_phonemes(self, ids):
31 | return [self.idx2h[i] for i in ids if self.idx2h[i] not in ['', '']]
32 |
33 | @classmethod
34 | def from_pretrained(cls, tokenizer_name):
35 |
36 | if tokenizer_name not in pretrained_tokenizers:
37 | raise ValueError
38 |
39 | # load config
40 | config = G2PConfig(pretrained_tokenizers[tokenizer_name].config_path) # TODO add metod from_file
41 | tokenizer = cls(config)
42 |
43 | return tokenizer
44 |
--------------------------------------------------------------------------------
/ukro_g2p/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from ukro_g2p.datasets.lexicon_datasets import LexiconDataset
4 | from ukro_g2p.trainer import Trainer
5 | from torch.optim import Adam
6 | import torch.nn as nn
7 |
8 | from ukro_g2p.models.g2p_model import G2PConfig, G2PModel
9 |
10 | if __name__ == '__main__':
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('-c', '--config', default=None, type=str,
13 | help='config file path (default: None)')
14 | parser.add_argument('-d', '--dataset', default=None, type=str,
15 | help='path to the dataset dir (default: None)')
16 | parser.add_argument('-n', '--experiment_name', default=None, type=str,
17 | help='path to the output dir (default: None)')
18 | parser.add_argument('-r', '--restore_epoch', default=-1, type=int,
19 | help='restore epoch number (default: -1)')
20 |
21 | args = parser.parse_args()
22 |
23 | config = G2PConfig(args.config)
24 | model = G2PModel(config)
25 | optimizer = Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
26 | loss = nn.NLLLoss(ignore_index=config.decoder_padding_idx)
27 | datasets = {'train': LexiconDataset(args.dataset, split='train'),
28 | 'dev': LexiconDataset(args.dataset, split='dev')}
29 |
30 | trainer = Trainer(model, datasets, optimizer, loss, epochs=100, batch_size=256,
31 | experiment_name=args.experiment_name,
32 | logging_freq=10, restore_epoch=args.restore_epoch)
33 | trainer.train_and_validate()
34 |
--------------------------------------------------------------------------------
/ukro_g2p/predict.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 | from torch.autograd import Variable
4 |
5 | from ukro_g2p.models.g2p_model import G2PModel, pretrained_models
6 | from ukro_g2p.tokenization import G2PTokenizer
7 |
8 |
9 | class G2P(object):
10 | def __init__(self, model_name):
11 | super().__init__()
12 |
13 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14 |
15 | self.model = G2PModel.from_pretrained(model_name).to(self.device)
16 | self.tokenizer = G2PTokenizer.from_pretrained(model_name)
17 | self.model.eval()
18 |
19 | def __call__(self, word, human_readable=False):
20 |
21 | # map word's chars into graphemes idx
22 | graphemes = self.tokenizer.tokenize_graphemes(word)
23 | graphemes = self.tokenizer.convert_graphemes_to_ids(graphemes)
24 | g_length = [len(graphemes)]
25 |
26 | graphemes = Variable(torch.LongTensor(graphemes).unsqueeze(0)).to(self.device)
27 | g_length = Variable(torch.LongTensor(g_length)).to(self.device)
28 |
29 | phonemes = self.model(graphemes, g_length).tolist()[0]
30 |
31 | return self.tokenizer.convert_ids_to_phonemes(phonemes) if not human_readable else self.tokenizer.convert_ids_to_human_phonemes(phonemes)
32 |
33 |
34 | if __name__ == '__main__':
35 | # argument parser
36 | parser = argparse.ArgumentParser()
37 |
38 | # required
39 | parser.add_argument('word', type=str, help='Word to generate pronunciation for')
40 | parser.add_argument('-m', '--model_name', required=False, type=str, default='ukro-base-uncased',
41 | choices=pretrained_models.keys(),
42 | help='Trained model name')
43 | parser.add_argument('-hr', '--human_readable', required=False, action='store_true',
44 | help='Output human readable set of phonemes')
45 |
46 | # parse
47 | script_args = parser.parse_args()
48 |
49 | g2p = G2P('ukro-base-uncased')
50 | pron = g2p(script_args.word, human_readable=True if script_args.human_readable else False)
51 |
52 | print(f"{' '.join(pron)}")
53 |
--------------------------------------------------------------------------------
/ukro_g2p/test.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch.utils.data import DataLoader
5 | from tqdm import tqdm
6 |
7 | from ukro_g2p.datasets.lexicon_datasets import LexiconDataset, _collate_fn
8 | from ukro_g2p.models.g2p_model import G2PConfig, G2PModel
9 | from ukro_g2p.utils.util import phoneme_error_rate
10 |
11 | from pathlib import Path
12 | from ukro_g2p.utils.util import dump
13 |
14 |
15 | def main(model, dataset, resume, out_dir):
16 |
17 | # init paths
18 | out_dir = Path(out_dir)
19 | ref_path = out_dir / 'ref'
20 | hyp_path = out_dir / 'hyp'
21 | scores_path = out_dir / 'scores'
22 |
23 | # make dir and files
24 | out_dir.mkdir(parents=True, exist_ok=True)
25 | try:
26 | ref_path.unlink()
27 | hyp_path.unlink()
28 | scores_path.unlink()
29 | except:
30 | pass
31 |
32 | # setup data_loader instances
33 | data_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=_collate_fn, num_workers=1)
34 |
35 | # load model weights
36 | checkpoint = torch.load(resume, map_location=lambda storage, loc: storage)
37 | model.load_state_dict(checkpoint)
38 |
39 | # prepare model for testing
40 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
41 | model = model.to(device)
42 | model.eval()
43 |
44 | # init mertic values
45 | avr_per = 0
46 | total_phonemes_length = 0
47 | num_batches = len(data_loader)
48 |
49 | with torch.no_grad():
50 | for batch_idx, batch in tqdm(enumerate(data_loader), total=num_batches):
51 | graphemes, graphemes_length, phonemes, _ = batch
52 |
53 | graphemes = graphemes.to(device)
54 | graphemes_length = graphemes_length.to(device)
55 | phonemes = phonemes.to(device)
56 |
57 | phonemes_predictions = model(graphemes, graphemes_length).tolist()
58 | phonemes_targets = phonemes[:, 1:].contiguous().tolist()
59 |
60 | for predictions, targets, input_graphemes in zip(phonemes_predictions, phonemes_targets, graphemes):
61 | targets_length = len(targets)
62 | per = phoneme_error_rate(predictions, targets)
63 | avr_per += per * targets_length
64 | total_phonemes_length += targets_length
65 |
66 | # saving hyp, ref and scores
67 | graphemes_str = ''.join([g for g in dataset.idx2graphemes(input_graphemes.tolist()) if g != ''])
68 | predictions_str = ' '.join([p for p in dataset.idx2phonemes(predictions) if p not in ['', '']])
69 | targets_str = ' '.join([p for p in dataset.idx2phonemes(targets) if p not in ['', '']])
70 | dump(f'{graphemes_str}\t{predictions_str}', hyp_path, append=True)
71 | dump(f'{graphemes_str}\t{targets_str}', ref_path, append=True)
72 | dump(f'{round(per, 4)}\t{targets_str}\t{predictions_str}', scores_path, append=True)
73 |
74 | avr_per /= total_phonemes_length
75 | log_str = f'Phoneme Error Rate: {round(avr_per, 4)}'
76 | dump(log_str, scores_path, append=True)
77 | print(log_str)
78 |
79 |
80 | if __name__ == '__main__':
81 | parser = argparse.ArgumentParser()
82 | parser.add_argument('-c', '--config', default=None, type=str,
83 | help='config file path (default: None)')
84 | parser.add_argument('-d', '--dataset', default=None, type=str,
85 | help='path to the dataset dir (default: None)')
86 | parser.add_argument('-r', '--resume', default=None, type=str,
87 | help='path to latest checkpoint (default: None)')
88 | parser.add_argument('-o', '--out_dir', default=None, type=str,
89 | help='path to the output dir (default: None)')
90 |
91 | args = parser.parse_args()
92 |
93 | config = G2PConfig(args.config)
94 | model = G2PModel(config)
95 | dataset = LexiconDataset(args.dataset, split='test')
96 |
97 | main(model, dataset, args.resume, args.out_dir)
98 |
--------------------------------------------------------------------------------
/ukro_g2p/datasets/lexicon_datasets.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset
2 | from pathlib import Path
3 | from ukro_g2p.utils.lexicon_util import read_lexicon_dataset
4 | from ukro_g2p.utils.util import load_list
5 | import torch
6 | import numpy as np
7 |
8 | # TODO put this into config
9 | GRAPHEMES_PADDING = 37
10 | PHONEMES_PADDING = 99
11 |
12 |
13 | class LexiconDataset(Dataset):
14 | def __init__(self, data_root, split='train'):
15 | self.data_root = Path(data_root)
16 | self.lexicon_dataset_path = self.data_root / 'data.txt'
17 | if split in ['train', 'dev', 'test']:
18 | self.data_keys_path = self.data_root / f'{split}_set'
19 | else:
20 | raise KeyError
21 | #TODO take it from config file
22 | self.phonemes_path = self.data_root / 'phones'
23 | self.letters_path = self.data_root / 'letters'
24 |
25 | self.lexicon = read_lexicon_dataset(self.lexicon_dataset_path)
26 | self.data_keys = load_list(self.data_keys_path)
27 | self.phoneme2idx = {p.split()[1]: int(p.split()[0]) for p in load_list(self.phonemes_path)}
28 | self.phoneme2idx['pad'] = len(self.phoneme2idx)
29 | self.phoneme2idx[''] = len(self.phoneme2idx)
30 | self.phoneme2idx[''] = len(self.phoneme2idx)
31 | self.idx2phoneme = dict((v, k) for k, v in self.phoneme2idx.items())
32 | self.grapheme2idx = {p.split()[1]: int(p.split()[0]) for p in load_list(self.letters_path)}
33 | self.grapheme2idx['pad'] = len(self.grapheme2idx)
34 | self.grapheme2idx[''] = len(self.grapheme2idx)
35 | self.grapheme2idx[''] = len(self.grapheme2idx)
36 | self.idx2grapheme = dict((v, k) for k, v in self.grapheme2idx.items())
37 |
38 | def __getitem__(self, idx):
39 | key = self.data_keys[idx]
40 | datapoint = self.lexicon[key]
41 |
42 | graphemes = datapoint.split()[0]
43 | phonemes = datapoint.split()[1:]
44 |
45 | graphemes_idx = self.graphemes2idx(graphemes)
46 | phonemes_idx = self.phonemes2idx(phonemes)
47 | return graphemes_idx, phonemes_idx
48 |
49 | def __len__(self):
50 | return len(self.data_keys)
51 |
52 | def graphemes2idx(self, graphemes):
53 | # graphemes_idx = [self.grapheme2idx['']]
54 | graphemes_idx = [self.grapheme2idx[g] for g in graphemes]
55 | return graphemes_idx
56 |
57 | def phonemes2idx(self, phonemes):
58 | phonemes_idx = [self.phoneme2idx['']]
59 | phonemes_idx.extend([self.phoneme2idx[p] for p in phonemes])
60 | phonemes_idx.append(self.phoneme2idx[''])
61 | return phonemes_idx
62 |
63 | def idx2graphemes(self, idx):
64 | return [self.idx2grapheme[g] for g in idx]
65 |
66 | def idx2phonemes(self, idx):
67 | return [self.idx2phoneme[p] for p in idx]
68 |
69 |
70 | def _collate_fn(batch):
71 | """
72 | Merges list of samples to form a mini-batch.
73 | Pads input sequences to longest inputs sequence in the batch
74 | Pads all target sequences to longest sequence in mini-batch with constraint
75 |
76 | Args:
77 | batch:
78 |
79 | Returns:
80 |
81 | """
82 |
83 | graphemes_lengths = list(map(lambda x: len(x[0]), batch))
84 | phonemes_lengths = list(map(lambda x: len(x[1]), batch))
85 | max_word_length = max(graphemes_lengths)
86 | max_phonemes_length = max(phonemes_lengths)
87 | batch_size = len(batch)
88 |
89 | graphemes_inputs = torch.LongTensor(batch_size, max_word_length).zero_() + GRAPHEMES_PADDING
90 | phonemes_targets = torch.LongTensor(batch_size, max_phonemes_length).zero_() + PHONEMES_PADDING
91 | graphemes_length = torch.LongTensor(batch_size).zero_()
92 | phonemes_length = torch.LongTensor(batch_size).zero_()
93 |
94 | for x in range(batch_size):
95 | sample = batch[x]
96 | graphemes, phonemes = sample
97 |
98 | graphemes = torch.from_numpy(np.asarray(graphemes)).long()
99 |
100 | phonemes = torch.from_numpy(np.asarray(phonemes)).long()
101 |
102 | graphemes_seq_length = graphemes.size(0)
103 | phonemes_seq_length = phonemes.size(0)
104 |
105 | graphemes_inputs[x].narrow(0, 0, graphemes_seq_length).copy_(graphemes)
106 | graphemes_length[x] = graphemes_seq_length
107 | phonemes_length[x] = phonemes_seq_length
108 |
109 | phonemes_targets[x].narrow(0, 0, phonemes_seq_length).copy_(phonemes)
110 |
111 | return graphemes_inputs, graphemes_length, phonemes_targets, phonemes_length
112 |
--------------------------------------------------------------------------------
/ukro_g2p/models/modules.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | import torch
4 |
5 | # TODO remove config, send par explicitly
6 |
7 |
8 | class Encoder(nn.Module):
9 |
10 | def __init__(self, config):
11 | super(Encoder, self).__init__()
12 |
13 | self.vocab_size = config.encoder_vocab_size
14 | self.padding_idx = config.encoder_padding_idx
15 |
16 | self.d_embed = config.encoder_d_embed
17 | self.d_hidden = config.encoder_d_hidden
18 | self.num_layers = config.encoder_n_layers
19 | self.bidirectional = config.encoder_bidirectional
20 |
21 | self.embedding = nn.Embedding(self.vocab_size, self.d_embed, padding_idx=self.padding_idx)
22 | self.lstm = nn.LSTM(self.d_embed, self.d_hidden // 2 if self.bidirectional else self.d_hidden, self.num_layers,
23 | batch_first=True, bidirectional=self.bidirectional)
24 |
25 | def forward(self, x, x_length):
26 |
27 | x = self.embedding(x) # B x T x D
28 | x = nn.utils.rnn.pack_padded_sequence(x, x_length, batch_first=True, enforce_sorted=False)
29 |
30 | out, hc = self.lstm(x)
31 |
32 | out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
33 |
34 | if self.bidirectional:
35 | # stacking hidden and cell from forward and backward layers
36 | hc = tuple(torch.cat((hc_[0::2, :, :], hc_[1::2, :, :]), 2) for hc_ in hc)
37 |
38 | return out, hc
39 |
40 |
41 | class Decoder(nn.Module):
42 |
43 | def __init__(self, config):
44 | super(Decoder, self).__init__()
45 |
46 | self.vocab_size = config.decoder_vocab_size
47 | self.padding_idx = config.decoder_padding_idx
48 |
49 | self.d_embed = config.decoder_d_embed
50 | self.d_hidden = config.decoder_d_hidden
51 | self.num_layers = config.decoder_n_layers
52 |
53 | self.embedding = nn.Embedding(self.vocab_size, self.d_embed, padding_idx=self.padding_idx)
54 | self.lstm = nn.LSTM(self.d_embed, self.d_hidden, self.num_layers, batch_first=True)
55 | if config.attention:
56 | self.attn = Attention(self.d_hidden)
57 | else:
58 | self.attn = None
59 |
60 | self.linear = nn.Linear(self.d_hidden, self.vocab_size)
61 |
62 | def forward(self, y, y_length, hc, context=None):
63 |
64 | batch_size, seq_len = y.size()
65 |
66 | y = self.embedding(y) # B x T x D
67 | y = nn.utils.rnn.pack_padded_sequence(y, y_length, batch_first=True, enforce_sorted=False)
68 |
69 | out, hc = self.lstm(y, hc)
70 |
71 | out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
72 |
73 | if self.attn:
74 | out = self.attn(out, context)
75 |
76 | out = out.contiguous()
77 |
78 | out = self.linear(out.view(-1, out.size(2)))
79 |
80 | return F.log_softmax(out, dim=1).view(batch_size, -1, out.size(1)), hc
81 |
82 |
83 | class Attention(nn.Module):
84 | """Dot global attention from https://arxiv.org/abs/1508.04025"""
85 |
86 | def __init__(self, dim):
87 | super(Attention, self).__init__()
88 | self.linear = nn.Linear(dim * 2, dim, bias=False)
89 |
90 | def forward(self, x, context):
91 |
92 | batch_size, seq_len, _ = x.size()
93 |
94 | attn = F.softmax(x.bmm(context.transpose(1, 2)), dim=2)
95 | weighted_context = attn.bmm(context)
96 |
97 | o = self.linear(torch.cat((x, weighted_context), 2).view(batch_size * seq_len, -1))
98 | return torch.tanh(o).view(batch_size, seq_len, -1)
99 |
100 |
101 | class Beam(object):
102 | """Ordered beam of candidate outputs"""
103 |
104 | def __init__(self, config):
105 | """Initialize params"""
106 | self.size = config.beam_size
107 | self.done = False
108 | self.pad = config.decoder_padding_idx
109 | self.bos = config.decoder_bos_idx
110 | self.eos = config.decoder_eos_idx
111 | self.tt = torch.cuda if config.use_cuda else torch
112 |
113 | # The score for each translation on the beam.
114 | self.scores = self.tt.FloatTensor(self.size).zero_()
115 |
116 | # The backpointers at each time-step.
117 | self.prevKs = []
118 |
119 | # The outputs at each time-step.
120 | self.nextYs = [self.tt.LongTensor(self.size).fill_(self.pad)]
121 | self.nextYs[0][0] = self.bos
122 |
123 | # Get the outputs for the current timestep.
124 | def get_current_state(self):
125 | """Get state of beam."""
126 | return self.nextYs[-1]
127 |
128 | # Get the backpointers for the current timestep.
129 | def get_current_origin(self):
130 | """Get the backpointer to the beam at this step."""
131 | return self.prevKs[-1]
132 |
133 | def advance(self, workd_lk):
134 | """Advance the beam."""
135 | num_words = workd_lk.size(1)
136 |
137 | # Sum the previous scores.
138 | if len(self.prevKs) > 0:
139 | beam_lk = workd_lk + self.scores.unsqueeze(1).expand_as(workd_lk)
140 | else:
141 | beam_lk = workd_lk[0]
142 |
143 | flat_beam_lk = beam_lk.view(-1)
144 |
145 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True)
146 | self.scores = best_scores
147 |
148 | # best_scores_id is flattened beam x word array, so calculate which
149 | # word and beam each score came from
150 | prev_k = best_scores_id // num_words
151 | self.prevKs.append(prev_k)
152 | self.nextYs.append(best_scores_id - prev_k * num_words)
153 | # End condition is when n-best are EOS.
154 | if self.nextYs[-1][0] == self.eos:
155 | self.done = True
156 | return self.done
157 |
158 | def get_hyp(self, k):
159 | """Get hypotheses."""
160 | hyp = []
161 | for j in range(len(self.prevKs) - 1, -1, -1):
162 | hyp.append(self.nextYs[j + 1][k])
163 | k = self.prevKs[j][k]
164 | return hyp[::-1]
165 |
--------------------------------------------------------------------------------
/ukro_g2p/models/g2p_model.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | from torch.autograd import Variable
4 | import numpy as np
5 | from ukro_g2p.models.modules import Encoder, Decoder
6 | from ukro_g2p.models.modules import Beam
7 | from pathlib import Path
8 | import configparser
9 | from collections import namedtuple
10 | from torch.utils import model_zoo
11 |
12 | model_obj = namedtuple("model_obj", ["url", "config_path"])
13 |
14 | pretrained_models = {
15 | "ukro-base-uncased": model_obj(
16 | url="https://github.com/kosti4ka/ukro_g2p/releases/download/ukro_base_uncased_v.0.1/ukro_base_uncased-epoch-99-d545c0d.th",
17 | config_path=Path(__file__).parent / "../configs/ukro_base_uncased.config",
18 | )
19 | }
20 |
21 |
22 | class G2PConfig(dict):
23 |
24 | def __init__(self, model_config_file):
25 | super(G2PConfig, self).__init__()
26 |
27 | self.use_cuda = False
28 | self.model_path = None
29 |
30 | # reading config file
31 | config_file = configparser.ConfigParser()
32 | config_file.read(model_config_file, encoding='utf8')
33 |
34 | self.padding = config_file['VocabConfig']['padding']
35 | # TODO simplify this part - use same bos, eos symbol for both phonemes and graphemes
36 | self.decoder_bos = config_file['VocabConfig']['phoneme_bos']
37 | self.decoder_eos = config_file['VocabConfig']['phoneme_eos']
38 | # reading graphemes
39 | self.graphemes = config_file['VocabConfig']['graphemes'].split()
40 | self.encoder_vocab_size = len(self.graphemes)
41 | self.encoder_padding_idx = self.graphemes.index(self.padding)
42 | # reading phonemes
43 | self.phonemes = config_file['VocabConfig']['phonemes'].split()
44 | self.decoder_vocab_size = len(self.phonemes)
45 | self.decoder_padding_idx = self.phonemes.index(self.padding)
46 | self.decoder_bos_idx = self.phonemes.index(self.decoder_bos)
47 | self.decoder_eos_idx = self.phonemes.index(self.decoder_eos)
48 | # reading human phonemes
49 | self.human_phonemes = config_file['VocabConfig']['human_phonemes'].split()
50 |
51 | # encoder config
52 | self.encoder_d_embed = int(config_file['EncoderConfig']['encoder_d_embed'])
53 | self.encoder_d_hidden = int(config_file['EncoderConfig']['encoder_d_hidden'])
54 | self.encoder_n_layers = int(config_file['EncoderConfig']['encoder_n_layers'])
55 | self.encoder_bidirectional = True if config_file['EncoderConfig']['encoder_bidirectional'].lower() == 'true' else False
56 |
57 | # decoder config
58 | self.decoder_d_embed = int(config_file['DecoderConfig']['decoder_d_embed'])
59 | self.decoder_d_hidden = int(config_file['DecoderConfig']['decoder_d_hidden'])
60 | self.decoder_n_layers = int(config_file['DecoderConfig']['decoder_n_layers'])
61 | self.attention = True if config_file['DecoderConfig']['attention'] == 'True' else False
62 |
63 | # generator
64 | self.beam_size = int(config_file['GeneratorConfig']['beam_size'])
65 | self.max_generate_len = int(config_file['GeneratorConfig']['max_generate_len'])
66 |
67 | # optimizer
68 | self.lr = float(config_file['OptimizerConfig']['lr'])
69 | self.weight_decay = float(config_file['OptimizerConfig']['weight_decay'])
70 |
71 |
72 | class PreTrainedG2PModel(nn.Module):
73 | def __init__(self, config):
74 | super(PreTrainedG2PModel, self).__init__()
75 | self.config = config
76 |
77 | @classmethod
78 | def from_pretrained(cls, model_name):
79 |
80 | if model_name not in pretrained_models:
81 | raise ValueError
82 |
83 | # load config
84 | config = G2PConfig(pretrained_models[model_name].config_path) # TODO add metod from_file
85 |
86 | # instantiate model
87 | model = cls(config)
88 |
89 | # loading weights
90 | state_dict = model_zoo.load_url(pretrained_models[model_name].url,
91 | progress=True, map_location=lambda storage, loc: storage)
92 | model.load_state_dict(state_dict)
93 |
94 | return model
95 |
96 |
97 | class G2PModel(PreTrainedG2PModel):
98 |
99 | def __init__(self, config):
100 | super(G2PModel, self).__init__(config)
101 |
102 | # init
103 | self.config = config
104 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
105 |
106 | # encoder
107 | self.encoder = Encoder(config)
108 |
109 | # decoder
110 | self.decoder = Decoder(config)
111 | self.attention = config.attention
112 |
113 | # generator
114 | self.beam_size = config.beam_size
115 | self.max_generate_len = config.max_generate_len
116 |
117 | def forward(self, x, x_length, y=None, p_length=None, n_best=1):
118 | # TODO rewrite desscription
119 |
120 | encoder_out, encoder_hc = self.encoder(x, x_length)
121 |
122 | if y is not None:
123 | out = self.decoder(y, p_length, encoder_hc, context=encoder_out if self.attention else None)
124 | else:
125 | out = self._generate(encoder_hc, context=encoder_out if self.attention else None)
126 |
127 | return out
128 |
129 | def _generate(self, hc, context=None):
130 | beam = Beam(self.config)
131 | h, c = hc
132 | # Make a beam_size batch.
133 | h = h.expand(h.size(1), beam.size, h.size(2))
134 | c = c.expand(c.size(1), beam.size, c.size(2))
135 | if context is not None:
136 | context = context.expand(beam.size, context.size(1), context.size(2))
137 | p_length = Variable(torch.from_numpy(np.array([1])))
138 | p_length = p_length.expand(beam.size).to(self.device)
139 |
140 | for i in range(self.max_generate_len):
141 | x = beam.get_current_state()
142 | o, hc = self.decoder(Variable(x.unsqueeze(1)).to(self.device), p_length, (h, c), context=context)
143 | if beam.advance(o.data.squeeze(1)):
144 | break
145 | h, c = hc
146 | h.data.copy_(h.data.index_select(1, beam.get_current_origin()))
147 | c.data.copy_(c.data.index_select(1, beam.get_current_origin()))
148 | return torch.LongTensor(beam.get_hyp(0)).unsqueeze(0)
149 |
150 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Ukrainian G2P model
2 |
3 | ## Installation
4 |
5 | `pip install -U ukro-g2p`
6 |
7 | ## Example inference
8 |
9 | ```python
10 | from ukro_g2p.predict import G2P
11 |
12 | g2p = G2P('ukro-base-uncased')
13 |
14 | #ARPABET format
15 | g2p('фонетика')
16 |
17 | #human readable format
18 | g2p('фонетика', human_readable=True)
19 | ```
20 |
21 | Jupyter notebook with the example: [](https://colab.research.google.com/drive/1bptBFKWtuBVVbAA_e_WB5tL-N4tJ4xyq#scrollTo=JGG5NcltvXTx?usp=sharing)
22 |
23 | ## Web app
24 | https://ukro-g2p.herokuapp.com
25 |
26 | Code for the web app: https://github.com/kosti4ka/ukro_g2p_demo
27 |
28 | ## Ukrainian phonology symbols
29 |
30 | ### Голосні
31 |
32 |
33 |
34 | | Ukrainian |
35 | ARPABET-like |
36 |
37 |
38 |
39 |
40 | | [і] |
41 | IY |
42 |
43 |
44 | | [и] |
45 | IH |
46 |
47 |
48 | | [е] |
49 | EH |
50 |
51 |
52 | | [у] |
53 | UH |
54 |
55 |
56 | | [о] |
57 | AO |
58 |
59 |
60 | | [а] |
61 | AA |
62 |
63 |
64 |
65 |
66 | | Наближення |
67 |
68 |
69 |
70 |
71 | | [еи] |
72 | EIH |
73 |
74 |
75 | | [еі] |
76 | EIY |
77 |
78 |
79 | | [ие] |
80 | IHE |
81 |
82 |
83 | | [оу] |
84 | AOU |
85 |
86 |
87 |
88 |
89 | | Наголос |
90 |
91 |
92 |
93 |
94 | | [ ́ ] |
95 | 1 |
96 |
97 |
98 |
99 |
100 | ### Приголосні
101 |
102 |
103 |
104 | | Ukrainian |
105 | ARPABET-like |
106 |
107 |
108 |
109 |
110 | | [б] |
111 | B |
112 |
113 |
114 | | [в] |
115 | V |
116 |
117 |
118 | | [г] |
119 | H |
120 |
121 |
122 | | [д] |
123 | D |
124 |
125 |
126 | | [дж] |
127 | JH |
128 |
129 |
130 | | [дз] |
131 | DZ |
132 |
133 |
134 | | [ж] |
135 | ZH |
136 |
137 |
138 | | [з] |
139 | Z |
140 |
141 |
142 | | [й] |
143 | Y |
144 |
145 |
146 | | [к] |
147 | K |
148 |
149 |
150 | | [л] |
151 | L |
152 |
153 |
154 | | [м] |
155 | M |
156 |
157 |
158 | | [н] |
159 | N |
160 |
161 |
162 | | [п] |
163 | P |
164 |
165 |
166 | | [р] |
167 | R |
168 |
169 |
170 | | [с] |
171 | S |
172 |
173 |
174 | | [т] |
175 | T |
176 |
177 |
178 | | [ф] |
179 | F |
180 |
181 |
182 | | [х] |
183 | X |
184 |
185 |
186 | | [ц] |
187 | TS |
188 |
189 |
190 | | [ч] |
191 | CH |
192 |
193 |
194 | | [ш] |
195 | SH |
196 |
197 |
198 | | [ґ] |
199 | G |
200 |
201 |
202 | | [ў] |
203 | WH |
204 |
205 |
206 |
207 |
208 | | М'які |
209 |
210 |
211 |
212 |
213 | | [д’] |
214 | DJ |
215 |
216 |
217 | | [дз’] |
218 | DZJ |
219 |
220 |
221 | | [з’] |
222 | ZJ |
223 |
224 |
225 | | [л’] |
226 | LJ |
227 |
228 |
229 | | [н’] |
230 | NJ |
231 |
232 |
233 | | [р’] |
234 | RJ |
235 |
236 |
237 | | [с’] |
238 | SJ |
239 |
240 |
241 | | [т’] |
242 | TJ |
243 |
244 |
245 | | [ц’] |
246 | TSJ |
247 |
248 |
249 |
250 |
251 | | Пом'якшення |
252 |
253 |
254 |
255 |
256 | | [`] |
257 | 2 |
258 |
259 |
260 |
261 |
262 | | Подовження |
263 |
264 |
265 |
266 |
267 | | [:] |
268 | 3 |
269 |
270 |
271 |
272 |
--------------------------------------------------------------------------------
/ukro_g2p/trainer.py:
--------------------------------------------------------------------------------
1 | import time
2 | from collections import OrderedDict
3 |
4 | from pathlib import Path
5 |
6 | import torch
7 | from torch.autograd import Variable
8 | from torch.utils.data import DataLoader
9 | from ukro_g2p.predict import _collate_fn
10 | from torch.optim.lr_scheduler import ReduceLROnPlateau
11 |
12 | # from text import text_to_sequence
13 | from ukro_g2p.predict import infolog
14 | from tensorboardX import SummaryWriter
15 | from tqdm import tqdm
16 |
17 | log = infolog.log
18 |
19 |
20 | class Trainer(object):
21 | def __init__(self, model, datasets, optimizer, loss, epochs, batch_size, experiment_name, logging_freq,
22 | restore_epoch=-1):
23 |
24 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
25 | self.epochs = epochs
26 | self.experiment_name = experiment_name
27 | self.logging_freq = logging_freq
28 | self.model = model.to(self.device)
29 | self.restore_epoch = restore_epoch
30 |
31 | self.optimizer = optimizer
32 | self.loss = loss
33 | # TODO remove scheduler from trainer object
34 | self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.1, patience=5, verbose=True)
35 |
36 | self.global_batch_index = 0
37 | self.global_batch_index_dev = 0
38 | self.datasets = datasets
39 | self.dataloaders = OrderedDict()
40 | for data_name in datasets:
41 | self.dataloaders[data_name] = DataLoader(datasets[data_name], batch_size=batch_size,
42 | shuffle=True if data_name == 'train' else False,
43 | collate_fn=_collate_fn,
44 | num_workers=8)
45 |
46 |
47 | self.experiment_dir = Path('./exp') / self.experiment_name
48 | self.log_dir = self.experiment_dir / 'logs'
49 | self.log_path = self.log_dir / 'log.txt'
50 | self.checkpoint_dir = self.experiment_dir / 'checkpoints'
51 |
52 | self.experiment_dir.mkdir(parents=True, exist_ok=True)
53 | self.log_dir.mkdir(parents=True, exist_ok=True)
54 | self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
55 |
56 | # initialize logger
57 | infolog.init(self.log_path, self.experiment_name)
58 | # initialize TensorBoard writer
59 | self.writer = SummaryWriter(log_dir=self.log_dir)
60 |
61 | if self.restore_epoch != -1:
62 | self.load_from_checkpoint(epoch=self.restore_epoch)
63 |
64 | def train_and_validate(self):
65 | for e in range(self.restore_epoch + 1, self.epochs):
66 | for ds, dataloader in self.dataloaders.items():
67 | if ds != 'train':
68 | self.model.eval() # turn off batchnorm /dropout
69 | self.run_epoch(dataloader, dataset_name=ds, epoch=e)
70 | self.model.train() # turn on batchnorm /dropout
71 | self.save_to_checkpoint(epoch=e)
72 |
73 | def run_epoch(self, dataloader, dataset_name, epoch):
74 | begin_time = time.time()
75 | train = dataset_name == 'train'
76 | num_batches = len(dataloader)
77 | avg_loss = 0
78 |
79 | progress_bar = tqdm(total=num_batches)
80 | for batch_idx, batch in enumerate(dataloader):
81 | word_inputs, input_length, pron_targets, p_length = batch
82 |
83 | word_inputs = Variable(word_inputs).to(self.device)
84 | input_length = Variable(input_length).to(self.device)
85 | pron_targets = Variable(pron_targets).to(self.device)
86 | p_length = Variable(p_length).to(self.device)
87 |
88 | p_preds, _ = self.model(word_inputs, input_length, pron_targets[:, :-1], p_length - 1)
89 | targets = pron_targets[:, 1:].contiguous()
90 |
91 | loss = self.loss(p_preds.view(p_preds.size(0) * p_preds.size(1), -1),
92 | targets.view(targets.size(0) * targets.size(1)))
93 |
94 | loss_floats = loss.data.cpu().item()
95 | avg_loss += loss_floats
96 |
97 | progress_bar.set_description(f'{dataset_name}: {epoch}/{self.epochs}, current loss: {round(loss_floats,4)}')
98 | progress_bar.refresh()
99 | progress_bar.update()
100 |
101 | if train:
102 | self.global_batch_index += 1
103 | self.writer.add_scalar(dataset_name, loss_floats, self.global_batch_index)
104 | self.optimizer.zero_grad()
105 | loss.backward()
106 | self.optimizer.step()
107 | # if (batch_idx + 1) % self.logging_freq == 0:
108 | # log_str = self.construct_logging_str(loss_floats, epoch, num_batches, batch_idx + 1)
109 | # log(log_str)
110 | else:
111 | # TODO it is ugly part of code, shuold add global step explicitly
112 | self.global_batch_index_dev += 1
113 | self.writer.add_scalar(dataset_name, loss_floats, self.global_batch_index_dev)
114 |
115 | progress_bar.close()
116 |
117 | # add generated text to tensor board
118 | if not train:
119 | graphemes, graphemes_length, phonemes, _ = next(iter(dataloader))
120 |
121 | graphemes = graphemes.to(self.device)
122 | graphemes_length = graphemes_length.to(self.device)
123 | phonemes = phonemes.to(self.device)
124 |
125 | # phonemes_predictions = self.model(graphemes, graphemes_length).tolist()
126 | phonemes_targets = phonemes[:, 1:].contiguous().tolist()
127 |
128 | text = ''
129 | for idx in range(0, 5):
130 | phonemes_predictions = self.model(graphemes[idx].unsqueeze(0), graphemes_length[idx].unsqueeze(0)).tolist()[0]
131 |
132 | graphemes_str = ''.join(
133 | [g for g in self.datasets[dataset_name].idx2graphemes(graphemes[idx].tolist()) if g not in ['', 'pad']])
134 | predictions_str = ' '.join(
135 | [p for p in self.datasets[dataset_name].idx2phonemes(phonemes_predictions) if p not in ['', '']])
136 | targets_str = ' '.join(
137 | [p for p in self.datasets[dataset_name].idx2phonemes(phonemes_targets[idx]) if p not in ['', '', 'pad']])
138 |
139 | text = text + graphemes_str + ' \n' + targets_str + ' \n' + predictions_str + ' \n'
140 |
141 | self.writer.add_text('Text', text, epoch)
142 |
143 | avg_loss /= num_batches
144 | end_time = time.time()
145 | if train:
146 | self.scheduler.step(avg_loss)
147 | log_str = 'Epoch: {}, {} loss: {:.5f} time: {:.2f} sec'
148 | log_str = log_str.format(epoch, dataset_name, avg_loss, end_time - begin_time)
149 | log(log_str)
150 |
151 | def load_from_checkpoint(self, epoch):
152 | checkpoint_file = f'{self.experiment_name}-epoch-{epoch}.th'
153 | checkpoint_path = self.checkpoint_dir / checkpoint_file
154 | self.model.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
155 | print('model loaded from checkpoint: {}'.format(checkpoint_file))
156 |
157 | def save_to_checkpoint(self, epoch):
158 | checkpoint_file = f'{self.experiment_name}-epoch-{epoch}.th'
159 | checkpoint_path = self.checkpoint_dir / checkpoint_file
160 | if self.checkpoint_dir.exists():
161 | torch.save(self.model.state_dict(), checkpoint_path)
162 | print(f'model saved to checkpoint: {checkpoint_path}')
163 | else:
164 | raise FileNotFoundError
165 |
166 | @staticmethod
167 | def construct_logging_str(loss, epoch, total_batches, idx):
168 | tmpstr = 'Epoch:{:2} Batch:[{:3}/{:3}] Loss: {:.4f}'
169 | tmpstr = tmpstr.format(epoch, idx, total_batches, loss)
170 | return tmpstr
171 |
--------------------------------------------------------------------------------