├── ukro_g2p ├── models │ ├── __init__.py │ ├── modules.py │ └── g2p_model.py ├── utils │ ├── __init__.py │ ├── lexicon_util.py │ ├── infolog.py │ └── util.py ├── datasets │ ├── __init__.py │ └── lexicon_datasets.py ├── __init__.py ├── configs │ └── ukro_base_uncased.config ├── tokenization.py ├── train.py ├── predict.py ├── test.py └── trainer.py ├── .gitignore ├── .gitattributes ├── MANIFEST.in ├── setup.py └── README.md /ukro_g2p/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ukro_g2p/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ukro_g2p/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ukro_g2p/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.5" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data/ 2 | /exp/ 3 | /trained_models/ 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.th filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ukro_g2p/configs/ukro_base_uncased.config 2 | -------------------------------------------------------------------------------- /ukro_g2p/utils/lexicon_util.py: -------------------------------------------------------------------------------- 1 | from ukro_g2p.utils.util import load_list 2 | 3 | 4 | def read_lexicon_dataset(lexicon_dataset_path): 5 | """ 6 | Read lexicon out of file to the dict in format - "id word pronunciation" 7 | :param lexicon_dataset_path: path to the lexicon dataset text file 8 | :return: lexicon dictionary 9 | """ 10 | 11 | lexicon = {x.split()[0]: ' '.join(x.split()[1:]) for x in load_list(lexicon_dataset_path)} 12 | 13 | return lexicon 14 | -------------------------------------------------------------------------------- /ukro_g2p/utils/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | 4 | 5 | _format = '%Y-%m-%d %H:%M:%S.%f' 6 | _file = None 7 | _run_name = None 8 | 9 | 10 | def init(filename, run_name): 11 | global _file, _run_name 12 | _close_logfile() 13 | _file = open(filename, 'a') 14 | _file.write('\n-----------------------------------------------------------------\n') 15 | _file.write('Starting new training run\n') 16 | _file.write('-----------------------------------------------------------------\n') 17 | _run_name = run_name 18 | 19 | 20 | def log(msg): 21 | print(msg) 22 | if _file is not None: 23 | _file.write(f'[{datetime.now().strftime(_format)[:-3]}] {msg}\n') 24 | 25 | 26 | def _close_logfile(): 27 | global _file 28 | if _file is not None: 29 | _file.close() 30 | _file = None 31 | 32 | 33 | atexit.register(_close_logfile) 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from os import path 3 | 4 | 5 | this_directory = path.abspath(path.dirname(__file__)) 6 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 7 | long_description = f.read() 8 | 9 | 10 | setup( 11 | name="ukro_g2p", 12 | version="0.1.5", 13 | author="Kostiantyn Pylypenko", 14 | author_email="k.pylypenko@hotmail.com", 15 | description="NN based grapheme to phoneme model for Ukrainian language", 16 | license="MIT", 17 | keywords="Ukrainian grapheme to phoneme", 18 | url="https://github.com/kosti4ka/ukro_g2p", 19 | packages=find_packages(), 20 | long_description=long_description, 21 | long_description_content_type='text/markdown', 22 | include_package_data=True, 23 | classifiers=[ 24 | "License :: OSI Approved :: MIT License", 25 | "Intended Audience :: Developers", 26 | "Intended Audience :: Science/Research", 27 | "Operating System :: OS Independent", 28 | "Programming Language :: Python", 29 | "Programming Language :: Python :: 3", 30 | "Topic :: Software Development :: Libraries", 31 | "Topic :: Software Development :: Libraries :: Python Modules", 32 | ], 33 | ) 34 | -------------------------------------------------------------------------------- /ukro_g2p/utils/util.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | from collections import OrderedDict 4 | import Levenshtein 5 | 6 | 7 | def read_json(fname): 8 | fname = Path(fname) 9 | with fname.open('rt') as handle: 10 | return json.load(handle, object_hook=OrderedDict) 11 | 12 | 13 | def load_list(fname): 14 | fname = Path(fname) 15 | return [x for x in open(fname, 'r', encoding='utf-8').read().split('\n') if x] 16 | 17 | 18 | def phoneme_error_rate(p_seq1, p_seq2): 19 | p_vocab = set(p_seq1 + p_seq2) 20 | p2c = dict(zip(p_vocab, range(len(p_vocab)))) 21 | c_seq1 = [chr(p2c[p]) for p in p_seq1] 22 | c_seq2 = [chr(p2c[p]) for p in p_seq2] 23 | return Levenshtein.distance(''.join(c_seq1), ''.join(c_seq2)) / len(c_seq2) 24 | 25 | 26 | def dump(iterable, file_name, append=False): 27 | 28 | # init paths and make sure the dir to write in exists 29 | file_name = Path(file_name) 30 | out_dir = file_name.parent 31 | out_dir.mkdir(parents=True, exist_ok=True) 32 | 33 | with open(file_name, 'a' if append else 'w', encoding='utf-8') as f: 34 | f.writelines(('\t'.join(str(ll) for ll in l) if type(l) != str and hasattr(l, '__iter__') else str(l)).rstrip('\r\n') + '\n' for l in ([iterable] if type(iterable) == str else iterable)) 35 | -------------------------------------------------------------------------------- /ukro_g2p/configs/ukro_base_uncased.config: -------------------------------------------------------------------------------- 1 | [VocabConfig] 2 | graphemes: _ ' - а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ь ю я є і ї ґ pad 3 | phonemes: AA AA1 AO AO1 AOU B B2 B23 B3 CH CH2 CH23 CH3 D D3 DJ DJ3 DZ DZJ EH EH1 EIH EIY F F2 F23 F3 G G2 H H2 H3 IH IH1 IHE IY IY1 JH JH2 K K2 K23 K3 L L3 LJ LJ3 M M2 M23 M3 N N3 NJ NJ3 P P2 P23 P3 R R3 RJ RJ3 S S3 SH SH2 SH23 SH3 SJ SJ3 T T3 TJ TJ3 TS TS3 TSJ TSJ3 UH UH1 V V2 V23 WH WH2 X X2 X3 Y Y3 Z Z3 ZH ZH2 ZH23 ZH3 ZJ ZJ3 pad 4 | human_phonemes: а а́ о о́ оу б б` б`: б: ч ч` ч`: ч: д д: д’ д’: дз дз’ е е́ еи еі ф ф` ф`: ф: ґ ґ` г г` г: и и́ ие і і́ дж дж` к к` к`: к: л л: л’ л’: м м` м`: м: н н: н’ н’: п п` п`: п: р р: р’ р’: с с: ш ш` ш`: ш: с’ с’: т т: т’ т’: ц ц: ц’ ц’: у у́ в в` в`: ў ў` х х` х: й й` з з: ж ж` ж`: ж: з’ з’: pad 5 | padding: pad 6 | phoneme_bos: 7 | phoneme_eos: 8 | 9 | [EncoderConfig] 10 | encoder_d_embed: 256 11 | encoder_d_hidden: 256 12 | encoder_n_layers: 1 13 | encoder_bidirectional: False 14 | 15 | [DecoderConfig] 16 | decoder_d_embed: 256 17 | decoder_d_hidden: 256 18 | decoder_n_layers: 1 19 | attention: False 20 | 21 | [GeneratorConfig] 22 | beam_size: 3 23 | max_generate_len: 22 24 | 25 | [OptimizerConfig] 26 | lr: 1e-3 27 | weight_decay: 1e-5 28 | -------------------------------------------------------------------------------- /ukro_g2p/tokenization.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from ukro_g2p.models.g2p_model import G2PConfig 3 | from collections import namedtuple 4 | 5 | tokenizer_obj = namedtuple("tokenizer_obj", ["config_path"]) 6 | 7 | pretrained_tokenizers = { 8 | "ukro-base-uncased": tokenizer_obj( 9 | config_path=Path(__file__).parent / "configs/ukro_base_uncased.config", 10 | ) 11 | } 12 | 13 | 14 | class G2PTokenizer(object): 15 | def __init__(self, config): 16 | 17 | self.g2idx = {g: idx for idx, g in enumerate(config.graphemes)} 18 | self.idx2p = {idx: p for idx, p in enumerate(config.phonemes)} 19 | self.idx2h = {idx: h for idx, h in enumerate(config.human_phonemes)} 20 | 21 | def tokenize_graphemes(self, word): 22 | return list(word) 23 | 24 | def convert_graphemes_to_ids(self, graphemes): 25 | return [self.g2idx[g] for g in graphemes] 26 | 27 | def convert_ids_to_phonemes(self, ids): 28 | return [self.idx2p[i] for i in ids if self.idx2p[i] not in ['', '']] 29 | 30 | def convert_ids_to_human_phonemes(self, ids): 31 | return [self.idx2h[i] for i in ids if self.idx2h[i] not in ['', '']] 32 | 33 | @classmethod 34 | def from_pretrained(cls, tokenizer_name): 35 | 36 | if tokenizer_name not in pretrained_tokenizers: 37 | raise ValueError 38 | 39 | # load config 40 | config = G2PConfig(pretrained_tokenizers[tokenizer_name].config_path) # TODO add metod from_file 41 | tokenizer = cls(config) 42 | 43 | return tokenizer 44 | -------------------------------------------------------------------------------- /ukro_g2p/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from ukro_g2p.datasets.lexicon_datasets import LexiconDataset 4 | from ukro_g2p.trainer import Trainer 5 | from torch.optim import Adam 6 | import torch.nn as nn 7 | 8 | from ukro_g2p.models.g2p_model import G2PConfig, G2PModel 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('-c', '--config', default=None, type=str, 13 | help='config file path (default: None)') 14 | parser.add_argument('-d', '--dataset', default=None, type=str, 15 | help='path to the dataset dir (default: None)') 16 | parser.add_argument('-n', '--experiment_name', default=None, type=str, 17 | help='path to the output dir (default: None)') 18 | parser.add_argument('-r', '--restore_epoch', default=-1, type=int, 19 | help='restore epoch number (default: -1)') 20 | 21 | args = parser.parse_args() 22 | 23 | config = G2PConfig(args.config) 24 | model = G2PModel(config) 25 | optimizer = Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) 26 | loss = nn.NLLLoss(ignore_index=config.decoder_padding_idx) 27 | datasets = {'train': LexiconDataset(args.dataset, split='train'), 28 | 'dev': LexiconDataset(args.dataset, split='dev')} 29 | 30 | trainer = Trainer(model, datasets, optimizer, loss, epochs=100, batch_size=256, 31 | experiment_name=args.experiment_name, 32 | logging_freq=10, restore_epoch=args.restore_epoch) 33 | trainer.train_and_validate() 34 | -------------------------------------------------------------------------------- /ukro_g2p/predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | from torch.autograd import Variable 4 | 5 | from ukro_g2p.models.g2p_model import G2PModel, pretrained_models 6 | from ukro_g2p.tokenization import G2PTokenizer 7 | 8 | 9 | class G2P(object): 10 | def __init__(self, model_name): 11 | super().__init__() 12 | 13 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 14 | 15 | self.model = G2PModel.from_pretrained(model_name).to(self.device) 16 | self.tokenizer = G2PTokenizer.from_pretrained(model_name) 17 | self.model.eval() 18 | 19 | def __call__(self, word, human_readable=False): 20 | 21 | # map word's chars into graphemes idx 22 | graphemes = self.tokenizer.tokenize_graphemes(word) 23 | graphemes = self.tokenizer.convert_graphemes_to_ids(graphemes) 24 | g_length = [len(graphemes)] 25 | 26 | graphemes = Variable(torch.LongTensor(graphemes).unsqueeze(0)).to(self.device) 27 | g_length = Variable(torch.LongTensor(g_length)).to(self.device) 28 | 29 | phonemes = self.model(graphemes, g_length).tolist()[0] 30 | 31 | return self.tokenizer.convert_ids_to_phonemes(phonemes) if not human_readable else self.tokenizer.convert_ids_to_human_phonemes(phonemes) 32 | 33 | 34 | if __name__ == '__main__': 35 | # argument parser 36 | parser = argparse.ArgumentParser() 37 | 38 | # required 39 | parser.add_argument('word', type=str, help='Word to generate pronunciation for') 40 | parser.add_argument('-m', '--model_name', required=False, type=str, default='ukro-base-uncased', 41 | choices=pretrained_models.keys(), 42 | help='Trained model name') 43 | parser.add_argument('-hr', '--human_readable', required=False, action='store_true', 44 | help='Output human readable set of phonemes') 45 | 46 | # parse 47 | script_args = parser.parse_args() 48 | 49 | g2p = G2P('ukro-base-uncased') 50 | pron = g2p(script_args.word, human_readable=True if script_args.human_readable else False) 51 | 52 | print(f"{' '.join(pron)}") 53 | -------------------------------------------------------------------------------- /ukro_g2p/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from torch.utils.data import DataLoader 5 | from tqdm import tqdm 6 | 7 | from ukro_g2p.datasets.lexicon_datasets import LexiconDataset, _collate_fn 8 | from ukro_g2p.models.g2p_model import G2PConfig, G2PModel 9 | from ukro_g2p.utils.util import phoneme_error_rate 10 | 11 | from pathlib import Path 12 | from ukro_g2p.utils.util import dump 13 | 14 | 15 | def main(model, dataset, resume, out_dir): 16 | 17 | # init paths 18 | out_dir = Path(out_dir) 19 | ref_path = out_dir / 'ref' 20 | hyp_path = out_dir / 'hyp' 21 | scores_path = out_dir / 'scores' 22 | 23 | # make dir and files 24 | out_dir.mkdir(parents=True, exist_ok=True) 25 | try: 26 | ref_path.unlink() 27 | hyp_path.unlink() 28 | scores_path.unlink() 29 | except: 30 | pass 31 | 32 | # setup data_loader instances 33 | data_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=_collate_fn, num_workers=1) 34 | 35 | # load model weights 36 | checkpoint = torch.load(resume, map_location=lambda storage, loc: storage) 37 | model.load_state_dict(checkpoint) 38 | 39 | # prepare model for testing 40 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 41 | model = model.to(device) 42 | model.eval() 43 | 44 | # init mertic values 45 | avr_per = 0 46 | total_phonemes_length = 0 47 | num_batches = len(data_loader) 48 | 49 | with torch.no_grad(): 50 | for batch_idx, batch in tqdm(enumerate(data_loader), total=num_batches): 51 | graphemes, graphemes_length, phonemes, _ = batch 52 | 53 | graphemes = graphemes.to(device) 54 | graphemes_length = graphemes_length.to(device) 55 | phonemes = phonemes.to(device) 56 | 57 | phonemes_predictions = model(graphemes, graphemes_length).tolist() 58 | phonemes_targets = phonemes[:, 1:].contiguous().tolist() 59 | 60 | for predictions, targets, input_graphemes in zip(phonemes_predictions, phonemes_targets, graphemes): 61 | targets_length = len(targets) 62 | per = phoneme_error_rate(predictions, targets) 63 | avr_per += per * targets_length 64 | total_phonemes_length += targets_length 65 | 66 | # saving hyp, ref and scores 67 | graphemes_str = ''.join([g for g in dataset.idx2graphemes(input_graphemes.tolist()) if g != '']) 68 | predictions_str = ' '.join([p for p in dataset.idx2phonemes(predictions) if p not in ['', '']]) 69 | targets_str = ' '.join([p for p in dataset.idx2phonemes(targets) if p not in ['', '']]) 70 | dump(f'{graphemes_str}\t{predictions_str}', hyp_path, append=True) 71 | dump(f'{graphemes_str}\t{targets_str}', ref_path, append=True) 72 | dump(f'{round(per, 4)}\t{targets_str}\t{predictions_str}', scores_path, append=True) 73 | 74 | avr_per /= total_phonemes_length 75 | log_str = f'Phoneme Error Rate: {round(avr_per, 4)}' 76 | dump(log_str, scores_path, append=True) 77 | print(log_str) 78 | 79 | 80 | if __name__ == '__main__': 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument('-c', '--config', default=None, type=str, 83 | help='config file path (default: None)') 84 | parser.add_argument('-d', '--dataset', default=None, type=str, 85 | help='path to the dataset dir (default: None)') 86 | parser.add_argument('-r', '--resume', default=None, type=str, 87 | help='path to latest checkpoint (default: None)') 88 | parser.add_argument('-o', '--out_dir', default=None, type=str, 89 | help='path to the output dir (default: None)') 90 | 91 | args = parser.parse_args() 92 | 93 | config = G2PConfig(args.config) 94 | model = G2PModel(config) 95 | dataset = LexiconDataset(args.dataset, split='test') 96 | 97 | main(model, dataset, args.resume, args.out_dir) 98 | -------------------------------------------------------------------------------- /ukro_g2p/datasets/lexicon_datasets.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from pathlib import Path 3 | from ukro_g2p.utils.lexicon_util import read_lexicon_dataset 4 | from ukro_g2p.utils.util import load_list 5 | import torch 6 | import numpy as np 7 | 8 | # TODO put this into config 9 | GRAPHEMES_PADDING = 37 10 | PHONEMES_PADDING = 99 11 | 12 | 13 | class LexiconDataset(Dataset): 14 | def __init__(self, data_root, split='train'): 15 | self.data_root = Path(data_root) 16 | self.lexicon_dataset_path = self.data_root / 'data.txt' 17 | if split in ['train', 'dev', 'test']: 18 | self.data_keys_path = self.data_root / f'{split}_set' 19 | else: 20 | raise KeyError 21 | #TODO take it from config file 22 | self.phonemes_path = self.data_root / 'phones' 23 | self.letters_path = self.data_root / 'letters' 24 | 25 | self.lexicon = read_lexicon_dataset(self.lexicon_dataset_path) 26 | self.data_keys = load_list(self.data_keys_path) 27 | self.phoneme2idx = {p.split()[1]: int(p.split()[0]) for p in load_list(self.phonemes_path)} 28 | self.phoneme2idx['pad'] = len(self.phoneme2idx) 29 | self.phoneme2idx[''] = len(self.phoneme2idx) 30 | self.phoneme2idx[''] = len(self.phoneme2idx) 31 | self.idx2phoneme = dict((v, k) for k, v in self.phoneme2idx.items()) 32 | self.grapheme2idx = {p.split()[1]: int(p.split()[0]) for p in load_list(self.letters_path)} 33 | self.grapheme2idx['pad'] = len(self.grapheme2idx) 34 | self.grapheme2idx[''] = len(self.grapheme2idx) 35 | self.grapheme2idx[''] = len(self.grapheme2idx) 36 | self.idx2grapheme = dict((v, k) for k, v in self.grapheme2idx.items()) 37 | 38 | def __getitem__(self, idx): 39 | key = self.data_keys[idx] 40 | datapoint = self.lexicon[key] 41 | 42 | graphemes = datapoint.split()[0] 43 | phonemes = datapoint.split()[1:] 44 | 45 | graphemes_idx = self.graphemes2idx(graphemes) 46 | phonemes_idx = self.phonemes2idx(phonemes) 47 | return graphemes_idx, phonemes_idx 48 | 49 | def __len__(self): 50 | return len(self.data_keys) 51 | 52 | def graphemes2idx(self, graphemes): 53 | # graphemes_idx = [self.grapheme2idx['']] 54 | graphemes_idx = [self.grapheme2idx[g] for g in graphemes] 55 | return graphemes_idx 56 | 57 | def phonemes2idx(self, phonemes): 58 | phonemes_idx = [self.phoneme2idx['']] 59 | phonemes_idx.extend([self.phoneme2idx[p] for p in phonemes]) 60 | phonemes_idx.append(self.phoneme2idx['']) 61 | return phonemes_idx 62 | 63 | def idx2graphemes(self, idx): 64 | return [self.idx2grapheme[g] for g in idx] 65 | 66 | def idx2phonemes(self, idx): 67 | return [self.idx2phoneme[p] for p in idx] 68 | 69 | 70 | def _collate_fn(batch): 71 | """ 72 | Merges list of samples to form a mini-batch. 73 | Pads input sequences to longest inputs sequence in the batch 74 | Pads all target sequences to longest sequence in mini-batch with constraint 75 | 76 | Args: 77 | batch: 78 | 79 | Returns: 80 | 81 | """ 82 | 83 | graphemes_lengths = list(map(lambda x: len(x[0]), batch)) 84 | phonemes_lengths = list(map(lambda x: len(x[1]), batch)) 85 | max_word_length = max(graphemes_lengths) 86 | max_phonemes_length = max(phonemes_lengths) 87 | batch_size = len(batch) 88 | 89 | graphemes_inputs = torch.LongTensor(batch_size, max_word_length).zero_() + GRAPHEMES_PADDING 90 | phonemes_targets = torch.LongTensor(batch_size, max_phonemes_length).zero_() + PHONEMES_PADDING 91 | graphemes_length = torch.LongTensor(batch_size).zero_() 92 | phonemes_length = torch.LongTensor(batch_size).zero_() 93 | 94 | for x in range(batch_size): 95 | sample = batch[x] 96 | graphemes, phonemes = sample 97 | 98 | graphemes = torch.from_numpy(np.asarray(graphemes)).long() 99 | 100 | phonemes = torch.from_numpy(np.asarray(phonemes)).long() 101 | 102 | graphemes_seq_length = graphemes.size(0) 103 | phonemes_seq_length = phonemes.size(0) 104 | 105 | graphemes_inputs[x].narrow(0, 0, graphemes_seq_length).copy_(graphemes) 106 | graphemes_length[x] = graphemes_seq_length 107 | phonemes_length[x] = phonemes_seq_length 108 | 109 | phonemes_targets[x].narrow(0, 0, phonemes_seq_length).copy_(phonemes) 110 | 111 | return graphemes_inputs, graphemes_length, phonemes_targets, phonemes_length 112 | -------------------------------------------------------------------------------- /ukro_g2p/models/modules.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | 5 | # TODO remove config, send par explicitly 6 | 7 | 8 | class Encoder(nn.Module): 9 | 10 | def __init__(self, config): 11 | super(Encoder, self).__init__() 12 | 13 | self.vocab_size = config.encoder_vocab_size 14 | self.padding_idx = config.encoder_padding_idx 15 | 16 | self.d_embed = config.encoder_d_embed 17 | self.d_hidden = config.encoder_d_hidden 18 | self.num_layers = config.encoder_n_layers 19 | self.bidirectional = config.encoder_bidirectional 20 | 21 | self.embedding = nn.Embedding(self.vocab_size, self.d_embed, padding_idx=self.padding_idx) 22 | self.lstm = nn.LSTM(self.d_embed, self.d_hidden // 2 if self.bidirectional else self.d_hidden, self.num_layers, 23 | batch_first=True, bidirectional=self.bidirectional) 24 | 25 | def forward(self, x, x_length): 26 | 27 | x = self.embedding(x) # B x T x D 28 | x = nn.utils.rnn.pack_padded_sequence(x, x_length, batch_first=True, enforce_sorted=False) 29 | 30 | out, hc = self.lstm(x) 31 | 32 | out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) 33 | 34 | if self.bidirectional: 35 | # stacking hidden and cell from forward and backward layers 36 | hc = tuple(torch.cat((hc_[0::2, :, :], hc_[1::2, :, :]), 2) for hc_ in hc) 37 | 38 | return out, hc 39 | 40 | 41 | class Decoder(nn.Module): 42 | 43 | def __init__(self, config): 44 | super(Decoder, self).__init__() 45 | 46 | self.vocab_size = config.decoder_vocab_size 47 | self.padding_idx = config.decoder_padding_idx 48 | 49 | self.d_embed = config.decoder_d_embed 50 | self.d_hidden = config.decoder_d_hidden 51 | self.num_layers = config.decoder_n_layers 52 | 53 | self.embedding = nn.Embedding(self.vocab_size, self.d_embed, padding_idx=self.padding_idx) 54 | self.lstm = nn.LSTM(self.d_embed, self.d_hidden, self.num_layers, batch_first=True) 55 | if config.attention: 56 | self.attn = Attention(self.d_hidden) 57 | else: 58 | self.attn = None 59 | 60 | self.linear = nn.Linear(self.d_hidden, self.vocab_size) 61 | 62 | def forward(self, y, y_length, hc, context=None): 63 | 64 | batch_size, seq_len = y.size() 65 | 66 | y = self.embedding(y) # B x T x D 67 | y = nn.utils.rnn.pack_padded_sequence(y, y_length, batch_first=True, enforce_sorted=False) 68 | 69 | out, hc = self.lstm(y, hc) 70 | 71 | out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) 72 | 73 | if self.attn: 74 | out = self.attn(out, context) 75 | 76 | out = out.contiguous() 77 | 78 | out = self.linear(out.view(-1, out.size(2))) 79 | 80 | return F.log_softmax(out, dim=1).view(batch_size, -1, out.size(1)), hc 81 | 82 | 83 | class Attention(nn.Module): 84 | """Dot global attention from https://arxiv.org/abs/1508.04025""" 85 | 86 | def __init__(self, dim): 87 | super(Attention, self).__init__() 88 | self.linear = nn.Linear(dim * 2, dim, bias=False) 89 | 90 | def forward(self, x, context): 91 | 92 | batch_size, seq_len, _ = x.size() 93 | 94 | attn = F.softmax(x.bmm(context.transpose(1, 2)), dim=2) 95 | weighted_context = attn.bmm(context) 96 | 97 | o = self.linear(torch.cat((x, weighted_context), 2).view(batch_size * seq_len, -1)) 98 | return torch.tanh(o).view(batch_size, seq_len, -1) 99 | 100 | 101 | class Beam(object): 102 | """Ordered beam of candidate outputs""" 103 | 104 | def __init__(self, config): 105 | """Initialize params""" 106 | self.size = config.beam_size 107 | self.done = False 108 | self.pad = config.decoder_padding_idx 109 | self.bos = config.decoder_bos_idx 110 | self.eos = config.decoder_eos_idx 111 | self.tt = torch.cuda if config.use_cuda else torch 112 | 113 | # The score for each translation on the beam. 114 | self.scores = self.tt.FloatTensor(self.size).zero_() 115 | 116 | # The backpointers at each time-step. 117 | self.prevKs = [] 118 | 119 | # The outputs at each time-step. 120 | self.nextYs = [self.tt.LongTensor(self.size).fill_(self.pad)] 121 | self.nextYs[0][0] = self.bos 122 | 123 | # Get the outputs for the current timestep. 124 | def get_current_state(self): 125 | """Get state of beam.""" 126 | return self.nextYs[-1] 127 | 128 | # Get the backpointers for the current timestep. 129 | def get_current_origin(self): 130 | """Get the backpointer to the beam at this step.""" 131 | return self.prevKs[-1] 132 | 133 | def advance(self, workd_lk): 134 | """Advance the beam.""" 135 | num_words = workd_lk.size(1) 136 | 137 | # Sum the previous scores. 138 | if len(self.prevKs) > 0: 139 | beam_lk = workd_lk + self.scores.unsqueeze(1).expand_as(workd_lk) 140 | else: 141 | beam_lk = workd_lk[0] 142 | 143 | flat_beam_lk = beam_lk.view(-1) 144 | 145 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) 146 | self.scores = best_scores 147 | 148 | # best_scores_id is flattened beam x word array, so calculate which 149 | # word and beam each score came from 150 | prev_k = best_scores_id // num_words 151 | self.prevKs.append(prev_k) 152 | self.nextYs.append(best_scores_id - prev_k * num_words) 153 | # End condition is when n-best are EOS. 154 | if self.nextYs[-1][0] == self.eos: 155 | self.done = True 156 | return self.done 157 | 158 | def get_hyp(self, k): 159 | """Get hypotheses.""" 160 | hyp = [] 161 | for j in range(len(self.prevKs) - 1, -1, -1): 162 | hyp.append(self.nextYs[j + 1][k]) 163 | k = self.prevKs[j][k] 164 | return hyp[::-1] 165 | -------------------------------------------------------------------------------- /ukro_g2p/models/g2p_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torch.autograd import Variable 4 | import numpy as np 5 | from ukro_g2p.models.modules import Encoder, Decoder 6 | from ukro_g2p.models.modules import Beam 7 | from pathlib import Path 8 | import configparser 9 | from collections import namedtuple 10 | from torch.utils import model_zoo 11 | 12 | model_obj = namedtuple("model_obj", ["url", "config_path"]) 13 | 14 | pretrained_models = { 15 | "ukro-base-uncased": model_obj( 16 | url="https://github.com/kosti4ka/ukro_g2p/releases/download/ukro_base_uncased_v.0.1/ukro_base_uncased-epoch-99-d545c0d.th", 17 | config_path=Path(__file__).parent / "../configs/ukro_base_uncased.config", 18 | ) 19 | } 20 | 21 | 22 | class G2PConfig(dict): 23 | 24 | def __init__(self, model_config_file): 25 | super(G2PConfig, self).__init__() 26 | 27 | self.use_cuda = False 28 | self.model_path = None 29 | 30 | # reading config file 31 | config_file = configparser.ConfigParser() 32 | config_file.read(model_config_file, encoding='utf8') 33 | 34 | self.padding = config_file['VocabConfig']['padding'] 35 | # TODO simplify this part - use same bos, eos symbol for both phonemes and graphemes 36 | self.decoder_bos = config_file['VocabConfig']['phoneme_bos'] 37 | self.decoder_eos = config_file['VocabConfig']['phoneme_eos'] 38 | # reading graphemes 39 | self.graphemes = config_file['VocabConfig']['graphemes'].split() 40 | self.encoder_vocab_size = len(self.graphemes) 41 | self.encoder_padding_idx = self.graphemes.index(self.padding) 42 | # reading phonemes 43 | self.phonemes = config_file['VocabConfig']['phonemes'].split() 44 | self.decoder_vocab_size = len(self.phonemes) 45 | self.decoder_padding_idx = self.phonemes.index(self.padding) 46 | self.decoder_bos_idx = self.phonemes.index(self.decoder_bos) 47 | self.decoder_eos_idx = self.phonemes.index(self.decoder_eos) 48 | # reading human phonemes 49 | self.human_phonemes = config_file['VocabConfig']['human_phonemes'].split() 50 | 51 | # encoder config 52 | self.encoder_d_embed = int(config_file['EncoderConfig']['encoder_d_embed']) 53 | self.encoder_d_hidden = int(config_file['EncoderConfig']['encoder_d_hidden']) 54 | self.encoder_n_layers = int(config_file['EncoderConfig']['encoder_n_layers']) 55 | self.encoder_bidirectional = True if config_file['EncoderConfig']['encoder_bidirectional'].lower() == 'true' else False 56 | 57 | # decoder config 58 | self.decoder_d_embed = int(config_file['DecoderConfig']['decoder_d_embed']) 59 | self.decoder_d_hidden = int(config_file['DecoderConfig']['decoder_d_hidden']) 60 | self.decoder_n_layers = int(config_file['DecoderConfig']['decoder_n_layers']) 61 | self.attention = True if config_file['DecoderConfig']['attention'] == 'True' else False 62 | 63 | # generator 64 | self.beam_size = int(config_file['GeneratorConfig']['beam_size']) 65 | self.max_generate_len = int(config_file['GeneratorConfig']['max_generate_len']) 66 | 67 | # optimizer 68 | self.lr = float(config_file['OptimizerConfig']['lr']) 69 | self.weight_decay = float(config_file['OptimizerConfig']['weight_decay']) 70 | 71 | 72 | class PreTrainedG2PModel(nn.Module): 73 | def __init__(self, config): 74 | super(PreTrainedG2PModel, self).__init__() 75 | self.config = config 76 | 77 | @classmethod 78 | def from_pretrained(cls, model_name): 79 | 80 | if model_name not in pretrained_models: 81 | raise ValueError 82 | 83 | # load config 84 | config = G2PConfig(pretrained_models[model_name].config_path) # TODO add metod from_file 85 | 86 | # instantiate model 87 | model = cls(config) 88 | 89 | # loading weights 90 | state_dict = model_zoo.load_url(pretrained_models[model_name].url, 91 | progress=True, map_location=lambda storage, loc: storage) 92 | model.load_state_dict(state_dict) 93 | 94 | return model 95 | 96 | 97 | class G2PModel(PreTrainedG2PModel): 98 | 99 | def __init__(self, config): 100 | super(G2PModel, self).__init__(config) 101 | 102 | # init 103 | self.config = config 104 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 105 | 106 | # encoder 107 | self.encoder = Encoder(config) 108 | 109 | # decoder 110 | self.decoder = Decoder(config) 111 | self.attention = config.attention 112 | 113 | # generator 114 | self.beam_size = config.beam_size 115 | self.max_generate_len = config.max_generate_len 116 | 117 | def forward(self, x, x_length, y=None, p_length=None, n_best=1): 118 | # TODO rewrite desscription 119 | 120 | encoder_out, encoder_hc = self.encoder(x, x_length) 121 | 122 | if y is not None: 123 | out = self.decoder(y, p_length, encoder_hc, context=encoder_out if self.attention else None) 124 | else: 125 | out = self._generate(encoder_hc, context=encoder_out if self.attention else None) 126 | 127 | return out 128 | 129 | def _generate(self, hc, context=None): 130 | beam = Beam(self.config) 131 | h, c = hc 132 | # Make a beam_size batch. 133 | h = h.expand(h.size(1), beam.size, h.size(2)) 134 | c = c.expand(c.size(1), beam.size, c.size(2)) 135 | if context is not None: 136 | context = context.expand(beam.size, context.size(1), context.size(2)) 137 | p_length = Variable(torch.from_numpy(np.array([1]))) 138 | p_length = p_length.expand(beam.size).to(self.device) 139 | 140 | for i in range(self.max_generate_len): 141 | x = beam.get_current_state() 142 | o, hc = self.decoder(Variable(x.unsqueeze(1)).to(self.device), p_length, (h, c), context=context) 143 | if beam.advance(o.data.squeeze(1)): 144 | break 145 | h, c = hc 146 | h.data.copy_(h.data.index_select(1, beam.get_current_origin())) 147 | c.data.copy_(c.data.index_select(1, beam.get_current_origin())) 148 | return torch.LongTensor(beam.get_hyp(0)).unsqueeze(0) 149 | 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ukrainian G2P model 2 | 3 | ## Installation 4 | 5 | `pip install -U ukro-g2p` 6 | 7 | ## Example inference 8 | 9 | ```python 10 | from ukro_g2p.predict import G2P 11 | 12 | g2p = G2P('ukro-base-uncased') 13 | 14 | #ARPABET format 15 | g2p('фонетика') 16 | 17 | #human readable format 18 | g2p('фонетика', human_readable=True) 19 | ``` 20 | 21 | Jupyter notebook with the example: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bptBFKWtuBVVbAA_e_WB5tL-N4tJ4xyq#scrollTo=JGG5NcltvXTx?usp=sharing) 22 | 23 | ## Web app 24 | https://ukro-g2p.herokuapp.com 25 | 26 | Code for the web app: https://github.com/kosti4ka/ukro_g2p_demo 27 | 28 | ## Ukrainian phonology symbols 29 | 30 | ### Голосні 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 |
UkrainianARPABET-like
[і]IY
[и]IH
[е]EH
[у]UH
[о]AO
[а]AA
Наближення
и]EIH
і]EIY
е]IHE
у]AOU
Наголос
[ ́ ]1
99 | 100 | ### Приголосні 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 |
UkrainianARPABET-like
[б]B
[в]V
[г]H
[д]D
[дж]JH
[дз]DZ
[ж]ZH
[з]Z
[й]Y
[к]K
[л]L
[м]M
[н]N
[п]P
[р]R
[с]S
[т]T
[ф]F
[х]X
[ц]TS
[ч]CH
[ш]SH
[ґ]G
[ў]WH
М'які
[д’]DJ
[дз’]DZJ
[з’]ZJ
[л’]LJ
[н’]NJ
[р’]RJ
[с’]SJ
[т’]TJ
[ц’]TSJ
Пом'якшення
[`]2
Подовження
[:]3
272 | -------------------------------------------------------------------------------- /ukro_g2p/trainer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import OrderedDict 3 | 4 | from pathlib import Path 5 | 6 | import torch 7 | from torch.autograd import Variable 8 | from torch.utils.data import DataLoader 9 | from ukro_g2p.predict import _collate_fn 10 | from torch.optim.lr_scheduler import ReduceLROnPlateau 11 | 12 | # from text import text_to_sequence 13 | from ukro_g2p.predict import infolog 14 | from tensorboardX import SummaryWriter 15 | from tqdm import tqdm 16 | 17 | log = infolog.log 18 | 19 | 20 | class Trainer(object): 21 | def __init__(self, model, datasets, optimizer, loss, epochs, batch_size, experiment_name, logging_freq, 22 | restore_epoch=-1): 23 | 24 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | self.epochs = epochs 26 | self.experiment_name = experiment_name 27 | self.logging_freq = logging_freq 28 | self.model = model.to(self.device) 29 | self.restore_epoch = restore_epoch 30 | 31 | self.optimizer = optimizer 32 | self.loss = loss 33 | # TODO remove scheduler from trainer object 34 | self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.1, patience=5, verbose=True) 35 | 36 | self.global_batch_index = 0 37 | self.global_batch_index_dev = 0 38 | self.datasets = datasets 39 | self.dataloaders = OrderedDict() 40 | for data_name in datasets: 41 | self.dataloaders[data_name] = DataLoader(datasets[data_name], batch_size=batch_size, 42 | shuffle=True if data_name == 'train' else False, 43 | collate_fn=_collate_fn, 44 | num_workers=8) 45 | 46 | 47 | self.experiment_dir = Path('./exp') / self.experiment_name 48 | self.log_dir = self.experiment_dir / 'logs' 49 | self.log_path = self.log_dir / 'log.txt' 50 | self.checkpoint_dir = self.experiment_dir / 'checkpoints' 51 | 52 | self.experiment_dir.mkdir(parents=True, exist_ok=True) 53 | self.log_dir.mkdir(parents=True, exist_ok=True) 54 | self.checkpoint_dir.mkdir(parents=True, exist_ok=True) 55 | 56 | # initialize logger 57 | infolog.init(self.log_path, self.experiment_name) 58 | # initialize TensorBoard writer 59 | self.writer = SummaryWriter(log_dir=self.log_dir) 60 | 61 | if self.restore_epoch != -1: 62 | self.load_from_checkpoint(epoch=self.restore_epoch) 63 | 64 | def train_and_validate(self): 65 | for e in range(self.restore_epoch + 1, self.epochs): 66 | for ds, dataloader in self.dataloaders.items(): 67 | if ds != 'train': 68 | self.model.eval() # turn off batchnorm /dropout 69 | self.run_epoch(dataloader, dataset_name=ds, epoch=e) 70 | self.model.train() # turn on batchnorm /dropout 71 | self.save_to_checkpoint(epoch=e) 72 | 73 | def run_epoch(self, dataloader, dataset_name, epoch): 74 | begin_time = time.time() 75 | train = dataset_name == 'train' 76 | num_batches = len(dataloader) 77 | avg_loss = 0 78 | 79 | progress_bar = tqdm(total=num_batches) 80 | for batch_idx, batch in enumerate(dataloader): 81 | word_inputs, input_length, pron_targets, p_length = batch 82 | 83 | word_inputs = Variable(word_inputs).to(self.device) 84 | input_length = Variable(input_length).to(self.device) 85 | pron_targets = Variable(pron_targets).to(self.device) 86 | p_length = Variable(p_length).to(self.device) 87 | 88 | p_preds, _ = self.model(word_inputs, input_length, pron_targets[:, :-1], p_length - 1) 89 | targets = pron_targets[:, 1:].contiguous() 90 | 91 | loss = self.loss(p_preds.view(p_preds.size(0) * p_preds.size(1), -1), 92 | targets.view(targets.size(0) * targets.size(1))) 93 | 94 | loss_floats = loss.data.cpu().item() 95 | avg_loss += loss_floats 96 | 97 | progress_bar.set_description(f'{dataset_name}: {epoch}/{self.epochs}, current loss: {round(loss_floats,4)}') 98 | progress_bar.refresh() 99 | progress_bar.update() 100 | 101 | if train: 102 | self.global_batch_index += 1 103 | self.writer.add_scalar(dataset_name, loss_floats, self.global_batch_index) 104 | self.optimizer.zero_grad() 105 | loss.backward() 106 | self.optimizer.step() 107 | # if (batch_idx + 1) % self.logging_freq == 0: 108 | # log_str = self.construct_logging_str(loss_floats, epoch, num_batches, batch_idx + 1) 109 | # log(log_str) 110 | else: 111 | # TODO it is ugly part of code, shuold add global step explicitly 112 | self.global_batch_index_dev += 1 113 | self.writer.add_scalar(dataset_name, loss_floats, self.global_batch_index_dev) 114 | 115 | progress_bar.close() 116 | 117 | # add generated text to tensor board 118 | if not train: 119 | graphemes, graphemes_length, phonemes, _ = next(iter(dataloader)) 120 | 121 | graphemes = graphemes.to(self.device) 122 | graphemes_length = graphemes_length.to(self.device) 123 | phonemes = phonemes.to(self.device) 124 | 125 | # phonemes_predictions = self.model(graphemes, graphemes_length).tolist() 126 | phonemes_targets = phonemes[:, 1:].contiguous().tolist() 127 | 128 | text = '' 129 | for idx in range(0, 5): 130 | phonemes_predictions = self.model(graphemes[idx].unsqueeze(0), graphemes_length[idx].unsqueeze(0)).tolist()[0] 131 | 132 | graphemes_str = ''.join( 133 | [g for g in self.datasets[dataset_name].idx2graphemes(graphemes[idx].tolist()) if g not in ['', 'pad']]) 134 | predictions_str = ' '.join( 135 | [p for p in self.datasets[dataset_name].idx2phonemes(phonemes_predictions) if p not in ['', '']]) 136 | targets_str = ' '.join( 137 | [p for p in self.datasets[dataset_name].idx2phonemes(phonemes_targets[idx]) if p not in ['', '', 'pad']]) 138 | 139 | text = text + graphemes_str + ' \n' + targets_str + ' \n' + predictions_str + ' \n' 140 | 141 | self.writer.add_text('Text', text, epoch) 142 | 143 | avg_loss /= num_batches 144 | end_time = time.time() 145 | if train: 146 | self.scheduler.step(avg_loss) 147 | log_str = 'Epoch: {}, {} loss: {:.5f} time: {:.2f} sec' 148 | log_str = log_str.format(epoch, dataset_name, avg_loss, end_time - begin_time) 149 | log(log_str) 150 | 151 | def load_from_checkpoint(self, epoch): 152 | checkpoint_file = f'{self.experiment_name}-epoch-{epoch}.th' 153 | checkpoint_path = self.checkpoint_dir / checkpoint_file 154 | self.model.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage)) 155 | print('model loaded from checkpoint: {}'.format(checkpoint_file)) 156 | 157 | def save_to_checkpoint(self, epoch): 158 | checkpoint_file = f'{self.experiment_name}-epoch-{epoch}.th' 159 | checkpoint_path = self.checkpoint_dir / checkpoint_file 160 | if self.checkpoint_dir.exists(): 161 | torch.save(self.model.state_dict(), checkpoint_path) 162 | print(f'model saved to checkpoint: {checkpoint_path}') 163 | else: 164 | raise FileNotFoundError 165 | 166 | @staticmethod 167 | def construct_logging_str(loss, epoch, total_batches, idx): 168 | tmpstr = 'Epoch:{:2} Batch:[{:3}/{:3}] Loss: {:.4f}' 169 | tmpstr = tmpstr.format(epoch, idx, total_batches, loss) 170 | return tmpstr 171 | --------------------------------------------------------------------------------