├── .gitignore ├── LICENSE ├── README.md ├── backend ├── __init__.py ├── config │ └── default.yaml ├── functional.py ├── hparams.py ├── manager.py ├── models.py ├── mytts.py ├── pretrained │ ├── ljspeech-melgan-epoch3200.pth │ └── ljspeech-parallel-epoch0100.pth ├── synthesizer.py └── transform.py ├── client.py ├── client2.py ├── frontend ├── .editorconfig ├── .gitignore ├── README.md ├── babel.config.js ├── package-lock.json ├── package.json ├── public │ ├── demo.wav │ ├── favicon.ico │ └── index.html ├── src │ ├── App.vue │ ├── assets │ │ ├── logo.png │ │ └── logo.svg │ ├── components │ │ ├── HelloWorld.vue │ │ └── MyParaTTS.vue │ ├── main.js │ ├── plugins │ │ └── vuetify.js │ ├── router │ │ └── index.js │ └── views │ │ ├── About.vue │ │ └── Home.vue └── vue.config.js ├── requirements.txt └── server.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Projects 2 | .vscode/ 3 | dist/ 4 | outputs/ 5 | 000-* 6 | 7 | # Logs 8 | logs 9 | *.log 10 | npm-debug.log* 11 | yarn-debug.log* 12 | yarn-error.log* 13 | lerna-debug.log* 14 | 15 | # Diagnostic reports (https://nodejs.org/api/report.html) 16 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 17 | 18 | # Runtime data 19 | pids 20 | *.pid 21 | *.seed 22 | *.pid.lock 23 | 24 | # Directory for instrumented libs generated by jscoverage/JSCover 25 | lib-cov 26 | 27 | # Coverage directory used by tools like istanbul 28 | coverage 29 | *.lcov 30 | 31 | # nyc test coverage 32 | .nyc_output 33 | 34 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 35 | .grunt 36 | 37 | # Bower dependency directory (https://bower.io/) 38 | bower_components 39 | 40 | # node-waf configuration 41 | .lock-wscript 42 | 43 | # Compiled binary addons (https://nodejs.org/api/addons.html) 44 | build/Release 45 | 46 | # Dependency directories 47 | node_modules/ 48 | jspm_packages/ 49 | 50 | # TypeScript v1 declaration files 51 | typings/ 52 | 53 | # TypeScript cache 54 | *.tsbuildinfo 55 | 56 | # Optional npm cache directory 57 | .npm 58 | 59 | # Optional eslint cache 60 | .eslintcache 61 | 62 | # Microbundle cache 63 | .rpt2_cache/ 64 | .rts2_cache_cjs/ 65 | .rts2_cache_es/ 66 | .rts2_cache_umd/ 67 | 68 | # Optional REPL history 69 | .node_repl_history 70 | 71 | # Output of 'npm pack' 72 | *.tgz 73 | 74 | # Yarn Integrity file 75 | .yarn-integrity 76 | 77 | # dotenv environment variables file 78 | .env 79 | .env.test 80 | 81 | # parcel-bundler cache (https://parceljs.org/) 82 | .cache 83 | 84 | # Next.js build output 85 | .next 86 | 87 | # Nuxt.js build / generate output 88 | .nuxt 89 | dist 90 | 91 | # Gatsby files 92 | .cache/ 93 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 94 | # https://nextjs.org/blog/next-9-1#public-directory-support 95 | # public 96 | 97 | # vuepress build output 98 | .vuepress/dist 99 | 100 | # Serverless directories 101 | .serverless/ 102 | 103 | # FuseBox cache 104 | .fusebox/ 105 | 106 | # DynamoDB Local files 107 | .dynamodb/ 108 | 109 | # TernJS port file 110 | .tern-port 111 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Atomicoo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PTTS 网页演示 2 | 3 | [TOC] 4 | 5 | 使用 [Flask](https://github.com/pallets/flask) + [Vue](https://github.com/vuejs/vue)(框架:[Vuetify](https://github.com/vuetifyjs/vuetify))完成的语音合成单网页演示项目,语音合成后端基于我的另一个项目 [atomicoo/ParallelTTS](https://github.com/atomicoo/ParallelTTS)。 6 | 7 | ## 目录结构 8 | 9 | ``` 10 | . 11 | |--- backend/ 12 | |--- pretrained/ # 预训练模型 13 | |--- mytts.py # 封装 TTS 类 14 | |--- ... 15 | |--- dist/ # 前端的编译输出 16 | |--- frontend/ 17 | |--- public/ 18 | |--- src/ 19 | |--- components/ 20 | |--- MyParaTTS.vue # 语音合成页面 21 | |--- ... 22 | |--- ... 23 | |--- client.py # 接口测试脚本 24 | |--- LICENSE 25 | |--- README.md # 说明文档 26 | |--- requirements.txt # 依赖文件 27 | |--- server.py # 服务器端启动脚本 28 | ``` 29 | 30 | ## 快速开始 31 | 32 | ```shell 33 | $ git clone https://github.com/atomicoo/PTTS-WebAPP.git 34 | $ cd PTTS-WebAPP/frontend/ 35 | $ npm install --save 36 | $ npm run dev 37 | $ cd .. 38 | $ pip install -r requirements.txt 39 | $ python server.py 40 | $ python client.py 41 | ``` 42 | 43 | 运行 `npm run dev` 命令后,项目根目录下应该已经生成前端代码的编译输出,在 `./dist/` 目录下。 44 | 45 | 运行 `python server.py` 命令后,服务器端已经启动,可以先试试 `python client.py` 测试一下语音合成接口是否正常。 46 | 47 | 如果至此一切正常,那直接访问 http://localhost:5000/ 即可。 48 | 49 | ![image-20210412175503742](https://cdn.jsdelivr.net/gh/atomicoo/picture-bed@latest/2021/04/image-20210412175503742.png) 50 | 51 | ## 一些问题 52 | 53 | - 语音合成后端基于我自己的另一个项目 [atomicoo/ParallelTTS](https://github.com/atomicoo/ParallelTTS),但为了简化重构了代码结构,如果想要换成其他语言的话,理论上只需要替换掉 `./config/` 下的配置文件和 `./pretrained/` 下的模型文件即可,但没有经过完全测试,不能确保不会出现问题。 54 | - ~~目前只支持调整语速,后续会增加音量和语调的调整,如果有大佬能帮忙搞定就更好了 [doge]。~~(已完成) 55 | - 语调的调整使用 变速不变调(TSM)+ 重采样 方案来完成;音量的调整使用比较简单粗暴的方式,后续会改掉。 56 | 57 | ## 参考资料 58 | 59 | - [Flask:Python Web 微框架](https://flask.palletsprojects.com/en/1.1.x/) 60 | - [Vuetify:Material Design 框架](https://vuetifyjs.com/zh-Hans/) 61 | - [变速不变调方法总结 - 知乎](https://zhuanlan.zhihu.com/p/337193578) -------------------------------------------------------------------------------- /backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/PTTS-WebAPP/ff76ebd7e44542e79dcdb9c3bc23e91199de3a54/backend/__init__.py -------------------------------------------------------------------------------- /backend/config/default.yaml: -------------------------------------------------------------------------------- 1 | text: 2 | graphemes: &gs !!python/object/apply:eval ['list("abcdefghijklmnopqrstuvwxyz")'] 3 | phonemes: &ps !!python/object/apply:eval ['["AA0","AA1","AA2","AE0","AE1","AE2","AH0","AH1","AH2","AO0","AO1","AO2","AW0","AW1","AW2","AY0","AY1","AY2","B","CH","D","DH","EH0","EH1","EH2","ER0","ER1","ER2","EY0","EY1","EY2","F","G","HH","IH0","IH1","IH2","IY0","IY1","IY2","JH","K","L","M","N","NG","OW0","OW1","OW2","OY0","OY1","OY2","P","R","S","SH","T","TH","UH0","UH1","UH2","UW","UW0","UW1","UW2","V","W","Y","Z","ZH"]'] 4 | specials: &sp !!python/object/apply:eval ['["", ""]'] 5 | punctuations: &pt !!python/object/apply:eval ['[".", ",", "?", "!", " ", "-"]'] 6 | units_list: &ul !!python/object/apply:eval ['us+sp+pt', {'us': *ps, 'sp': *sp, 'pt': *pt}] 7 | use_phonemes: &up true 8 | audio: 9 | n_mel_channels: &nm 80 10 | filter_length: 1024 11 | hop_length: 256 # WARNING: this can't be changed. 12 | win_length: 1024 13 | sampling_rate: &sr 22050 14 | segment_length: *sr 15 | pad_short: 2000 16 | mel_fmin: 80.0 17 | mel_fmax: 7600.0 18 | # Precomputed statistics for log-mel-spectrs for speech dataset 19 | spec_mean: -5.522 # for LJSpeech dataset 20 | spec_std: 2.063 # for LJSpeech dataset 21 | spec_min: -11.5129 # for LJSpeech dataset 22 | spec_max: 2.0584 # for LJSpeech dataset 23 | # Others 24 | force_frame_rate: true # force match sampling rate 25 | normalize: 26 | match_volume: false 27 | trim_silence: false 28 | reduction_rate: 4 29 | parallel: 30 | ground_truth: false 31 | out_channels: *nm # equal to ${audio.n_mel_channels} 32 | alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}] 33 | channels: 128 34 | enc_kernel_size: 4 35 | dec_kernel_size: 4 36 | enc_dilations: !!python/object/apply:eval ['4 * [1,2,4] + [1]'] # receptive field is max 15 37 | dec_dilations: !!python/object/apply:eval ['4 * [1,2,4,8] + [1]'] # receptive field is max 32 38 | normalize: FreqNorm # 'freq', 'layer', 'batch' 39 | activation: torch.nn.ReLU # 'relu', 'linear', 'sigmoid' 40 | final_activation: torch.nn.Identity 41 | pos_mode: 'duration' # 'standard', 'duration' 42 | interpolate: false # true 43 | separate_duration_grad: true 44 | checkpoint: 'ljspeech-parallel-epoch0100.pth' 45 | vocoder: 46 | checkpoint: 'ljspeech-melgan-epoch3200.pth' -------------------------------------------------------------------------------- /backend/functional.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def mask(shape, lengths, dim=-1): 5 | 6 | assert dim != 0, 'Masking not available for batch dimension' 7 | assert len(lengths) == shape[0], 'Lengths must contain as many elements as there are items in the batch' 8 | 9 | lengths = torch.as_tensor(lengths) 10 | 11 | to_expand = [1] * (len(shape)-1)+[-1] 12 | mask = torch.arange(shape[dim]).expand(to_expand).transpose(dim, -1).expand(shape).to(lengths.device) 13 | mask = mask < lengths.expand(to_expand).transpose(0, -1) 14 | return mask 15 | 16 | 17 | def positional_encoding(channels, length, w=1): 18 | """The positional encoding from `Attention is all you need` paper 19 | 20 | :param channels: How many channels to use 21 | :param length: 22 | :param w: Scaling factor 23 | :return: 24 | """ 25 | enc = torch.FloatTensor(length, channels) 26 | rows = torch.arange(length, out=torch.FloatTensor())[:, None] 27 | cols = 2 * torch.arange(channels//2, out=torch.FloatTensor()) 28 | 29 | enc[:, 0::2] = torch.sin(w * rows / (10.0**4 ** (cols / channels))) 30 | enc[:, 1::2] = torch.cos(w * rows / (10.0**4 ** (cols / channels))) 31 | return enc 32 | -------------------------------------------------------------------------------- /backend/hparams.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/HarryVolek/PyTorch_Speaker_Verification 2 | 3 | import os 4 | import yaml 5 | 6 | def load_hparam_str(hp_str): 7 | path = 'temp-restore.yaml' 8 | with open(path, 'w') as f: 9 | f.write(hp_str) 10 | ret = HParam(path) 11 | os.remove(path) 12 | return ret 13 | 14 | 15 | def load_hparam(filename): 16 | stream = open(filename, 'r', encoding='utf-8') 17 | docs = yaml.load_all(stream, Loader=yaml.Loader) 18 | hparam_dict = dict() 19 | for doc in docs: 20 | for k, v in doc.items(): 21 | hparam_dict[k] = v 22 | return hparam_dict 23 | 24 | 25 | def merge_dict(user, default): 26 | if isinstance(user, dict) and isinstance(default, dict): 27 | for k, v in default.items(): 28 | if k not in user: 29 | user[k] = v 30 | else: 31 | user[k] = merge_dict(user[k], v) 32 | return user 33 | 34 | 35 | class Dotdict(dict): 36 | """ 37 | a dictionary that supports dot notation 38 | as well as dictionary access notation 39 | usage: d = DotDict() or d = DotDict({'val1':'first'}) 40 | set attributes: d.val2 = 'second' or d['val2'] = 'second' 41 | get attributes: d.val2 or d['val2'] 42 | """ 43 | __getattr__ = dict.__getitem__ 44 | __setattr__ = dict.__setitem__ 45 | __delattr__ = dict.__delitem__ 46 | 47 | def __init__(self, dct=None): 48 | dct = dict() if not dct else dct 49 | for key, value in dct.items(): 50 | if hasattr(value, 'keys'): 51 | value = Dotdict(value) 52 | self[key] = value 53 | 54 | 55 | class HParam(Dotdict): 56 | 57 | def __init__(self, file): 58 | super(Dotdict, self).__init__() 59 | hp_dict = load_hparam(file) 60 | hp_dotdict = Dotdict(hp_dict) 61 | for k, v in hp_dotdict.items(): 62 | setattr(self, k, v) 63 | 64 | __getattr__ = Dotdict.__getitem__ 65 | __setattr__ = Dotdict.__setitem__ 66 | __delattr__ = Dotdict.__delitem__ 67 | -------------------------------------------------------------------------------- /backend/manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Aug 22 19:41:55 2017 4 | @author: Quantum Liu 5 | """ 6 | ''' 7 | Example: 8 | gm=GPUManager() 9 | with torch.cuda.device(gm.auto_choice()): 10 | blabla 11 | Or: 12 | gm=GPUManager() 13 | torch.cuda.set_device(gm.auto_choice()) 14 | ''' 15 | 16 | import os 17 | import torch 18 | 19 | 20 | def check_gpus(): 21 | ''' 22 | GPU available check 23 | http://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-cuda/ 24 | ''' 25 | if not torch.cuda.is_available(): 26 | print('This script could only be used to manage NVIDIA GPUs, but not GPU found in your device!') 27 | return False 28 | elif not 'NVIDIA System Management' in os.popen('nvidia-smi -h').read(): 29 | print("The 'nvidia-smi' tool not found.") 30 | return False 31 | return True 32 | 33 | 34 | if check_gpus(): 35 | def parse(line, qargs): 36 | ''' 37 | line: 38 | a line of text 39 | qargs: 40 | query arguments 41 | return: 42 | a dict of gpu infos 43 | Pasing a line of csv format text returned by nvidia-smi 44 | 解析一行nvidia-smi返回的csv格式文本 45 | ''' 46 | numberic_args = ['memory.free', 'memory.total', 'power.draw', 'power.limit']#可计数的参数 47 | power_manage_enable = lambda v:(not 'Not Support' in v)#lambda表达式,显卡是否滋瓷power management(笔记本可能不滋瓷) 48 | to_numberic = lambda v:float(v.upper().strip().replace('MIB','').replace('W',''))#带单位字符串去掉单位 49 | process = lambda k,v:((int(to_numberic(v)) if power_manage_enable(v) else 1) if k in numberic_args else v.strip()) 50 | return {k:process(k,v) for k,v in zip(qargs,line.strip().split(','))} 51 | 52 | def query_gpu(qargs=[]): 53 | ''' 54 | qargs: 55 | query arguments 56 | return: 57 | a list of dict 58 | Querying GPUs infos 59 | 查询GPU信息 60 | ''' 61 | qargs = ['index','gpu_name', 'memory.free', 'memory.total', 'power.draw', 'power.limit']+ qargs 62 | cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs)) 63 | results = os.popen(cmd).readlines() 64 | return [parse(line,qargs) for line in results] 65 | 66 | def by_power(d): 67 | ''' 68 | helper function fo sorting gpus by power 69 | ''' 70 | power_infos = (d['power.draw'],d['power.limit']) 71 | if any(v==1 for v in power_infos): 72 | print('Power management unable for GPU {}'.format(d['index'])) 73 | return 1 74 | return float(d['power.draw'])/d['power.limit'] 75 | 76 | class GPUManager: 77 | ''' 78 | qargs: 79 | query arguments 80 | A manager which can list all available GPU devices 81 | and sort them and choice the most free one.Unspecified 82 | ones pref. 83 | GPU设备管理器,考虑列举出所有可用GPU设备,并加以排序,自动选出 84 | 最空闲的设备。在一个GPUManager对象内会记录每个GPU是否已被指定, 85 | 优先选择未指定的GPU。 86 | ''' 87 | def __init__(self, qargs=[]): 88 | ''' 89 | ''' 90 | self.qargs = qargs 91 | self.gpus = query_gpu(qargs) 92 | for gpu in self.gpus: 93 | gpu['specified'] = False 94 | self.gpu_num = len(self.gpus) 95 | 96 | def _sort_by_memory(self, gpus, by_size=False): 97 | if by_size: 98 | print('Sorted by free memory size') 99 | return sorted(gpus, key=lambda d:d['memory.free'], reverse=True) 100 | else: 101 | print('Sorted by free memory rate') 102 | return sorted(gpus, key=lambda d:float(d['memory.free'])/d['memory.total'], reverse=True) 103 | 104 | def _sort_by_power(self, gpus): 105 | return sorted(gpus, key=by_power) 106 | 107 | def _sort_by_custom(self, gpus, key, reverse=False, qargs=[]): 108 | if isinstance(key, str) and (key in qargs): 109 | return sorted(gpus,key=lambda d:d[key],reverse=reverse) 110 | if isinstance(key, type(lambda a:a)): 111 | return sorted(gpus, key=key, reverse=reverse) 112 | raise ValueError("The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi") 113 | 114 | def auto_choice(self, mode=0): 115 | ''' 116 | mode: 117 | 0:(default)sorted by free memory size 118 | return: 119 | a PT device object 120 | Auto choice the freest GPU device, not specified ones 121 | 自动选择最空闲GPU,返回索引 122 | ''' 123 | for old_infos, new_infos in zip(self.gpus, query_gpu(self.qargs)): 124 | old_infos.update(new_infos) 125 | unspecified_gpus = [gpu for gpu in self.gpus if not gpu['specified']] or self.gpus 126 | 127 | if mode==0: 128 | print('Choosing the GPU device has largest free memory...') 129 | chosen_gpu = self._sort_by_memory(unspecified_gpus, True)[0] 130 | elif mode==1: 131 | print('Choosing the GPU device has highest free memory rate...') 132 | chosen_gpu = self._sort_by_power(unspecified_gpus)[0] 133 | elif mode==2: 134 | print('Choosing the GPU device by power...') 135 | chosen_gpu = self._sort_by_power(unspecified_gpus)[0] 136 | else: 137 | print('Given an unaviliable mode,will be chosen by memory') 138 | chosen_gpu = self._sort_by_memory(unspecified_gpus)[0] 139 | chosen_gpu['specified'] = True 140 | index = chosen_gpu['index'] 141 | print('Using GPU {i}, its info: \n\t{info}'.format(i=index,info='\n\t'.join([str(k)+': '+str(v) for k,v in chosen_gpu.items()]))) 142 | return int(index) 143 | else: 144 | raise ImportError('GPU available check failed!') 145 | -------------------------------------------------------------------------------- /backend/models.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from .functional import mask, positional_encoding 8 | from .transform import Pad 9 | 10 | 11 | # =============================================== 12 | # Parallel Text2Mel 13 | # =============================================== 14 | 15 | def expand_encodings(encodings, durations): 16 | """Expand phoneme encodings according to corresponding estimated durations 17 | 18 | Durations should be 0-masked, to prevent expanding of padded characters 19 | :param encodings: 20 | :param durations: (batch, time) 21 | :return: 22 | """ 23 | encodings = [torch.repeat_interleave(e, d, dim=0) 24 | for e, d in zip(encodings, durations.long())] 25 | 26 | return encodings 27 | 28 | 29 | def expand_positional_encodings(durations, channels, repeat=False): 30 | """Expand positional encoding to align with phoneme durations 31 | 32 | Example: 33 | If repeat: 34 | phonemes a, b, c have durations 3,5,4 35 | The expanded encoding is 36 | a a a b b b b b c c c c 37 | [e1, e2, e3, e1, e2, e3, e4, e5, e1, e2, e3, e4] 38 | 39 | Use Pad from transforms to get batched tensor. 40 | 41 | :param durations: (batch, time), 0-masked tensor 42 | :return: positional_encodings as list of tensors, (batch, time) 43 | """ 44 | 45 | durations = durations.long() 46 | def rng(l): return list(range(l)) 47 | 48 | if repeat: 49 | max_len = torch.max(durations) 50 | pe = positional_encoding(channels, max_len) 51 | idx = [] 52 | for d in durations: 53 | idx.append(list(itertools.chain.from_iterable([rng(dd) for dd in d]))) 54 | return [pe[i] for i in idx] 55 | else: 56 | max_len = torch.max(durations.sum(dim=-1)) 57 | pe = positional_encoding(channels, max_len) 58 | return [pe[:s] for s in durations.sum(dim=-1)] 59 | 60 | 61 | def round_and_mask(pred_durations, plen): 62 | pred_durations[pred_durations < 1] = 1 # we do not care about gradient outside training 63 | pred_durations = mask_durations(pred_durations, plen) # the durations now expand only phonemes and not padded values 64 | pred_durations = torch.round(pred_durations) 65 | return pred_durations 66 | 67 | 68 | def mask_durations(durations, plen): 69 | m = mask(durations.shape, plen, dim=-1).to(durations.device).float() 70 | return durations * m 71 | 72 | 73 | def expand_enc(encodings, durations, mode=None): 74 | """Copy each phoneme encoding as many times as the duration predictor predicts""" 75 | encodings = Pad(0)(expand_encodings(encodings, durations)) 76 | if mode: 77 | if mode == 'duration': 78 | encodings += Pad(0)(expand_positional_encodings(durations, encodings.shape[-1])).to(encodings.device) 79 | elif mode == 'standard': 80 | encodings += positional_encoding(encodings.shape[-1], encodings.shape[1]).to(encodings.device) 81 | return encodings 82 | 83 | 84 | class ZeroTemporalPad(nn.ZeroPad2d): 85 | """Pad sequences to equal lentgh in the temporal dimension""" 86 | def __init__(self, kernel_size, dilation, causal=False): 87 | total_pad = (dilation * (kernel_size - 1)) 88 | 89 | if causal: 90 | super(ZeroTemporalPad, self).__init__((0, 0, total_pad, 0)) 91 | else: 92 | begin = total_pad // 2 93 | end = total_pad - begin 94 | super(ZeroTemporalPad, self).__init__((0, 0, begin, end)) 95 | 96 | 97 | class Conv1d(nn.Conv1d): 98 | """A wrapper around nn.Conv1d, that works on (batch, time, channels)""" 99 | 100 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, dilation=1, groups=1, bias=True, padding=0): 101 | super(Conv1d, self).__init__(in_channels=in_channels, out_channels=out_channels, 102 | kernel_size=kernel_size, stride=stride, dilation=dilation, 103 | groups=groups, bias=bias, padding=padding) 104 | 105 | def forward(self, x): 106 | return super().forward(x.transpose(2,1)).transpose(2,1) 107 | 108 | 109 | class FreqNorm(nn.BatchNorm1d): 110 | """Normalize separately each frequency channel in spectrogram and batch, 111 | 112 | 113 | Examples: 114 | t = torch.arange(2*10*5).reshape(2, 10, 5).float() 115 | b1 = nn.BatchNorm1d(10, affine=False, momentum=None) 116 | b2 = (t - t.mean([0,2], keepdim=True))/torch.sqrt(t.var([0,2], unbiased=False, keepdim=True)+1e-05) 117 | -> b1 and b2 give the same results 118 | -> BatchNorm1D by default normalizes over channels and batch - not useful for differet length sequences 119 | If we transpose last two dims, we get normalizaton across batch and time 120 | -> normalization for each frequency channel over time and batch 121 | 122 | # compare to layer norm: 123 | Layer_norm: (t - t.mean(-1, keepdim=True))/torch.sqrt(t.var(-1, unbiased=False, keepdim=True)+1e-05) 124 | -> layer norm normalizes across all frequencies for each timestep independently of batch 125 | 126 | => LayerNorm: Normalize each freq. bin wrt to other freq bins in the same timestep -> time independent, batch independent, freq deendent 127 | => FreqNorm: Normalize each freq. bin wrt to the same freq bin across time and batch -> time dependent, other freq independent 128 | """ 129 | def __init__(self, channels, affine=True, track_running_stats=True, momentum=0.1): 130 | super(FreqNorm, self).__init__(channels, affine=affine, track_running_stats=track_running_stats, momentum=momentum) 131 | 132 | def forward(self, x): 133 | return super().forward(x.transpose(2,1)).transpose(2,1) 134 | 135 | 136 | class ResidualBlock(nn.Module): 137 | """Implements conv->PReLU->norm n-times""" 138 | 139 | def __init__(self, channels, kernel_size, dilation, n=2, causal=False, norm=FreqNorm, activation=nn.ReLU): 140 | super(ResidualBlock, self).__init__() 141 | 142 | self.blocks = [ 143 | nn.Sequential( 144 | Conv1d(channels, channels, kernel_size, dilation=dilation), 145 | ZeroTemporalPad(kernel_size, dilation, causal=causal), 146 | activation(), 147 | norm(channels), # Normalize after activation. if we used ReLU, half of our neurons would be dead! 148 | ) 149 | for i in range(n) 150 | ] 151 | 152 | self.blocks = nn.Sequential(*self.blocks) 153 | 154 | def forward(self, x): 155 | return x + self.blocks(x) 156 | 157 | 158 | class TextEncoder(nn.Module): 159 | """Encodes input phonemes for the duration predictor and the decoder""" 160 | def __init__(self, hp): 161 | super(TextEncoder, self).__init__() 162 | self.kernel_size = hp.enc_kernel_size 163 | self.dilations = hp.enc_dilations 164 | 165 | self.prenet = nn.Sequential( 166 | nn.Embedding(hp.alphabet_size, hp.channels, padding_idx=0), 167 | Conv1d(hp.channels, hp.channels), 168 | eval(hp.activation)(), 169 | ) 170 | 171 | self.res_blocks = nn.Sequential(*[ 172 | ResidualBlock(hp.channels, self.kernel_size, d, n=2, norm=eval(hp.normalize), activation=eval(hp.activation)) 173 | for d in self.dilations 174 | ]) 175 | 176 | self.post_net1 = nn.Sequential( 177 | Conv1d(hp.channels, hp.channels), 178 | ) 179 | 180 | self.post_net2 = nn.Sequential( 181 | eval(hp.activation)(), 182 | eval(hp.normalize)(hp.channels), 183 | Conv1d(hp.channels, hp.channels) 184 | ) 185 | 186 | def forward(self, x): 187 | embedding = self.prenet(x) 188 | x = self.res_blocks(embedding) 189 | x = self.post_net1(x) + embedding 190 | return self.post_net2(x) 191 | 192 | 193 | class SpecDecoder(nn.Module): 194 | """Decodes the expanded phoneme encoding into spectrograms""" 195 | def __init__(self, hp): 196 | super(SpecDecoder, self).__init__() 197 | self.kernel_size = hp.dec_kernel_size 198 | self.dilations = hp.dec_dilations 199 | 200 | self.res_blocks = nn.Sequential( 201 | *[ResidualBlock(hp.channels, self.kernel_size, d, n=2, norm=eval(hp.normalize), activation=eval(hp.activation)) 202 | for d in self.dilations], 203 | ) 204 | 205 | self.post_net1 = nn.Sequential( 206 | Conv1d(hp.channels, hp.channels), 207 | ) 208 | 209 | self.post_net2 = nn.Sequential( 210 | ResidualBlock(hp.channels, self.kernel_size, 1, n=2), 211 | Conv1d(hp.channels, hp.out_channels), 212 | eval(hp.final_activation)() 213 | ) 214 | 215 | def forward(self, x): 216 | xx = self.res_blocks(x) 217 | x = self.post_net1(xx) + x 218 | return self.post_net2(x) 219 | 220 | 221 | class DurationPredictor(nn.Module): 222 | """Predicts phoneme log durations based on the encoder outputs""" 223 | def __init__(self, hp): 224 | super(DurationPredictor, self).__init__() 225 | 226 | self.layers = nn.Sequential( 227 | ResidualBlock(hp.channels, 4, 1, n=1, norm=eval(hp.normalize), activation=nn.ReLU), 228 | ResidualBlock(hp.channels, 3, 1, n=1, norm=eval(hp.normalize), activation=nn.ReLU), 229 | ResidualBlock(hp.channels, 1, 1, n=1, norm=eval(hp.normalize), activation=nn.ReLU), 230 | Conv1d(hp.channels, 1)) 231 | 232 | def forward(self, x): 233 | """Outputs interpreted as log(durations) 234 | To get actual durations, do exp transformation 235 | :param x: 236 | :return: 237 | """ 238 | return self.layers(x) 239 | 240 | 241 | class VoiceEncoder(nn.Module): 242 | """Reference audio encoder""" 243 | def __init__(self, hp): 244 | super(VoiceEncoder, self).__init__() 245 | 246 | # Define the network 247 | self.lstm = nn.LSTM(hp.n_mel_channels, hp.channels, 3, batch_first=True) 248 | self.linear = nn.Linear(hp.channels, hp.speaker_dim) 249 | self.relu = nn.ReLU() 250 | 251 | def forward(self, mels): 252 | # Pass the input through the LSTM layers and retrieve the final hidden state of the last 253 | # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings. 254 | _, (hidden, _) = self.lstm(mels) 255 | # Take only the hidden state of the last layer 256 | embeds_raw = self.relu(self.linear(hidden[-1])) 257 | # L2-normalize it 258 | embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5) 259 | return embeds 260 | 261 | 262 | class Interpolate(nn.Module): 263 | """Use multihead attention to increase variability in expanded phoneme encodings 264 | 265 | Not used in the final model, but used in reported experiments. 266 | """ 267 | def __init__(self, hp): 268 | super(Interpolate, self).__init__() 269 | 270 | ch = hp.channels 271 | self.att = nn.MultiheadAttention(ch, num_heads=4) 272 | self.norm = FreqNorm(ch) 273 | self.conv = Conv1d(ch, ch, kernel_size=1) 274 | 275 | def forward(self, x): 276 | xx = x.permute(1, 0, 2) # (batch, time, channels) -> (time, batch, channels) 277 | xx = self.att(xx, xx, xx)[0].permute(1, 0, 2) # (batch, time, channels) 278 | xx = self.conv(xx) 279 | return self.norm(xx) + x 280 | 281 | 282 | class ParallelText2Mel(nn.Module): 283 | def __init__(self, hp): 284 | """Text to melspectrogram network. 285 | Args: 286 | hp: hyper parameters 287 | Input: 288 | L: (B, N) text inputs 289 | Outputs: 290 | Y: (B, T, f) predicted melspectrograms 291 | """ 292 | super(ParallelText2Mel, self).__init__() 293 | self.hparams = hp 294 | self.encoder = TextEncoder(hp) 295 | self.decoder = SpecDecoder(hp) 296 | self.duration_predictor = DurationPredictor(hp) 297 | 298 | def forward(self, inputs): 299 | texts, tlens, durations, alpha = inputs 300 | alpha = alpha or 1.0 301 | 302 | encodings = self.encoder(texts) # batch, time, channels 303 | prd_durans = self.duration_predictor(encodings.detach() if self.hparams.separate_duration_grad 304 | else encodings)[..., 0] # batch, time 305 | 306 | # use exp(log(durations)) = durations 307 | if durations is None: 308 | prd_durans = (round_and_mask(torch.exp(prd_durans), tlens) * alpha).long() 309 | encodings = expand_enc(encodings, prd_durans, mode='duration') 310 | else: 311 | encodings = expand_enc(encodings, durations, mode='duration') 312 | 313 | melspecs = self.decoder(encodings) 314 | return melspecs, prd_durans 315 | 316 | 317 | # =============================================== 318 | # MelGAN Vocoder 319 | # =============================================== 320 | 321 | MAX_WAV_VALUE = 32768.0 322 | 323 | 324 | class ResStack(nn.Module): 325 | def __init__(self, channel): 326 | super(ResStack, self).__init__() 327 | 328 | self.blocks = nn.ModuleList([ 329 | nn.Sequential( 330 | nn.LeakyReLU(0.2), 331 | nn.ReflectionPad1d(3**i), 332 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=3, dilation=3**i)), 333 | nn.LeakyReLU(0.2), 334 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), 335 | ) 336 | for i in range(3) 337 | ]) 338 | 339 | self.shortcuts = nn.ModuleList([ 340 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) 341 | for i in range(3) 342 | ]) 343 | 344 | def forward(self, x): 345 | for block, shortcut in zip(self.blocks, self.shortcuts): 346 | x = shortcut(x) + block(x) 347 | return x 348 | 349 | def remove_weight_norm(self): 350 | for block, shortcut in zip(self.blocks, self.shortcuts): 351 | nn.utils.remove_weight_norm(block[2]) 352 | nn.utils.remove_weight_norm(block[4]) 353 | nn.utils.remove_weight_norm(shortcut) 354 | 355 | 356 | class MelGenerator(nn.Module): 357 | def __init__(self, mel_channel): 358 | super(MelGenerator, self).__init__() 359 | self.mel_channel = mel_channel 360 | 361 | self.generator = nn.Sequential( 362 | nn.ReflectionPad1d(3), 363 | nn.utils.weight_norm(nn.Conv1d(mel_channel, 512, kernel_size=7, stride=1)), 364 | 365 | nn.LeakyReLU(0.2), 366 | nn.utils.weight_norm(nn.ConvTranspose1d(512, 256, kernel_size=16, stride=8, padding=4)), 367 | 368 | ResStack(256), 369 | 370 | nn.LeakyReLU(0.2), 371 | nn.utils.weight_norm(nn.ConvTranspose1d(256, 128, kernel_size=16, stride=8, padding=4)), 372 | 373 | ResStack(128), 374 | 375 | nn.LeakyReLU(0.2), 376 | nn.utils.weight_norm(nn.ConvTranspose1d(128, 64, kernel_size=4, stride=2, padding=1)), 377 | 378 | ResStack(64), 379 | 380 | nn.LeakyReLU(0.2), 381 | nn.utils.weight_norm(nn.ConvTranspose1d(64, 32, kernel_size=4, stride=2, padding=1)), 382 | 383 | ResStack(32), 384 | 385 | nn.LeakyReLU(0.2), 386 | nn.ReflectionPad1d(3), 387 | nn.utils.weight_norm(nn.Conv1d(32, 1, kernel_size=7, stride=1)), 388 | nn.Tanh(), 389 | ) 390 | 391 | def forward(self, mel): 392 | mel = (mel + 5.0) / 5.0 # roughly normalize spectrogram 393 | return self.generator(mel) 394 | 395 | def eval(self, inference=False): 396 | super(MelGenerator, self).eval() 397 | 398 | # don't remove weight norm while validation in training loop 399 | if inference: 400 | self.remove_weight_norm() 401 | 402 | def remove_weight_norm(self): 403 | for idx, layer in enumerate(self.generator): 404 | if len(layer.state_dict()) != 0: 405 | try: 406 | nn.utils.remove_weight_norm(layer) 407 | except: 408 | layer.remove_weight_norm() 409 | 410 | def inference(self, mel): 411 | hop_length = 256 412 | # pad input mel with zeros to cut artifact 413 | # see https://github.com/seungwonpark/melgan/issues/8 414 | zero = torch.full((1, self.mel_channel, 10), -11.5129).to(mel.device) 415 | mel = torch.cat((mel, zero), dim=2) 416 | 417 | audio = self.forward(mel) 418 | audio = audio.squeeze() # collapse all dimension except time axis 419 | audio = audio[:-(hop_length*10)] 420 | audio = MAX_WAV_VALUE * audio 421 | audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) 422 | audio = audio.short() 423 | 424 | return audio 425 | -------------------------------------------------------------------------------- /backend/mytts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os.path as osp 3 | import librosa 4 | 5 | import torch 6 | from .hparams import HParam 7 | from .transform import StandardNorm, TextProcessor 8 | from .models import MelGenerator, ParallelText2Mel 9 | from .synthesizer import Synthesizer 10 | 11 | try: 12 | from .manager import GPUManager 13 | except ImportError as err: 14 | print(err); gm = None 15 | else: 16 | gm = GPUManager() 17 | 18 | 19 | def select_device(device): 20 | cpu_request = device.lower() == 'cpu' 21 | # if device requested other than 'cpu' 22 | if device and not cpu_request: 23 | c = 1024 ** 2 # bytes to MB 24 | x = torch.cuda.get_device_properties(int(device)) 25 | s = f'Using torch {torch.__version__} ' 26 | print("%sCUDA:%s (%s, %dMB)" % (s, device, x.name, x.total_memory / c)) 27 | return torch.device(f'cuda:{device}') 28 | else: 29 | print(f'Using torch {torch.__version__} CPU') 30 | return torch.device('cpu') 31 | 32 | 33 | class MyTTS: 34 | def __init__(self, config=None, device=None): 35 | if torch.cuda.is_available(): 36 | index = device if device else str(0 if gm is None else gm.auto_choice()) 37 | else: 38 | index = 'cpu' 39 | self.device = device = select_device(index) 40 | 41 | self.hparams = hparams = HParam(config) \ 42 | if config else HParam(osp.join(osp.dirname(osp.abspath(__file__)), "config", "default.yaml")) 43 | 44 | checkpoint = osp.join(osp.dirname(osp.abspath(__file__)), "pretrained", hparams.parallel.checkpoint) 45 | vocoder_checkpoint = osp.join(osp.dirname(osp.abspath(__file__)), "pretrained", hparams.vocoder.checkpoint) 46 | 47 | normalizer = StandardNorm(hparams.audio.spec_mean, hparams.audio.spec_std) 48 | processor = TextProcessor(hparams.text) 49 | text2mel = ParallelText2Mel(hparams.parallel) 50 | text2mel.eval() 51 | vocoder = MelGenerator(hparams.audio.n_mel_channels).to(device) 52 | vocoder.eval(inference=True) 53 | 54 | self.synthesizer = Synthesizer( 55 | model=text2mel, 56 | checkpoint=checkpoint, 57 | vocoder=vocoder, 58 | vocoder_checkpoint=vocoder_checkpoint, 59 | processor=processor, 60 | normalizer=normalizer, 61 | device=device 62 | ) 63 | 64 | def __call__(self, texts, speed, volume, tone): 65 | rate = int(tone) / 3 66 | alpha = (4 / int(speed)) * rate 67 | beta = int(volume) / 3 68 | wave = self.synthesizer.inference(texts, alpha=alpha, beta=beta) 69 | wave = wave.cpu().detach().numpy() 70 | sr = self.hparams.audio.sampling_rate 71 | # use TSM + resample to change tone 72 | wave = librosa.core.resample(wave, int(sr*rate), sr) 73 | return wave, sr 74 | 75 | -------------------------------------------------------------------------------- /backend/pretrained/ljspeech-melgan-epoch3200.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/PTTS-WebAPP/ff76ebd7e44542e79dcdb9c3bc23e91199de3a54/backend/pretrained/ljspeech-melgan-epoch3200.pth -------------------------------------------------------------------------------- /backend/pretrained/ljspeech-parallel-epoch0100.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/PTTS-WebAPP/ff76ebd7e44542e79dcdb9c3bc23e91199de3a54/backend/pretrained/ljspeech-parallel-epoch0100.pth -------------------------------------------------------------------------------- /backend/synthesizer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from .functional import mask 7 | 8 | 9 | class Synthesizer: 10 | def __init__(self, 11 | model=None, checkpoint=None, 12 | vocoder=None, vocoder_checkpoint=None, 13 | processor=None, normalizer=None, 14 | device='cuda'): 15 | # model 16 | self.model = model 17 | self.vocoder = vocoder 18 | self.processor = processor 19 | self.normalizer = normalizer 20 | 21 | # device 22 | self.device = device 23 | self.model.to(self.device) 24 | print(f'Model sent to {self.device}') 25 | 26 | # helper vars 27 | self.checkpoint = None 28 | self.epoch, self.step = 0, 0 29 | if checkpoint is not None: 30 | self.checkpoint = checkpoint 31 | self.load_checkpoint(checkpoint) 32 | 33 | self.vocoder_checkpoint = None 34 | if vocoder_checkpoint is not None: 35 | self.vocoder_checkpoint = vocoder_checkpoint 36 | self.load_voc_checkpoint(vocoder_checkpoint) 37 | 38 | def to_device(self, device): 39 | print(f'Sending network to {device}') 40 | self.device = device 41 | self.model.to(device) 42 | self.vocoder.to(device) 43 | return self 44 | 45 | def load_checkpoint(self, checkpoint): 46 | checkpoint = torch.load(checkpoint, map_location=self.device) 47 | self.epoch = checkpoint['epoch'] 48 | self.step = checkpoint['step'] 49 | self.model.load_state_dict(checkpoint['state_dict']) 50 | print("Loaded checkpoint (e=%d s=%d) finished" % (self.epoch, self.step)) 51 | 52 | self.checkpoint = None # prevent overriding old checkpoint 53 | return self 54 | 55 | def load_voc_checkpoint(self, checkpoint): 56 | checkpoint = torch.load(checkpoint, map_location=self.device) 57 | self.vocoder.load_state_dict(checkpoint) 58 | print("Loaded melgan checkpoint finished") 59 | 60 | def inference(self, texts, alpha=1.0, beta=1.0): 61 | print('Synthesizing...') 62 | since = time.time() 63 | texts, tlens = self.processor(texts) 64 | texts = torch.from_numpy(texts).long().to(self.device) 65 | texts = torch.cat((texts, torch.zeros(len(texts), 7).long().to(self.device)), dim=-1) 66 | tlens = torch.Tensor(tlens).to(self.device) 67 | with torch.no_grad(): 68 | melspecs, prd_durans = self.model((texts, tlens, None, alpha)) 69 | melspecs = self.normalizer.inverse(melspecs * beta) 70 | msk = mask(melspecs.shape, prd_durans.sum(dim=-1).long(), dim=1).to(self.device) 71 | melspecs = melspecs.masked_fill(~msk, -11.5129).permute(0, 2, 1) 72 | melspecs = torch.cat((melspecs, -11.5129*torch.ones(len(melspecs), melspecs.size(1), 3).to(self.device)), dim=-1) 73 | print(f"Inference {len(texts)} spectrograms, total elapsed {time.time()-since:.3f}s. Done.") 74 | waves = self.vocoder(melspecs).squeeze(1) 75 | print(f"Generate {len(texts)} audios, total elapsed {time.time()-since:.3f}s. Done.") 76 | return waves 77 | -------------------------------------------------------------------------------- /backend/transform.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | from g2p_en import G2p 4 | 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch import as_tensor, stack 8 | 9 | 10 | class Pad: 11 | """Pad all tensors in first (length) dimension""" 12 | 13 | def __init__(self, pad_value=0, get_lens=False): 14 | self.pad_value = pad_value 15 | self.get_lens = get_lens 16 | 17 | def __call__(self, x): 18 | """Pad each tensor in x to the same length 19 | 20 | Pad tensors in the first dimension and stack them to form a batch 21 | 22 | :param x: list of tensors/lists/arrays 23 | :returns batch: (len_x, max_len_x, ...) 24 | """ 25 | 26 | if self.get_lens: 27 | return self.pad_batch(x, self.pad_value), [len(xx) for xx in x] 28 | 29 | return self.pad_batch(x, self.pad_value) 30 | 31 | @staticmethod 32 | def pad_batch(items, pad_value=0): 33 | max_len = len(max(items, key=lambda x: len(x))) 34 | zeros = (2*as_tensor(items[0]).ndim -1) * [pad_value] 35 | return stack([F.pad(as_tensor(x), pad= zeros + [max_len - len(x)], value=pad_value) 36 | for x in items]) 37 | 38 | 39 | class StandardNorm(nn.Module): 40 | def __init__(self, mean, std): 41 | super(StandardNorm, self).__init__() 42 | self.mean = mean 43 | self.std = std 44 | 45 | def forward(self, x): 46 | return (x - self.mean)/self.std 47 | 48 | def inverse(self, x): 49 | return x * self.std + self.mean 50 | 51 | 52 | 53 | class TextProcessor: 54 | 55 | g2p = G2p() 56 | 57 | def __init__(self, hparams): 58 | self.units = self.graphemes = hparams.graphemes 59 | self.phonemes = hparams.phonemes 60 | self.phonemize = hparams.use_phonemes 61 | if self.phonemize: 62 | self.units = self.phonemes 63 | self.specials = hparams.specials 64 | self.punctuations = hparams.punctuations 65 | self.units = self.specials + self.units + self.punctuations 66 | self.txt2idx = {txt: idx for idx, txt in enumerate(self.units)} 67 | self.idx2txt = {idx: txt for idx, txt in enumerate(self.units)} 68 | 69 | def normalize(self, text): 70 | text = text.lower() 71 | text = re.sub("[ ]+", " ", text) 72 | # keep_re = "[^" + str(self.graphemes+self.punctuations) +"]" 73 | # text = re.sub(keep_re, " ", text) # remove 74 | text = [ch if ch in self.graphemes+self.punctuations else ' ' for ch in text] 75 | text = list(text) 76 | if self.phonemize: 77 | text = self.g2p(''.join(text)) 78 | return text 79 | 80 | def __call__(self, texts, max_n=None): 81 | if not isinstance(texts, (str, list)): 82 | raise TypeError("Inputs must be str or list(str)") 83 | if isinstance(texts, str): 84 | texts = [texts] 85 | normalized_texts = [self.normalize(line) for line in texts] # text normalization 86 | tlens = [len(l) for l in normalized_texts] 87 | max_n = max_n or max(tlens) 88 | texts = np.zeros((len(normalized_texts), max_n), np.long) 89 | for i, text in enumerate(normalized_texts): 90 | texts[i, :len(text)] = [self.txt2idx.get(ch, 1) for ch in text] 91 | return texts, tlens 92 | -------------------------------------------------------------------------------- /client.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import requests 4 | from urllib.parse import urlencode 5 | import json, time, uuid 6 | 7 | 8 | url = "http://127.0.0.1:5000" 9 | 10 | payload = { 11 | "speed": 4, 12 | "volume": 4, 13 | "tone": 4, 14 | "text": "To install precompiled package of eSpeak NG on Linux, use standard package manager of your distribution.", 15 | } 16 | headers = { 17 | 'content-type': "application/json" 18 | } 19 | 20 | outputs_dir = "outputs" 21 | os.makedirs(outputs_dir, exist_ok=True) 22 | 23 | 24 | print("="*12 + " POST TEST " + "="*12) 25 | data = json.dumps(payload) 26 | response = requests.request("POST", url+"/api/mytts", data=data, headers=headers) 27 | if response.status_code == 200: 28 | filename = f"{time.strftime('%Y-%m-%d')}_{uuid.uuid4()}.wav" 29 | with open(osp.join(outputs_dir, filename), "wb") as fw: 30 | fw.write(response.content) 31 | # print(f"Audios saved to {outputs_dir}. Done.") 32 | print("POST TEST SUCCESSED!") 33 | else: 34 | print("POST TEST FAILED!") 35 | 36 | 37 | print("="*12 + " GET TEST " + "="*12) 38 | data = urlencode(payload) 39 | response = requests.request("GET", url+"/api/mytts?"+data, headers=headers) 40 | if response.status_code == 200: 41 | filename = f"{time.strftime('%Y-%m-%d')}_{uuid.uuid4()}.wav" 42 | with open(osp.join(outputs_dir, filename), "wb") as fw: 43 | fw.write(response.content) 44 | # print(f"Audios saved to {outputs_dir}. Done.") 45 | print("GET TEST SUCCESSED!") 46 | else: 47 | print("GET TEST FAILED!") 48 | -------------------------------------------------------------------------------- /client2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import requests 4 | from urllib.parse import urlencode 5 | import json, time, uuid 6 | import numpy as np 7 | from scipy.io.wavfile import write 8 | 9 | 10 | url = "http://127.0.0.1:5000" 11 | 12 | payload = { 13 | "speed": 4, 14 | "volume": 4, 15 | "tone": 4, 16 | "text": "To install precompiled package of eSpeak NG on Linux, use standard package manager of your distribution.", 17 | } 18 | headers = { 19 | 'content-type': "application/json" 20 | } 21 | 22 | outputs_dir = "outputs" 23 | os.makedirs(outputs_dir, exist_ok=True) 24 | 25 | 26 | print("="*12 + " POST TEST " + "="*12) 27 | data = json.dumps(payload) 28 | response = requests.request("POST", url+"/api/mytts", data=data, headers=headers) 29 | if response.status_code == 200: 30 | content = response.content.decode('utf-8') 31 | content = json.loads(content) 32 | wave, sr = content['wave'], content['sr'] 33 | print('Saving audio...') 34 | filename = osp.join(outputs_dir, f"{time.strftime('%Y-%m-%d')}_{uuid.uuid4()}.wav") 35 | write(filename, sr, np.array(wave, dtype=np.float32)) 36 | print(f"Audios saved to {outputs_dir}. Done.") 37 | print("POST TEST SUCCESSED!") 38 | else: 39 | print("POST TEST FAILED!") 40 | 41 | 42 | print("="*12 + " GET TEST " + "="*12) 43 | data = urlencode(payload) 44 | response = requests.request("GET", url+"/api/mytts?"+data, headers=headers) 45 | if response.status_code == 200: 46 | content = response.content.decode('utf-8') 47 | content = json.loads(content) 48 | wave, sr = content['wave'], content['sr'] 49 | print('Saving audio...') 50 | filename = osp.join(outputs_dir, f"{time.strftime('%Y-%m-%d')}_{uuid.uuid4()}.wav") 51 | write(filename, sr, np.array(wave, dtype=np.float32)) 52 | print(f"Audios saved to {outputs_dir}. Done.") 53 | print("GET TEST SUCCESSED!") 54 | else: 55 | print("GET TEST FAILED!") 56 | -------------------------------------------------------------------------------- /frontend/.editorconfig: -------------------------------------------------------------------------------- 1 | [*.{js,jsx,ts,tsx,vue}] 2 | indent_style = space 3 | indent_size = 2 4 | trim_trailing_whitespace = true 5 | insert_final_newline = true 6 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | /dist 4 | 5 | 6 | # local env files 7 | .env.local 8 | .env.*.local 9 | 10 | # Log files 11 | npm-debug.log* 12 | yarn-debug.log* 13 | yarn-error.log* 14 | pnpm-debug.log* 15 | 16 | # Editor directories and files 17 | .idea 18 | .vscode 19 | *.suo 20 | *.ntvs* 21 | *.njsproj 22 | *.sln 23 | *.sw? 24 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # frontend 2 | 3 | ## Project setup 4 | ``` 5 | npm install 6 | ``` 7 | 8 | ### Compiles and hot-reloads for development 9 | ``` 10 | npm run serve 11 | ``` 12 | 13 | ### Compiles and minifies for production 14 | ``` 15 | npm run build 16 | ``` 17 | 18 | ### Lints and fixes files 19 | ``` 20 | npm run lint 21 | ``` 22 | 23 | ### Customize configuration 24 | See [Configuration Reference](https://cli.vuejs.org/config/). 25 | -------------------------------------------------------------------------------- /frontend/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [ 3 | '@vue/cli-plugin-babel/preset' 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "serve": "vue-cli-service serve", 7 | "build": "vue-cli-service build", 8 | "lint": "vue-cli-service lint" 9 | }, 10 | "dependencies": { 11 | "axios": "^0.21.1", 12 | "core-js": "^3.10.1", 13 | "vue": "^2.6.12", 14 | "vue-audio-native": "^0.1.41", 15 | "vue-router": "^3.5.1", 16 | "vuetify": "^2.4.9" 17 | }, 18 | "devDependencies": { 19 | "@vue/cli-plugin-babel": "^4.5.12", 20 | "@vue/cli-plugin-eslint": "^4.5.12", 21 | "@vue/cli-plugin-router": "^4.5.12", 22 | "@vue/cli-service": "^4.5.12", 23 | "@vue/eslint-config-standard": "^5.1.2", 24 | "babel-eslint": "^10.1.0", 25 | "eslint": "^6.8.0", 26 | "eslint-plugin-import": "^2.22.1", 27 | "eslint-plugin-node": "^11.1.0", 28 | "eslint-plugin-promise": "^4.3.1", 29 | "eslint-plugin-standard": "^4.1.0", 30 | "eslint-plugin-vue": "^6.2.2", 31 | "sass": "^1.32.8", 32 | "sass-loader": "^10.1.1", 33 | "vue-cli-plugin-vuetify": "^2.3.1", 34 | "vue-template-compiler": "^2.6.12", 35 | "vuetify-loader": "^1.7.2" 36 | }, 37 | "eslintConfig": { 38 | "root": true, 39 | "env": { 40 | "node": true 41 | }, 42 | "extends": [ 43 | "plugin:vue/essential", 44 | "@vue/standard" 45 | ], 46 | "parserOptions": { 47 | "parser": "babel-eslint" 48 | }, 49 | "rules": {} 50 | }, 51 | "browserslist": [ 52 | "> 1%", 53 | "last 2 versions", 54 | "not dead" 55 | ] 56 | } 57 | -------------------------------------------------------------------------------- /frontend/public/demo.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/PTTS-WebAPP/ff76ebd7e44542e79dcdb9c3bc23e91199de3a54/frontend/public/demo.wav -------------------------------------------------------------------------------- /frontend/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/PTTS-WebAPP/ff76ebd7e44542e79dcdb9c3bc23e91199de3a54/frontend/public/favicon.ico -------------------------------------------------------------------------------- /frontend/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | <%= htmlWebpackPlugin.options.title %> 9 | 10 | 11 | 12 | 13 | 16 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /frontend/src/App.vue: -------------------------------------------------------------------------------- 1 | 12 | 13 | 28 | -------------------------------------------------------------------------------- /frontend/src/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/PTTS-WebAPP/ff76ebd7e44542e79dcdb9c3bc23e91199de3a54/frontend/src/assets/logo.png -------------------------------------------------------------------------------- /frontend/src/assets/logo.svg: -------------------------------------------------------------------------------- 1 | Artboard 46 2 | -------------------------------------------------------------------------------- /frontend/src/components/HelloWorld.vue: -------------------------------------------------------------------------------- 1 | 93 | 94 | 152 | -------------------------------------------------------------------------------- /frontend/src/components/MyParaTTS.vue: -------------------------------------------------------------------------------- 1 | 70 | 71 | 112 | -------------------------------------------------------------------------------- /frontend/src/main.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue' 2 | import App from './App.vue' 3 | import router from './router' 4 | import vuetify from './plugins/vuetify' 5 | import vueAudioNative from 'vue-audio-native' 6 | Vue.use(vueAudioNative) 7 | 8 | Vue.config.productionTip = false 9 | 10 | new Vue({ 11 | router, 12 | vuetify, 13 | render: h => h(App) 14 | }).$mount('#app') 15 | -------------------------------------------------------------------------------- /frontend/src/plugins/vuetify.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue' 2 | import Vuetify from 'vuetify/lib/framework' 3 | 4 | Vue.use(Vuetify) 5 | 6 | export default new Vuetify({ 7 | }) 8 | -------------------------------------------------------------------------------- /frontend/src/router/index.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue' 2 | import VueRouter from 'vue-router' 3 | import Home from '../views/Home.vue' 4 | 5 | Vue.use(VueRouter) 6 | 7 | const routes = [ 8 | { 9 | path: '/', 10 | name: 'Home', 11 | component: Home 12 | }, 13 | { 14 | path: '/about', 15 | name: 'About', 16 | // route level code-splitting 17 | // this generates a separate chunk (about.[hash].js) for this route 18 | // which is lazy-loaded when the route is visited. 19 | component: () => import(/* webpackChunkName: "about" */ '../views/About.vue') 20 | } 21 | ] 22 | 23 | const router = new VueRouter({ 24 | mode: 'history', 25 | base: process.env.BASE_URL, 26 | routes 27 | }) 28 | 29 | export default router 30 | -------------------------------------------------------------------------------- /frontend/src/views/About.vue: -------------------------------------------------------------------------------- 1 | 6 | -------------------------------------------------------------------------------- /frontend/src/views/Home.vue: -------------------------------------------------------------------------------- 1 | 7 | 8 | 19 | -------------------------------------------------------------------------------- /frontend/vue.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | transpileDependencies: [ 3 | 'vuetify' 4 | ], 5 | 6 | outputDir: '../dist', 7 | assetsDir: 'static' 8 | } 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.25.0 2 | Flask==1.1.2 3 | Flask_Cors==3.0.9 4 | scipy==1.5.4 5 | numpy==1.19.2 6 | numba==0.48.0 7 | g2p_en==2.1.0 8 | torch==1.5.0 9 | PyYAML==5.4.1 10 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import requests 3 | 4 | from flask import Flask, request, render_template, jsonify, send_file 5 | from flask_cors import CORS 6 | 7 | from random import randint 8 | from backend.mytts import MyTTS 9 | from scipy.io.wavfile import write 10 | 11 | tts = MyTTS(device='cpu') 12 | 13 | app = Flask(__name__, 14 | static_folder = "./dist/static", 15 | template_folder = "./dist") 16 | cors = CORS(app, resources={r"/api/*": {"origins": "*"}}) 17 | 18 | 19 | @app.route('/', defaults={'path': ''}) 20 | @app.route('/') 21 | def catch_all(path): 22 | if app.debug: 23 | return requests.get('http://127.0.0.1:8080/{}'.format(path)).text 24 | return render_template("index.html") 25 | 26 | 27 | @app.route('/api/random') 28 | def api_random(): 29 | response = { 30 | 'randomNumber': randint(1, 100) 31 | } 32 | return jsonify(response) 33 | 34 | @app.route('/api/mytts', methods=['GET', 'POST']) 35 | def api_mytts(): 36 | req = request.json if request.method == 'POST' else request.args 37 | print(req) 38 | text, speed, volume, tone = \ 39 | req.get('text'), req.get('speed', 4), req.get('volume', 4), req.get('tone', 4) 40 | waves, sr = tts([text], int(speed), int(volume), int(tone)) 41 | filepath = osp.join('dist', 'demo.wav') 42 | write(filepath, sr, waves[0]) 43 | return send_file(filepath) 44 | # return jsonify({'wave': waves[0].tolist(), 'sr': sr}) 45 | 46 | 47 | if __name__ == '__main__': 48 | app.run(host='0.0.0.0', port=5000, debug=False) 49 | --------------------------------------------------------------------------------