├── julius4seg ├── __init__.py ├── sp_remover.py └── sp_inserter.py ├── sample ├── sample_kan.txt ├── result.png ├── sample_kana.txt ├── sample_voice.wav ├── run.sh ├── run_remover.py ├── README.md └── run_segment.py ├── .gitignore ├── LICENSE ├── Dockerfile └── README.md /julius4seg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sample/sample_kan.txt: -------------------------------------------------------------------------------- 1 | また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.dfa 2 | *.dict 3 | .mypy_cache 4 | __pycache__ 5 | sample/*.txt 6 | -------------------------------------------------------------------------------- /sample/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yamachu/julius4seg/HEAD/sample/result.png -------------------------------------------------------------------------------- /sample/sample_kana.txt: -------------------------------------------------------------------------------- 1 | マタ トージノヨーニ ゴダイミョウオート ヨバレル シュヨーナ ミョーオーノ チューオーニ ハイサレルコトモオーイ 2 | -------------------------------------------------------------------------------- /sample/sample_voice.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yamachu/julius4seg/HEAD/sample/sample_voice.wav -------------------------------------------------------------------------------- /sample/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ $# -lt 1 ]; then 5 | echo 'usage: args...' 6 | echo 'see: https://github.com/yamachu/julius4seg' 7 | exit 1 8 | fi 9 | 10 | case "$1" in 11 | "sp-remove" ) python3 run_remover.py ${@:2:($#-1)} ;; 12 | "sp-segment" ) python3 run_segment.py ${@:2:($#-1)} ;; 13 | * ) echo "sp-remove or sp-segment only" ;; 14 | esac 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yusuke Yamada 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | # For supporting Japanese in Python3 4 | ENV LC_CTYPE=C.UTF-8 5 | 6 | # Install git, git-lfs and Julius dependencies 7 | RUN apt-get update && \ 8 | apt-get install -y curl git python3 libgomp1 pulseaudio && \ 9 | apt-get clean && \ 10 | curl -sLO https://github.com/git-lfs/git-lfs/releases/download/v2.10.0/git-lfs-linux-amd64-v2.10.0.tar.gz && \ 11 | tar -zxf git-lfs-linux-amd64-v2.10.0.tar.gz git-lfs && \ 12 | mv git-lfs /usr/bin/ && \ 13 | rm git-lfs-linux-amd64-v2.10.0.tar.gz && \ 14 | git lfs install --skip-smudge 15 | 16 | ARG DICTATION_KIT_HASH="1ceb4dec245ef482918ca33c55c71d383dce145e" 17 | RUN git clone https://github.com/julius-speech/dictation-kit.git /opt/dictation-kit && \ 18 | cd /opt/dictation-kit && \ 19 | git checkout ${DICTATION_KIT_HASH} && \ 20 | git lfs fetch origin --recent -I "model/phone_m/jnas-mono-16mix-gid*" && \ 21 | git lfs checkout origin "model/phone_m/jnas-mono-16mix-gid*" 22 | 23 | ARG JULIUS4SEG_HASH="d83a954489d4d8ba605982355f6d95724a8121df" 24 | RUN git clone https://github.com/yamachu/julius4seg.git /opt/julius4seg && \ 25 | cd /opt/julius4seg && \ 26 | git checkout ${JULIUS4SEG_HASH} && \ 27 | cd /opt/julius4seg/sample && \ 28 | chmod +x ./run.sh 29 | 30 | WORKDIR /opt/julius4seg/sample 31 | 32 | ENTRYPOINT ["./run.sh"] 33 | -------------------------------------------------------------------------------- /sample/run_remover.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | from julius4seg import sp_remover 5 | import array 6 | import argparse 7 | import wave 8 | 9 | 10 | def main(args: dict): 11 | sp_remover.MARGIN = int(args.margin) 12 | with open(args.input_seg_file) as f: 13 | label = [s.strip() for s in f] 14 | 15 | sp_label = sp_remover.get_sp_segment(label) 16 | 17 | tmp = sp_remover.get_wav_sp_removed(args.wav_file, sp_label, args.edge_only, int(args.start), int(args.end)) 18 | 19 | with wave.open(args.output_wav_file, 'wb') as f: 20 | f.setnchannels(1) 21 | f.setsampwidth(2) 22 | f.setframerate(16000) 23 | f.writeframes(array.array('h' , tmp).tostring()) 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser('sp remove demo') 28 | 29 | parser.add_argument('wav_file') 30 | parser.add_argument('input_seg_file') 31 | parser.add_argument('output_wav_file') 32 | parser.add_argument('-s', '--start', default=0, help='発話始点の残す無音区間の量[msec]: -Eの時のみ有効') 33 | parser.add_argument('-e', '--end', default=0, help='発話終端の残す無音区間の量[msec]: -Eの時のみ有効') 34 | parser.add_argument('-E', '--edge-only', action='store_true', help='発話の前後の無音区間のみを行う') 35 | parser.add_argument('-m', '--margin', default=5, help='有声区間と無声区間のマージン[msec]') 36 | 37 | args = parser.parse_args() 38 | 39 | main(args) 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # julius4seg 2 | 3 | [Julius Japanese Dictation-kit](https://github.com/julius-speech/dictation-kit)をPythonから叩いている風にするためのスクリプト. 4 | 5 | grammer-kitとsegmentation-kitを足して2で割ったような立ち位置. 6 | 7 | ## Usage 8 | 9 | ### Cloneして使う方 10 | 11 | see: https://github.com/yamachu/julius4seg/blob/master/sample/README.md 12 | 13 | ### Dockerで使う方 14 | 15 | see: https://hub.docker.com/r/yamachu/julius4seg 16 | 17 | ```sh 18 | $ docker pull yamachu/julius4seg:latest 19 | ``` 20 | 21 | #### segmentationしたい方 22 | 23 | コマンド例 24 | 25 | ```sh 26 | $ docker run --rm -v `pwd`/sample:/tmp yamachu/julius4seg sp-segment /tmp/sample_voice.wav /tmp/sample_kana.txt /tmp/seg.txt 27 | ``` 28 | 29 | 第一引数に `sp-segment` を入れて、その後に続く引数は[sample](https://github.com/yamachu/julius4seg/blob/master/sample/README.md)の `run_segment.py` と同様 30 | 31 | ファイル入力前提で作られているので、ローカルのディレクトリをマウントして、そのファイルを指定するようにして下さい。 32 | 33 | #### silenceを除去したい方 34 | 35 | コマンド例 36 | 37 | ```sh 38 | $ docker run --rm -v `pwd`/sample:/tmp yamachu/julius4seg sp-remove /tmp/sample_voice.wav /tmp/seg.txt /tmp/out.wav 39 | ``` 40 | 41 | 第一引数に `sp-remove` を入れて、その後に続く引数は[sample](https://github.com/yamachu/julius4seg/blob/master/sample/README.md)の `run_remover.py` と同様 42 | 43 | ## 注意事項 44 | 45 | このスクリプトを実行するのに依存しているDictation-kitはgit lfsがインストールされていないと音響モデルも一緒にクローンできないため注意. 46 | 47 | Juliusの標準的なサポートフォーマットである16kHz, 16bit, monoの音声を対象としている. 48 | 49 | macOSX, Python3.6で動作を確認. 50 | -------------------------------------------------------------------------------- /julius4seg/sp_remover.py: -------------------------------------------------------------------------------- 1 | import array 2 | import struct 3 | import wave 4 | 5 | 6 | # 有声音素を削らないためのマージン 7 | MARGIN = 5 8 | 9 | 10 | def get_sp_segment(time_list: [str]) -> [[int]]: 11 | '''音素セグメントリストから無音区間の部分のみを抽出 12 | args: 13 | time_list ([str]): 音素セグメントリスト 14 | returns: 15 | [[int]]: 無音区間の初めと終わりのフレームのリスト 16 | ''' 17 | sps = [list(map(int, s.split()[:2])) for s in time_list if 'silB' in s or 'silE' in s or 'sp' in s] 18 | return sps 19 | 20 | 21 | def get_wav_sp_removed(wav_file_name: str, sp_segment: [[int]], only_edge: bool = False, start_margin: int = MARGIN, end_margin: int = MARGIN) -> [int]: 22 | with wave.open(wav_file_name) as f: 23 | n = f.getnframes() 24 | data = struct.unpack('h'*n, f.readframes(n)) 25 | 26 | removed = [] 27 | 28 | seg_start = 0 29 | 30 | if only_edge: 31 | tmp = sp_segment[0][1] * 10 - start_margin 32 | seg_start = tmp if tmp > 0 else sp_segment[0][0] * 10 33 | 34 | tmp = sp_segment[-1][0] * 10 + end_margin 35 | seg_end = tmp if tmp < sp_segment[-1][1] * 10 else sp_segment[-1][1] * 10 36 | 37 | removed.extend(data[int(seg_start / 1000 * 16000):int(seg_end / 1000 * 16000)]) 38 | else: 39 | for i, seg in enumerate(sp_segment): 40 | if i == 0: 41 | seg_start = seg[1] * 10 - MARGIN # ms 42 | continue 43 | 44 | seg_end = seg[0] * 10 + MARGIN 45 | 46 | removed.extend(data[int(seg_start / 1000 * 16000):int(seg_end / 1000 * 16000)]) 47 | 48 | if i != len(sp_segment) - 1: 49 | seg_start = seg[1] * 10 - MARGIN 50 | 51 | return removed 52 | -------------------------------------------------------------------------------- /sample/README.md: -------------------------------------------------------------------------------- 1 | # サンプル 2 | 3 | ## セグメンテーションツール 4 | 5 | ### 実行例 6 | 7 | spを挿入したテキストが必要な場合(spを考慮した言語モデルの作成などの用途) 8 | 9 | ``` 10 | python3 run_segment.py sample_voice.wav -it sample_kan.txt -ot sp_kan.txt sample_kana.txt seg.txt 11 | ``` 12 | 13 | 音素のセグメントのみが必要な場合(主に合成などで無音区間を除去したい場合など) 14 | 15 | ``` 16 | python3 run_segment.py sample_voice.wav sample_kana.txt seg.txt 17 | ``` 18 | 19 | ### 注意事項 20 | 21 | _run_segment.py_ 内の 22 | ``` 23 | sp_inserter.JULIUS_ROOT = PurePath('/Users/yamachu/tmp/dictation-kit') 24 | ``` 25 | のPATHを自分の環境に合わせること. 26 | 27 | またJuliusの制約上,長い音声をセグメンテーションしようとした場合,失敗することがあります. 28 | 29 |
30 | セグメント結果の例 31 | 32 | ``` 33 | 0 71 silB 34 | 72 74 m 35 | 75 83 a 36 | 84 91 t 37 | 92 108 a 38 | 109 137 sp 39 | 138 144 t 40 | 145 164 o: 41 | 165 170 j 42 | 171 173 i 43 | 174 179 n 44 | 180 185 o 45 | 186 194 y 46 | 195 211 o: 47 | 212 214 n 48 | 215 230 i 49 | 231 286 sp 50 | 287 291 g 51 | 292 298 o 52 | 299 304 d 53 | 305 313 a 54 | 314 320 i 55 | 321 337 my 56 | 338 342 o 57 | 343 345 u 58 | 346 356 o: 59 | 357 362 t 60 | 363 365 o 61 | 366 372 y 62 | 373 375 o 63 | 376 382 b 64 | 383 386 a 65 | 387 389 r 66 | 390 397 e 67 | 398 402 r 68 | 403 420 u 69 | 421 453 sp 70 | 454 472 sh 71 | 473 475 u 72 | 476 484 y 73 | 485 500 o: 74 | 501 503 n 75 | 504 512 a 76 | 513 525 my 77 | 526 532 o: 78 | 533 552 o: 79 | 553 557 n 80 | 558 573 o 81 | 574 589 sp 82 | 590 605 ch 83 | 606 619 u: 84 | 620 635 o: 85 | 636 640 n 86 | 641 645 i 87 | 646 654 h 88 | 655 662 a 89 | 663 666 i 90 | 667 674 s 91 | 675 679 a 92 | 680 682 r 93 | 683 690 e 94 | 691 693 r 95 | 694 696 u 96 | 697 706 k 97 | 707 710 o 98 | 711 715 t 99 | 716 720 o 100 | 721 729 m 101 | 730 735 o 102 | 736 762 o: 103 | 763 772 i 104 | 773 872 silE 105 | ``` 106 |
107 | 108 | ## 無音除去ツール 109 | 110 | セグメンテーションツールより得られたセグメンテーションファイルを元にファイルの無音区間を除去する. 111 | 112 | ### 実行例 113 | 114 | 全ての無音データを削除する場合 115 | 116 | ``` 117 | python3 run_remover.py sample_voice.wav seg.txt out.wav 118 | ``` 119 | 120 | 音声の先頭と終端の無音区間のトリミングを行う場合(例では500msecに揃える) 121 | 122 | ``` 123 | python3 run_remover.py sample_voice.wav seg.txt out.wav -s 500 -e 500 -E 124 | ``` 125 | 126 | また無音区間と判定された最初のフレームの初めから、また最後のフレームの終わりからn[msec]を削除できる `m` オプションを使用できる. 127 | 128 | ![実行結果](https://github.com/yamachu/julius4seg/raw/master/sample/result.png "サンプル") 129 | 130 | 131 | ## その他 132 | 133 | サンプルのテキストは[日本声優統計学会](http://voice-statistics.github.io/)より,[声優統計コーパス 音素バランス文](https://github.com/voice-statistics/voice-statistics.github.com/blob/master/assets/doc/balance_sentences.txt)の001をお借りいたしました. 134 | 135 | またsample音声は本サンプルの実行以外での使用を禁じます. 136 | 137 | -------------------------------------------------------------------------------- /sample/run_segment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | from julius4seg import sp_inserter 5 | from pathlib import PurePath 6 | import argparse 7 | from logging import DEBUG, FileHandler 8 | 9 | # MUST CHANGE YOUR JULIUS DICTATION-KIT PATH 10 | sp_inserter.JULIUS_ROOT = PurePath('/opt/dictation-kit') 11 | 12 | # If you want to handle error, uncomment-out 13 | # fhandler = FileHandler(logname + '.log') 14 | # fhandler.setLevel(DEBUG) 15 | # sp_inserter.logger.addHandler(fhandler) 16 | 17 | 18 | def main(args: dict): 19 | utt_id = PurePath(args.wav_file).name.split('.')[0] 20 | 21 | with open(args.input_kana_file) as f: 22 | base_kata_text = f.readline().strip() 23 | 24 | if args.input_text_file: 25 | with open(args.input_text_file) as f: 26 | base_kan_text = f.readline().strip().split() 27 | else: 28 | base_kan_text = ['sym_{}'.format(i) for i in range(len(base_kata_text.split()))] 29 | 30 | assert len(base_kan_text) == len(base_kata_text.split()) 31 | 32 | julius_phones = [sp_inserter.conv2julius(hira) for hira in [sp_inserter.kata2hira(kata) for kata in base_kata_text.split()]] 33 | 34 | dict_1st = sp_inserter.gen_julius_dict_1st(base_kan_text, julius_phones) 35 | dfa_1st = sp_inserter.gen_julius_dfa(dict_1st.count('\n')) 36 | 37 | with open('first_pass.dict', 'w') as f: 38 | f.write(dict_1st) 39 | 40 | with open('first_pass.dfa', 'w') as f: 41 | f.write(dfa_1st) 42 | 43 | raw_first_output = sp_inserter.julius_sp_insert(args.wav_file, 'first_pass', args.hmm_model) 44 | 45 | forced_text_with_sp = [] 46 | forced_phones_with_sp = [] 47 | 48 | try: 49 | _, sp_position = sp_inserter.get_sp_inserted_text(raw_first_output, utt_id) 50 | 51 | for j, zipped in enumerate(zip(base_kan_text, julius_phones)): 52 | forced_text_with_sp.append(zipped[0]) 53 | forced_phones_with_sp.append(zipped[1]) 54 | if j in sp_position: 55 | forced_text_with_sp.append('') 56 | forced_phones_with_sp.append('sp') 57 | 58 | forced_text_with_sp = ' '.join(forced_text_with_sp) 59 | forced_phones_with_sp = ' '.join(forced_phones_with_sp) 60 | except: 61 | pass 62 | 63 | phones_with_sp = sp_inserter.get_sp_inserterd_phone_seqence(raw_first_output, utt_id) 64 | 65 | if len(forced_phones_with_sp) < 2: 66 | forced_phones_with_sp = phones_with_sp 67 | 68 | dict_2nd = sp_inserter.gen_julius_dict_2nd(forced_phones_with_sp) 69 | dfa_2nd = sp_inserter.gen_julius_aliment_dfa() 70 | 71 | with open('second_pass.dict', 'w') as f: 72 | f.write(dict_2nd) 73 | 74 | with open('second_pass.dfa', 'w') as f: 75 | f.write(dfa_2nd) 76 | 77 | raw_second_output = sp_inserter.julius_phone_alignment(args.wav_file, 'second_pass', args.hmm_model) 78 | 79 | time_alimented_list = sp_inserter.get_time_alimented_list(raw_second_output) 80 | 81 | if args.output_text_file: 82 | with open(args.output_text_file, 'w') as f: 83 | f.write(forced_text_with_sp + '\n') 84 | 85 | with open(args.output_seg_file, 'w') as f: 86 | for ss in time_alimented_list: 87 | f.write(' '.join(list(ss)) + '\n') 88 | 89 | 90 | if __name__ == '__main__': 91 | parser = argparse.ArgumentParser('sp insert demo by Julius') 92 | 93 | parser.add_argument('wav_file', help='入力音声') 94 | parser.add_argument('input_kana_file', help='スペース区切りのカナ読みファイル') 95 | parser.add_argument('output_seg_file', help='時間情報付き音素セグメントファイル') 96 | 97 | parser.add_argument('-it','--input_text_file', help='漢字仮名交じり文') 98 | parser.add_argument('-ot', '--output_text_file', help='漢字仮名交じり文にspを挿入したもの') 99 | 100 | parser.add_argument('--hmm_model', help='support mono-phone model only') 101 | 102 | args = parser.parse_args() 103 | 104 | main(args) 105 | -------------------------------------------------------------------------------- /julius4seg/sp_inserter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import subprocess 5 | from itertools import chain 6 | from pathlib import Path, PurePath 7 | 8 | from logging import getLogger, DEBUG, NullHandler 9 | logger = getLogger(__name__) 10 | logger.addHandler(NullHandler()) 11 | logger.setLevel(DEBUG) 12 | logger.propagate = False 13 | 14 | 15 | # MUST CHANGE 16 | JULIUS_ROOT = PurePath('.') 17 | 18 | 19 | def get_os_dependent_directory() -> str: 20 | '''Juluis Segmentaion-Kitのディレクトリ名をOSの種類から取得 21 | returns: 22 | (str): OS依存のパスの一部 23 | ''' 24 | if sys.platform.startswith('win') or sys.platform.startswith('cygwin'): 25 | return 'windows' 26 | elif sys.platform.startswith('darwin'): 27 | return 'osx' 28 | elif sys.platform.startswith('linux'): 29 | return 'linux' 30 | 31 | 32 | def get_os_dependent_exec() -> str: 33 | '''Juliusの実行ファイル名を取得 34 | returns: 35 | (str): Juliusの実行ファイル名 36 | ''' 37 | if sys.platform.startswith('win') or sys.platform.startswith('cygwin'): 38 | return 'julius.exe' 39 | else: 40 | return 'julius' 41 | 42 | 43 | def kata2hira(kana: str) -> str: 44 | '''ヴ,ヵ,ヶ以外のカタカナをひらがなに変換 45 | args: 46 | kana(str): カタカナ文字列 47 | "ヤキニク" 48 | returns: 49 | (str): ひらがな文字列 50 | "やきにく" 51 | ''' 52 | return ''.join([chr(ord(c) + ord('あ') - ord('ア')) if c != 'ー' else 'ー' for c in kana]) 53 | 54 | 55 | def conv2julius(s: str) -> str: 56 | '''入力の単語の読み(ひらがな)をJuliusの音素列に変換 57 | args: 58 | kana(str): カタカナ文字列 59 | "やきにく" 60 | returns: 61 | (str): ひらがな文字列 62 | " y a k i n i k u" 63 | ''' 64 | s = s.replace('あぁ',' a a') 65 | s = s.replace('いぃ',' i i') 66 | s = s.replace('いぇ',' i e') 67 | s = s.replace('いゃ',' y a') 68 | s = s.replace('うぅ',' u:') 69 | s = s.replace('えぇ',' e e') 70 | s = s.replace('おぉ',' o:') 71 | s = s.replace('かぁ',' k a:') 72 | s = s.replace('きぃ',' k i:') 73 | s = s.replace('くぅ',' k u:') 74 | s = s.replace('くゃ',' ky a') 75 | s = s.replace('くゅ',' ky u') 76 | s = s.replace('くょ',' ky o') 77 | s = s.replace('けぇ',' k e:') 78 | s = s.replace('こぉ',' k o:') 79 | s = s.replace('がぁ',' g a:') 80 | s = s.replace('ぎぃ',' g i:') 81 | s = s.replace('ぐぅ',' g u:') 82 | s = s.replace('ぐゃ',' gy a') 83 | s = s.replace('ぐゅ',' gy u') 84 | s = s.replace('ぐょ',' gy o') 85 | s = s.replace('げぇ',' g e:') 86 | s = s.replace('ごぉ',' g o:') 87 | s = s.replace('さぁ',' s a:') 88 | s = s.replace('しぃ',' sh i:') 89 | s = s.replace('すぅ',' s u:') 90 | s = s.replace('すゃ',' sh a') 91 | s = s.replace('すゅ',' sh u') 92 | s = s.replace('すょ',' sh o') 93 | s = s.replace('せぇ',' s e:') 94 | s = s.replace('そぉ',' s o:') 95 | s = s.replace('ざぁ',' z a:') 96 | s = s.replace('じぃ',' j i:') 97 | s = s.replace('ずぅ',' z u:') 98 | s = s.replace('ずゃ',' zy a') 99 | s = s.replace('ずゅ',' zy u') 100 | s = s.replace('ずょ',' zy o') 101 | s = s.replace('ぜぇ',' z e:') 102 | s = s.replace('ぞぉ',' z o:') 103 | s = s.replace('たぁ',' t a:') 104 | s = s.replace('ちぃ',' ch i:') 105 | s = s.replace('つぁ',' ts a') 106 | s = s.replace('つぃ',' ts i') 107 | s = s.replace('つぅ',' ts u:') 108 | s = s.replace('つゃ',' ch a') 109 | s = s.replace('つゅ',' ch u') 110 | s = s.replace('つょ',' ch o') 111 | s = s.replace('つぇ',' ts e') 112 | s = s.replace('つぉ',' ts o') 113 | s = s.replace('てぇ',' t e:') 114 | s = s.replace('とぉ',' t o:') 115 | s = s.replace('だぁ',' d a:') 116 | s = s.replace('ぢぃ',' j i:') 117 | s = s.replace('づぅ',' d u:') 118 | s = s.replace('づゃ',' zy a') 119 | s = s.replace('づゅ',' zy u') 120 | s = s.replace('づょ',' zy o') 121 | s = s.replace('でぇ',' d e:') 122 | s = s.replace('どぉ',' d o:') 123 | s = s.replace('なぁ',' n a:') 124 | s = s.replace('にぃ',' n i:') 125 | s = s.replace('ぬぅ',' n u:') 126 | s = s.replace('ぬゃ',' ny a') 127 | s = s.replace('ぬゅ',' ny u') 128 | s = s.replace('ぬょ',' ny o') 129 | s = s.replace('ねぇ',' n e:') 130 | s = s.replace('のぉ',' n o:') 131 | s = s.replace('はぁ',' h a:') 132 | s = s.replace('ひぃ',' h i:') 133 | s = s.replace('ふぅ',' f u:') 134 | s = s.replace('ふゃ',' hy a') 135 | s = s.replace('ふゅ',' hy u') 136 | s = s.replace('ふょ',' hy o') 137 | s = s.replace('へぇ',' h e:') 138 | s = s.replace('ほぉ',' h o:') 139 | s = s.replace('ばぁ',' b a:') 140 | s = s.replace('びぃ',' b i:') 141 | s = s.replace('ぶぅ',' b u:') 142 | s = s.replace('ふゃ',' hy a') 143 | s = s.replace('ぶゅ',' by u') 144 | s = s.replace('ふょ',' hy o') 145 | s = s.replace('べぇ',' b e:') 146 | s = s.replace('ぼぉ',' b o:') 147 | s = s.replace('ぱぁ',' p a:') 148 | s = s.replace('ぴぃ',' p i:') 149 | s = s.replace('ぷぅ',' p u:') 150 | s = s.replace('ぷゃ',' py a') 151 | s = s.replace('ぷゅ',' py u') 152 | s = s.replace('ぷょ',' py o') 153 | s = s.replace('ぺぇ',' p e:') 154 | s = s.replace('ぽぉ',' p o:') 155 | s = s.replace('まぁ',' m a:') 156 | s = s.replace('みぃ',' m i:') 157 | s = s.replace('むぅ',' m u:') 158 | s = s.replace('むゃ',' my a') 159 | s = s.replace('むゅ',' my u') 160 | s = s.replace('むょ',' my o') 161 | s = s.replace('めぇ',' m e:') 162 | s = s.replace('もぉ',' m o:') 163 | s = s.replace('やぁ',' y a:') 164 | s = s.replace('ゆぅ',' y u:') 165 | s = s.replace('ゆゃ',' y a:') 166 | s = s.replace('ゆゅ',' y u:') 167 | s = s.replace('ゆょ',' y o:') 168 | s = s.replace('よぉ',' y o:') 169 | s = s.replace('らぁ',' r a:') 170 | s = s.replace('りぃ',' r i:') 171 | s = s.replace('るぅ',' r u:') 172 | s = s.replace('るゃ',' ry a') 173 | s = s.replace('るゅ',' ry u') 174 | s = s.replace('るょ',' ry o') 175 | s = s.replace('れぇ',' r e:') 176 | s = s.replace('ろぉ',' r o:') 177 | s = s.replace('わぁ',' w a:') 178 | s = s.replace('をぉ',' o:') 179 | 180 | s = s.replace('ゔ',' b u') 181 | s = s.replace('でぃ',' d i') 182 | s = s.replace('でぇ',' d e:') 183 | s = s.replace('でゃ',' dy a') 184 | s = s.replace('でゅ',' dy u') 185 | s = s.replace('でょ',' dy o') 186 | s = s.replace('てぃ',' t i') 187 | s = s.replace('てぇ',' t e:') 188 | s = s.replace('てゃ',' ty a') 189 | s = s.replace('てゅ',' ty u') 190 | s = s.replace('てょ',' ty o') 191 | s = s.replace('すぃ',' s i') 192 | s = s.replace('ずぁ',' z u a') 193 | s = s.replace('ずぃ',' z i') 194 | s = s.replace('ずぅ',' z u') 195 | s = s.replace('ずゃ',' zy a') 196 | s = s.replace('ずゅ',' zy u') 197 | s = s.replace('ずょ',' zy o') 198 | s = s.replace('ずぇ',' z e') 199 | s = s.replace('ずぉ',' z o') 200 | s = s.replace('きゃ',' ky a') 201 | s = s.replace('きゅ',' ky u') 202 | s = s.replace('きょ',' ky o') 203 | s = s.replace('しゃ',' sh a') 204 | s = s.replace('しゅ',' sh u') 205 | s = s.replace('しぇ',' sh e') 206 | s = s.replace('しょ',' sh o') 207 | s = s.replace('ちゃ',' ch a') 208 | s = s.replace('ちゅ',' ch u') 209 | s = s.replace('ちぇ',' ch e') 210 | s = s.replace('ちょ',' ch o') 211 | s = s.replace('とぅ',' t u') 212 | s = s.replace('とゃ',' ty a') 213 | s = s.replace('とゅ',' ty u') 214 | s = s.replace('とょ',' ty o') 215 | s = s.replace('どぁ',' d o a') 216 | s = s.replace('どぅ',' d u') 217 | s = s.replace('どゃ',' dy a') 218 | s = s.replace('どゅ',' dy u') 219 | s = s.replace('どょ',' dy o') 220 | s = s.replace('どぉ',' d o:') 221 | s = s.replace('にゃ',' ny a') 222 | s = s.replace('にゅ',' ny u') 223 | s = s.replace('にょ',' ny o') 224 | s = s.replace('ひゃ',' hy a') 225 | s = s.replace('ひゅ',' hy u') 226 | s = s.replace('ひょ',' hy o') 227 | s = s.replace('みゃ',' my a') 228 | s = s.replace('みゅ',' my u') 229 | s = s.replace('みょ',' my o') 230 | s = s.replace('りゃ',' ry a') 231 | s = s.replace('りゅ',' ry u') 232 | s = s.replace('りょ',' ry o') 233 | s = s.replace('ぎゃ',' gy a') 234 | s = s.replace('ぎゅ',' gy u') 235 | s = s.replace('ぎょ',' gy o') 236 | s = s.replace('ぢぇ',' j e') 237 | s = s.replace('ぢゃ',' j a') 238 | s = s.replace('ぢゅ',' j u') 239 | s = s.replace('ぢょ',' j o') 240 | s = s.replace('じぇ',' j e') 241 | s = s.replace('じゃ',' j a') 242 | s = s.replace('じゅ',' j u') 243 | s = s.replace('じょ',' j o') 244 | s = s.replace('びゃ',' by a') 245 | s = s.replace('びゅ',' by u') 246 | s = s.replace('びょ',' by o') 247 | s = s.replace('ぴゃ',' py a') 248 | s = s.replace('ぴゅ',' py u') 249 | s = s.replace('ぴょ',' py o') 250 | s = s.replace('うぁ',' u a') 251 | s = s.replace('うぃ',' w i') 252 | s = s.replace('うぇ',' w e') 253 | s = s.replace('うぉ',' w o') 254 | s = s.replace('ふぁ',' f a') 255 | s = s.replace('ふぃ',' f i') 256 | s = s.replace('ふぅ',' f u') 257 | s = s.replace('ふゃ',' hy a') 258 | s = s.replace('ふゅ',' hy u') 259 | s = s.replace('ふょ',' hy o') 260 | s = s.replace('ふぇ',' f e') 261 | s = s.replace('ふぉ',' f o') 262 | 263 | # 1音からなる変換規則 264 | s = s.replace('あ',' a') 265 | s = s.replace('い',' i') 266 | s = s.replace('う',' u') 267 | s = s.replace('え',' e') 268 | s = s.replace('お',' o') 269 | s = s.replace('か',' k a') 270 | s = s.replace('き',' k i') 271 | s = s.replace('く',' k u') 272 | s = s.replace('け',' k e') 273 | s = s.replace('こ',' k o') 274 | s = s.replace('さ',' s a') 275 | s = s.replace('し',' sh i') 276 | s = s.replace('す',' s u') 277 | s = s.replace('せ',' s e') 278 | s = s.replace('そ',' s o') 279 | s = s.replace('た',' t a') 280 | s = s.replace('ち',' ch i') 281 | s = s.replace('つ',' ts u') 282 | s = s.replace('て',' t e') 283 | s = s.replace('と',' t o') 284 | s = s.replace('な',' n a') 285 | s = s.replace('に',' n i') 286 | s = s.replace('ぬ',' n u') 287 | s = s.replace('ね',' n e') 288 | s = s.replace('の',' n o') 289 | s = s.replace('は',' h a') 290 | s = s.replace('ひ',' h i') 291 | s = s.replace('ふ',' f u') 292 | s = s.replace('へ',' h e') 293 | s = s.replace('ほ',' h o') 294 | s = s.replace('ま',' m a') 295 | s = s.replace('み',' m i') 296 | s = s.replace('む',' m u') 297 | s = s.replace('め',' m e') 298 | s = s.replace('も',' m o') 299 | s = s.replace('ら',' r a') 300 | s = s.replace('り',' r i') 301 | s = s.replace('る',' r u') 302 | s = s.replace('れ',' r e') 303 | s = s.replace('ろ',' r o') 304 | s = s.replace('が',' g a') 305 | s = s.replace('ぎ',' g i') 306 | s = s.replace('ぐ',' g u') 307 | s = s.replace('げ',' g e') 308 | s = s.replace('ご',' g o') 309 | s = s.replace('ざ',' z a') 310 | s = s.replace('じ',' j i') 311 | s = s.replace('ず',' z u') 312 | s = s.replace('ぜ',' z e') 313 | s = s.replace('ぞ',' z o') 314 | s = s.replace('だ',' d a') 315 | s = s.replace('ぢ',' j i') 316 | s = s.replace('づ',' z u') 317 | s = s.replace('で',' d e') 318 | s = s.replace('ど',' d o') 319 | s = s.replace('ば',' b a') 320 | s = s.replace('び',' b i') 321 | s = s.replace('ぶ',' b u') 322 | s = s.replace('べ',' b e') 323 | s = s.replace('ぼ',' b o') 324 | s = s.replace('ぱ',' p a') 325 | s = s.replace('ぴ',' p i') 326 | s = s.replace('ぷ',' p u') 327 | s = s.replace('ぺ',' p e') 328 | s = s.replace('ぽ',' p o') 329 | s = s.replace('や',' y a') 330 | s = s.replace('ゆ',' y u') 331 | s = s.replace('よ',' y o') 332 | s = s.replace('わ',' w a') 333 | s = s.replace('を',' o') 334 | s = s.replace('ん',' N') 335 | s = s.replace('っ',' q') 336 | s = s.replace('ー',':') 337 | 338 | s = s.replace('ぁ',' a') 339 | s = s.replace('ぃ',' i') 340 | s = s.replace('ぅ',' u') 341 | s = s.replace('ぇ',' e') 342 | s = s.replace('ぉ',' o') 343 | s = s.replace('ゎ',' w a') 344 | 345 | s = s[1:] 346 | 347 | s = re.sub(r':+', ':', s) 348 | 349 | return s 350 | 351 | 352 | def gen_julius_dict_1st(text_symbols: [str], word_phones: [str]) -> str: 353 | '''テキストのシンボルと読みの音素のJulius dictファイルの中身を生成 354 | args: 355 | text_symbols ([str]): 単語のシンボル 356 | ['今回', 'は'] 357 | word_phones ([str]): 単語の音素系列 358 | ['k o N k a i', 'w a'] 359 | returns: 360 | (str): Juliusのdictファイルの中身 361 | ''' 362 | tmp = [] 363 | finit = len(text_symbols) 364 | 365 | for i, zipped in enumerate(zip(text_symbols, word_phones)): 366 | tmp.append('{}\t[{}]\t{}'.format(i*2, *zipped)) 367 | if i + 1 != finit: 368 | tmp.append('{}\t[{}]\t{}'.format(i*2+1, 'sp_{}'.format(i), 'sp')) 369 | 370 | # append sp and Start, End symbol 371 | tmp.append('{}\t[{}]\t{}'.format(i*2+1, '', 'silB')) 372 | tmp.append('{}\t[{}]\t{}'.format((i+1)*2, '', 'silE')) 373 | 374 | return '\n'.join(tmp) + '\n' 375 | 376 | 377 | def gen_julius_dfa(number_of_words: int) -> str: 378 | '''単語数から遷移のためのJuliusのdfaファイルの中身を生成 379 | args: 380 | number_of_words (int): 遷移する単語の単語数 381 | returns: 382 | (str): Juliusのdfaファイルの中身 383 | ''' 384 | i = 0 385 | current_word = number_of_words - 3 386 | isLast = False 387 | tmp = [] 388 | while True: 389 | if i == 0: 390 | tmp.append('{} {} {} {} {}'.format(i, number_of_words - 1, i + 1, 0, 1)) 391 | i += 1 392 | elif i > 0 and not isLast: 393 | tmp.append('{} {} {} {} {}'.format(i, current_word, i + 1, 0, 0)) 394 | current_word -= 1 395 | isLast = current_word == -1 396 | i += 1 397 | elif i > 0 and isLast: 398 | tmp.append('{} {} {} {} {}'.format(i, i - 1, i + 1, 0, 0)) 399 | tmp.append('{} {} {} {} {}'.format(i + 1, -1, -1, 1, 0)) 400 | break 401 | 402 | return '\n'.join(tmp) + '\n' 403 | 404 | 405 | def gen_julius_dict_2nd(phone_seqence: str) -> str: 406 | '''音素系列から強制アライメントのためのdictファイルの中身を生成 407 | args: 408 | phone_seqence (str): 409 | 'k o N k a i w a ' 410 | returns: 411 | (str): Juliusのdictファイルの中身 412 | ''' 413 | return '\n'.join([ 414 | '0\t[w_0]\tsilB', 415 | '1\t[w_1]\t{}'.format(phone_seqence), 416 | '2\t[w_2]\tsilE', 417 | ]) + '\n' 418 | 419 | 420 | def gen_julius_aliment_dfa() -> str: 421 | '''強制アライメント用のdfaファイルの中身を生成 422 | returns: 423 | (str): Juliusのdfaファイルの中身 424 | ''' 425 | return '\n'.join([ 426 | '0 2 1 0 1', 427 | '1 1 2 0 0', 428 | '2 0 3 0 0', 429 | '3 -1 -1 1 0' 430 | ]) + '\n' 431 | 432 | 433 | def julius_sp_insert(target_wav_file: str, aliment_file_signiture: str, model_path: str = None) -> [str]: 434 | julius_args = { 435 | '-h': str( 436 | JULIUS_ROOT / 'model' / 'phone_m' / 'jnas-mono-16mix-gid.binhmm' 437 | ) if model_path is None else model_path, 438 | '-input': 'file', 439 | '-debug':'', 440 | '-gram': aliment_file_signiture, 441 | } 442 | 443 | file_echo_p = subprocess.Popen(['echo', target_wav_file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) 444 | julius_p = subprocess.Popen(' '.join([str(JULIUS_ROOT / 'bin' / get_os_dependent_directory() / get_os_dependent_exec()), 445 | *list(chain.from_iterable([[k, v] for k, v in julius_args.items()]))]).split(), stdin=file_echo_p.stdout, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) 446 | file_echo_p.stdout.close() 447 | return julius_p.communicate()[0].decode('utf-8').split('\n') 448 | 449 | 450 | def get_sp_inserted_text(raw_output: str, debug_symbol='') -> (str, [int]): 451 | '''デコード結果からsp挿入後のテキストとspのインデックスを取得する 452 | args: 453 | raw_output: `julius_sp_insert`の出力 454 | returns: 455 | Tuple(str, [int]): デコード結果とspのindex 456 | ''' 457 | r = re.compile(' (.*) ') 458 | pass1_best = next(s for s in raw_output if s.startswith('pass1_best')) 459 | matched = r.search(pass1_best) 460 | if matched is None: 461 | logger.warning('Failed Decoding Text [{}]'.format(debug_symbol)) 462 | raise Exception("Decode Failed") 463 | 464 | return (re.sub('sp_[\d+]', '', matched.group(1)), [int(s.split('_')[1]) for s in matched.group().split() if 'sp_' in s]) 465 | 466 | 467 | def get_sp_inserterd_phone_seqence(raw_output: str, debug_symbol='') -> str: 468 | pass1_best_phonemeseq = next(s for s in raw_output if s.startswith('pass1_best_phonemeseq')) 469 | 470 | complete_re = re.compile('silB \| (.*) \| silE') 471 | failed_re_1 = re.compile('silE \| (.*) \| silB') 472 | failed_re_2 = re.compile('silE \| (.*)') 473 | 474 | if complete_re.search(pass1_best_phonemeseq) is not None: 475 | matched = complete_re.search(pass1_best_phonemeseq) 476 | elif failed_re_1.search(pass1_best_phonemeseq) is not None: 477 | logger.info('Use not correct re to generate Phoneseq [{}]'.format(debug_symbol)) 478 | matched = failed_re_1.search(pass1_best_phonemeseq) 479 | elif failed_re_2.search(pass1_best_phonemeseq) is not None: 480 | logger.info('Use not correct re to generate Phoneseq [{}]'.format(debug_symbol)) 481 | matched = failed_re_2.search(pass1_best_phonemeseq) 482 | else: 483 | logger.warning('Failed Generate Phoneseq [{}]'.format(debug_symbol)) 484 | raise Exception("Decode Failed") 485 | 486 | tmp = matched.group(1) 487 | return ' '.join([s.strip() for s in tmp.split('|')]) 488 | 489 | 490 | def julius_phone_alignment(target_wav_file: str, aliment_file_signiture: str, model_path: str = None) -> [str]: 491 | julius_args = { 492 | '-h': str( 493 | JULIUS_ROOT / 'model' / 'phone_m' / 'jnas-mono-16mix-gid.binhmm' 494 | ) if model_path is None else model_path, 495 | '-palign':'', 496 | '-input': 'file', 497 | '-gram': aliment_file_signiture, 498 | } 499 | 500 | file_echo_p = subprocess.Popen(['echo', target_wav_file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) 501 | julius_p = subprocess.Popen(' '.join([str(JULIUS_ROOT / 'bin'/ get_os_dependent_directory() / get_os_dependent_exec()), 502 | *list(chain.from_iterable([[k, v] for k, v in julius_args.items()]))]).split(), stdin=file_echo_p.stdout, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) 503 | file_echo_p.stdout.close() 504 | return julius_p.communicate()[0].decode('utf-8').split('\n') 505 | 506 | 507 | def get_time_alimented_list(raw_output: str) -> [str]: 508 | r = re.compile('\[\s*(\d+)\s+(\d+)\s*\]\s*[\-]*[\d,\.]+\s*([\w,\:]+)$') 509 | 510 | return [ 511 | (s.group(1), s.group(2), s.group(3)) 512 | for s in map(lambda x: r.search(x), raw_output) if s is not None 513 | ] 514 | --------------------------------------------------------------------------------