├── .cache
    └── .gitignore
├── .dockerignore
├── .editorconfig
├── .gitignore
├── .gitmodules
├── .vscode
    └── settings.json
├── 01-Sources
    └── .gitignore
├── 02-PreparedSources
    └── .gitignore
├── 03-Segments
    └── .gitignore
├── 04-Datasets
    └── .gitignore
├── 05-Models
├── Aivis-Docker.sh
├── Aivis.sh
├── Aivis
    ├── __init__.py
    ├── __main__.py
    ├── constants.py
    ├── demucs.py
    ├── prepare.py
    └── utils.py
├── Dockerfile
├── License.txt
├── Readme.md
├── poetry.lock
├── poetry.toml
└── pyproject.toml


/.cache/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | _/
 2 | .cache/
 3 | .venv/
 4 | 01-Sources/
 5 | 02-PreparedSources/
 6 | 03-Segments/
 7 | 04-Datasets/
 8 | 05-Models/
 9 | Bert-VITS2/
10 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | 
 2 | root = true
 3 | 
 4 | [*]
 5 | charset = utf-8
 6 | end_of_line = lf
 7 | insert_final_newline = true
 8 | indent_size = 4
 9 | indent_style = space
10 | trim_trailing_whitespace = true
11 | 
12 | [*.md]
13 | trim_trailing_whitespace = false
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # macOS
163 | .DS_Store
164 | ._*
165 | 
166 | # Aivis data
167 | _/
168 | .venv/
169 | *.mp3
170 | *.mp4
171 | *.m4a
172 | *.wav
173 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "Bert-VITS2"]
2 | 	path = Bert-VITS2
3 | 	url = https://github.com/tsukumijima/Bert-VITS2.git
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Pylance の Type Checking を有効化
 3 |     "python.languageServer": "Pylance",
 4 |     "python.analysis.typeCheckingMode": "strict",
 5 |     // Pylance の Type Checking のうち、いくつかのエラー報告を抑制する
 6 |     "python.analysis.diagnosticSeverityOverrides": {
 7 |         "reportConstantRedefinition": "none",
 8 |         "reportMissingTypeStubs": "none",
 9 |         "reportPrivateImportUsage": "none",
10 |         "reportShadowedImports": "none",
11 |         "reportUnnecessaryComparison": "none",
12 |         "reportUnknownArgumentType": "none",
13 |         "reportUnknownMemberType": "none",
14 |         "reportUnknownVariableType": "none",
15 |         "reportUnusedFunction": "none",
16 |     },
17 | }
18 | 


--------------------------------------------------------------------------------
/01-Sources/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/02-PreparedSources/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/03-Segments/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/04-Datasets/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/05-Models:
--------------------------------------------------------------------------------
1 | Bert-VITS2/Data/


--------------------------------------------------------------------------------
/Aivis-Docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # このファイルがあるディレクトリを取得
 4 | BASE_DIR=$(cd $(dirname $0); pwd)
 5 | 
 6 | # 第一引数に build が指定されている場合は Docker イメージを作成して終了する
 7 | if [ "$1" = "build" ]; then
 8 |     docker build -t aivis .
 9 |     exit 0
10 | fi
11 | 
12 | # TODO: Docker イメージが安定しないうちは毎回ビルドする
13 | docker build -t aivis .
14 | 
15 | # まだ Docker イメージがビルドされていない場合はビルドする
16 | if [ ! "$(docker images -q aivis:latest 2> /dev/null)" ]; then
17 |     docker build -t aivis .
18 | fi
19 | 
20 | # Docker コンテナを起動する
21 | ## --gpus all で NVIDIA GPU をコンテナ内で使えるようにする
22 | ## データフォルダをコンテナ内にマウントする
23 | ## /code/.cache をマウントし、毎回学習済みモデルがダウンロードされるのを防ぐ
24 | ## --shm-size を指定しないと DataLoader でエラーが発生する
25 | ## ref: https://qiita.com/gorogoroyasu/items/e71dd3c076af145c9b44
26 | docker run --gpus all -it --rm --shm-size=256m \
27 |     -p 7860:7860 \
28 |     -v ${BASE_DIR}/.cache:/code/.cache \
29 |     -v ${BASE_DIR}/01-Sources:/code/01-Sources \
30 |     -v ${BASE_DIR}/02-PreparedSources:/code/02-PreparedSources \
31 |     -v ${BASE_DIR}/03-Segments:/code/03-Segments \
32 |     -v ${BASE_DIR}/04-Datasets:/code/04-Datasets \
33 |     -v ${BASE_DIR}/Bert-VITS2:/code/Bert-VITS2 \
34 |     aivis "$@"
35 | 


--------------------------------------------------------------------------------
/Aivis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # faster-whisper が動的にシステム上の libcudnn_ops_infer.so.8 をロードしようとしてエラーになるので、
4 | # 事前に .venv/ 以下のライブラリへ LD_LIBRARY_PATH を通しておく
5 | # ref: https://github.com/SYSTRAN/faster-whisper/issues/153#issuecomment-1510218906
6 | LD_LIBRARY_PATH=`poetry run python -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` poetry run python -m Aivis "$@"
7 | 


--------------------------------------------------------------------------------
/Aivis/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | # バージョン情報
3 | __version__ = '1.0.0'
4 | 


--------------------------------------------------------------------------------
/Aivis/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # FutureWarning / RuntimeWarning / UserWarning を抑制する
  4 | import warnings
  5 | warnings.simplefilter(action='ignore', category=FutureWarning)
  6 | warnings.simplefilter(action='ignore', category=RuntimeWarning)
  7 | warnings.simplefilter(action='ignore', category=UserWarning)
  8 | 
  9 | import functools
 10 | import json
 11 | import math
 12 | import re
 13 | import shutil
 14 | import subprocess
 15 | import sys
 16 | import typer
 17 | from pathlib import Path
 18 | from typing import Annotated, Any, cast, Optional, Union
 19 | 
 20 | from Aivis import __version__
 21 | from Aivis import constants
 22 | from Aivis import demucs
 23 | from Aivis import prepare
 24 | from Aivis import utils
 25 | 
 26 | 
 27 | app = typer.Typer(help='Aivis: AI Voice Imitation System')
 28 | 
 29 | @app.command(help='Create audio segments from audio sources.')
 30 | def create_segments(
 31 |     use_demucs: Annotated[bool, typer.Option(help='Use Demucs to extract voices from audio files.')] = True,
 32 |     whisper_model: Annotated[constants.ModelNameType, typer.Option(help='Whisper model name.')] = constants.ModelNameType.large_v3,
 33 |     force_transcribe: Annotated[bool, typer.Option(help='Force Whisper to transcribe audio files.')] = False,
 34 |     trim_silence: Annotated[bool, typer.Option(help='Trim silence (start and end only) from audio files.')] = True,
 35 | ):
 36 |     # このサブコマンドでしか利用せず、かつ比較的インポートが重いモジュールはここでインポートする
 37 |     import faster_whisper
 38 |     import stable_whisper
 39 | 
 40 |     # 01-Sources ディレクトリ以下のメディアファイルを取得
 41 |     ## 処理対象のメディアファイルの拡張子は constants.SOURCE_FILE_EXTENSIONS で定義されている
 42 |     ## アルファベット順にソートする
 43 |     source_files = sorted(list(constants.SOURCES_DIR.glob('**/*.*')))
 44 |     source_files = [i for i in source_files if i.suffix in constants.SOURCE_FILE_EXTENSIONS]
 45 | 
 46 |     # Demucs V4 (htdemucs_ft) で AI 音源分離を行い、音声ファイルからボイスのみを抽出する
 47 |     ## 本来は楽曲をボーカル・ドラム・ベース・その他に音源分離するための AI だが、これを応用して BGM・SE・ノイズなどを大部分除去できる
 48 |     ## Demucs でボーカル (=ボイス) のみを抽出したファイルは 02-PreparedSources/(音声ファイル名).wav に出力される
 49 |     ## すでに抽出済みのファイルがある場合は音源分離は行われず、すでに抽出済みのファイルを使用する
 50 |     ## Demucs での音源分離を行わない場合は、音声ファイルを wav に変換して 02-PreparedSources/(音声ファイル名).wav に出力する
 51 |     if use_demucs is True:
 52 |         voices_files = demucs.ExtractVoices(source_files, constants.PREPARE_SOURCES_DIR)
 53 |     else:
 54 |         voices_files = demucs.ConvertToWave(source_files, constants.PREPARE_SOURCES_DIR)
 55 | 
 56 |     model: faster_whisper.WhisperModel | None = None
 57 | 
 58 |     # ここからは各音声ファイルごとにループ
 59 |     for voices_file in voices_files:
 60 |         typer.echo('=' * utils.GetTerminalColumnSize())
 61 | 
 62 |         # 出力先ディレクトリを作成
 63 |         ## すでに存在している場合は生成済みなのでスキップ (ただし、ディレクトリの中身が空の場合はスキップしない)
 64 |         ## もしもう一度生成したい場合はディレクトリを削除すること
 65 |         folder = constants.SEGMENTS_DIR / voices_file.name.split('.')[0]
 66 |         if folder.exists() and len(list(folder.glob('*.*'))) > 0:
 67 |             typer.echo(f'Directory {folder} already exists. Skip.')
 68 |             continue
 69 |         folder.mkdir(parents=True, exist_ok=True)
 70 |         typer.echo(f'Directory {folder} created.')
 71 | 
 72 |         transcribe_result: stable_whisper.WhisperResult
 73 |         results_json_file = constants.PREPARE_SOURCES_DIR / f'{voices_file.name.split(".")[0]}.json'
 74 | 
 75 |         # すでに音声認識結果のデータ (JSON) が保存されている場合はそのデータを使い、新規の音声認識は行わない
 76 |         ## なお、--force-transcribe オプションが指定されている場合は JSON ファイルが存在するかに関わらず音声認識を実行する
 77 |         if results_json_file.exists() and force_transcribe is False:
 78 |             typer.echo(f'File {voices_file} already transcribed.')
 79 |             transcribe_result = stable_whisper.WhisperResult(str(results_json_file))
 80 | 
 81 |         # Whisper で音声認識を実行
 82 |         else:
 83 | 
 84 |             typer.echo('-' * utils.GetTerminalColumnSize())
 85 |             typer.echo(f'File {voices_file} transcribing...')
 86 |             typer.echo('-' * utils.GetTerminalColumnSize())
 87 | 
 88 |             # Whisper の学習済みモデルをロード (1回のみ)
 89 |             if model is None:
 90 |                 typer.echo(f'Whisper model loading... (Model: {whisper_model.value})')
 91 |                 model = stable_whisper.load_faster_whisper(
 92 |                     whisper_model.value,
 93 |                     device = 'cuda',
 94 |                     compute_type = 'auto',
 95 |                 )
 96 |                 typer.echo('Whisper model loaded.')
 97 |                 typer.echo('-' * utils.GetTerminalColumnSize())
 98 | 
 99 |             # Whisper に入力する初期プロンプト (呪文)
100 |             ## Whisper は前の文脈を踏まえて書き起こしてくれるらしいので、会話文の書き起こしっぽいものを入れておくと、
101 |             ## 書き起こしに句読点をつけるよう誘導できるみたい…
102 |             initial_prompt = (
103 |                 'そうだ。今日はピクニックしない…？天気もいいし、絶好のピクニック日和だと思う！ 良いですね！行きましょう…！'
104 |                 'じゃあ早速、荷物の準備しておきますね。 そうだね！どこに行く？ そうですね…。桜の見える公園なんかどうでしょう…？'
105 |                 'おー！今の時期は桜が綺麗だしね。じゃあそれで決まりっ！ 分かりました。調べたところ、電車だと550円掛かるみたいです。'
106 |                 '少し時間が掛かりますが、歩いた方が健康的かもしれません。 え〜！歩くのはきついよぉ…。'
107 |             )
108 | 
109 |             # 音声認識を実行し、タイムスタンプなどが調整された音声認識結果を取得する
110 |             # ref: https://qiita.com/reriiasu/items/5ad8e1a7dbc425de7bb0
111 |             # ref: https://zenn.dev/tsuzukia/articles/1381e6c9a88577
112 |             # ref: https://note.com/asahi_ictrad/n/nf3ca329f17df
113 |             transcribe_result: stable_whisper.WhisperResult = cast(Any, model).transcribe_stable(
114 |                 # 入力元の音声ファイル
115 |                 str(voices_file),
116 |                 # 単語ごとのタイムスタンプを出力する
117 |                 word_timestamps = True,
118 |                 # ログをコンソールに出力する
119 |                 verbose = True,
120 |                 # 単語セグメントの再グループ化を行わない
121 |                 ## 別途音声認識が完了してから行う
122 |                 regroup = False,
123 |                 # すでに Demucs で音源分離を行っているため、ここでは音源分離を行わない
124 |                 ## 音声ファイルごとにモデルを読み込むよりも、読み込んだモデルを使いまわした方が高速に処理できる
125 |                 demucs = False,
126 |                 # 検出された無音に基づいてタイムスタンプの調整を有効にする
127 |                 suppress_silence = True,
128 |                 # 検出された無音に基づいて単語のタイムスタンプを調整する
129 |                 suppress_word_ts = True,
130 |                 # Silero VAD を使用してタイムスタンプ抑制マスクを生成する
131 |                 vad = True,
132 |                 # faster-whisper 本体の設定パラメータ
133 |                 # 日本語
134 |                 language = 'ja',
135 |                 # beam_size (1 に設定して CER を下げる)
136 |                 beam_size = 1,
137 |                 # 謎のパラメータ (10 に設定すると temperature を下げたことで上がる repetition を抑えられるらしい？)
138 |                 no_repeat_ngram_size = 10,
139 |                 # temperature (0.0 に設定して CER を下げる)
140 |                 temperature = 0.0,
141 |                 # 前回の音声チャンクの出力結果を次のウインドウのプロンプトに設定しない
142 |                 condition_on_previous_text = False,
143 |                 # 初期プロンプト
144 |                 initial_prompt = initial_prompt,
145 |                 # faster-whisper 側で VAD を使った無音フィルタリングを行う
146 |                 vad_filter = True,
147 |             )
148 |             typer.echo('-' * utils.GetTerminalColumnSize())
149 |             typer.echo(f'File {voices_file} transcribed.')
150 | 
151 |             # 音声認識結果を再グループ化する
152 |             ## 再グループ化のアルゴリズムは多くあるが、ここではデフォルト設定を調整して使っている
153 |             ## ref: https://github.com/jianfch/stable-ts#regrouping-words
154 |             (transcribe_result.clamp_max()
155 |                 .split_by_punctuation([('.', ' '), '。', '?', '？', (',', ' '), '，'])  # type: ignore
156 |                 .split_by_gap(0.75)
157 |                 .merge_by_gap(0.3, max_words=3)
158 |                 .split_by_punctuation([('.', ' '), '。', '?', '？']))  # type: ignore
159 | 
160 |             # 音声認識結果をファイルに出力する
161 |             with open(results_json_file, mode='w', encoding='utf-8') as f:
162 |                 json.dump(transcribe_result.to_dict(), f, indent=4, ensure_ascii=False, allow_nan=True)
163 | 
164 |         # 一文ごとに切り出した音声ファイル（ファイル名には書き起こし文が入る）を出力する
165 |         count = 1
166 |         for index, segment in enumerate(transcribe_result.segments):
167 |             typer.echo('-' * utils.GetTerminalColumnSize())
168 | 
169 |             # 書き起こし結果を下処理し、よりデータセットとして最適な形にする
170 |             transcript = prepare.PrepareText(segment.text)
171 |             typer.echo(f'Transcript: {transcript}')
172 | 
173 |             # Whisper は無音区間とかがあると「視聴頂きありがとうございました」「チャンネル登録よろしく」などの謎のハルシネーションが発生するので、
174 |             # そういう系の書き起こし結果があった場合はスキップする
175 |             if transcript in constants.SKIP_TRANSCRIPTS:
176 |                 typer.echo(f'Transcript skipped. (Transcript is in SKIP_TRANSCRIPTS)')
177 |                 continue
178 | 
179 |             # (句読点含めて) 書き起こし結果が4文字未満だった場合、データセットにするには短すぎるためスキップする
180 |             ## 例: そう。/ まじ？ / あ。
181 |             if len(transcript) < 4:
182 |                 typer.echo(f'Transcript skipped. (Transcript length < 4 characters)')
183 |                 continue
184 | 
185 |             # セグメントの開始時間と終了時間を取得
186 |             segment_start = segment.start
187 |             segment_end = segment.end
188 | 
189 |             # もし現在処理中のセグメントの最初の単語の長さが 0.425 秒以上だった場合、先頭 0.25 秒を削る
190 |             ## 前のセグメントの最後の発音の母音が含まれてしまう問題の回避策
191 |             ## 日本語の場合単語は基本1文字か2文字になるため、発声時間は 0.425 秒以下になることが多いのを利用している
192 |             if segment.words[0].duration >= 0.425:
193 |                 segment_start += 0.25
194 | 
195 |                 # さらに、もし現在処理中のセグメントの最初の単語の長さが 1 秒以上だった場合、
196 |                 # その長さ - 1 秒をさらに削る (最低でも 0.75 秒は残す)
197 |                 ## 例: 3.6 秒ある単語なら、先頭 0.25 秒 + 2.6 秒 = 先頭 2.85 秒を削り、残りの 0.75 秒を出力する
198 |                 ## 1単語の発声に 1 秒以上掛かることはほぼあり得ないため、無音区間が含まれていると判断する
199 |                 if segment.words[0].duration >= 1.0:
200 |                     segment_start += segment.words[0].duration - 1.0
201 | 
202 |             # もし次のセグメントの最初の単語の長さが 0.425 秒以上だった場合、末尾 0.25 秒を伸ばす
203 |             ## 最後の発音の母音が切れてしまう問題の回避策
204 |             if index + 1 < len(transcribe_result.segments) and transcribe_result.segments[index + 1].words[0].duration >= 0.425:
205 |                 segment_end += 0.25
206 | 
207 |                 # さらに、もし次のセグメントの最初の単語の長さが 1 秒以上だった場合、
208 |                 # その長さ - 1 秒をさらに伸ばす (最大で 1.0 秒まで伸ばす)
209 |                 if transcribe_result.segments[index + 1].words[0].duration >= 1.0:
210 |                     segment_end += min(transcribe_result.segments[index + 1].words[0].duration - 1.0, 1.0)
211 | 
212 |             # もし次のセグメントの開始位置が現在処理中のセグメントの終了位置よりも後なら、
213 |             # 現在処理中のセグメントの終了位置を次のセグメントの開始位置に合わせて末尾が欠けないようにする (最大で 3.0 秒まで伸ばす)
214 |             if index + 1 < len(transcribe_result.segments) and segment_end < transcribe_result.segments[index + 1].start:
215 |                 segment_end = min(transcribe_result.segments[index + 1].start, segment_end + 3.0)
216 | 
217 |             # もし現在処理中のセグメントが音声認識結果の最後のセグメントなら、
218 |             # 現在処理中のセグメントの終了位置を音声の長さに合わせて末尾が欠けないようにする
219 |             if index + 1 == len(transcribe_result.segments):
220 |                 segment_end = prepare.GetAudioFileDuration(voices_file)
221 | 
222 |             typer.echo(f'Segment Range: {utils.SecondToTimeCode(segment_start)} - {utils.SecondToTimeCode(segment_end)}')
223 | 
224 |             # 開始時刻と終了時刻が同じだった場合、タイムスタンプが正しく取得できていないためスキップする
225 |             if segment_start == segment_end:
226 |                 typer.echo(f'Transcript skipped. (Start time == End time)')
227 |                 continue
228 | 
229 |              # 出力する音声ファイルの長さが1秒未満になった場合、データセットにするには短すぎるためスキップする
230 |             if segment_end - segment_start < 1:
231 |                 typer.echo(f'Transcript skipped. (Duration < 1 sec)')
232 |                 continue
233 | 
234 |             # 出力先の音声ファイルのパス
235 |             # 例: 0001_こんにちは.wav
236 |             output_audio_file = folder / f'{count:04d}_{transcript}.wav'
237 | 
238 |             # 一文ごとに切り出した (セグメント化した) 音声ファイルを出力
239 |             real_output_audio_file = prepare.SliceAudioFile(voices_file, output_audio_file, segment_start, segment_end, trim_silence)
240 | 
241 |             typer.echo(f'File {real_output_audio_file} saved.')
242 |             count += 1
243 | 
244 |     typer.echo('=' * utils.GetTerminalColumnSize())
245 |     typer.echo('All files segmentation done.')
246 |     typer.echo('=' * utils.GetTerminalColumnSize())
247 | 
248 | 
249 | @app.command(help='Create datasets from audio segments.')
250 | def create_datasets(
251 |     segments_dir_name: Annotated[str, typer.Argument(help='Segments directory name. Glob pattern (wildcard) is available.')],
252 |     speaker_names: Annotated[str, typer.Argument(help='Speaker name. (Comma separated)')],
253 |     accept_all: Annotated[bool, typer.Option(help='Accept all segments and transcriptions. (Skip UI)')] = False,
254 | ):
255 |     # このサブコマンドでしか利用せず、かつ比較的インポートが重いモジュールはここでインポートする
256 |     import gradio
257 |     from gradio import WaveformOptions
258 | 
259 |     typer.echo('=' * utils.GetTerminalColumnSize())
260 | 
261 |     # バリデーション
262 |     if speaker_names == '':
263 |         typer.echo(f'Error: Speaker names is empty.')
264 |         typer.echo('=' * utils.GetTerminalColumnSize())
265 |         sys.exit(1)
266 | 
267 |     # 出力後のデータセットの出力先ディレクトリがなければ作成
268 |     speaker_name_list = speaker_names.split(',')
269 |     for speaker in speaker_name_list:
270 |         output_dir = constants.DATASETS_DIR / speaker
271 |         if not output_dir.exists():
272 |             output_dir.mkdir(parents=True, exist_ok=True)
273 |             typer.echo(f'Speaker: {speaker} / Directory: {output_dir} created.')
274 |         else:
275 |             typer.echo(f'Speaker: {speaker} / Directory: {output_dir} already created.')
276 |     typer.echo('=' * utils.GetTerminalColumnSize())
277 | 
278 |     # 03-Segments/(指定されたディレクトリ名の Glob パターン)/ 以下のセグメント化された音声ファイルを取得
279 |     ## 拡張子は .wav
280 |     ## glob() の結果は順序がバラバラなのでアルファベット順にソートする
281 |     segment_audio_paths = sorted(list((constants.SEGMENTS_DIR).glob(f'{segments_dir_name}/*.wav')))
282 |     if len(segment_audio_paths) == 0:
283 |         typer.echo(f'Error: {segments_dir_name}/*.wav glob pattern matched no files.')
284 |         typer.echo('=' * utils.GetTerminalColumnSize())
285 |         sys.exit(1)
286 |     for segment_audio_path in segment_audio_paths:
287 |         segments_dir_name = segment_audio_path.parent.name
288 |         typer.echo(f'Segment File: {segments_dir_name}/{segment_audio_path.name}')
289 |     typer.echo('=' * utils.GetTerminalColumnSize())
290 | 
291 |     # 音声ファイル名から書き起こし文を取得
292 |     ## 例: 0001_こんにちは.wav -> こんにちは
293 |     segment_audio_transcripts: list[str] = []
294 |     for segment_audio_path in segment_audio_paths:
295 | 
296 |         # 拡張子なしファイル名から _ より後の部分を取得
297 |         segment_audio_transcript = segment_audio_path.stem.split('_')[1]
298 | 
299 |         # 1文が長すぎてファイル名が最大文字数を超えてしまっている場合、別途同じファイル名で .txt ファイルに全体の書き起こし文が保存されているので、
300 |         # それを読み込んで使う
301 |         segment_audio_transcript_txt = segment_audio_path.with_suffix('.txt')
302 |         if segment_audio_transcript_txt.exists():
303 |             with open(segment_audio_transcript_txt, mode='r', encoding='utf-8') as f:
304 |                 segment_audio_transcript = f.read()
305 | 
306 |         # 書き起こし文をリストに追加
307 |         segment_audio_transcripts.append(segment_audio_transcript)
308 | 
309 |     # 現在処理中の音声ファイルのインデックスと音声ファイルのパスと書き起こし文
310 |     current_index = 0
311 | 
312 |     # セレクトボックスの選択肢
313 |     choices = speaker_name_list
314 | 
315 |     # 出力ファイルの連番
316 |     output_audio_count: dict[str, int] = {}
317 |     for speaker in speaker_name_list:
318 |         # 既にそのディレクトリに存在するファイルの中で連番が一番大きいものを取得し、それに 1 を足したものを初期値とする
319 |         output_audio_count[speaker] = max([
320 |             int(re.sub(r'\D', '', i.stem)) for i in (constants.DATASETS_DIR / speaker / 'audios').glob('*.wav')
321 |         ], default=0) + 1
322 | 
323 |     # --accept-all を指定して UI を表示せずにすべての音声ファイルを一括処理する場合
324 |     if accept_all is True:
325 | 
326 |         # --accept-all を指定した場合、話者名は必ず1つだけでなければならない
327 |         ## 当然ながら、--accept-all を使う際は処理対象に指定したすべてのセグメントが同一話者のものでなければならない
328 |         if len(speaker_name_list) != 1:
329 |             typer.echo(f'Error: Speaker names must be one if --accept-all option is specified.')
330 |             typer.echo('=' * utils.GetTerminalColumnSize())
331 |             sys.exit(1)
332 |         speaker_name = speaker_name_list[0]
333 | 
334 |         # 現在処理中の音声ファイルのインデックスが音声ファイルの総数に達するまでループ
335 |         while current_index < len(segment_audio_paths):
336 | 
337 |             segment_audio_path = segment_audio_paths[current_index]
338 |             transcript = segment_audio_transcripts[current_index]
339 |             typer.echo(f'Segment File : {segment_audio_path.name}')
340 |             typer.echo(f'Speaker Name : {speaker_name}')
341 |             typer.echo(f'Transcript   : {transcript}')
342 | 
343 |             # データセットに音声ファイルを保存 (書き起こし文はファイル名が長くなるので含まず、別途書き起こしファイルに保存する)
344 |             audio_output_dir = constants.DATASETS_DIR / speaker_name / 'audios'
345 |             audio_output_dir.mkdir(parents=True, exist_ok=True)
346 |             output_path = audio_output_dir / f'{output_audio_count[speaker_name]:04}.wav'
347 |             output_audio_count[speaker_name] += 1  # 連番をインクリメント
348 |             shutil.copyfile(segment_audio_path, output_path)
349 |             typer.echo(f'File {output_path} saved.')
350 | 
351 |             # 音声ファイルのパスと書き起こし文のパスのペアを transcripts.list に順次追記
352 |             text_list_path = constants.DATASETS_DIR / speaker_name / 'transcripts.list'
353 |             if not text_list_path.exists():  # ファイルがなければ空のファイルを作成
354 |                 text_list_path.parent.mkdir(parents=True, exist_ok=True)
355 |                 text_list_path.touch()
356 |             with open(text_list_path, mode='a', encoding='utf-8') as f:
357 |                 f.write(f'{output_path.name}|{speaker_name}|JP|{transcript}\n')
358 |             typer.echo(f'File {text_list_path} updated.')
359 |             typer.echo('-' * utils.GetTerminalColumnSize())
360 | 
361 |             # 次の処理対象のファイルのインデックスに進める
362 |             current_index += 1
363 | 
364 |         # すべての音声ファイルを処理したら終了
365 |         typer.echo('=' * utils.GetTerminalColumnSize())
366 |         typer.echo('All files processed.')
367 |         typer.echo('=' * utils.GetTerminalColumnSize())
368 |         return
369 | 
370 |     def OnClick(
371 |         segment_audio_path_str: str,
372 |         speaker_name: str,
373 |         transcript: str,
374 |         is_skip: bool = False,
375 |     ) -> tuple[gradio.Audio, gradio.Dropdown, gradio.Textbox]:
376 |         """ 確定ボタンが押されたときの処理 """
377 | 
378 |         nonlocal current_index, segment_audio_paths, segment_audio_transcripts, choices, output_audio_count
379 | 
380 |         # 話者名が空の場合は初期画面から「確定」を押して実行されたイベントなので、保存処理は実行しない
381 |         speaker_name = speaker_name.strip()
382 |         if speaker_name != '' and speaker_name != '選別完了':
383 | 
384 |             segment_audio_path = Path(segment_audio_path_str)
385 |             typer.echo(f'Segment File : {segment_audio_path.name}')
386 |             typer.echo(f'Speaker Name : {speaker_name if is_skip is False else "(Skipped)"}')
387 |             typer.echo(f'Transcript   : {transcript}')
388 | 
389 |             # 確定ボタンの代わりにスキップボタンが押された場合は何もしない
390 |             if is_skip is True:
391 |                 typer.echo('Segment file skipped.')
392 |                 typer.echo('-' * utils.GetTerminalColumnSize())
393 |             else:
394 |                 # データセットに編集後の音声ファイルを保存 (書き起こし文はファイル名が長くなるので含まず、別途書き起こしファイルに保存する)
395 |                 ## Gradio の謎機能で、GUI でトリムした編集後の一次ファイルが segment_audio_path_str として渡されてくる
396 |                 audio_output_dir = constants.DATASETS_DIR / speaker_name / 'audios'
397 |                 audio_output_dir.mkdir(parents=True, exist_ok=True)
398 |                 output_path = audio_output_dir / f'{output_audio_count[speaker_name]:04}.wav'
399 |                 output_audio_count[speaker_name] += 1  # 連番をインクリメント
400 |                 shutil.copyfile(segment_audio_path, output_path)
401 |                 typer.echo(f'File {output_path} saved.')
402 | 
403 |                 # 音声ファイルのパスと書き起こし文のパスのペアを transcripts.list に順次追記
404 |                 text_list_path = constants.DATASETS_DIR / speaker_name / 'transcripts.list'
405 |                 if not text_list_path.exists():  # ファイルがなければ空のファイルを作成
406 |                     text_list_path.parent.mkdir(parents=True, exist_ok=True)
407 |                     text_list_path.touch()
408 |                 with open(text_list_path, mode='a', encoding='utf-8') as f:
409 |                     f.write(f'{output_path.name}|{speaker_name}|JP|{transcript}\n')
410 |                 typer.echo(f'File {text_list_path} updated.')
411 |                 typer.echo('-' * utils.GetTerminalColumnSize())
412 | 
413 |             # 次の処理対象のファイルのインデックスに進める
414 |             current_index += 1
415 | 
416 |         elif current_index < len(segment_audio_paths):
417 |             # 初期画面から「確定」を押して実行されたイベントなので、ログに確定を出力
418 |             ## 次の処理対象のファイルがない場合は実行されない
419 |             typer.echo('=' * utils.GetTerminalColumnSize())
420 |             typer.echo('Selection of segment files has started.')
421 |             typer.echo('=' * utils.GetTerminalColumnSize())
422 | 
423 |         # 次の処理対象のファイルがない場合は終了
424 |         if current_index >= len(segment_audio_paths):
425 |             typer.echo('=' * utils.GetTerminalColumnSize())
426 |             typer.echo('All files processed.')
427 |             typer.echo('=' * utils.GetTerminalColumnSize())
428 |             return (
429 |                 gradio.Audio(
430 |                     sources = [],
431 |                     type = 'filepath',
432 |                     interactive = True,
433 |                     autoplay = True,
434 |                     waveform_options = WaveformOptions(sample_rate=44100),  # UI 上でトリミングした音声ファイルのサンプリングレート
435 |                 ),
436 |                 gradio.Dropdown(choices=['選別完了'], value='選別完了', label='音声セグメントの話者名'),  # type: ignore
437 |                 gradio.Textbox(value='すべてのセグメントの選別を完了しました。Aivis のプロセスを終了してください。', label='音声セグメントの書き起こし文'),
438 |             )
439 | 
440 |         # UI を更新
441 |         return (
442 |             gradio.Audio(
443 |                 value = segment_audio_paths[current_index],
444 |                 sources = [],
445 |                 type = 'filepath',
446 |                 label = segment_audio_paths[current_index].name,
447 |                 interactive = True,
448 |                 autoplay = True,
449 |                 waveform_options = WaveformOptions(sample_rate=44100),  # UI 上でトリミングした音声ファイルのサンプリングレート
450 |             ),
451 |             gradio.Dropdown(choices=choices, value=choices[0], label='音声セグメントの話者名'),  # type: ignore
452 |             gradio.Textbox(value=segment_audio_transcripts[current_index], label='音声セグメントの書き起こし文'),
453 |         )
454 | 
455 |     def OnReset(speaker_name: str) -> tuple[gradio.Audio, gradio.Textbox]:
456 |         """ リセットボタンが押されたときの処理 """
457 | 
458 |         nonlocal current_index, segment_audio_paths, segment_audio_transcripts, choices
459 | 
460 |         # 話者名が空の場合は初期画面から「確定」を押して実行されたイベントなので、デフォルトのフォームを返す
461 |         if speaker_name == '':
462 |             return (
463 |                 gradio.Audio(
464 |                     sources = [],
465 |                     type = 'filepath',
466 |                     interactive = True,
467 |                     autoplay = True,
468 |                     waveform_options = WaveformOptions(sample_rate=44100),  # UI 上でトリミングした音声ファイルのサンプリングレート
469 |                 ),
470 |                 gradio.Textbox(value='確定ボタンを押して、データセット作成を開始してください。', label='音声セグメントの書き起こし文'),
471 |             )
472 | 
473 |         # 現在の current_index に応じて音声と書き起こし文をリセット
474 |         return (
475 |             gradio.Audio(
476 |                 value = segment_audio_paths[current_index],
477 |                 sources = [],
478 |                 type = 'filepath',
479 |                 label = segment_audio_paths[current_index].name,
480 |                 interactive = True,
481 |                 autoplay = True,
482 |                 waveform_options = WaveformOptions(sample_rate=44100),  # UI 上でトリミングした音声ファイルのサンプリングレート
483 |             ),
484 |             gradio.Textbox(value=segment_audio_transcripts[current_index], label='音声セグメントの書き起こし文'),
485 |         )
486 | 
487 |     # Gradio UI の定義と起動
488 |     with gradio.Blocks(css='.gradio-container { max-width: 768px !important; }') as gui:
489 |         with gradio.Column():
490 |             gradio.Markdown("""
491 |                 # Aivis - Create Datasets
492 |                 Tab キー / Shift + Tab キー を押すと、フォームやボタン間で素早くフォーカスを移動できます。
493 |             """)
494 |             audio_player = gradio.Audio(
495 |                 sources = [],
496 |                 type = 'filepath',
497 |                 interactive = True,
498 |                 autoplay = True,
499 |                 waveform_options = WaveformOptions(sample_rate=44100),  # UI 上でトリミングした音声ファイルのサンプリングレート
500 |             )
501 |             speaker_choice = gradio.Dropdown(choices=[], value='', label='音声セグメントの話者名')  # type: ignore
502 |             transcript_box = gradio.Textbox(value='確定ボタンを押して、データセット作成を開始してください。', label='音声セグメントの書き起こし文')
503 |             with gradio.Row():
504 |                 confirm_button = gradio.Button('確定', variant='primary')
505 |                 skip_button = gradio.Button('このデータを除外')
506 |             confirm_button.click(
507 |                 fn = OnClick,
508 |                 inputs = [
509 |                     audio_player,
510 |                     speaker_choice,
511 |                     transcript_box,
512 |                 ],
513 |                 outputs = [
514 |                     audio_player,
515 |                     speaker_choice,
516 |                     transcript_box,
517 |                 ],
518 |             )
519 |             skip_button.click(
520 |                 # functools.partial() を使って OnClick() に is_skip=True を渡す
521 |                 fn = functools.partial(OnClick, is_skip=True),
522 |                 inputs = [
523 |                     audio_player,
524 |                     speaker_choice,
525 |                     transcript_box,
526 |                 ],
527 |                 outputs = [
528 |                     audio_player,
529 |                     speaker_choice,
530 |                     transcript_box,
531 |                 ],
532 |             )
533 |             reset_button = gradio.Button('音声と書き起こし文の変更をリセット')
534 |             reset_button.click(
535 |                 fn = OnReset,
536 |                 inputs = [
537 |                     speaker_choice,
538 |                 ],
539 |                 outputs = [
540 |                     audio_player,
541 |                     transcript_box,
542 |                 ],
543 |             )
544 | 
545 |         # 0.0.0.0:7860 で Gradio UI を起動
546 |         gui.launch(server_name='0.0.0.0', server_port=7860)
547 | 
548 | 
549 | @app.command(help='Check dataset files and calculate total duration.')
550 | def check_dataset(
551 |     speaker_name: Annotated[str, typer.Argument(help='Speaker name.')],
552 | ):
553 |     typer.echo('=' * utils.GetTerminalColumnSize())
554 | 
555 |     # バリデーション
556 |     dataset_dir = constants.DATASETS_DIR / speaker_name
557 |     if not dataset_dir.exists():
558 |         typer.echo(f'Error: Speaker {speaker_name} not found.')
559 |         typer.echo('=' * utils.GetTerminalColumnSize())
560 |         sys.exit(1)
561 | 
562 |     # transcripts.list をパースして音声ファイル名と書き起こし文を取得
563 |     ## 例: 0001.wav|SpeakerName|JP|こんにちは
564 |     with open(dataset_dir / 'transcripts.list', mode='r', encoding='utf-8') as f:
565 |         dataset_files_raw = f.read().splitlines()
566 |         dataset_files = [i.split('|') for i in dataset_files_raw]
567 | 
568 |     typer.echo(f'Speaker: {speaker_name} / Directory: {dataset_dir}')
569 |     typer.echo('=' * utils.GetTerminalColumnSize())
570 | 
571 |     # 各音声ファイルごとにループ
572 |     total_audio_duration = 0.0
573 |     for index, dataset_file in enumerate(dataset_files):
574 |         if index > 0:
575 |             typer.echo('-' * utils.GetTerminalColumnSize())
576 |         dataset_file_path = constants.DATASETS_DIR / speaker_name / 'audios' / dataset_file[0]
577 |         typer.echo(f'Dataset File : {dataset_file_path}')
578 |         if not dataset_file_path.exists():
579 |             typer.echo(f'Error: Dataset file {dataset_file_path} not found.')
580 |         else:
581 |             audio_duration = prepare.GetAudioFileDuration(dataset_file_path)
582 |             total_audio_duration += audio_duration
583 |             typer.echo(f'Duration     : {utils.SecondToTimeCode(audio_duration)}')
584 |             typer.echo(f'Transcript   : {dataset_file[3]}')
585 | 
586 |     typer.echo('=' * utils.GetTerminalColumnSize())
587 |     typer.echo(f'Total Files    : {len(dataset_files)}')
588 |     typer.echo(f'Total Duration : {utils.SecondToTimeCode(total_audio_duration)}')
589 |     typer.echo('=' * utils.GetTerminalColumnSize())
590 | 
591 | 
592 | @app.command(help='Train model.')
593 | def train(
594 |     speaker_name: Annotated[str, typer.Argument(help='Speaker name.')],
595 |     batch_size: Annotated[int, typer.Option(help='Training batch size.')] = 4,
596 |     epochs: Annotated[Union[int, None], typer.Option(help='Training epochs. (Cannot be used with --steps)')] = None,
597 |     steps: Annotated[Union[int, None], typer.Option(help='Training steps. (Cannot be used with --epochs)')] = None,
598 | ):
599 |     typer.echo('=' * utils.GetTerminalColumnSize())
600 | 
601 |     # バリデーション
602 |     dataset_dir = constants.DATASETS_DIR / speaker_name
603 |     if not dataset_dir.exists():
604 |         typer.echo(f'Error: Speaker {speaker_name} not found.')
605 |         typer.echo('=' * utils.GetTerminalColumnSize())
606 |         sys.exit(1)
607 |     if epochs is not None and steps is not None:
608 |         typer.echo(f'Error: --epochs and --steps cannot be used together.')
609 |         typer.echo('=' * utils.GetTerminalColumnSize())
610 |         sys.exit(1)
611 |     if epochs is None and steps is None:
612 |         typer.echo(f'Error: --epochs or --steps must be specified.')
613 |         typer.echo('=' * utils.GetTerminalColumnSize())
614 |         sys.exit(1)
615 | 
616 |     # transcripts.list をパースしてデータセットの音声ファイルの総数を取得
617 |     with open(dataset_dir / 'transcripts.list', mode='r', encoding='utf-8') as f:
618 |         dataset_files_raw = f.read().splitlines()
619 |         dataset_files = [i.split('|') for i in dataset_files_raw]
620 |         dataset_files_count = len(dataset_files)
621 | 
622 |     # もし --epochs が指定されている場合、バッチサイズ・データセットの総数から自動的にステップ数を計算
623 |     if epochs is not None:
624 |         steps = math.ceil((dataset_files_count / batch_size) * epochs)
625 | 
626 |     # もし --steps が指定されている場合、バッチサイズ・データセットの総数から自動的にエポック数を計算
627 |     if steps is not None:
628 |         epochs = math.ceil(steps / (dataset_files_count / batch_size)) + 1  # モデルを確実に保存するため +1 しておく
629 | 
630 |     typer.echo(f'Speaker: {speaker_name} / Directory: {dataset_dir} (Total {dataset_files_count} files)')
631 |     typer.echo(f'Batch Size: {batch_size} / Epochs: {epochs} / Steps: {steps}')
632 |     typer.echo('=' * utils.GetTerminalColumnSize())
633 | 
634 |     # Bert-VITS2 のデータセットディレクトリを作成
635 |     bert_vits2_dataset_dir = constants.BERT_VITS2_DIR / 'Data'
636 |     bert_vits2_dataset_dir.mkdir(parents=True, exist_ok=True)
637 | 
638 |     # 事前学習済みモデルがまだダウンロードされていなければダウンロード
639 |     ## ダウンロード中に実行を中断するとダウンロード途中のロードできない事前学習済みモデルが残ってしまう
640 |     ## 基本ダウンロード中に実行を中断すべきではないが、万が一そうなった場合は手動でダウンロード途中のモデルを削除してから再実行する必要がある
641 |     download_base_url = 'https://huggingface.co/Stardust-minus/Bert-VITS2-Japanese-Extra/resolve/main/'
642 |     if not (constants.CACHE_DIR / 'D_0.pth').exists():
643 |         typer.echo('Downloading pretrained model (D_0.pth) ...')
644 |         utils.DownloadFile(download_base_url + 'D_0.pth', constants.CACHE_DIR / 'D_0.pth')
645 |     if not (constants.CACHE_DIR / 'G_0.pth').exists():
646 |         typer.echo('Downloading pretrained model (G_0.pth) ...')
647 |         utils.DownloadFile(download_base_url + 'G_0.pth', constants.CACHE_DIR / 'G_0.pth')
648 |     if not (constants.CACHE_DIR / 'WD_0.pth').exists():
649 |         typer.echo('Downloading pretrained model (WD_0.pth) ...')
650 |         utils.DownloadFile(download_base_url + 'WD_0.pth', constants.CACHE_DIR / 'WD_0.pth')
651 | 
652 |     # 既に Bert-VITS2/Data/(話者名)/audios/ が存在する場合は一旦削除
653 |     ## 同一のデータセットでもう一度学習を回す際、Bert 関連の中間ファイルを削除して再生成されるようにする
654 |     if (bert_vits2_dataset_dir / speaker_name / 'audios').exists():
655 |         shutil.rmtree(bert_vits2_dataset_dir / speaker_name / 'audios')
656 |     ## 再度空のディレクトリを作成
657 |     (bert_vits2_dataset_dir / speaker_name / 'audios').mkdir(parents=True, exist_ok=True)
658 | 
659 |     # 既に Bert-VITS2/Data/(話者名)/filelists/ が存在する場合は一旦削除
660 |     ## 同一のデータセットでもう一度学習を回す際、書き起こしデータの中間ファイルを削除して再生成されるようにする
661 |     if (bert_vits2_dataset_dir / speaker_name / 'filelists').exists():
662 |         shutil.rmtree(bert_vits2_dataset_dir / speaker_name / 'filelists')
663 |     ## 再度空のディレクトリを作成
664 |     (bert_vits2_dataset_dir / speaker_name / 'filelists').mkdir(parents=True, exist_ok=True)
665 | 
666 |     # 指定されたデータセットを Bert-VITS2 のデータセットディレクトリにコピー
667 |     ## ex: 04-Datasets/(話者名)/audios/ -> Bert-VITS2/Data/(話者名)/audios/wavs/
668 |     ## ex: 04-Datasets/(話者名)/transcripts.list -> Bert-VITS2/Data/(話者名)/filelists/transcripts.list
669 |     typer.echo('Copying dataset files...')
670 |     shutil.copytree(dataset_dir / 'audios', bert_vits2_dataset_dir / speaker_name / 'audios' / 'wavs')
671 |     shutil.copyfile(dataset_dir / 'transcripts.list', bert_vits2_dataset_dir / speaker_name / 'filelists' / 'transcripts.list')
672 | 
673 |     # 書き起こし文ファイル内の音声ファイル名を Data/(話者名)/audios/wavs/ からのパスに変更
674 |     ## 例: 0001.wav|SpeakerName|JP|こんにちは → Data/SpeakerName/audios/wavs/0001.wav|SpeakerName|JP|こんにちは
675 |     with open(bert_vits2_dataset_dir / speaker_name / 'filelists' / 'transcripts.list', 'r', encoding='utf-8') as f:
676 |         transcripts_list = f.read()
677 |     with open(bert_vits2_dataset_dir / speaker_name / 'filelists' / 'transcripts.list', 'w', encoding='utf-8') as f:
678 |         f.write(re.sub(r'(.*\.wav)', f'Data/{speaker_name}/audios/wavs/\\1', transcripts_list))
679 | 
680 |     # ダウンロードした事前学習済みモデルを Bert-VITS2/Data/(話者名)/models/ にコピー
681 |     ## モデル学習の際にこれらのファイルは上書きされてしまうため、シンボリックリンクではなくコピーする
682 |     if not (bert_vits2_dataset_dir / speaker_name / 'models').exists():
683 |         typer.echo('Copying pretrained model files...')
684 |         (bert_vits2_dataset_dir / speaker_name / 'models').mkdir(parents=True, exist_ok=True)
685 |         ## ex: Bert-VITS2/Data/G_0.pth -> Bert-VITS2/Data/(話者名)/models/G_0.pth
686 |         if not (bert_vits2_dataset_dir / speaker_name / 'models' / 'D_0.pth').exists():
687 |             shutil.copyfile(constants.CACHE_DIR / 'D_0.pth', bert_vits2_dataset_dir / speaker_name / 'models' / 'D_0.pth')
688 |         if not (bert_vits2_dataset_dir / speaker_name / 'models' / 'G_0.pth').exists():
689 |             shutil.copyfile(constants.CACHE_DIR / 'G_0.pth', bert_vits2_dataset_dir / speaker_name / 'models' / 'G_0.pth')
690 |         if not (bert_vits2_dataset_dir / speaker_name / 'models' / 'WD_0.pth').exists():
691 |             shutil.copyfile(constants.CACHE_DIR / 'WD_0.pth', bert_vits2_dataset_dir / speaker_name / 'models' / 'WD_0.pth')
692 | 
693 |     # Bert-VITS2/configs/config.json を Bert-VITS2/Data/(話者名)/config.json にコピー
694 |     ## モデル学習の際にこれらのファイルは上書きされてしまうため、シンボリックリンクではなくコピーする
695 |     if not (bert_vits2_dataset_dir / speaker_name / 'config.json').exists():
696 |         typer.echo('Copying model config file...')
697 |         shutil.copyfile(constants.BERT_VITS2_DIR / 'configs' / 'config.json', bert_vits2_dataset_dir / speaker_name / 'config.json')
698 | 
699 |     # コピーした config.json の epochs と batch_size とを指定された値に変更
700 |     with open(bert_vits2_dataset_dir / speaker_name / 'config.json', mode='r', encoding='utf-8') as f:
701 |         config = json.load(f)
702 |     config['train']['epochs'] = epochs
703 |     config['train']['batch_size'] = batch_size
704 |     with open(bert_vits2_dataset_dir / speaker_name / 'config.json', mode='w', encoding='utf-8') as f:
705 |         json.dump(config, f, indent=2, ensure_ascii=False)
706 | 
707 |     # Bert-VITS2/default_config.yml を Bert-VITS2/config.yml にコピー
708 |     ## 学習対象のデータセット名を変更する必要があるため、既に config.yml が存在する場合も上書きする
709 |     typer.echo('Copying default_config.yml to config.yml...')
710 |     shutil.copyfile(constants.BERT_VITS2_DIR / 'default_config.yml', constants.BERT_VITS2_DIR / 'config.yml')
711 | 
712 |     # config.yml 内の dataset_path: "Data/MySpeaker" を dataset_path: "Data/(話者名)" に変更
713 |     ## 正規表現で置換する
714 |     with open(constants.BERT_VITS2_DIR / 'config.yml', mode='r', encoding='utf-8') as f:
715 |         config_yml = f.read()
716 |     config_yml = re.sub(r'dataset_path: "Data/.*"', f'dataset_path: "Data/{speaker_name}"', config_yml)
717 |     with open(constants.BERT_VITS2_DIR / 'config.yml', mode='w', encoding='utf-8') as f:
718 |         f.write(config_yml)
719 |     typer.echo('=' * utils.GetTerminalColumnSize())
720 | 
721 |     # Bert-VITS2/preprocess_text.py を実行
722 |     typer.echo('Running preprocess_text.py...')
723 |     typer.echo('-' * utils.GetTerminalColumnSize())
724 |     subprocess.run(
725 |         ['python', constants.BERT_VITS2_DIR / 'preprocess_text.py'],
726 |         cwd = constants.BERT_VITS2_DIR,  # カレントディレクトリを Bert-VITS2/ に変更しないと実行できない
727 |         check = True,
728 |     )
729 |     typer.echo('=' * utils.GetTerminalColumnSize())
730 | 
731 |     # Bert-VITS2/bert_gen.py を実行
732 |     typer.echo('Running bert_gen.py...')
733 |     typer.echo('-' * utils.GetTerminalColumnSize())
734 |     subprocess.run(
735 |         ['python', constants.BERT_VITS2_DIR / 'bert_gen.py'],
736 |         cwd = constants.BERT_VITS2_DIR,  # カレントディレクトリを Bert-VITS2/ に変更しないと実行できない
737 |         check = True,
738 |     )
739 |     typer.echo('=' * utils.GetTerminalColumnSize())
740 | 
741 |     # Bert-VITS2/clap_gen.py を実行
742 |     typer.echo('Running clap_gen.py...')
743 |     typer.echo('-' * utils.GetTerminalColumnSize())
744 |     subprocess.run(
745 |         ['python', constants.BERT_VITS2_DIR / 'clap_gen.py'],
746 |         cwd = constants.BERT_VITS2_DIR,  # カレントディレクトリを Bert-VITS2/ に変更しないと実行できない
747 |         check = True,
748 |     )
749 |     typer.echo('=' * utils.GetTerminalColumnSize())
750 | 
751 |     # 学習を開始 (Bert-VITS2/train_ms.py を実行)
752 |     typer.echo('Training started.')
753 |     typer.echo('-' * utils.GetTerminalColumnSize())
754 |     try:
755 |         subprocess.run(
756 |             ['python', constants.BERT_VITS2_DIR / 'train_ms.py'],
757 |             cwd = constants.BERT_VITS2_DIR,  # カレントディレクトリを Bert-VITS2/ に変更しないと実行できない
758 |             check = True,
759 |         )
760 |     except subprocess.CalledProcessError as ex:
761 |         typer.echo('-' * utils.GetTerminalColumnSize())
762 |         typer.echo(f'Training failed. (Process exited with code {ex.returncode})')
763 |         typer.echo('=' * utils.GetTerminalColumnSize())
764 |         sys.exit(1)
765 |     typer.echo('-' * utils.GetTerminalColumnSize())
766 |     typer.echo('Training finished.')
767 |     typer.echo('=' * utils.GetTerminalColumnSize())
768 | 
769 | 
770 | @app.command(help='Infer model.')
771 | def infer(
772 |     speaker_name: Annotated[str, typer.Argument(help='Speaker name.')],
773 |     model_step: Annotated[Optional[int], typer.Option(help='Model step. (Default: Largest step)')] = None,
774 | ):
775 |     typer.echo('=' * utils.GetTerminalColumnSize())
776 | 
777 |     # バリデーション
778 |     model_dir = constants.BERT_VITS2_DIR / 'Data' / speaker_name
779 |     if not model_dir.exists():
780 |         typer.echo(f'Error: Speaker {speaker_name} not found.')
781 |         typer.echo('=' * utils.GetTerminalColumnSize())
782 |         sys.exit(1)
783 | 
784 |     # モデルファイルを探す
785 |     # 指定されていなければ最大のステップのモデルを探す
786 |     ## モデルは 1000 ステップごとに保存されており、G_(ステップ数).pth のファイル名フォーマットで保存されている
787 |     ## 例: G_0.pth / G_1000.pth / G_2000.pth / G_3000.pth
788 |     if model_step is None:
789 |         model_step = 0
790 |         for model_file in (model_dir / 'models').glob('G_*.pth'):
791 |             step = int(re.sub(r'\D', '', model_file.stem))
792 |             if step > model_step:
793 |                 model_step = step
794 |         if (model_dir / 'models' / f'G_{model_step}.pth').exists():
795 |             model_file = model_dir / 'models' / f'G_{model_step}.pth'
796 |         else:
797 |             typer.echo(f'Error: Model file {model_dir / "models" / f"G_{model_step}.pth"} not found.')
798 |             typer.echo('=' * utils.GetTerminalColumnSize())
799 |             sys.exit(1)
800 | 
801 |     # ステップ数が指定されている場合はそのステップのモデルを探す
802 |     else:
803 |         model_file = model_dir / 'models' / f'G_{model_step}.pth'
804 |         if not model_file.exists():
805 |             typer.echo(f'Error: Model file {model_file} not found.')
806 |             typer.echo('=' * utils.GetTerminalColumnSize())
807 |             sys.exit(1)
808 | 
809 |     typer.echo(f'Speaker: {speaker_name} / Model Directory: {model_dir}')
810 |     typer.echo(f'Model File: {model_file}')
811 |     typer.echo('=' * utils.GetTerminalColumnSize())
812 | 
813 |     # config.yml を正規表現で書き換える
814 |     ## dataset_path: ".*" を dataset_path: "Data/(話者名)" に書き換える
815 |     ## model: "models/.*" を model: "models/G_(ステップ数).pth" に書き換える
816 |     with open(constants.BERT_VITS2_DIR / 'config.yml', mode='r', encoding='utf-8') as f:
817 |         config_yml = f.read()
818 |     config_yml = re.sub(r'dataset_path: ".*"', f'dataset_path: "Data/{speaker_name}"', config_yml)
819 |     config_yml = re.sub(r'model: "models/.*"', f'model: "models/G_{model_step}.pth"', config_yml)
820 |     with open(constants.BERT_VITS2_DIR / 'config.yml', mode='w', encoding='utf-8') as f:
821 |         f.write(config_yml)
822 | 
823 |     # Bert-VITS2/webui.py を実行
824 |     typer.echo('Running Infer Web UI...')
825 |     typer.echo('-' * utils.GetTerminalColumnSize())
826 |     subprocess.run(
827 |         ['python', constants.BERT_VITS2_DIR / 'webui.py'],
828 |         cwd = constants.BERT_VITS2_DIR,  # カレントディレクトリを Bert-VITS2/ に変更しないと実行できない
829 |         check = True,
830 |     )
831 |     typer.echo('=' * utils.GetTerminalColumnSize())
832 | 
833 | 
834 | @app.command(help='Show version.')
835 | def version():
836 |     typer.echo(f'Aivis version {__version__}')
837 | 
838 | 
839 | if __name__ == '__main__':
840 |     app()
841 | 


--------------------------------------------------------------------------------
/Aivis/constants.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from enum import Enum
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | # 各データの保存先ディレクトリ
 7 | BASE_DIR = Path(__file__).resolve().parent.parent
 8 | CACHE_DIR = BASE_DIR / '.cache'
 9 | SOURCES_DIR = BASE_DIR / '01-Sources'
10 | PREPARE_SOURCES_DIR = BASE_DIR / '02-PreparedSources'
11 | SEGMENTS_DIR = BASE_DIR / '03-Segments'
12 | DATASETS_DIR = BASE_DIR / '04-Datasets'
13 | BERT_VITS2_DIR = BASE_DIR / 'Bert-VITS2'
14 | 
15 | # データソースとして読み込むファイルの拡張子
16 | ## 大半の動画・音声ファイルを網羅しているはず
17 | SOURCE_FILE_EXTENSIONS = [
18 |     '.wav',
19 |     '.flac',
20 |     '.opus',
21 |     '.ogg',
22 |     '.vorbis',
23 |     '.mp3',
24 |     '.m4a',
25 |     '.mp4',
26 |     '.mov',
27 |     '.mkv',
28 |     '.webm',
29 |     '.wmv',
30 |     '.ts',
31 |     '.mts',
32 |     '.m2ts',
33 |     '.mpg',
34 |     '.mpeg',
35 | ]
36 | 
37 | # スキップする Whisper のハルシネーション避けのワード
38 | SKIP_TRANSCRIPTS = [
39 |     '視聴ありがとう',
40 |     '視聴頂き',
41 |     '視聴いただき',
42 |     '視聴下さ',
43 |     '視聴くださ',
44 |     'チャンネル登録',
45 | ]
46 | 
47 | class ModelNameType(str, Enum):
48 |     small = 'small'
49 |     medium = 'medium'
50 |     large = 'large'
51 |     large_v1 = 'large-v1'
52 |     large_v2 = 'large-v2'
53 |     large_v3 = 'large-v3'
54 | 


--------------------------------------------------------------------------------
/Aivis/demucs.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import annotations
  3 | 
  4 | import ffmpeg
  5 | import numpy as np
  6 | import typer
  7 | from concurrent.futures import ProcessPoolExecutor
  8 | from pathlib import Path
  9 | from pydub import AudioSegment
 10 | from typing import Any, TYPE_CHECKING
 11 | 
 12 | # ロード時間短縮のため型チェック時のみインポートする
 13 | if TYPE_CHECKING:
 14 |     from torch import Tensor
 15 |     from demucs.htdemucs import HTDemucs
 16 | 
 17 | from Aivis import utils
 18 | 
 19 | 
 20 | def ConvertToWave(file_paths: list[Path], output_dir: Path) -> list[Path]:
 21 |     """
 22 |     音声ファイルを WAV に変換して出力する
 23 | 
 24 |     Args:
 25 |         file_paths (list[Path]): ファイルパスのリスト
 26 |         output_dir (Path): 出力先のフォルダ
 27 | 
 28 |     Returns:
 29 |         list[Path]: 出力されたファイルパスのリスト
 30 |     """
 31 | 
 32 |     # 出力されたファイルパスのリスト (すでに変換済みのファイルも含む)
 33 |     output_file_paths: list[Path] = []
 34 | 
 35 |     for file_path in file_paths:
 36 |         typer.echo('=' * utils.GetTerminalColumnSize())
 37 | 
 38 |         # 出力先のファイルがすでに存在する場合
 39 |         # すでに変換済みなのでスキップ
 40 |         output_file_path = output_dir / f'{file_path.name.split(".")[0]}.wav'
 41 |         if output_file_path.exists():
 42 |             typer.echo(f'File {file_path} is already converted.')
 43 |             output_file_paths.append(output_file_path)
 44 |             continue
 45 | 
 46 |         typer.echo(f'File {file_path} converting...')
 47 |         typer.echo('-' * utils.GetTerminalColumnSize())
 48 | 
 49 |         # 音声ファイルを読み込む
 50 |         audio = AudioSegment.from_file(file_path)
 51 | 
 52 |         # 音声ファイルを WAV に変換する
 53 |         audio.export(output_file_path, format='wav')
 54 |         typer.echo('-' * utils.GetTerminalColumnSize())
 55 |         typer.echo(f'File {file_path} converted.')
 56 | 
 57 |         output_file_paths.append(output_file_path)
 58 |         typer.echo(f'File saved: {output_file_path}')
 59 | 
 60 |     return output_file_paths
 61 | 
 62 | 
 63 | def ExtractVoices(file_paths: list[Path], output_dir: Path) -> list[Path]:
 64 |     """
 65 |     音声ファイルからボイスのみを抽出 (BGM などは除去) して出力する
 66 | 
 67 |     Args:
 68 |         file_paths (list[Path]): ファイルパスのリスト
 69 |         output_dir (Path): 出力先のフォルダ
 70 | 
 71 |     Returns:
 72 |         list[Path]: 出力されたファイルパスのリスト
 73 |     """
 74 | 
 75 |     # Demucs での推論終了時に確実に VRAM を解放するため、マルチプロセスで実行する
 76 |     ## 確実に VRAM を解放できないと VRAM 容量次第では後続の Whisper での書き起こし処理に支障するため
 77 |     ## 並列処理は行っていない (リソース的に厳しい) ため本来はマルチプロセスにする意味はないが、
 78 |     ## マルチプロセスで起動させれば、マルチプロセス終了時に確実に VRAM を解放することができる
 79 |     ## del model でもある程度解放できるが、完全に解放されるわけではないみたい…
 80 |     ## 並列処理を行うためにマルチプロセスにしているわけではないため max_workers は 1 に設定している
 81 |     with ProcessPoolExecutor(max_workers=1) as executor:
 82 |         return executor.submit(__ExtractVoicesMultiProcess, file_paths, output_dir).result()
 83 | 
 84 | 
 85 | def __ExtractVoicesMultiProcess(file_paths: list[Path], output_dir: Path) -> list[Path]:
 86 |     """
 87 |     ProcessPoolExecutor で実行される ExtractVoices() の実処理
 88 | 
 89 |     Args:
 90 |         file_paths (list[Path]): ファイルパスのリスト
 91 |         output_dir (Path): 出力先のフォルダ
 92 | 
 93 |     Returns:
 94 |         list[Path]: 出力されたファイルパスのリスト
 95 |     """
 96 | 
 97 |     import torch
 98 |     from demucs.pretrained import get_model_from_args
 99 | 
100 |     demucs_model = None
101 | 
102 |     # 出力されたファイルパスのリスト (すでに抽出済みのファイルも含む)
103 |     output_file_paths: list[Path] = []
104 | 
105 |     for file_path in file_paths:
106 | 
107 |         typer.echo('=' * utils.GetTerminalColumnSize())
108 | 
109 |         # 出力先のファイルがすでに存在する場合
110 |         # すでに抽出済みなのでスキップ
111 |         output_file_path = output_dir / f'{file_path.name.split(".")[0]}.wav'
112 |         if output_file_path.exists():
113 |             typer.echo(f'File {file_path} is already separated.')
114 |             output_file_paths.append(output_file_path)
115 |             continue
116 | 
117 |         typer.echo(f'File {file_path} separating...')
118 |         typer.echo('-' * utils.GetTerminalColumnSize())
119 | 
120 |         # 学習済みモデルを読み込む (初回のみ)
121 |         if demucs_model is None:
122 |             typer.echo('Demucs model loading...')
123 |             demucs_model = get_model_from_args(type('args', (object,), dict(name='htdemucs_ft', repo=None))).cpu().eval()
124 |             typer.echo('Demucs model loaded.')
125 |             typer.echo('-' * utils.GetTerminalColumnSize())
126 | 
127 |         # 音源分離を実行する
128 |         RunDemucs(
129 |             demucs_model,
130 |             str(file_path),
131 |             save_path = str(output_file_path),
132 |             device = 'cuda',
133 |             verbose = True,
134 |         )
135 |         typer.echo('-' * utils.GetTerminalColumnSize())
136 |         typer.echo(f'File {file_path} separated.')
137 | 
138 |         output_file_paths.append(output_file_path)
139 |         typer.echo(f'File saved: {output_file_path}')
140 | 
141 |     # GPU の VRAM を解放する
142 |     del demucs_model
143 |     torch.cuda.empty_cache()
144 | 
145 |     return output_file_paths
146 | 
147 | 
148 | def RunDemucs(
149 |     model: HTDemucs,
150 |     audio: Tensor | str,
151 |     input_sr: int | None = None,
152 |     output_sr: int | None = None,
153 |     device: str | None = None,
154 |     verbose: bool = True,
155 |     track_name: str | None = None,
156 |     save_path: str | None = None,
157 |     **demucs_options: Any,
158 | ) -> Tensor:
159 |     """
160 |     Demucs で音源分離を実行する
161 |     stable-ts v2.14.4 時点での demucs_audio() を若干変更の上移植したもの
162 |     stable-ts v2.15.0 以降では Demucs の 4 トラックのうち単一トラックのみを処理するよう大幅に音源分離関連が変更されたが、
163 |     その結果雑音除去性能がガタ落ちしていたため (出力後の音声にブーンと低いノイズが入る…) 、あえて古い実装を移植して利用している
164 |     その分遅くはなるが、ノイズが入ることで学習時に支障が出たら元も子もない
165 |     ref: https://github.com/jianfch/stable-ts/blob/f6d61c228d5a00f89637422537d36cd358e5b90d/stable_whisper/audio.py
166 |     """
167 | 
168 |     import torch
169 |     import torchaudio
170 |     from demucs.apply import apply_model
171 | 
172 |     def load_audio(file: str | bytes, sr: int = 44100):
173 |         if isinstance(file, bytes):
174 |             inp, file = file, 'pipe:'
175 |         else:
176 |             inp = None
177 |         try:
178 |             # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
179 |             # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
180 |             out, _ = (
181 |                 ffmpeg.input(file, threads=0)
182 |                 .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
183 |                 .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True, input=inp)
184 |             )
185 |         except ffmpeg.Error as e:
186 |             raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
187 | 
188 |         return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
189 | 
190 |     if track_name:
191 |         track_name = f'"{track_name}"'
192 | 
193 |     if isinstance(audio, (str, bytes)):
194 |         if not track_name:
195 |             track_name = f'"{audio}"'
196 |         audio = torch.from_numpy(load_audio(audio, model.samplerate))
197 |     elif input_sr != model.samplerate:
198 |         if input_sr is None:
199 |             raise ValueError('No [input_sr] specified for audio tensor.')
200 |         audio = torchaudio.functional.resample(audio,
201 |                                                orig_freq=input_sr,
202 |                                                new_freq=model.samplerate)
203 |     if not track_name:
204 |         track_name = 'audio track'
205 |     audio_dims = audio.dim()
206 |     if audio_dims == 1:
207 |         audio = audio[None, None].repeat_interleave(2, -2)
208 |     else:
209 |         if audio.shape[-2] == 1:
210 |             audio = audio.repeat_interleave(2, -2)
211 |         if audio_dims < 3:
212 |             audio = audio[None]
213 | 
214 |     if 'mix' in demucs_options:
215 |         audio = demucs_options.pop('mix')
216 | 
217 |     if device is None:
218 |         device = "cuda" if torch.cuda.is_available() else "cpu"
219 | 
220 |     vocals_idx = model.sources.index('vocals')
221 |     if verbose:
222 |         print(f'Isolating vocals from {track_name}')
223 |     apply_kwarg = dict(
224 |         model=model,
225 |         mix=audio,
226 |         device=device,
227 |         split=True,
228 |         overlap=.25,
229 |         progress=verbose is not None,
230 |     )
231 |     apply_kwarg.update(demucs_options)
232 |     vocals = apply_model(**apply_kwarg)[0, vocals_idx].mean(0)  # type: ignore
233 | 
234 |     if device != 'cpu':
235 |         torch.cuda.empty_cache()
236 | 
237 |     if output_sr is not None and model.samplerate != output_sr:
238 |         vocals = torchaudio.functional.resample(vocals,
239 |                                                 orig_freq=model.samplerate,
240 |                                                 new_freq=output_sr)
241 | 
242 |     if save_path is not None:
243 |         if not save_path.lower().endswith('.wav'):
244 |             save_path += '.wav'
245 |         torchaudio.save(save_path, vocals[None], output_sr or model.samplerate)  # type: ignore
246 |         print(f'Saved: {save_path}')
247 | 
248 |     return vocals
249 | 


--------------------------------------------------------------------------------
/Aivis/prepare.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import errno
  3 | import librosa
  4 | import pyloudnorm
  5 | import re
  6 | import regex
  7 | import shutil
  8 | import soundfile
  9 | import subprocess
 10 | import sys
 11 | import tempfile
 12 | import typer
 13 | from pathlib import Path
 14 | from pydub import AudioSegment
 15 | 
 16 | 
 17 | def GetAudioFileDuration(file_path: Path) -> float:
 18 |     """
 19 |     音声ファイルの長さを取得する
 20 | 
 21 |     Args:
 22 |         file_path (Path): 音声ファイルのパス
 23 | 
 24 |     Returns:
 25 |         float: 音声ファイルの長さ (秒)
 26 |     """
 27 | 
 28 |     # 音声ファイルを読み込む
 29 |     audio = AudioSegment.from_file(file_path)
 30 | 
 31 |     # 音声ファイルの長さを取得する
 32 |     return audio.duration_seconds
 33 | 
 34 | 
 35 | def SliceAudioFile(src_file_path: Path, dst_file_path: Path, start: float, end: float, trim_silence: bool) -> Path:
 36 |     """
 37 |     音声ファイルの一部を切り出して出力する
 38 |     trim_silence=True のときは追加で切り出した音声ファイルの前後の無音区間が削除される
 39 | 
 40 |     Args:
 41 |         src_file_path (Path): 切り出し元の音声ファイルのパス
 42 |         dst_file_path (Path): 切り出し先の音声ファイルのパス
 43 |         start (float): 切り出し開始時間 (秒)
 44 |         end (float): 切り出し終了時間 (秒)
 45 |         trim_silence (bool): 前後の無音区間を削除するかどうか
 46 |     """
 47 | 
 48 |     # 一時保存先のテンポラリファイル
 49 |     ## Windows だと /tmp/ が使えないので NamedTemporaryFile を使う
 50 |     ## src_file_path --切り出し--> temp1 --モノラル化--> temp2 --ノーマライズ--> temp3 --無音区間削除--> temp4 --リネーム--> dst_file_path
 51 |     dst_file_path_temp1 = Path(tempfile.NamedTemporaryFile(suffix='.wav').name)
 52 |     dst_file_path_temp2 = Path(tempfile.NamedTemporaryFile(suffix='.wav').name)
 53 |     dst_file_path_temp3 = Path(tempfile.NamedTemporaryFile(suffix='.wav').name)
 54 |     dst_file_path_temp4 = Path(tempfile.NamedTemporaryFile(suffix='.wav').name)
 55 | 
 56 |     # 開始時刻ちょうどから切り出すと子音が切れてしまうことがあるため、開始時刻の 0.1 秒前から切り出す
 57 |     start = max(0, start - 0.1)
 58 | 
 59 |     # 音声ファイルを読み込む
 60 |     audio = AudioSegment.from_file(src_file_path)
 61 | 
 62 |     # 音声ファイルを切り出す
 63 |     sliced_audio = audio[start * 1000:end * 1000]
 64 |     sliced_audio.export(dst_file_path_temp1, format='wav')
 65 | 
 66 |     # FFmpeg で 44.1kHz 16bit モノラルの wav 形式に変換する
 67 |     ## 基本この時点で 44.1kHz 16bit にはなっているはずだが、音声チャンネルはステレオのままなので、ここでモノラルにダウンミックスする
 68 |     subprocess.run([
 69 |         'ffmpeg',
 70 |         '-y',
 71 |         '-i', str(dst_file_path_temp1),
 72 |         '-ac', '1',
 73 |         '-ar', '44100',
 74 |         '-acodec', 'pcm_s16le',
 75 |         str(dst_file_path_temp2),
 76 |     ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 77 | 
 78 |     # pyloudnorm で音声ファイルをノーマライズ（ラウドネス正規化）する
 79 |     ## FFmpeg でのモノラルへのダウンミックスで音量が変わる可能性も無くはないため、念のためダウンミックス後にノーマライズを行うように実装している
 80 |     LoudnessNorm(dst_file_path_temp2, dst_file_path_temp3, loudness=-23.0)  # -23LUFS にノーマライズする
 81 | 
 82 |     if trim_silence is True:
 83 |         # 最後に前後の無音区間を librosa を使って削除する
 84 |         ## sr=None を指定して、音声ファイルのサンプリングレートをそのまま維持して読み込む (指定しないと 22050Hz になる…)
 85 |         ## 無音区間は dB 基準なのでノーマライズ後に実行した方が望ましい
 86 |         y, sr = librosa.load(dst_file_path_temp3, sr=None)  # type: ignore
 87 |         y, _ = librosa.effects.trim(y, top_db=30)
 88 |         soundfile.write(dst_file_path_temp4, y, sr)
 89 |     else:
 90 |         # 無音区間は削除せずにそのままコピーする
 91 |         shutil.copyfile(dst_file_path_temp3, dst_file_path_temp4)
 92 | 
 93 |     # 最後にファイルを dst_file_path にコピーする
 94 |     try:
 95 |         shutil.copyfile(dst_file_path_temp4, dst_file_path)
 96 |     except OSError as ex:
 97 |         # 万が一ファイル名が最大文字数を超える場合は、ファイル名を短くする
 98 |         ## 87文字は、Linux のファイル名の最大バイト数 (255B) から、拡張子 (.wav) を引いた 251B に入る UTF-8 の最大文字数
 99 |         ## NTFS のファイル名の最大文字数は 255 文字なので (バイト単位ではない) 、Windows でも問題ないはず
100 |         if ex.errno == errno.ENAMETOOLONG:
101 |             # ファイル名を短くした上でコピーする
102 |             dst_file_path_new = dst_file_path.with_name(dst_file_path.stem[:87] + dst_file_path.suffix)
103 |             shutil.copyfile(dst_file_path_temp4, dst_file_path_new)
104 |             typer.echo('Warning: File name is too long. Truncated.')
105 |             # フルの書き起こし文にアクセスできるように、別途テキストファイルに書き起こし文を保存する
106 |             with open(dst_file_path_new.with_suffix('.txt'), mode='w', encoding='utf-8') as f:
107 |                 transcript = re.sub(r'^\d+_', '', dst_file_path.stem)
108 |                 f.write(transcript)
109 |             # ファイル名からの書き起こし文の取得が終わったので、dst_file_path を上書きする
110 |             dst_file_path = dst_file_path_new
111 |         # Windows でファイル名に使用できない文字が含まれている場合は、ファイル名から使用できない文字を置換する
112 |         ## Windows はファイル名に使用できない文字が多い
113 |         elif ex.errno == errno.EINVAL and sys.platform == 'win32':
114 |             # ファイル名に使用できない文字を置換する
115 |             dst_file_path_new = dst_file_path.with_name(re.sub(r'[\\/:*?"<>|]', '_', dst_file_path.stem) + dst_file_path.suffix)
116 |             shutil.copyfile(dst_file_path_temp4, dst_file_path_new)
117 |             typer.echo('Warning: File name contains invalid characters. Replaced.')
118 |             # フルの書き起こし文にアクセスできるように、別途テキストファイルに書き起こし文を保存する
119 |             with open(dst_file_path_new.with_suffix('.txt'), mode='w', encoding='utf-8') as f:
120 |                 transcript = re.sub(r'^\d+_', '', dst_file_path.stem)
121 |                 f.write(transcript)
122 |             # ファイル名からの書き起こし文の取得が終わったので、dst_file_path を上書きする
123 |             dst_file_path = dst_file_path_new
124 |         else:
125 |             raise ex
126 | 
127 |     # 一時ファイルを削除
128 |     dst_file_path_temp1.unlink()
129 |     dst_file_path_temp2.unlink()
130 |     dst_file_path_temp3.unlink()
131 |     dst_file_path_temp4.unlink()
132 | 
133 |     return dst_file_path
134 | 
135 | 
136 | def LoudnessNorm(input: Path, output: Path, peak: float = -1.0, loudness: float = -23.0, block_size : float = 0.400) -> None:
137 |     """
138 |     音声ファイルに対して、ラウドネス正規化（ITU-R BS.1770-4）を実行する
139 |     ref: https://github.com/fishaudio/audio-preprocess/blob/main/fish_audio_preprocess/utils/loudness_norm.py#L9-L33
140 | 
141 |     Args:
142 |         input: 入力音声ファイル
143 |         output: 出力音声ファイル
144 |         peak: 音声を N dB にピーク正規化する. Defaults to -1.0.
145 |         loudness: 音声を N dB LUFS にラウドネス正規化する. Defaults to -23.0.
146 |         block_size: ラウドネス測定用のブロックサイズ. Defaults to 0.400. (400 ms)
147 | 
148 |     Returns:
149 |         ラウドネス正規化された音声データ
150 |     """
151 | 
152 |     # 音声ファイルを読み込む
153 |     audio, rate = soundfile.read(str(input))
154 | 
155 |     # ノーマライズを実行
156 |     audio = pyloudnorm.normalize.peak(audio, peak)
157 |     meter = pyloudnorm.Meter(rate, block_size=block_size)  # create BS.1770 meter
158 |     try:
159 |         _loudness = meter.integrated_loudness(audio)
160 |         audio = pyloudnorm.normalize.loudness(audio, _loudness, loudness)
161 |     except ValueError:
162 |         pass
163 | 
164 |     # 音声ファイルを出力する
165 |     soundfile.write(str(output), audio, rate)
166 | 
167 | 
168 | def PrepareText(text: str) -> str:
169 |     """
170 |     Whisper で書き起こされたテキストをより適切な形に前処理する
171 |     (Whisper の書き起こし結果にはガチャがあり、句読点が付く場合と付かない場合があるため、前処理が必要)
172 | 
173 |     Args:
174 |         text (str): Whisper で書き起こされたテキスト
175 | 
176 |     Returns:
177 |         str: 前処理されたテキスト
178 |     """
179 | 
180 |     # 前後の空白を削除する
181 |     text = text.strip()
182 | 
183 |     # 入力テキストに 1 つでもひらがな・カタカナ・漢字が含まれる場合のみ、日本語として処理する
184 |     # ref: https://note.nkmk.me/python-re-regex-character-type/
185 |     is_japanese = False
186 |     hiragana_katakana_kanji_pattern = regex.compile(r'\p{Hiragana}|\p{Katakana}|\p{Han}')
187 |     if hiragana_katakana_kanji_pattern.search(text):
188 |         is_japanese = True
189 | 
190 |     # 半角の ､｡!? を 全角の 、。！？ に置換する
191 |     if is_japanese is True:
192 |         text = text.replace('､', '、')
193 |         text = text.replace('｡', '。')
194 |         text = text.replace('!', '！')
195 |         text = text.replace('?', '？')
196 | 
197 |     # 全角の 、。！？ の後に半角スペースがある場合は削除する
198 |     if is_japanese is True:
199 |         text = text.replace('、 ', '、')
200 |         text = text.replace('。 ', '。')
201 |         text = text.replace('！ ', '！')
202 |         text = text.replace('？ ', '？')
203 | 
204 |     # 末尾に記号がついていない場合は 。を追加する
205 |     if is_japanese is True:
206 |         if text[-1] not in ['、', '。','！', '？']:
207 |             text = text + '。'
208 |     else:
209 |         if text[-1] not in ['.', '!', '?']:
210 |             text = text + '.'
211 | 
212 |     # 先頭に 、。！？ がある場合は削除する
213 |     if is_japanese is True:
214 |         text = re.sub(r'^[、。！？]+', '', text)
215 |     else:
216 |         text = re.sub(r'^[,.!?]+', '', text)
217 | 
218 |     # 同じ文字が4文字以上続いていたら (例: ～～～～～～～～！！)、2文字にする (例: ～～！！)
219 |     text = re.sub(r'(.)\1{3,}', r'\1\1', text)
220 | 
221 |     # 中間にある空白文字 (半角/全角の両方) を 、に置換する
222 |     if is_japanese is True:
223 |         text = re.sub(r'[ 　]', '、', text)
224 | 
225 |     # （）や【】「」で囲われた文字列を削除する
226 |     text = re.sub(r'（.*?）', '', text)
227 |     text = re.sub(r'【.*?】', '', text)
228 |     text = re.sub(r'「.*?」', '', text)
229 | 
230 |     # 念押しで前後の空白を削除する
231 |     text = text.strip()
232 | 
233 |     # 連続する句読点を1つにまとめる
234 |     if is_japanese is True:
235 |         text = re.sub(r'([、。！？])\1+', r'\1', text)
236 |     else:
237 |         text = re.sub(r'([,\.!\?])\1+', r'\1', text)
238 | 
239 |     return text
240 | 


--------------------------------------------------------------------------------
/Aivis/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import requests
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def DownloadFile(url: str, path: Path) -> None:
 8 |     """
 9 |     ファイルをダウンロードする
10 | 
11 |     Args:
12 |         url (str): ダウンロードするファイルの URL
13 |         path (str): ダウンロードしたファイルの保存先
14 |     """
15 | 
16 |     with requests.get(url, stream=True) as r:
17 |         r.raise_for_status()
18 |         with open(path, mode='wb') as f:
19 |             for chunk in r.iter_content(chunk_size=8192):
20 |                 f.write(chunk)
21 | 
22 | 
23 | def GetTerminalColumnSize() -> int:
24 |     """
25 |     ターミナルの列のサイズを取得する
26 | 
27 |     Returns:
28 |         int: ターミナルの列のサイズ
29 |     """
30 | 
31 |     try:
32 |         columns = os.get_terminal_size().columns
33 |         return columns
34 |     except OSError:
35 |         return 80
36 | 
37 | 
38 | def SecondToTimeCode(second: float) -> str:
39 |     """
40 |     秒数をタイムコード (HH:MM:SS.mmm) に変換する
41 | 
42 |     Args:
43 |         second (float): 秒数
44 | 
45 |     Returns:
46 |         str: タイムコード
47 |     """
48 | 
49 |     m, s = divmod(second, 60)
50 |     h, m = divmod(m, 60)
51 |     return f'{int(h):02d}:{int(m):02d}:{int(s):02d}.{int((s - int(s)) * 1000):03d}'
52 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # CUDA 12.1.1 (CUDNN8 Runtime Ubuntu 20.04) をベースイメージとして利用
 3 | FROM nvcr.io/nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu20.04
 4 | 
 5 | # タイムゾーンを東京に設定
 6 | ENV TZ=Asia/Tokyo
 7 | 
 8 | # apt-get に対話的に設定を確認されないための設定
 9 | ENV DEBIAN_FRONTEND=noninteractive
10 | 
11 | # Python 3.11 と動作に必要な各種ソフトのインストール
12 | RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates curl git software-properties-common tzdata && \
13 |     add-apt-repository ppa:deadsnakes/ppa && \
14 |     apt-get update && apt-get upgrade -y && \
15 |     apt-get install -y --no-install-recommends \
16 |         python3.11 \
17 |         python3.11-dev \
18 |         python3.11-distutils \
19 |         python3.11-venv \
20 |         build-essential \
21 |         cmake && \
22 |     apt-get -y autoremove && \
23 |     apt-get -y clean && \
24 |     rm -rf /var/lib/apt/lists/* && \
25 |     rm -rf /tmp/*
26 | 
27 | # FFmpeg 6.0 をインストール
28 | RUN curl -LO \
29 |     https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2023-11-30-12-55/ffmpeg-n6.0.1-linux64-gpl-shared-6.0.tar.xz && \
30 |     tar -xvf ffmpeg-n6.0.1-linux64-gpl-shared-6.0.tar.xz && \
31 |     cp -ar ffmpeg-n6.0.1-linux64-gpl-shared-6.0/bin/* /usr/bin/ && \
32 |     cp -ar ffmpeg-n6.0.1-linux64-gpl-shared-6.0/lib/* /usr/lib/ && \
33 |     rm -rf ffmpeg-n6.0.1-linux64-gpl-shared-6.0 && \
34 |     rm -rf ffmpeg-n6.0.1-linux64-gpl-shared-6.0.tar.xz
35 | 
36 | # コンテナ内での作業ディレクトリを指定
37 | WORKDIR /code/
38 | 
39 | # pip をインストール
40 | ## python3-pip だと古い Python 向けの pip がインストールされるため、get-pip.py でインストールする
41 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.11
42 | 
43 | # Poetry をインストール
44 | RUN pip install poetry
45 | 
46 | # Poetry の依存パッケージリストだけをコピー
47 | COPY ./pyproject.toml ./poetry.lock ./poetry.toml /code/
48 | 
49 | # 依存パッケージを Poetry でインストール
50 | RUN poetry env use 3.11 && \
51 |     poetry install --only main --no-root
52 | 
53 | # 事前に OpenJTalk の辞書をダウンロードしておく
54 | ## 通常は動的に行われるが、Docker イメージを再ビルドした際に毎回ダウンロードされるのを防ぐ
55 | RUN cd /code/.venv/lib/python3.11/site-packages/pyopenjtalk/ && \
56 |     curl -LO https://github.com/r9y9/open_jtalk/releases/download/v1.11.1/open_jtalk_dic_utf_8-1.11.tar.gz && \
57 |     tar -xvf open_jtalk_dic_utf_8-1.11.tar.gz && \
58 |     rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
59 | 
60 | # /root/.cache を /code/.cache/ へのシンボリックリンクにする
61 | RUN cd /root/ && ln -s /code/.cache/ .cache
62 | 
63 | # /root/.keras/ を /code/.cache/ へのシンボリックリンクにする
64 | RUN cd /root/ && ln -s /code/.cache/ .keras
65 | 
66 | # /root/nltk_data/ を /code/.cache/ へのシンボリックリンクにする
67 | RUN cd /root/ && ln -s /code/.cache/ nltk_data
68 | 
69 | # ソースコードをコピー
70 | COPY ./ /code/
71 | 
72 | # ./Aivis.sh をエントリーポイントとして指定
73 | ENTRYPOINT ["./Aivis.sh"]
74 | 


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2023-2024 tsukumi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Aivis
  3 | 
  4 | 💠 **Aivis:** **AI** **V**oice **I**mitation **S**ystem
  5 | 
  6 | <img width="100%" alt="image" src="https://github.com/tsukumijima/Aivis/assets/39271166/c5b2a9cd-74ec-4b4b-8981-4ed81d0f4345">
  7 | 
  8 | ## Overview
  9 | 
 10 | **Aivis は、高音質で感情豊かな音声を生成できる [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) 用のデータセットの作成・学習・推論を、オールインワンで行えるツールです。**
 11 | 
 12 | 通常、専用に作成された音声コーパス以外の音源から学習用データセットを作成するには、膨大な手間と時間がかかります。  
 13 | Aivis では、**一般的な音源からデータセットを作成するための工程を AI で可能な限り自動化し、さらに最終的な人力でのアノテーション作業を簡単に行える Web UI を通して、データセット作成の手間と時間を大幅に削減します。**  
 14 | さらに Bert-VITS2 でのモデルの学習や推論 (Web UI の起動) も、簡単なコマンドひとつで実行できます。
 15 | 
 16 | https://github.com/tsukumijima/Aivis/assets/39271166/dfae6b5d-da73-477b-b316-dc077b06e6ef
 17 | 
 18 | 大元の音源の量・質・話し方にもよりますが、上のサンプル音声の通り、専用に作成された音声コーパスを使い学習させたモデルと比べても遜色ないクオリティの音声を生成できます。  
 19 | Bert-VITS2 の事前学習モデル自体の性能が極めて高いようで、私の環境では Aivis で作成したわずか7分弱のデータセットを学習させたモデルでも、かなり近い声質の明瞭かつ感情豊かな音声を生成できています。
 20 | 
 21 | > [!NOTE]  
 22 | > Aivis では、実用途に合わせて細部を調整した [フォーク版の Bert-VITS2](https://github.com/tsukumijima/Bert-VITS2) を利用しています。  
 23 | > 今のところ学習/推論アルゴリズムは変更していません。Web UI を日本語化したことや学習時に必要なモデルを自動でダウンロードできることなど以外は、オリジナルの Bert-VITS2 ([Japanese-Extra ブランチ](https://github.com/fishaudio/Bert-VITS2/tree/Japanese-Extra)) と同等です。
 24 | 
 25 | ## Installation
 26 | 
 27 | Linux (Ubuntu 20.04 LTS) x64 でのみ検証しています。  
 28 | CUDA / cuDNN 関連ライブラリ (.so) は基本 `poetry install` 時に pip wheels としてインストールされるため、別途 CUDA / cuDNN をインストールする必要はないと思われます。
 29 | 
 30 | Windows でもある程度動くように調整しているつもりですが、動作確認は取れていないためお勧めしません。Windows では WSL2 を使ってください。
 31 | 
 32 | > [!NOTE]  
 33 | > 手元に環境がないため WSL2 での動作検証はできていませんが、動作報告はいただいています。  
 34 | > WSL2 で動かす場合、NVIDIA GPU と CUDA のサポートが追加されている Windows 11 または Windows 10 (21H2 以降) が必要です。  
 35 | > なお、NVIDIA GPU ドライバは Windows 側にのみインストールする必要があります。WSL2 側にはインストールしないでください。
 36 | 
 37 | 当然ですが、Aivis の実行には NVIDIA GPU が必要です。  
 38 | Geforce GTX 1080 (VRAM 8GB) での動作を確認しています。VRAM はおそらく最低 8GB は必要です (VRAM 12GB のグラボが欲しい…) 。
 39 | 
 40 | ### Non-Docker
 41 | 
 42 | Docker を使わない場合、事前に Git・Python 3.11・Poetry・FFmpeg がインストールされている必要があります。
 43 | 
 44 | ```bash
 45 | # サブモジュールが含まれているため --recurse を付ける
 46 | git clone --recurse https://github.com/tsukumijima/Aivis.git
 47 | 
 48 | # 依存関係のインストール
 49 | cd Aivis
 50 | poetry env use 3.11
 51 | poetry install --no-root
 52 | 
 53 | # ヘルプを表示
 54 | ./Aivis.sh --help
 55 | ```
 56 | 
 57 | 以前インストールした環境を最新版に更新する場合は、以下のコマンドを実行してください。
 58 | 
 59 | ```bash
 60 | git pull
 61 | git submodule update --init --recursive
 62 | poetry install --no-root
 63 | ```
 64 | 
 65 | ### Docker
 66 | 
 67 | Docker を使う場合、事前に Git・Docker がインストールされている必要があります。  
 68 | Docker を使わない場合と比べてあまり検証できていないため、うまく動かないことがあるかもしれません。
 69 | 
 70 | 
 71 | ```bash
 72 | # サブモジュールが含まれているため --recurse を付ける
 73 | git clone --recurse https://github.com/tsukumijima/Aivis.git
 74 | 
 75 | # 依存関係のインストール
 76 | cd Aivis
 77 | ./Aivis-Docker.sh build
 78 | 
 79 | # ヘルプを表示
 80 | ./Aivis-Docker.sh --help
 81 | ```
 82 | 
 83 | 以前インストールした環境を最新版に更新する場合は、以下のコマンドを実行してください。
 84 | 
 85 | ```bash
 86 | git pull
 87 | git submodule update --init --recursive
 88 | ./Aivis-Docker.sh build
 89 | ```
 90 | 
 91 | ## Dataset Directory Structure
 92 | 
 93 | Aivis のデータセットディレクトリは、5段階に分けて構成されています。
 94 | 
 95 | - **01-Sources:** データセットにする音声をそのまま入れるディレクトリ
 96 |   - データセットの素材にする音声ファイルをそのまま入れてください。
 97 |     - 基本どの音声フォーマットでも大丈夫です。`create-segments` での下処理にて、自動的に wav に変換されます。
 98 |     - 背景 BGM の除去などの下処理を行う必要はありません。`create-segments` での下処理にて、自動的に BGM や雑音の除去が行われます。
 99 |     - 数十分〜数時間ある音声ファイルの場合は `create-segments` での書き起こしの精度が悪くなることがあるため、事前に10分前後に分割することをおすすめします。
100 |   - `create-segments` サブコマンドを実行すると、BGM や雑音の除去・書き起こし・一文ごとのセグメントへの分割・セグメント化した音声の音量/フォーマット調整が、すべて自動的に行われます。
101 | - **02-PreparedSources:** `create-segments` サブコマンドで下処理が行われた音声ファイルと、その書き起こしテキストが入るディレクトリ
102 |   - `create-segments` サブコマンドを実行すると、`01-Sources/` にある音声ファイルの BGM や雑音が除去され、このディレクトリに書き起こしテキストとともに保存されます。
103 |   - `create-segments` の実行時、このディレクトリに当該音声の下処理済みの音声ファイルや書き起こしテキストが存在する場合は、そのファイルが再利用されます。
104 |   - 下処理済みの音声ファイル名は `02-PreparedSources/(01-Sourceでのファイル名).wav` となります。
105 |   - 書き起こしテキストのファイル名は `02-PreparedSources/(01-Sourceでのファイル名).json` となります。
106 |     - 書き起こしの精度がよくない (Whisper の音声認識ガチャで外れを引いた) 場合は、書き起こしテキストの JSON ファイルを削除してから `create-segments` を実行すると、再度書き起こし処理が行われます。
107 | - **03-Segments:** `create-segments` サブコマンドでセグメント化された音声ファイルが入るディレクトリ
108 |   - `create-segments` サブコマンドを実行すると、`02-PreparedSources/` 以下にある音声ファイルが書き起こし文や無音区間などをもとに一文ごとにセグメント化され、このディレクトリに保存されます。
109 |   - セグメントデータのファイル名は `03-Segments/(01-Sourceでのファイル名)/(4桁の連番)_(書き起こし文).wav` となります。
110 |     - 基本発生しませんが、万が一書き起こし文がファイル名の最大長を超える場合は、ファイル名が切り詰められ、代わりにフルの書き起こし文が `03-Segments/(01-Sourceでのファイル名)/(4桁の連番)_(書き起こし文).txt` に保存されます。
111 |   - なんらかの理由でもう一度セグメント化を行いたい場合は、`03-Segments/(01-Sourceでのファイル名)/` を削除してから `create-segments` を実行すると、再度セグメント化が行われます。
112 | - **04-Datasets:** `create-datasets` サブコマンドで手動で作成されたデータセットが入るディレクトリ
113 |   - `create-datasets` サブコマンドを実行すると Gradio の Web UI が起動し、`03-Segments/` 以下にある一文ごとにセグメント化された音声と書き起こし文をもとにアノテーションを行い、手動でデータセットを作成できます。
114 |   - `03-Segments/` までの処理は AI 技術を使い完全に自動化されています。
115 |     - 調整を重ねそれなりに高い精度で自動生成できるようになった一方で、他の人と声が被っていたり発音がはっきりしないなど、データセットにするにはふさわしくない音声が含まれていることもあります。
116 |     - また、書き起こし文が微妙に誤っていたり、句読点がなかったりすることもあります。
117 |     - さらに元の音声に複数の話者の声が含まれている場合、必要な話者の音声だけを抽出する必要もあります。
118 |   - `create-datasets` サブコマンドで起動する Web UI は、どうしても最後は人力で行う必要があるアノテーション作業を、簡単に手早く行えるようにするためのものです。
119 |     - 話者の選別 (データセットから除外することも可能)・音声の再生・音声のトリミング (切り出し)・書き起こし文の修正を一つの画面で行えます。
120 |     - 確定ボタンを押すと、そのセグメントが指定された話者のデータセットに追加されます ([このデータを除外] ボタンが押された場合はデータセットへの追加がスキップされる) 。
121 |     - `create-datasets` サブコマンドによって、`03-Segments/` 以下のセグメント化された音声ファイルが変更されることはありません。
122 |   - データセットは音声ファイルが `04-Datasets/(話者名)/audio/wavs/(連番).wav` に、書き起こし文が `04-Datasets/(話者名)/transcripts.list` にそれぞれ保存されます。
123 |     - このディレクトリ構造は Bert-VITS2 のデータセット構造に概ね準拠したものですが、`config.json` など一部のファイルやディレクトリは存在しません。
124 |     - Bert-VITS2 の学習処理によって、`04-Datasets/` 以下のデータセットが変更されることはありません。
125 | - **05-Models:** `train` サブコマンドで生成された、Bert-VITS2 の学習済みモデルが入るディレクトリ
126 |   - 実体は `Bert-VITS2/Data/` へのシンボリックリンクです。
127 |   - `train` サブコマンドを実行すると、`04-Datasets/` 以下の指定された話者のデータセットディレクトリが Bert-VITS2 側にコピーされ、Bert-VITS2 の学習処理が開始されます。
128 |     - 生成された学習済みモデルは、`05-Models/(話者名)/models/` 以下に保存されます。
129 |     - 再度学習を行う場合は、`05-Models/(話者名)/` ディレクトリを削除してから再度 `train` サブコマンドを実行してください。
130 |   - `infer` サブコマンドを実行すると、Bert-VITS2 の推論用 Web UI が起動されます。
131 |     - Bert-VITS2 の推論用 Web UI によって、`05-Models/` 以下の学習済みモデルが変更されることはありません。
132 | 
133 | ## Usage
134 | 
135 | 概ね前述した通りですが、念のためにここでも説明します。  
136 | ここでは、学習するモデルの名前を「MySpeaker1」「MySpeaker2」とします。
137 | 
138 | ### 1. データセットの準備
139 | 
140 | `01-Sources/` 以下に、データセットにする音声ファイルをそのまま入れます。
141 | 
142 | 基本どの音声フォーマットでも大丈夫です。`create-segments` での下処理にて、自動的に wav に変換されます。  
143 | また、背景 BGM の除去などの下処理を行う必要はありません。`create-segments` での下処理にて、自動的に BGM や雑音の除去が行われます。
144 | 
145 | なお、数十分〜数時間ある音声ファイルの場合は `create-segments` での書き起こしの精度が悪くなることがあるため、事前に10分前後に分割することをおすすめします。
146 | 
147 | ### 2. データセットの下処理とセグメント化
148 | 
149 | ```bash
150 | # Non-Docker
151 | ./Aivis.sh create-segments
152 | 
153 | # Docker
154 | ./Aivis-Docker.sh create-segments
155 | ```
156 | 
157 | 実行すると、音源抽出 AI の [Demucs (htdemucs_ft)](https://github.com/facebookresearch/demucs) により `01-Sources/` 以下にある音声ファイルの BGM や雑音が除去され、書き起こしテキストとともに `02-PreparedSources/` 以下に保存されます。  
158 | 書き起こしテキストは音声認識 AI の [faster-whisper (large-v3)](https://github.com/SYSTRAN/faster-whisper) によって生成され、[stable-ts](https://github.com/jianfch/stable-ts) によってアライメントされます。  
159 | すでに `02-PreparedSources/` 以下に当該音声の下処理済みの音声ファイルや書き起こしテキストが存在する場合は、そのファイルが再利用されます。
160 | 
161 | 上記の処理が完了すると、`02-PreparedSources/` 以下にある音声ファイルが書き起こし文や無音区間などをもとに一文ごとにセグメント化され、`03-Segments/` 以下に保存されます。  
162 | すでに `03-Segments/` 以下に当該音声のセグメント化された音声ファイルが存在する場合は、当該音声のセグメント化はスキップされます。
163 | 
164 | > [!NOTE]  
165 | > `create-segments` サブコマンドにはいくつかオプションがあります。  
166 | >   
167 | > `--force-transcribe` オプションを指定すると、既に書き起こしテキストが存在する音声ファイルでも、再度書き起こし処理が行われます。  
168 | > Whisper の書き起こし結果にはガチャ (ランダム性) があり、稀にハルシネーションと無限ループだらけの使い物にならない書き起こし結果が出てくることもあります。  
169 | > 随時ログに出力される書き起こし結果が微妙だと感じた際は、このオプションを指定して再度書き起こし処理を行うことをおすすめします。  
170 | >   
171 | > `--whisper-model` オプションを指定すると、Whisper の書き起こしに使うモデルを指定できます。  
172 | > デフォルトは `large-v3` です。モデルの変更は GPU 性能の関係で `large-v3` が動かない場合のみ行ってください。  
173 | >   
174 | > `--no-use-demucs` オプションを指定すると、Demucs による BGM や雑音の除去を行わず、そのままの音声ファイルを使って書き起こし処理を行います。  
175 | > デフォルトでは、すべての音声ファイルに対して Demucs による BGM や雑音の除去が行われます。
176 | >   
177 | > `--no-trim-silence` オプションを指定すると、先頭と末尾の無音区間のトリミングを行わずにセグメント化された音声ファイルを保存します。  
178 | > デフォルトでは、セグメント化された音声ファイルを保存する際に、先頭と末尾の無音区間のトリミングが行われます。
179 | 
180 | ### 3. データセットの作成 (アノテーション)
181 | 
182 | <img width="100%" alt="image" src="https://github.com/tsukumijima/Aivis/assets/39271166/d2c2009d-5195-49b2-8771-980a3d317fa6"><br>
183 | 
184 | ```bash
185 | # Non-Docker
186 | ./Aivis.sh create-datasets '*' 'MySpeaker1,MySpeaker2'
187 | 
188 | # Docker
189 | ./Aivis-Docker.sh create-datasets '*' 'MySpeaker1,MySpeaker2'
190 | ```
191 | 
192 | Aivis でのデータセット作成工程は、大半が `create-segments` サブコマンドで自動化されています。  
193 | しかし、話者やセグメント自体の選別・書き起こし文の修正・うまく切り出せていない音声のトリミングなどの仕上げのアノテーション作業は、どうしても人力で行う必要があります。
194 | 
195 | `create-datasets` サブコマンドを実行すると、Gradio の Web UI が起動します。  
196 | この Web UI から `03-Segments/` 以下の一文ごとにセグメント化された音声と書き起こし文をもとにアノテーションを行い、人力でアノテーションを行った最終的なデータセットを作成できます。
197 | 
198 | `create-datasets` サブコマンドの第一引数には、`03-Segments/` 以下に自動生成されている、セグメント化された音声ファイルのディレクトリ名を指定します。通常は `01-Sources/` 以下の音声ファイル名と同じです。  
199 | 内部では Glob が使われているため、ワイルドカード (`*`) を活用し、複数のディレクトリのアノテーション作業を一括で実行できます。
200 | 
201 | `create-datasets` サブコマンドの第二引数には、データセットを作成する話者の名前をカンマ区切りで指定します。  
202 | ここで指定した話者のデータセットが、`04-Datasets/` 以下に作成されます。  
203 | Web UI 上では、セグメント化された音声ファイルごとにどの話者に割り当てるか、あるいはデータセットから除外するかを選択できます。
204 | 
205 | Web UI 上で確定ボタンを押すと、次のセグメントのアノテーション作業に移ります。  
206 | 実装上、一度確定したセグメントのアノテーションをやり直すことはできません。間違いがないかよく確認してください。  
207 | 作成中のデータセットの進捗ログは、`create-datasets` サブコマンドの標準出力に表示されます。
208 | 
209 | > [!TIP]
210 | > `--accept-all` オプションを指定すると、UI を表示せずにすべての音声ファイルを一括処理できます。  
211 | > あらかじめ第一引数で指定したディレクトリパターン内の音声が第二引数で指定した単一話者だけなことが分かっていて、さらに書き起こし文を調整する必要がないときは、このオプションを使うと大幅に作業時間を短縮できます。  
212 | > セグメントがどの話者に対応するかは自動判定できないため、`--accept-all` 指定時に複数の話者を指定することはできません。
213 | 
214 | > [!NOTE]  
215 | > すでにデータセットが途中まで作成されている状態で再度 `create-datasets` サブコマンドを実行すると、途中まで作成されているデータセットの次の連番から、データセット作成が再開されます。  
216 | > 最初からアノテーション作業をやり直したい場合は、`04-Datasets/` 以下の話者ごとのデータセットディレクトリを削除してから、再度 `create-datasets` サブコマンドを実行してください。
217 | 
218 | ```bash
219 | # Non-Docker
220 | ./Aivis.sh check-dataset 'MySpeaker1'
221 | 
222 | # Docker
223 | ./Aivis-Docker.sh check-dataset 'MySpeaker1'
224 | ```
225 | 
226 | `check-dataset` サブコマンドを実行すると、指定された話者のデータセットディレクトリにある音声ファイルと書き起こし文、音声ファイルの総時間 (秒) を確認できます。  
227 | 
228 | `check-dataset` サブコマンドの第一引数には、データセットを確認したい話者の名前 (`04-Datasets/` 以下のディレクトリ名と一致する) を指定します。ワイルドカードは使えないため注意してください。
229 | 
230 | ### 4. 学習の実行
231 | 
232 | <img width="100%" alt="image" src="https://github.com/tsukumijima/Aivis/assets/39271166/6d67a57b-d53e-465c-a454-d94d981278ad"><br>
233 | 
234 | ```bash
235 | # Non-Docker
236 | ./Aivis.sh train 'MySpeaker1' --steps 8000 --batch-size 4
237 | 
238 | # Docker
239 | ./Aivis-Docker.sh train 'MySpeaker1' --steps 8000 --batch-size 4
240 | ```
241 | 
242 | `train` サブコマンドを実行すると、指定された話者のデータセットディレクトリのコピー、`config.json` などの学習時に必要なファイルの生成、引数に応じた `Bert-VITS2/config.yml` の自動書き換えといった下処理の後、Bert-VITS2 の学習処理が開始されます。
243 | 
244 | [Bert-VITS2 の事前学習モデル](https://huggingface.co/Stardust-minus/Bert-VITS2-Japanese-Extra/tree/main) がまだダウンロードされていない場合は、実行時に `.cache/` 以下に自動的にダウンロードされます。
245 | 
246 | 学習時には、`--epochs` (エポック数) と `--steps` (ステップ数) のどちらかを指定する必要があります。  
247 | `--batch-size` はバッチサイズを指定するオプションで、指定しなかった場合は `4` に設定されます。
248 | 
249 | NVIDIA GPU のスペックにもよりますが、学習の完了には相応の時間がかかります。  
250 | Geforce GTX 1080 (バッチサイズ: 2) で 8000 ステップ学習させたときは3時間程度かかりました。
251 | 
252 | 一般的に、2000 ステップ 〜 4000 ステップで十分似た声質になるようですが、データセットの数や質にも依存します。  
253 | 最大でも 8000 ステップも学習させれば、かなり自然な音声が生成できるようになります。
254 | 
255 | > [!IMPORTANT]  
256 | > ステップ数は、`(データセットの総数 ÷ バッチサイズ) × エポック数` で求められます。  
257 | > 逆に必要なエポック数は、`ステップ数 ÷ (データセットの総数 ÷ バッチサイズ)` で求められます。  
258 | > データセットの総数が少ない場合は、エポック数を増やして目標ステップ数を超えるように調整してください。
259 | > `--epochs` の代わりに `--steps` オプションを使うと、バッチサイズ・データセットの総数から自動的にエポック数を計算して学習を行います。
260 | 
261 | > [!TIP]  
262 | > VRAM 不足で実行途中に CUDA Out Of Memory エラーが出る場合は、`--batch-size` で学習時のバッチサイズを小さくしてください。  
263 | > Geforce GTX 1080 ではバッチサイズ 2 〜 3 でギリギリな印象です。
264 | 
265 | 学習中は、標準出力に学習の進捗ログが表示されます。
266 | 
267 | 学習したモデルは `05-Models/(話者名)/models/` 以下に保存されます。  
268 | `05-Models/(話者名)/` ディレクトリを別 PC の Aivis の `05-Models/` 以下にコピーすることで、学習済みモデルを別の環境に移行できます。
269 | 
270 | > [!NOTE]  
271 | > 学習済みモデルは 1000 ステップごとに異なるファイル名で保存されます。  
272 | > もしモデルディレクトリに `G_7000.pth` が存在するなら、7000 ステップまで学習させたモデルです。
273 | 
274 | ### 5. 学習済みモデルの推論
275 | 
276 | <img width="100%" alt="image" src="https://github.com/tsukumijima/Aivis/assets/39271166/b38ce1e9-abae-4e0f-b9d4-33f896ba4408"><br>
277 | 
278 | ```bash
279 | # Non-Docker
280 | ./Aivis.sh infer 'MySpeaker1' --model-step 5000
281 | 
282 | # Docker
283 | ./Aivis-Docker.sh infer 'MySpeaker1' --model-step 5000
284 | ```
285 | 
286 | `infer` サブコマンドを実行すると、引数で指定された話者の学習済みモデルを使って音声を生成できる、推論用 Web UI を起動できます。
287 | 
288 | > [!TIP]
289 | > `--model-step` はオプションで、指定しなかった場合は一番最後に保存されたステップ数のモデルが使われます。  
290 | > もし一番最後に保存されたステップ数のモデルでの性能が芳しくない場合は、過学習気味かもしれません。  
291 | > より前の低いステップ数のモデルの方が発声やイントネーションが安定した音声を生成できる場合もあります。
292 | 
293 | この Web UI はオリジナルの Bert-VITS2 の Web UI を日本語化し、より使いやすくなるよう画面構成を整理し説明文を変更したものです。  
294 | コマンド実行時に、引数に合わせて自動的に `Bert-VITS2/config.yml` が書き換えられます。
295 | 
296 | Web UI にはいくつかボタンがありますが、基本は喋らせたい音声を入力して、「音声を行ごとに分割して生成 (おすすめ)」をクリックするだけです。
297 | 
298 | > [!NOTE]
299 | > Bert-VITS2 は、読み上げテキストが示す感情に合わせて、自動的に抑揚や感情表現を調整します。  
300 | > 例えば「ありがとうございます！」なら前向きに明るい声で、「とても残念です…。」なら残念そうな声で読み上げできます。  
301 | > 句読点の有無や、！？… などの文末記号の使い方次第で、抑揚や感情表現が大きく変わります。  
302 | > 意図した表現にならないときは、読み上げテキストを工夫してみてください。
303 | 
304 | > [!TIP]
305 | > Bert-VITS2 は行の最初の方の文から読み取れる感情表現を後まで引き継ぐ傾向があるため、まとまった文ごとに改行で区切り、[音声を行ごとに分割して生成] ボタンを押すとより自然な音声を生成できます。  
306 | > ただし、「とても嬉しいです。しかし残念です。」のように真逆の感情を含む文では、同じ行に含めた方がより自然な繋がりになることもあります。
307 | 
308 | > [!TIP]
309 | > 音声合成時に抑揚の強さ (SDP Ratio) を指定できます。基本 0.2 〜 0.6 の範囲がおすすめです。  
310 | > 0.0 に近いほど抑揚が弱く読み上げが遅くなり、1.0 に近いほど抑揚が強く読み上げが早くなります。  
311 | > 0.0 では棒読みに、0.6 ではより抑揚の強い感情のこもった音声になります。  
312 | > 0.2 では抑揚が比較的少ない抑制的なトーンで読み上げます。  
313 | > 0.6 以上にしても抑揚はあまり変わらない印象です。むしろ発声が不安定になることもあります。  
314 | 
315 | ## License
316 | 
317 | [MIT License](License.txt)
318 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 
4 | [virtualenvs.options]
5 | always-copy = false
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "Aivis"
 3 | version = "1.0.0"
 4 | description = "Aivis: AI Voice Imitation System"
 5 | package-mode = false
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = ">=3.11,<3.12"
 9 | demucs = "^4.0.1"
10 | faster-whisper = "0.10.1"
11 | gradio = "^4.16.0"
12 | librosa = "0.9.2"  # Bert-VITS2 は librosa 0.10.x に対応していない
13 | nvidia-cublas-cu11 = "^11.11.3.6"
14 | pydub = "^0.25.1"
15 | pyloudnorm = "^0.1.1"
16 | regex = "^2023.10.3"
17 | soundfile = "^0.12.1"
18 | stable-ts = "^2.15.2"
19 | torch = "^2.1.1"
20 | torchaudio = "^2.1.1"
21 | torchvision = "^0.16.1"
22 | typer = {extras = ["all"], version = "^0.9.0"}
23 | # Bert-VITS2 dependencies
24 | amfm-decompy = "^1.0.11"
25 | av = "==10.*"
26 | cmudict = "^1.0.16"
27 | cn2an = "^0.5.22"
28 | ffmpeg-python = "^0.2.0"
29 | fugashi = "^1.3.0"
30 | g2p-en = "^2.1.0"
31 | gputil = "^1.4.0"
32 | jaconv = "^0.3.4"
33 | jieba = "^0.42.1"
34 | langid = "^1.1.6"
35 | loguru = "^0.7.2"
36 | matplotlib = "^3.8.2"
37 | mecab-python3 = "^1.0.8"
38 | num2words = "^0.5.13"
39 | numba = "^0.58.1"
40 | numpy = "^1.26.2"
41 | phonemizer = "^3.2.1"
42 | psutil = "^5.9.7"
43 | pykakasi = "^2.2.1"
44 | pyopenjtalk-prebuilt = "^0.3.0"
45 | pypinyin = "^0.50.0"
46 | pyyaml = "^6.0.1"
47 | requests = "^2.31.0"
48 | scipy = "^1.11.4"
49 | sentencepiece = "^0.1.99"
50 | tensorboard = "^2.15.1"
51 | transformers = "^4.36.2"
52 | unidecode = "^1.3.7"
53 | unidic-lite = "^1.0.8"
54 | vector-quantize-pytorch = "^1.12.4"
55 | 
56 | [build-system]
57 | requires = ["poetry-core"]
58 | build-backend = "poetry.core.masonry.api"
59 | 


--------------------------------------------------------------------------------