├── .github
    └── workflows
    │   ├── genlocale.yml
    │   ├── pull_format.yml
    │   ├── push_format.yml
    │   └── unitest.yml
├── .gitignore
├── README.md
├── RVC-GUI.bat
├── config.py
├── docs
    ├── GUI.JPG
    └── GUI20230508.JPG
├── infer
    ├── infer-pm-index256.py
    ├── train-index.py
    └── trans_weights.py
├── infer_pack
    ├── attentions.py
    ├── commons.py
    ├── models.py
    ├── models_onnx.py
    ├── models_onnx_moess.py
    ├── modelsv2.py
    ├── modules.py
    └── transforms.py
├── my_utils.py
├── requirements.txt
├── rvcgui.py
├── setup.bat
├── trainset_preprocess_pipeline_print.py
└── vc_infer_pipeline.py


/.github/workflows/genlocale.yml:
--------------------------------------------------------------------------------
 1 | name: genlocale
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | jobs:
 7 |   golangci:
 8 |     name: genlocale
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Check out
12 |         uses: actions/checkout@master
13 | 
14 |       - name: Run locale generation
15 |         run: |
16 |           python3 extract_locale.py
17 |           cd i18n && python3 locale_diff.py
18 | 
19 |       - name: Commit back
20 |         if: ${{ !github.head_ref }}
21 |         continue-on-error: true
22 |         run: |
23 |           git config --local user.name 'github-actions[bot]'
24 |           git config --local user.email '41898282+github-actions[bot]@users.noreply.github.com'
25 |           git add --all
26 |           git commit -m "🎨 同步 locale"
27 | 
28 |       - name: Create Pull Request
29 |         if: ${{ !github.head_ref }}
30 |         continue-on-error: true
31 |         uses: peter-evans/create-pull-request@v4
32 |   
33 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_format.yml:
--------------------------------------------------------------------------------
 1 | name: pull format
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | jobs:
 8 |   pull_format:
 9 |     runs-on: ubuntu-latest
10 |     continue-on-error: true
11 |     steps:
12 |       - name: checkout
13 |         continue-on-error: true
14 |         uses: actions/checkout@v3
15 |         with:
16 |           ref: ${{ github.head_ref }}
17 |           fetch-depth: 0
18 |       
19 |           
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 | 
25 |       - name: Install Black
26 |         run: pip install black
27 | 
28 |       - name: Run Black
29 |         # run: black $(git ls-files '*.py')
30 |         run: black .
31 | 
32 |       - name: Commit Back
33 |         uses: stefanzweifel/git-auto-commit-action@v4
34 |         with:
35 |           commit_message: Apply Code Formatter Change
36 | 


--------------------------------------------------------------------------------
/.github/workflows/push_format.yml:
--------------------------------------------------------------------------------
 1 | name: push format
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   pull-requests: write
11 | jobs:
12 |   push_format:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |         with:
17 |           ref: ${{github.ref_name}}
18 | 
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       - name: Install Black
25 |         run: pip install black
26 | 
27 |       - name: Run Black
28 |         # run: black $(git ls-files '*.py')
29 |         run: black .
30 | 
31 |       - name: Commit Back
32 |         continue-on-error: true
33 |         id: commitback
34 |         run: |
35 |           git config --local user.email "github-actions[bot]@users.noreply.github.com"
36 |           git config --local user.name "github-actions[bot]"
37 |           git add --all
38 |           git commit -m "Format code"
39 |       
40 |       - name: Create Pull Request
41 |         if: steps.commitback.outcome == 'success'
42 |         continue-on-error: true
43 |         uses: peter-evans/create-pull-request@v4
44 |         with:
45 |           body: Apply Code Formatter Change
46 |           commit-message: Automatic code format
47 | 


--------------------------------------------------------------------------------
/.github/workflows/unitest.yml:
--------------------------------------------------------------------------------
 1 | name: unitest
 2 | on: [ push, pull_request ]
 3 | jobs:
 4 |   build:
 5 |     runs-on: ${{ matrix.os }}
 6 |     strategy:
 7 |       matrix:
 8 |         python-version: ["3.8", "3.9", "3.10"]
 9 |         os: [ubuntu-latest]
10 |       fail-fast: false
11 | 
12 |     steps:
13 |     - uses: actions/checkout@master
14 |     - name: Set up Python ${{ matrix.python-version }}
15 |       uses: actions/setup-python@v4
16 |       with:
17 |         python-version: ${{ matrix.python-version }}
18 |     - name: Install dependencies
19 |       run: |
20 |         sudo apt update
21 |         sudo apt -y install ffmpeg
22 |         sudo apt -y install -qq aria2
23 |         aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d ./ -o hubert_base.pt
24 |         python -m pip install --upgrade pip
25 |         python -m pip install --upgrade setuptools
26 |         python -m pip install --upgrade wheel
27 |         pip install torch torchvision torchaudio
28 |         pip install -r requirements.txt
29 |     - name: Test step 1 & 2
30 |       run: |
31 |         mkdir -p logs/mi-test
32 |         touch logs/mi-test/preprocess.log
33 |         python trainset_preprocess_pipeline_print.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True
34 |         touch logs/mi-test/extract_f0_feature.log
35 |         python extract_f0_print.py logs/mi-test $(nproc) pm
36 |         python extract_feature_print.py cpu 1 0 0 logs/mi-test
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | /TEMP
4 | *.pyd
5 | hubert_base.pt
6 | /logs
7 | models/
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | <h1>RVC GUI<br><br>
 4 |   
 5 | For audio file inference only
 6 | 
 7 |   <br>
 8 | 
 9 |   
10 | 
11 | </div>
12 | 
13 |   
14 | 
15 |  
16 | 
17 |   
18 | ## GUI
19 | 
20 | ![GUI](https://github.com/Tiger14n/RVC-GUI/raw/main/docs/GUI.JPG)
21 |  <br><br>
22 |   
23 | ## Direct setup for Windows users
24 | ## [Windows-pkg](https://github.com/Tiger14n/RVC-GUI/releases/tag/Windows-pkg)
25 |   
26 | <br><br>
27 | ## Preparing the environment
28 | 
29 | 
30 | * Install Python version +3.8 if you have not:
31 | 
32 | * Execute these commands
33 | 
34 | Windows with Nvidia cards
35 | ```bash
36 | python -m pip install -U pip setuptools wheel
37 | pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118
38 | pip install -r requirements.txt
39 | ```
40 | Other
41 | ```
42 | python -m pip install -U pip setuptools wheel
43 | pip install -U torch torchaudio 
44 | pip install -r requirements.txt
45 | ```
46 | 
47 | Apple silicon Macs fix
48 | ```
49 | pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
50 | 
51 | export PYTORCH_ENABLE_MPS_FALLBACK=1
52 | ```
53 | <br>
54 | 
55 | * Downlaod [hubert_base.pt](https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt/) and place it in the root folder
56 | 
57 | <br>
58 |  
59 | * Then use this command to start RVC GUI:
60 | ```bash
61 | python rvcgui.py
62 | ```
63 | Or run this file on windows
64 | ```
65 | RVC-GUI.bat
66 | ```
67 | 
68 | # Loading models
69 | use the import button to import a model from a zip file, 
70 | * The .zip must contain the ".pth" weight file. 
71 | * The .zip is recommended to contain the feature retrieval files ".index"
72 | 
73 | Or place the model manually in root/models
74 | ```
75 | models
76 | ├───Person1
77 | │   ├───xxxx.pth
78 | │   ├───xxxx.index
79 | │   └───xxxx.npy
80 | └───Person2
81 |     ├───xxxx.pth
82 |     ├───...
83 |     └───...
84 | ````
85 | <br>
86 | 
87 | 
88 | <br> 
89 | 
90 | ### How to get models?.
91 | * Join the[ AI Hub](https://discord.gg/aihub) Discord 
92 | * [Community Models on HuggingFace](https://huggingface.co/QuickWick/Music-AI-Voices/tree/main) by Wicked aka QuickWick
93 | 
94 | <br>
95 | 
96 | K7#4523
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/RVC-GUI.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | REM Get the path of the script's directory
 4 | set "scriptDir=%~dp0"
 5 | 
 6 | REM Set the path to the Python runtime folder
 7 | set "runtimeFolder=%scriptDir%runtime"
 8 | 
 9 | REM Check if the runtime folder exists
10 | 
11 | REM Check if the runtime folder exists
12 | if exist "%runtimeFolder%\python.exe" (
13 |     REM Runtime folder exists, so run the file using the runtime Python
14 |     echo Running with the runtime Python.
15 |      "runtime/python.exe" rvcgui.py --pycmd "runtime/python.exe"
16 |       pause
17 | ) else (
18 |     REM Runtime folder does not exist, so run the file using the system Python
19 |      echo Running with the system Python.
20 |     python.exe rvcgui.py --pycmd python.exe
21 | pause
22 | )


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import sys
  4 | import torch
  5 | from multiprocessing import cpu_count
  6 | 
  7 | 
  8 | class Config:
  9 |     def __init__(self):
 10 |         self.device = "cuda:0"
 11 |         self.is_half = True
 12 |         self.n_cpu = 0
 13 |         self.gpu_name = None
 14 |         self.gpu_mem = None
 15 |         (
 16 |             self.python_cmd,
 17 |             self.listen_port,
 18 |             self.iscolab,
 19 |             self.noparallel,
 20 |             self.noautoopen,
 21 |             self.use_gfloat,
 22 |             self.paperspace,
 23 |         ) = self.arg_parse()
 24 |         
 25 |         if self.use_gfloat: 
 26 |             print("Using g_float instead of g_half")
 27 |             self.is_half = False
 28 |         self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
 29 | 
 30 |     def arg_parse(self) -> tuple:
 31 |         parser = argparse.ArgumentParser()
 32 |         parser.add_argument("--port", type=int, default=7865, help="Listen port")
 33 |         parser.add_argument(
 34 |             "--pycmd", type=str, default="python", help="Python command"
 35 |         )
 36 |         parser.add_argument("--colab", action="store_true", help="Launch in colab")
 37 |         parser.add_argument(
 38 |             "--noparallel", action="store_true", help="Disable parallel processing"
 39 |         )
 40 |         parser.add_argument(
 41 |             "--noautoopen",
 42 |             action="store_true",
 43 |             help="Do not open in browser automatically",
 44 |         )
 45 |         parser.add_argument( # this argument (if set to false) allows windows users to avoid the "slow_conv2d_cpu not implemented for 'Half'" exception
 46 |             "--use_gfloat", action="store_true", help="Will use g_float instead of g_half during voice conversion."
 47 |         )
 48 |         parser.add_argument( # Fork Feature. Paperspace integration for web UI
 49 |             "--paperspace", action="store_true", help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems."
 50 |         )
 51 |         cmd_opts = parser.parse_args()
 52 | 
 53 |         cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
 54 | 
 55 |         return (
 56 |             cmd_opts.pycmd,
 57 |             cmd_opts.port,
 58 |             cmd_opts.colab,
 59 |             cmd_opts.noparallel,
 60 |             cmd_opts.noautoopen,
 61 |             cmd_opts.use_gfloat,
 62 |             cmd_opts.paperspace,
 63 |         )
 64 | 
 65 |     def device_config(self) -> tuple:
 66 |         if torch.cuda.is_available():
 67 |             i_device = int(self.device.split(":")[-1])
 68 |             self.gpu_name = torch.cuda.get_device_name(i_device)
 69 |             if (
 70 |                 ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
 71 |                 or "P40" in self.gpu_name.upper()
 72 |                 or "1060" in self.gpu_name
 73 |                 or "1070" in self.gpu_name
 74 |                 or "1080" in self.gpu_name
 75 |             ):
 76 |                 print("16系/10系显卡和P40强制单精度")
 77 |                 self.is_half = False
 78 |                 with open("trainset_preprocess_pipeline_print.py", "r") as f:
 79 |                     strr = f.read().replace("3.7", "3.0")
 80 |                 with open("trainset_preprocess_pipeline_print.py", "w") as f:
 81 |                     f.write(strr)
 82 |             else:
 83 |                 self.gpu_name = None
 84 |             self.gpu_mem = int(
 85 |                 torch.cuda.get_device_properties(i_device).total_memory
 86 |                 / 1024
 87 |                 / 1024
 88 |                 / 1024
 89 |                 + 0.4
 90 |             )
 91 |             if self.gpu_mem <= 4:
 92 |                 with open("trainset_preprocess_pipeline_print.py", "r") as f:
 93 |                     strr = f.read().replace("3.7", "3.0")
 94 |                 with open("trainset_preprocess_pipeline_print.py", "w") as f:
 95 |                     f.write(strr)
 96 |         elif torch.backends.mps.is_available():
 97 |             print("No supported Nvidia cards found, using MPS for inference ")
 98 |             self.device = "mps"
 99 |         else:
100 |             print("No supported Nvidia cards found, using CPU for inference")
101 |             self.device = "cpu"
102 |             if not self.use_gfloat: # Fork Feature: Force g_float (is_half = False) if --use_gfloat arg is used. 
103 |                 self.is_half = False
104 | 
105 |         if self.n_cpu == 0:
106 |             self.n_cpu = cpu_count()
107 | 
108 |         if self.is_half:
109 |             # 6G显存配置
110 |             x_pad = 3
111 |             x_query = 10
112 |             x_center = 60
113 |             x_max = 65
114 |         else:
115 |             # 5G显存配置
116 |             x_pad = 1
117 |             x_query = 6
118 |             x_center = 38
119 |             x_max = 41
120 | 
121 |         if self.gpu_mem != None and self.gpu_mem <= 4:
122 |             x_pad = 1
123 |             x_query = 5
124 |             x_center = 30
125 |             x_max = 32
126 | 
127 |         return x_pad, x_query, x_center, x_max
128 | 


--------------------------------------------------------------------------------
/docs/GUI.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tiger14n/RVC-GUI/0c2e2b158e0fdff0ed91a53d9fea2b0b3dc4752b/docs/GUI.JPG


--------------------------------------------------------------------------------
/docs/GUI20230508.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tiger14n/RVC-GUI/0c2e2b158e0fdff0ed91a53d9fea2b0b3dc4752b/docs/GUI20230508.JPG


--------------------------------------------------------------------------------
/infer/infer-pm-index256.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | 对源特征进行检索
  4 | """
  5 | import torch, pdb, os, parselmouth
  6 | 
  7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  8 | import numpy as np
  9 | import soundfile as sf
 10 | 
 11 | # from models import SynthesizerTrn256#hifigan_nonsf
 12 | # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
 13 | from infer_pack.models import (
 14 |     SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
 15 | )  # hifigan_nsf
 16 | 
 17 | # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
 18 | # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
 19 | # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
 20 | 
 21 | 
 22 | from scipy.io import wavfile
 23 | from fairseq import checkpoint_utils
 24 | 
 25 | # import pyworld
 26 | import librosa
 27 | import torch.nn.functional as F
 28 | import scipy.signal as signal
 29 | 
 30 | # import torchcrepe
 31 | from time import time as ttime
 32 | 
 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 34 | model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt"  #
 35 | print("load model(s) from {}".format(model_path))
 36 | models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
 37 |     [model_path],
 38 |     suffix="",
 39 | )
 40 | model = models[0]
 41 | model = model.to(device)
 42 | model = model.half()
 43 | model.eval()
 44 | 
 45 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
 46 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
 47 | net_g = SynthesizerTrn256(
 48 |     1025,
 49 |     32,
 50 |     192,
 51 |     192,
 52 |     768,
 53 |     2,
 54 |     6,
 55 |     3,
 56 |     0,
 57 |     "1",
 58 |     [3, 7, 11],
 59 |     [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
 60 |     [10, 10, 2, 2],
 61 |     512,
 62 |     [16, 16, 4, 4],
 63 |     183,
 64 |     256,
 65 |     is_half=True,
 66 | )  # hifigan#512#256#no_dropout
 67 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
 68 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
 69 | #
 70 | # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
 71 | # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
 72 | 
 73 | # weights=torch.load("infer/ft-mi_1k-noD.pt")
 74 | # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
 75 | # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
 76 | # weights=torch.load("infer/ft-mi-sim1k.pt")
 77 | weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
 78 | print(net_g.load_state_dict(weights, strict=True))
 79 | 
 80 | net_g.eval().to(device)
 81 | net_g.half()
 82 | 
 83 | 
 84 | def get_f0(x, p_len, f0_up_key=0):
 85 |     time_step = 160 / 16000 * 1000
 86 |     f0_min = 50
 87 |     f0_max = 1100
 88 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
 89 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 90 | 
 91 |     f0 = (
 92 |         parselmouth.Sound(x, 16000)
 93 |         .to_pitch_ac(
 94 |             time_step=time_step / 1000,
 95 |             voicing_threshold=0.6,
 96 |             pitch_floor=f0_min,
 97 |             pitch_ceiling=f0_max,
 98 |         )
 99 |         .selected_array["frequency"]
100 |     )
101 | 
102 |     pad_size = (p_len - len(f0) + 1) // 2
103 |     if pad_size > 0 or p_len - len(f0) - pad_size > 0:
104 |         f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
105 |     f0 *= pow(2, f0_up_key / 12)
106 |     f0bak = f0.copy()
107 | 
108 |     f0_mel = 1127 * np.log(1 + f0 / 700)
109 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
110 |         f0_mel_max - f0_mel_min
111 |     ) + 1
112 |     f0_mel[f0_mel <= 1] = 1
113 |     f0_mel[f0_mel > 255] = 255
114 |     # f0_mel[f0_mel > 188] = 188
115 |     f0_coarse = np.rint(f0_mel).astype(np.int)
116 |     return f0_coarse, f0bak
117 | 
118 | 
119 | import faiss
120 | 
121 | index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
122 | big_npy = np.load("infer/big_src_feature_mi.npy")
123 | ta0 = ta1 = ta2 = 0
124 | for idx, name in enumerate(
125 |     [
126 |         "冬之花clip1.wav",
127 |     ]
128 | ):  ##
129 |     wav_path = "todo-songs/%s" % name  #
130 |     f0_up_key = -2  #
131 |     audio, sampling_rate = sf.read(wav_path)
132 |     if len(audio.shape) > 1:
133 |         audio = librosa.to_mono(audio.transpose(1, 0))
134 |     if sampling_rate != 16000:
135 |         audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
136 | 
137 |     feats = torch.from_numpy(audio).float()
138 |     if feats.dim() == 2:  # double channels
139 |         feats = feats.mean(-1)
140 |     assert feats.dim() == 1, feats.dim()
141 |     feats = feats.view(1, -1)
142 |     padding_mask = torch.BoolTensor(feats.shape).fill_(False)
143 |     inputs = {
144 |         "source": feats.half().to(device),
145 |         "padding_mask": padding_mask.to(device),
146 |         "output_layer": 9,  # layer 9
147 |     }
148 |     if torch.cuda.is_available():
149 |         torch.cuda.synchronize()
150 |     t0 = ttime()
151 |     with torch.no_grad():
152 |         logits = model.extract_features(**inputs)
153 |         feats = model.final_proj(logits[0])
154 | 
155 |     ####索引优化
156 |     npy = feats[0].cpu().numpy().astype("float32")
157 |     D, I = index.search(npy, 1)
158 |     feats = (
159 |         torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
160 |     )
161 | 
162 |     feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
163 |     if torch.cuda.is_available():
164 |         torch.cuda.synchronize()
165 |     t1 = ttime()
166 |     # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
167 |     p_len = min(feats.shape[1], 10000)  #
168 |     pitch, pitchf = get_f0(audio, p_len, f0_up_key)
169 |     p_len = min(feats.shape[1], 10000, pitch.shape[0])  # 太大了爆显存
170 |     if torch.cuda.is_available():
171 |         torch.cuda.synchronize()
172 |     t2 = ttime()
173 |     feats = feats[:, :p_len, :]
174 |     pitch = pitch[:p_len]
175 |     pitchf = pitchf[:p_len]
176 |     p_len = torch.LongTensor([p_len]).to(device)
177 |     pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
178 |     sid = torch.LongTensor([0]).to(device)
179 |     pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
180 |     with torch.no_grad():
181 |         audio = (
182 |             net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
183 |             .data.cpu()
184 |             .float()
185 |             .numpy()
186 |         )  # nsf
187 |     if torch.cuda.is_available():
188 |         torch.cuda.synchronize()
189 |     t3 = ttime()
190 |     ta0 += t1 - t0
191 |     ta1 += t2 - t1
192 |     ta2 += t3 - t2
193 |     # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
194 |     # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
195 |     # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
196 |     wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio)  ##
197 | 
198 | 
199 | print(ta0, ta1, ta2)  #
200 | 


--------------------------------------------------------------------------------
/infer/train-index.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
 3 | """
 4 | import faiss, numpy as np, os
 5 | 
 6 | # ###########如果是原始特征要先写save
 7 | inp_root = r"E:\codes\py39\dataset\mi\2-co256"
 8 | npys = []
 9 | for name in sorted(list(os.listdir(inp_root))):
10 |     phone = np.load("%s/%s" % (inp_root, name))
11 |     npys.append(phone)
12 | big_npy = np.concatenate(npys, 0)
13 | print(big_npy.shape)  # (6196072, 192)#fp32#4.43G
14 | np.save("infer/big_src_feature_mi.npy", big_npy)
15 | 
16 | ##################train+add
17 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
18 | print(big_npy.shape)
19 | index = faiss.index_factory(256, "IVF512,Flat")  # mi
20 | print("training")
21 | index_ivf = faiss.extract_index_ivf(index)  #
22 | index_ivf.nprobe = 9
23 | index.train(big_npy)
24 | faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
25 | print("adding")
26 | index.add(big_npy)
27 | faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
28 | """
29 | 大小（都是FP32）
30 | big_src_feature 2.95G
31 |     (3098036, 256)
32 | big_emb         4.43G
33 |     (6196072, 192)
34 | big_emb双倍是因为求特征要repeat后再加pitch
35 | 
36 | """
37 | 


--------------------------------------------------------------------------------
/infer/trans_weights.py:
--------------------------------------------------------------------------------
 1 | import torch, pdb
 2 | 
 3 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
 4 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
 5 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
 6 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
 7 | a = torch.load(
 8 |     r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
 9 | )[
10 |     "model"
11 | ]  # sim_nsf#
12 | for key in a.keys():
13 |     a[key] = a[key].half()
14 | # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
15 | # torch.save(a,"ft-mi-sim1k.pt")#
16 | torch.save(a, "ft-mi-no_opt-no_dropout.pt")  #
17 | 


--------------------------------------------------------------------------------
/infer_pack/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from infer_pack import commons
  9 | from infer_pack import modules
 10 | from infer_pack.modules import LayerNorm
 11 | 
 12 | 
 13 | class Encoder(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         hidden_channels,
 17 |         filter_channels,
 18 |         n_heads,
 19 |         n_layers,
 20 |         kernel_size=1,
 21 |         p_dropout=0.0,
 22 |         window_size=10,
 23 |         **kwargs
 24 |     ):
 25 |         super().__init__()
 26 |         self.hidden_channels = hidden_channels
 27 |         self.filter_channels = filter_channels
 28 |         self.n_heads = n_heads
 29 |         self.n_layers = n_layers
 30 |         self.kernel_size = kernel_size
 31 |         self.p_dropout = p_dropout
 32 |         self.window_size = window_size
 33 | 
 34 |         self.drop = nn.Dropout(p_dropout)
 35 |         self.attn_layers = nn.ModuleList()
 36 |         self.norm_layers_1 = nn.ModuleList()
 37 |         self.ffn_layers = nn.ModuleList()
 38 |         self.norm_layers_2 = nn.ModuleList()
 39 |         for i in range(self.n_layers):
 40 |             self.attn_layers.append(
 41 |                 MultiHeadAttention(
 42 |                     hidden_channels,
 43 |                     hidden_channels,
 44 |                     n_heads,
 45 |                     p_dropout=p_dropout,
 46 |                     window_size=window_size,
 47 |                 )
 48 |             )
 49 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
 50 |             self.ffn_layers.append(
 51 |                 FFN(
 52 |                     hidden_channels,
 53 |                     hidden_channels,
 54 |                     filter_channels,
 55 |                     kernel_size,
 56 |                     p_dropout=p_dropout,
 57 |                 )
 58 |             )
 59 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
 60 | 
 61 |     def forward(self, x, x_mask):
 62 |         attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 63 |         x = x * x_mask
 64 |         for i in range(self.n_layers):
 65 |             y = self.attn_layers[i](x, x, attn_mask)
 66 |             y = self.drop(y)
 67 |             x = self.norm_layers_1[i](x + y)
 68 | 
 69 |             y = self.ffn_layers[i](x, x_mask)
 70 |             y = self.drop(y)
 71 |             x = self.norm_layers_2[i](x + y)
 72 |         x = x * x_mask
 73 |         return x
 74 | 
 75 | 
 76 | class Decoder(nn.Module):
 77 |     def __init__(
 78 |         self,
 79 |         hidden_channels,
 80 |         filter_channels,
 81 |         n_heads,
 82 |         n_layers,
 83 |         kernel_size=1,
 84 |         p_dropout=0.0,
 85 |         proximal_bias=False,
 86 |         proximal_init=True,
 87 |         **kwargs
 88 |     ):
 89 |         super().__init__()
 90 |         self.hidden_channels = hidden_channels
 91 |         self.filter_channels = filter_channels
 92 |         self.n_heads = n_heads
 93 |         self.n_layers = n_layers
 94 |         self.kernel_size = kernel_size
 95 |         self.p_dropout = p_dropout
 96 |         self.proximal_bias = proximal_bias
 97 |         self.proximal_init = proximal_init
 98 | 
 99 |         self.drop = nn.Dropout(p_dropout)
100 |         self.self_attn_layers = nn.ModuleList()
101 |         self.norm_layers_0 = nn.ModuleList()
102 |         self.encdec_attn_layers = nn.ModuleList()
103 |         self.norm_layers_1 = nn.ModuleList()
104 |         self.ffn_layers = nn.ModuleList()
105 |         self.norm_layers_2 = nn.ModuleList()
106 |         for i in range(self.n_layers):
107 |             self.self_attn_layers.append(
108 |                 MultiHeadAttention(
109 |                     hidden_channels,
110 |                     hidden_channels,
111 |                     n_heads,
112 |                     p_dropout=p_dropout,
113 |                     proximal_bias=proximal_bias,
114 |                     proximal_init=proximal_init,
115 |                 )
116 |             )
117 |             self.norm_layers_0.append(LayerNorm(hidden_channels))
118 |             self.encdec_attn_layers.append(
119 |                 MultiHeadAttention(
120 |                     hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121 |                 )
122 |             )
123 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
124 |             self.ffn_layers.append(
125 |                 FFN(
126 |                     hidden_channels,
127 |                     hidden_channels,
128 |                     filter_channels,
129 |                     kernel_size,
130 |                     p_dropout=p_dropout,
131 |                     causal=True,
132 |                 )
133 |             )
134 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
135 | 
136 |     def forward(self, x, x_mask, h, h_mask):
137 |         """
138 |         x: decoder input
139 |         h: encoder output
140 |         """
141 |         self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142 |             device=x.device, dtype=x.dtype
143 |         )
144 |         encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145 |         x = x * x_mask
146 |         for i in range(self.n_layers):
147 |             y = self.self_attn_layers[i](x, x, self_attn_mask)
148 |             y = self.drop(y)
149 |             x = self.norm_layers_0[i](x + y)
150 | 
151 |             y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152 |             y = self.drop(y)
153 |             x = self.norm_layers_1[i](x + y)
154 | 
155 |             y = self.ffn_layers[i](x, x_mask)
156 |             y = self.drop(y)
157 |             x = self.norm_layers_2[i](x + y)
158 |         x = x * x_mask
159 |         return x
160 | 
161 | 
162 | class MultiHeadAttention(nn.Module):
163 |     def __init__(
164 |         self,
165 |         channels,
166 |         out_channels,
167 |         n_heads,
168 |         p_dropout=0.0,
169 |         window_size=None,
170 |         heads_share=True,
171 |         block_length=None,
172 |         proximal_bias=False,
173 |         proximal_init=False,
174 |     ):
175 |         super().__init__()
176 |         assert channels % n_heads == 0
177 | 
178 |         self.channels = channels
179 |         self.out_channels = out_channels
180 |         self.n_heads = n_heads
181 |         self.p_dropout = p_dropout
182 |         self.window_size = window_size
183 |         self.heads_share = heads_share
184 |         self.block_length = block_length
185 |         self.proximal_bias = proximal_bias
186 |         self.proximal_init = proximal_init
187 |         self.attn = None
188 | 
189 |         self.k_channels = channels // n_heads
190 |         self.conv_q = nn.Conv1d(channels, channels, 1)
191 |         self.conv_k = nn.Conv1d(channels, channels, 1)
192 |         self.conv_v = nn.Conv1d(channels, channels, 1)
193 |         self.conv_o = nn.Conv1d(channels, out_channels, 1)
194 |         self.drop = nn.Dropout(p_dropout)
195 | 
196 |         if window_size is not None:
197 |             n_heads_rel = 1 if heads_share else n_heads
198 |             rel_stddev = self.k_channels**-0.5
199 |             self.emb_rel_k = nn.Parameter(
200 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201 |                 * rel_stddev
202 |             )
203 |             self.emb_rel_v = nn.Parameter(
204 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205 |                 * rel_stddev
206 |             )
207 | 
208 |         nn.init.xavier_uniform_(self.conv_q.weight)
209 |         nn.init.xavier_uniform_(self.conv_k.weight)
210 |         nn.init.xavier_uniform_(self.conv_v.weight)
211 |         if proximal_init:
212 |             with torch.no_grad():
213 |                 self.conv_k.weight.copy_(self.conv_q.weight)
214 |                 self.conv_k.bias.copy_(self.conv_q.bias)
215 | 
216 |     def forward(self, x, c, attn_mask=None):
217 |         q = self.conv_q(x)
218 |         k = self.conv_k(c)
219 |         v = self.conv_v(c)
220 | 
221 |         x, self.attn = self.attention(q, k, v, mask=attn_mask)
222 | 
223 |         x = self.conv_o(x)
224 |         return x
225 | 
226 |     def attention(self, query, key, value, mask=None):
227 |         # reshape [b, d, t] -> [b, n_h, t, d_k]
228 |         b, d, t_s, t_t = (*key.size(), query.size(2))
229 |         query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230 |         key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231 |         value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232 | 
233 |         scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234 |         if self.window_size is not None:
235 |             assert (
236 |                 t_s == t_t
237 |             ), "Relative attention is only available for self-attention."
238 |             key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239 |             rel_logits = self._matmul_with_relative_keys(
240 |                 query / math.sqrt(self.k_channels), key_relative_embeddings
241 |             )
242 |             scores_local = self._relative_position_to_absolute_position(rel_logits)
243 |             scores = scores + scores_local
244 |         if self.proximal_bias:
245 |             assert t_s == t_t, "Proximal bias is only available for self-attention."
246 |             scores = scores + self._attention_bias_proximal(t_s).to(
247 |                 device=scores.device, dtype=scores.dtype
248 |             )
249 |         if mask is not None:
250 |             scores = scores.masked_fill(mask == 0, -1e4)
251 |             if self.block_length is not None:
252 |                 assert (
253 |                     t_s == t_t
254 |                 ), "Local attention is only available for self-attention."
255 |                 block_mask = (
256 |                     torch.ones_like(scores)
257 |                     .triu(-self.block_length)
258 |                     .tril(self.block_length)
259 |                 )
260 |                 scores = scores.masked_fill(block_mask == 0, -1e4)
261 |         p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
262 |         p_attn = self.drop(p_attn)
263 |         output = torch.matmul(p_attn, value)
264 |         if self.window_size is not None:
265 |             relative_weights = self._absolute_position_to_relative_position(p_attn)
266 |             value_relative_embeddings = self._get_relative_embeddings(
267 |                 self.emb_rel_v, t_s
268 |             )
269 |             output = output + self._matmul_with_relative_values(
270 |                 relative_weights, value_relative_embeddings
271 |             )
272 |         output = (
273 |             output.transpose(2, 3).contiguous().view(b, d, t_t)
274 |         )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
275 |         return output, p_attn
276 | 
277 |     def _matmul_with_relative_values(self, x, y):
278 |         """
279 |         x: [b, h, l, m]
280 |         y: [h or 1, m, d]
281 |         ret: [b, h, l, d]
282 |         """
283 |         ret = torch.matmul(x, y.unsqueeze(0))
284 |         return ret
285 | 
286 |     def _matmul_with_relative_keys(self, x, y):
287 |         """
288 |         x: [b, h, l, d]
289 |         y: [h or 1, m, d]
290 |         ret: [b, h, l, m]
291 |         """
292 |         ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293 |         return ret
294 | 
295 |     def _get_relative_embeddings(self, relative_embeddings, length):
296 |         max_relative_position = 2 * self.window_size + 1
297 |         # Pad first before slice to avoid using cond ops.
298 |         pad_length = max(length - (self.window_size + 1), 0)
299 |         slice_start_position = max((self.window_size + 1) - length, 0)
300 |         slice_end_position = slice_start_position + 2 * length - 1
301 |         if pad_length > 0:
302 |             padded_relative_embeddings = F.pad(
303 |                 relative_embeddings,
304 |                 commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305 |             )
306 |         else:
307 |             padded_relative_embeddings = relative_embeddings
308 |         used_relative_embeddings = padded_relative_embeddings[
309 |             :, slice_start_position:slice_end_position
310 |         ]
311 |         return used_relative_embeddings
312 | 
313 |     def _relative_position_to_absolute_position(self, x):
314 |         """
315 |         x: [b, h, l, 2*l-1]
316 |         ret: [b, h, l, l]
317 |         """
318 |         batch, heads, length, _ = x.size()
319 |         # Concat columns of pad to shift from relative to absolute indexing.
320 |         x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321 | 
322 |         # Concat extra elements so to add up to shape (len+1, 2*len-1).
323 |         x_flat = x.view([batch, heads, length * 2 * length])
324 |         x_flat = F.pad(
325 |             x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326 |         )
327 | 
328 |         # Reshape and slice out the padded elements.
329 |         x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330 |             :, :, :length, length - 1 :
331 |         ]
332 |         return x_final
333 | 
334 |     def _absolute_position_to_relative_position(self, x):
335 |         """
336 |         x: [b, h, l, l]
337 |         ret: [b, h, l, 2*l-1]
338 |         """
339 |         batch, heads, length, _ = x.size()
340 |         # padd along column
341 |         x = F.pad(
342 |             x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343 |         )
344 |         x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345 |         # add 0's in the beginning that will skew the elements after reshape
346 |         x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347 |         x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348 |         return x_final
349 | 
350 |     def _attention_bias_proximal(self, length):
351 |         """Bias for self-attention to encourage attention to close positions.
352 |         Args:
353 |           length: an integer scalar.
354 |         Returns:
355 |           a Tensor with shape [1, 1, length, length]
356 |         """
357 |         r = torch.arange(length, dtype=torch.float32)
358 |         diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359 |         return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360 | 
361 | 
362 | class FFN(nn.Module):
363 |     def __init__(
364 |         self,
365 |         in_channels,
366 |         out_channels,
367 |         filter_channels,
368 |         kernel_size,
369 |         p_dropout=0.0,
370 |         activation=None,
371 |         causal=False,
372 |     ):
373 |         super().__init__()
374 |         self.in_channels = in_channels
375 |         self.out_channels = out_channels
376 |         self.filter_channels = filter_channels
377 |         self.kernel_size = kernel_size
378 |         self.p_dropout = p_dropout
379 |         self.activation = activation
380 |         self.causal = causal
381 | 
382 |         if causal:
383 |             self.padding = self._causal_padding
384 |         else:
385 |             self.padding = self._same_padding
386 | 
387 |         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388 |         self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389 |         self.drop = nn.Dropout(p_dropout)
390 | 
391 |     def forward(self, x, x_mask):
392 |         x = self.conv_1(self.padding(x * x_mask))
393 |         if self.activation == "gelu":
394 |             x = x * torch.sigmoid(1.702 * x)
395 |         else:
396 |             x = torch.relu(x)
397 |         x = self.drop(x)
398 |         x = self.conv_2(self.padding(x * x_mask))
399 |         return x * x_mask
400 | 
401 |     def _causal_padding(self, x):
402 |         if self.kernel_size == 1:
403 |             return x
404 |         pad_l = self.kernel_size - 1
405 |         pad_r = 0
406 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407 |         x = F.pad(x, commons.convert_pad_shape(padding))
408 |         return x
409 | 
410 |     def _same_padding(self, x):
411 |         if self.kernel_size == 1:
412 |             return x
413 |         pad_l = (self.kernel_size - 1) // 2
414 |         pad_r = self.kernel_size // 2
415 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416 |         x = F.pad(x, commons.convert_pad_shape(padding))
417 |         return x
418 | 


--------------------------------------------------------------------------------
/infer_pack/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |     classname = m.__class__.__name__
 10 |     if classname.find("Conv") != -1:
 11 |         m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |     return int((kernel_size * dilation - dilation) / 2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |     l = pad_shape[::-1]
 20 |     pad_shape = [item for sublist in l for item in sublist]
 21 |     return pad_shape
 22 | 
 23 | 
 24 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 25 |     """KL(P||Q)"""
 26 |     kl = (logs_q - logs_p) - 0.5
 27 |     kl += (
 28 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 29 |     )
 30 |     return kl
 31 | 
 32 | 
 33 | def rand_gumbel(shape):
 34 |     """Sample from the Gumbel distribution, protect from overflows."""
 35 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 36 |     return -torch.log(-torch.log(uniform_samples))
 37 | 
 38 | 
 39 | def rand_gumbel_like(x):
 40 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 41 |     return g
 42 | 
 43 | 
 44 | def slice_segments(x, ids_str, segment_size=4):
 45 |     ret = torch.zeros_like(x[:, :, :segment_size])
 46 |     for i in range(x.size(0)):
 47 |         idx_str = ids_str[i]
 48 |         idx_end = idx_str + segment_size
 49 |         ret[i] = x[i, :, idx_str:idx_end]
 50 |     return ret
 51 | 
 52 | 
 53 | def slice_segments2(x, ids_str, segment_size=4):
 54 |     ret = torch.zeros_like(x[:, :segment_size])
 55 |     for i in range(x.size(0)):
 56 |         idx_str = ids_str[i]
 57 |         idx_end = idx_str + segment_size
 58 |         ret[i] = x[i, idx_str:idx_end]
 59 |     return ret
 60 | 
 61 | 
 62 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 63 |     b, d, t = x.size()
 64 |     if x_lengths is None:
 65 |         x_lengths = t
 66 |     ids_str_max = x_lengths - segment_size + 1
 67 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 68 |     ret = slice_segments(x, ids_str, segment_size)
 69 |     return ret, ids_str
 70 | 
 71 | 
 72 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 73 |     position = torch.arange(length, dtype=torch.float)
 74 |     num_timescales = channels // 2
 75 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 76 |         num_timescales - 1
 77 |     )
 78 |     inv_timescales = min_timescale * torch.exp(
 79 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 80 |     )
 81 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 82 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 83 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 84 |     signal = signal.view(1, channels, length)
 85 |     return signal
 86 | 
 87 | 
 88 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 89 |     b, channels, length = x.size()
 90 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 91 |     return x + signal.to(dtype=x.dtype, device=x.device)
 92 | 
 93 | 
 94 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 95 |     b, channels, length = x.size()
 96 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 97 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 98 | 
 99 | 
100 | def subsequent_mask(length):
101 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102 |     return mask
103 | 
104 | 
105 | @torch.jit.script
106 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107 |     n_channels_int = n_channels[0]
108 |     in_act = input_a + input_b
109 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
110 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111 |     acts = t_act * s_act
112 |     return acts
113 | 
114 | 
115 | def convert_pad_shape(pad_shape):
116 |     l = pad_shape[::-1]
117 |     pad_shape = [item for sublist in l for item in sublist]
118 |     return pad_shape
119 | 
120 | 
121 | def shift_1d(x):
122 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123 |     return x
124 | 
125 | 
126 | def sequence_mask(length, max_length=None):
127 |     if max_length is None:
128 |         max_length = length.max()
129 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130 |     return x.unsqueeze(0) < length.unsqueeze(1)
131 | 
132 | 
133 | def generate_path(duration, mask):
134 |     """
135 |     duration: [b, 1, t_x]
136 |     mask: [b, 1, t_y, t_x]
137 |     """
138 |     device = duration.device
139 | 
140 |     b, _, t_y, t_x = mask.shape
141 |     cum_duration = torch.cumsum(duration, -1)
142 | 
143 |     cum_duration_flat = cum_duration.view(b * t_x)
144 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145 |     path = path.view(b, t_x, t_y)
146 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147 |     path = path.unsqueeze(1).transpose(2, 3) * mask
148 |     return path
149 | 
150 | 
151 | def clip_grad_value_(parameters, clip_value, norm_type=2):
152 |     if isinstance(parameters, torch.Tensor):
153 |         parameters = [parameters]
154 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
155 |     norm_type = float(norm_type)
156 |     if clip_value is not None:
157 |         clip_value = float(clip_value)
158 | 
159 |     total_norm = 0
160 |     for p in parameters:
161 |         param_norm = p.grad.data.norm(norm_type)
162 |         total_norm += param_norm.item() ** norm_type
163 |         if clip_value is not None:
164 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
165 |     total_norm = total_norm ** (1.0 / norm_type)
166 |     return total_norm
167 | 


--------------------------------------------------------------------------------
/infer_pack/models_onnx.py:
--------------------------------------------------------------------------------
  1 | import math, pdb, os
  2 | from time import time as ttime
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | from infer_pack import modules
  7 | from infer_pack import attentions
  8 | from infer_pack import commons
  9 | from infer_pack.commons import init_weights, get_padding
 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 12 | from infer_pack.commons import init_weights
 13 | import numpy as np
 14 | from infer_pack import commons
 15 | 
 16 | 
 17 | class TextEncoder256(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         out_channels,
 21 |         hidden_channels,
 22 |         filter_channels,
 23 |         n_heads,
 24 |         n_layers,
 25 |         kernel_size,
 26 |         p_dropout,
 27 |         f0=True,
 28 |     ):
 29 |         super().__init__()
 30 |         self.out_channels = out_channels
 31 |         self.hidden_channels = hidden_channels
 32 |         self.filter_channels = filter_channels
 33 |         self.n_heads = n_heads
 34 |         self.n_layers = n_layers
 35 |         self.kernel_size = kernel_size
 36 |         self.p_dropout = p_dropout
 37 |         self.emb_phone = nn.Linear(256, hidden_channels)
 38 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
 39 |         if f0 == True:
 40 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
 41 |         self.encoder = attentions.Encoder(
 42 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
 43 |         )
 44 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 45 | 
 46 |     def forward(self, phone, pitch, lengths):
 47 |         if pitch == None:
 48 |             x = self.emb_phone(phone)
 49 |         else:
 50 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
 51 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
 52 |         x = self.lrelu(x)
 53 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
 54 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
 55 |             x.dtype
 56 |         )
 57 |         x = self.encoder(x * x_mask, x_mask)
 58 |         stats = self.proj(x) * x_mask
 59 | 
 60 |         m, logs = torch.split(stats, self.out_channels, dim=1)
 61 |         return m, logs, x_mask
 62 | 
 63 | 
 64 | class TextEncoder768(nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         out_channels,
 68 |         hidden_channels,
 69 |         filter_channels,
 70 |         n_heads,
 71 |         n_layers,
 72 |         kernel_size,
 73 |         p_dropout,
 74 |         f0=True,
 75 |     ):
 76 |         super().__init__()
 77 |         self.out_channels = out_channels
 78 |         self.hidden_channels = hidden_channels
 79 |         self.filter_channels = filter_channels
 80 |         self.n_heads = n_heads
 81 |         self.n_layers = n_layers
 82 |         self.kernel_size = kernel_size
 83 |         self.p_dropout = p_dropout
 84 |         self.emb_phone = nn.Linear(768, hidden_channels)
 85 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
 86 |         if f0 == True:
 87 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
 88 |         self.encoder = attentions.Encoder(
 89 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
 90 |         )
 91 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 92 | 
 93 |     def forward(self, phone, pitch, lengths):
 94 |         if pitch == None:
 95 |             x = self.emb_phone(phone)
 96 |         else:
 97 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
 98 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
 99 |         x = self.lrelu(x)
100 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
101 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102 |             x.dtype
103 |         )
104 |         x = self.encoder(x * x_mask, x_mask)
105 |         stats = self.proj(x) * x_mask
106 | 
107 |         m, logs = torch.split(stats, self.out_channels, dim=1)
108 |         return m, logs, x_mask
109 | 
110 | 
111 | class ResidualCouplingBlock(nn.Module):
112 |     def __init__(
113 |         self,
114 |         channels,
115 |         hidden_channels,
116 |         kernel_size,
117 |         dilation_rate,
118 |         n_layers,
119 |         n_flows=4,
120 |         gin_channels=0,
121 |     ):
122 |         super().__init__()
123 |         self.channels = channels
124 |         self.hidden_channels = hidden_channels
125 |         self.kernel_size = kernel_size
126 |         self.dilation_rate = dilation_rate
127 |         self.n_layers = n_layers
128 |         self.n_flows = n_flows
129 |         self.gin_channels = gin_channels
130 | 
131 |         self.flows = nn.ModuleList()
132 |         for i in range(n_flows):
133 |             self.flows.append(
134 |                 modules.ResidualCouplingLayer(
135 |                     channels,
136 |                     hidden_channels,
137 |                     kernel_size,
138 |                     dilation_rate,
139 |                     n_layers,
140 |                     gin_channels=gin_channels,
141 |                     mean_only=True,
142 |                 )
143 |             )
144 |             self.flows.append(modules.Flip())
145 | 
146 |     def forward(self, x, x_mask, g=None, reverse=False):
147 |         if not reverse:
148 |             for flow in self.flows:
149 |                 x, _ = flow(x, x_mask, g=g, reverse=reverse)
150 |         else:
151 |             for flow in reversed(self.flows):
152 |                 x = flow(x, x_mask, g=g, reverse=reverse)
153 |         return x
154 | 
155 |     def remove_weight_norm(self):
156 |         for i in range(self.n_flows):
157 |             self.flows[i * 2].remove_weight_norm()
158 | 
159 | 
160 | class PosteriorEncoder(nn.Module):
161 |     def __init__(
162 |         self,
163 |         in_channels,
164 |         out_channels,
165 |         hidden_channels,
166 |         kernel_size,
167 |         dilation_rate,
168 |         n_layers,
169 |         gin_channels=0,
170 |     ):
171 |         super().__init__()
172 |         self.in_channels = in_channels
173 |         self.out_channels = out_channels
174 |         self.hidden_channels = hidden_channels
175 |         self.kernel_size = kernel_size
176 |         self.dilation_rate = dilation_rate
177 |         self.n_layers = n_layers
178 |         self.gin_channels = gin_channels
179 | 
180 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181 |         self.enc = modules.WN(
182 |             hidden_channels,
183 |             kernel_size,
184 |             dilation_rate,
185 |             n_layers,
186 |             gin_channels=gin_channels,
187 |         )
188 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189 | 
190 |     def forward(self, x, x_lengths, g=None):
191 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192 |             x.dtype
193 |         )
194 |         x = self.pre(x) * x_mask
195 |         x = self.enc(x, x_mask, g=g)
196 |         stats = self.proj(x) * x_mask
197 |         m, logs = torch.split(stats, self.out_channels, dim=1)
198 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199 |         return z, m, logs, x_mask
200 | 
201 |     def remove_weight_norm(self):
202 |         self.enc.remove_weight_norm()
203 | 
204 | 
205 | class Generator(torch.nn.Module):
206 |     def __init__(
207 |         self,
208 |         initial_channel,
209 |         resblock,
210 |         resblock_kernel_sizes,
211 |         resblock_dilation_sizes,
212 |         upsample_rates,
213 |         upsample_initial_channel,
214 |         upsample_kernel_sizes,
215 |         gin_channels=0,
216 |     ):
217 |         super(Generator, self).__init__()
218 |         self.num_kernels = len(resblock_kernel_sizes)
219 |         self.num_upsamples = len(upsample_rates)
220 |         self.conv_pre = Conv1d(
221 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
222 |         )
223 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224 | 
225 |         self.ups = nn.ModuleList()
226 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227 |             self.ups.append(
228 |                 weight_norm(
229 |                     ConvTranspose1d(
230 |                         upsample_initial_channel // (2**i),
231 |                         upsample_initial_channel // (2 ** (i + 1)),
232 |                         k,
233 |                         u,
234 |                         padding=(k - u) // 2,
235 |                     )
236 |                 )
237 |             )
238 | 
239 |         self.resblocks = nn.ModuleList()
240 |         for i in range(len(self.ups)):
241 |             ch = upsample_initial_channel // (2 ** (i + 1))
242 |             for j, (k, d) in enumerate(
243 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
244 |             ):
245 |                 self.resblocks.append(resblock(ch, k, d))
246 | 
247 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248 |         self.ups.apply(init_weights)
249 | 
250 |         if gin_channels != 0:
251 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252 | 
253 |     def forward(self, x, g=None):
254 |         x = self.conv_pre(x)
255 |         if g is not None:
256 |             x = x + self.cond(g)
257 | 
258 |         for i in range(self.num_upsamples):
259 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
260 |             x = self.ups[i](x)
261 |             xs = None
262 |             for j in range(self.num_kernels):
263 |                 if xs is None:
264 |                     xs = self.resblocks[i * self.num_kernels + j](x)
265 |                 else:
266 |                     xs += self.resblocks[i * self.num_kernels + j](x)
267 |             x = xs / self.num_kernels
268 |         x = F.leaky_relu(x)
269 |         x = self.conv_post(x)
270 |         x = torch.tanh(x)
271 | 
272 |         return x
273 | 
274 |     def remove_weight_norm(self):
275 |         for l in self.ups:
276 |             remove_weight_norm(l)
277 |         for l in self.resblocks:
278 |             l.remove_weight_norm()
279 | 
280 | 
281 | class SineGen(torch.nn.Module):
282 |     """Definition of sine generator
283 |     SineGen(samp_rate, harmonic_num = 0,
284 |             sine_amp = 0.1, noise_std = 0.003,
285 |             voiced_threshold = 0,
286 |             flag_for_pulse=False)
287 |     samp_rate: sampling rate in Hz
288 |     harmonic_num: number of harmonic overtones (default 0)
289 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
290 |     noise_std: std of Gaussian noise (default 0.003)
291 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
292 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
293 |     Note: when flag_for_pulse is True, the first time step of a voiced
294 |         segment is always sin(np.pi) or cos(0)
295 |     """
296 | 
297 |     def __init__(
298 |         self,
299 |         samp_rate,
300 |         harmonic_num=0,
301 |         sine_amp=0.1,
302 |         noise_std=0.003,
303 |         voiced_threshold=0,
304 |         flag_for_pulse=False,
305 |     ):
306 |         super(SineGen, self).__init__()
307 |         self.sine_amp = sine_amp
308 |         self.noise_std = noise_std
309 |         self.harmonic_num = harmonic_num
310 |         self.dim = self.harmonic_num + 1
311 |         self.sampling_rate = samp_rate
312 |         self.voiced_threshold = voiced_threshold
313 | 
314 |     def _f02uv(self, f0):
315 |         # generate uv signal
316 |         uv = torch.ones_like(f0)
317 |         uv = uv * (f0 > self.voiced_threshold)
318 |         return uv
319 | 
320 |     def forward(self, f0, upp):
321 |         """sine_tensor, uv = forward(f0)
322 |         input F0: tensor(batchsize=1, length, dim=1)
323 |                   f0 for unvoiced steps should be 0
324 |         output sine_tensor: tensor(batchsize=1, length, dim)
325 |         output uv: tensor(batchsize=1, length, 1)
326 |         """
327 |         with torch.no_grad():
328 |             f0 = f0[:, None].transpose(1, 2)
329 |             f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330 |             # fundamental component
331 |             f0_buf[:, :, 0] = f0[:, :, 0]
332 |             for idx in np.arange(self.harmonic_num):
333 |                 f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334 |                     idx + 2
335 |                 )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336 |             rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
337 |             rand_ini = torch.rand(
338 |                 f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339 |             )
340 |             rand_ini[:, 0] = 0
341 |             rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342 |             tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
343 |             tmp_over_one *= upp
344 |             tmp_over_one = F.interpolate(
345 |                 tmp_over_one.transpose(2, 1),
346 |                 scale_factor=upp,
347 |                 mode="linear",
348 |                 align_corners=True,
349 |             ).transpose(2, 1)
350 |             rad_values = F.interpolate(
351 |                 rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352 |             ).transpose(
353 |                 2, 1
354 |             )  #######
355 |             tmp_over_one %= 1
356 |             tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357 |             cumsum_shift = torch.zeros_like(rad_values)
358 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359 |             sine_waves = torch.sin(
360 |                 torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361 |             )
362 |             sine_waves = sine_waves * self.sine_amp
363 |             uv = self._f02uv(f0)
364 |             uv = F.interpolate(
365 |                 uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366 |             ).transpose(2, 1)
367 |             noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368 |             noise = noise_amp * torch.randn_like(sine_waves)
369 |             sine_waves = sine_waves * uv + noise
370 |         return sine_waves, uv, noise
371 | 
372 | 
373 | class SourceModuleHnNSF(torch.nn.Module):
374 |     """SourceModule for hn-nsf
375 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376 |                  add_noise_std=0.003, voiced_threshod=0)
377 |     sampling_rate: sampling_rate in Hz
378 |     harmonic_num: number of harmonic above F0 (default: 0)
379 |     sine_amp: amplitude of sine source signal (default: 0.1)
380 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
381 |         note that amplitude of noise in unvoiced is decided
382 |         by sine_amp
383 |     voiced_threshold: threhold to set U/V given F0 (default: 0)
384 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385 |     F0_sampled (batchsize, length, 1)
386 |     Sine_source (batchsize, length, 1)
387 |     noise_source (batchsize, length 1)
388 |     uv (batchsize, length, 1)
389 |     """
390 | 
391 |     def __init__(
392 |         self,
393 |         sampling_rate,
394 |         harmonic_num=0,
395 |         sine_amp=0.1,
396 |         add_noise_std=0.003,
397 |         voiced_threshod=0,
398 |         is_half=True,
399 |     ):
400 |         super(SourceModuleHnNSF, self).__init__()
401 | 
402 |         self.sine_amp = sine_amp
403 |         self.noise_std = add_noise_std
404 |         self.is_half = is_half
405 |         # to produce sine waveforms
406 |         self.l_sin_gen = SineGen(
407 |             sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408 |         )
409 | 
410 |         # to merge source harmonics into a single excitation
411 |         self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412 |         self.l_tanh = torch.nn.Tanh()
413 | 
414 |     def forward(self, x, upp=None):
415 |         sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416 |         if self.is_half:
417 |             sine_wavs = sine_wavs.half()
418 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419 |         return sine_merge, None, None  # noise, uv
420 | 
421 | 
422 | class GeneratorNSF(torch.nn.Module):
423 |     def __init__(
424 |         self,
425 |         initial_channel,
426 |         resblock,
427 |         resblock_kernel_sizes,
428 |         resblock_dilation_sizes,
429 |         upsample_rates,
430 |         upsample_initial_channel,
431 |         upsample_kernel_sizes,
432 |         gin_channels,
433 |         sr,
434 |         is_half=False,
435 |     ):
436 |         super(GeneratorNSF, self).__init__()
437 |         self.num_kernels = len(resblock_kernel_sizes)
438 |         self.num_upsamples = len(upsample_rates)
439 | 
440 |         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441 |         self.m_source = SourceModuleHnNSF(
442 |             sampling_rate=sr, harmonic_num=0, is_half=is_half
443 |         )
444 |         self.noise_convs = nn.ModuleList()
445 |         self.conv_pre = Conv1d(
446 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
447 |         )
448 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449 | 
450 |         self.ups = nn.ModuleList()
451 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452 |             c_cur = upsample_initial_channel // (2 ** (i + 1))
453 |             self.ups.append(
454 |                 weight_norm(
455 |                     ConvTranspose1d(
456 |                         upsample_initial_channel // (2**i),
457 |                         upsample_initial_channel // (2 ** (i + 1)),
458 |                         k,
459 |                         u,
460 |                         padding=(k - u) // 2,
461 |                     )
462 |                 )
463 |             )
464 |             if i + 1 < len(upsample_rates):
465 |                 stride_f0 = np.prod(upsample_rates[i + 1 :])
466 |                 self.noise_convs.append(
467 |                     Conv1d(
468 |                         1,
469 |                         c_cur,
470 |                         kernel_size=stride_f0 * 2,
471 |                         stride=stride_f0,
472 |                         padding=stride_f0 // 2,
473 |                     )
474 |                 )
475 |             else:
476 |                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477 | 
478 |         self.resblocks = nn.ModuleList()
479 |         for i in range(len(self.ups)):
480 |             ch = upsample_initial_channel // (2 ** (i + 1))
481 |             for j, (k, d) in enumerate(
482 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
483 |             ):
484 |                 self.resblocks.append(resblock(ch, k, d))
485 | 
486 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487 |         self.ups.apply(init_weights)
488 | 
489 |         if gin_channels != 0:
490 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491 | 
492 |         self.upp = np.prod(upsample_rates)
493 | 
494 |     def forward(self, x, f0, g=None):
495 |         har_source, noi_source, uv = self.m_source(f0, self.upp)
496 |         har_source = har_source.transpose(1, 2)
497 |         x = self.conv_pre(x)
498 |         if g is not None:
499 |             x = x + self.cond(g)
500 | 
501 |         for i in range(self.num_upsamples):
502 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
503 |             x = self.ups[i](x)
504 |             x_source = self.noise_convs[i](har_source)
505 |             x = x + x_source
506 |             xs = None
507 |             for j in range(self.num_kernels):
508 |                 if xs is None:
509 |                     xs = self.resblocks[i * self.num_kernels + j](x)
510 |                 else:
511 |                     xs += self.resblocks[i * self.num_kernels + j](x)
512 |             x = xs / self.num_kernels
513 |         x = F.leaky_relu(x)
514 |         x = self.conv_post(x)
515 |         x = torch.tanh(x)
516 |         return x
517 | 
518 |     def remove_weight_norm(self):
519 |         for l in self.ups:
520 |             remove_weight_norm(l)
521 |         for l in self.resblocks:
522 |             l.remove_weight_norm()
523 | 
524 | 
525 | sr2sr = {
526 |     "32k": 32000,
527 |     "40k": 40000,
528 |     "48k": 48000,
529 | }
530 | 
531 | 
532 | class SynthesizerTrnMsNSFsidM(nn.Module):
533 |     def __init__(
534 |         self,
535 |         spec_channels,
536 |         segment_size,
537 |         inter_channels,
538 |         hidden_channels,
539 |         filter_channels,
540 |         n_heads,
541 |         n_layers,
542 |         kernel_size,
543 |         p_dropout,
544 |         resblock,
545 |         resblock_kernel_sizes,
546 |         resblock_dilation_sizes,
547 |         upsample_rates,
548 |         upsample_initial_channel,
549 |         upsample_kernel_sizes,
550 |         spk_embed_dim,
551 |         gin_channels,
552 |         sr,
553 |         **kwargs
554 |     ):
555 |         super().__init__()
556 |         if type(sr) == type("strr"):
557 |             sr = sr2sr[sr]
558 |         self.spec_channels = spec_channels
559 |         self.inter_channels = inter_channels
560 |         self.hidden_channels = hidden_channels
561 |         self.filter_channels = filter_channels
562 |         self.n_heads = n_heads
563 |         self.n_layers = n_layers
564 |         self.kernel_size = kernel_size
565 |         self.p_dropout = p_dropout
566 |         self.resblock = resblock
567 |         self.resblock_kernel_sizes = resblock_kernel_sizes
568 |         self.resblock_dilation_sizes = resblock_dilation_sizes
569 |         self.upsample_rates = upsample_rates
570 |         self.upsample_initial_channel = upsample_initial_channel
571 |         self.upsample_kernel_sizes = upsample_kernel_sizes
572 |         self.segment_size = segment_size
573 |         self.gin_channels = gin_channels
574 |         # self.hop_length = hop_length#
575 |         self.spk_embed_dim = spk_embed_dim
576 |         if self.gin_channels == 256:
577 |             self.enc_p = TextEncoder256(
578 |                 inter_channels,
579 |                 hidden_channels,
580 |                 filter_channels,
581 |                 n_heads,
582 |                 n_layers,
583 |                 kernel_size,
584 |                 p_dropout,
585 |             )
586 |         else:
587 |             self.enc_p = TextEncoder768(
588 |                 inter_channels,
589 |                 hidden_channels,
590 |                 filter_channels,
591 |                 n_heads,
592 |                 n_layers,
593 |                 kernel_size,
594 |                 p_dropout,
595 |             )
596 |         self.dec = GeneratorNSF(
597 |             inter_channels,
598 |             resblock,
599 |             resblock_kernel_sizes,
600 |             resblock_dilation_sizes,
601 |             upsample_rates,
602 |             upsample_initial_channel,
603 |             upsample_kernel_sizes,
604 |             gin_channels=gin_channels,
605 |             sr=sr,
606 |             is_half=kwargs["is_half"],
607 |         )
608 |         self.enc_q = PosteriorEncoder(
609 |             spec_channels,
610 |             inter_channels,
611 |             hidden_channels,
612 |             5,
613 |             1,
614 |             16,
615 |             gin_channels=gin_channels,
616 |         )
617 |         self.flow = ResidualCouplingBlock(
618 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
619 |         )
620 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
621 |         self.speaker_map = None
622 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
623 | 
624 |     def remove_weight_norm(self):
625 |         self.dec.remove_weight_norm()
626 |         self.flow.remove_weight_norm()
627 |         self.enc_q.remove_weight_norm()
628 | 
629 |     def construct_spkmixmap(self, n_speaker):
630 |         self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
631 |         for i in range(n_speaker):
632 |             self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
633 |         self.speaker_map = self.speaker_map.unsqueeze(0)
634 | 
635 |     def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
636 |         if self.speaker_map is not None:  # [N, S]  *  [S, B, 1, H]
637 |             g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1))  # [N, S, B, 1, 1]
638 |             g = g * self.speaker_map  # [N, S, B, 1, H]
639 |             g = torch.sum(g, dim=1)  # [N, 1, B, 1, H]
640 |             g = g.transpose(0, -1).transpose(0, -2).squeeze(0)  # [B, H, N]
641 |         else:
642 |             g = g.unsqueeze(0)
643 |             g = self.emb_g(g).transpose(1, 2)
644 | 
645 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
646 |         z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
647 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
648 |         o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
649 |         return o
650 | 
651 | 
652 | class MultiPeriodDiscriminator(torch.nn.Module):
653 |     def __init__(self, use_spectral_norm=False):
654 |         super(MultiPeriodDiscriminator, self).__init__()
655 |         periods = [2, 3, 5, 7, 11, 17]
656 |         # periods = [3, 5, 7, 11, 17, 23, 37]
657 | 
658 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
659 |         discs = discs + [
660 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
661 |         ]
662 |         self.discriminators = nn.ModuleList(discs)
663 | 
664 |     def forward(self, y, y_hat):
665 |         y_d_rs = []  #
666 |         y_d_gs = []
667 |         fmap_rs = []
668 |         fmap_gs = []
669 |         for i, d in enumerate(self.discriminators):
670 |             y_d_r, fmap_r = d(y)
671 |             y_d_g, fmap_g = d(y_hat)
672 |             # for j in range(len(fmap_r)):
673 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
674 |             y_d_rs.append(y_d_r)
675 |             y_d_gs.append(y_d_g)
676 |             fmap_rs.append(fmap_r)
677 |             fmap_gs.append(fmap_g)
678 | 
679 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
680 | 
681 | 
682 | class MultiPeriodDiscriminatorV2(torch.nn.Module):
683 |     def __init__(self, use_spectral_norm=False):
684 |         super(MultiPeriodDiscriminatorV2, self).__init__()
685 |         # periods = [2, 3, 5, 7, 11, 17]
686 |         periods = [2, 3, 5, 7, 11, 17, 23, 37]
687 | 
688 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
689 |         discs = discs + [
690 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
691 |         ]
692 |         self.discriminators = nn.ModuleList(discs)
693 | 
694 |     def forward(self, y, y_hat):
695 |         y_d_rs = []  #
696 |         y_d_gs = []
697 |         fmap_rs = []
698 |         fmap_gs = []
699 |         for i, d in enumerate(self.discriminators):
700 |             y_d_r, fmap_r = d(y)
701 |             y_d_g, fmap_g = d(y_hat)
702 |             # for j in range(len(fmap_r)):
703 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
704 |             y_d_rs.append(y_d_r)
705 |             y_d_gs.append(y_d_g)
706 |             fmap_rs.append(fmap_r)
707 |             fmap_gs.append(fmap_g)
708 | 
709 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
710 | 
711 | 
712 | class DiscriminatorS(torch.nn.Module):
713 |     def __init__(self, use_spectral_norm=False):
714 |         super(DiscriminatorS, self).__init__()
715 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
716 |         self.convs = nn.ModuleList(
717 |             [
718 |                 norm_f(Conv1d(1, 16, 15, 1, padding=7)),
719 |                 norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
720 |                 norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
721 |                 norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
722 |                 norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
723 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
724 |             ]
725 |         )
726 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
727 | 
728 |     def forward(self, x):
729 |         fmap = []
730 | 
731 |         for l in self.convs:
732 |             x = l(x)
733 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
734 |             fmap.append(x)
735 |         x = self.conv_post(x)
736 |         fmap.append(x)
737 |         x = torch.flatten(x, 1, -1)
738 | 
739 |         return x, fmap
740 | 
741 | 
742 | class DiscriminatorP(torch.nn.Module):
743 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
744 |         super(DiscriminatorP, self).__init__()
745 |         self.period = period
746 |         self.use_spectral_norm = use_spectral_norm
747 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
748 |         self.convs = nn.ModuleList(
749 |             [
750 |                 norm_f(
751 |                     Conv2d(
752 |                         1,
753 |                         32,
754 |                         (kernel_size, 1),
755 |                         (stride, 1),
756 |                         padding=(get_padding(kernel_size, 1), 0),
757 |                     )
758 |                 ),
759 |                 norm_f(
760 |                     Conv2d(
761 |                         32,
762 |                         128,
763 |                         (kernel_size, 1),
764 |                         (stride, 1),
765 |                         padding=(get_padding(kernel_size, 1), 0),
766 |                     )
767 |                 ),
768 |                 norm_f(
769 |                     Conv2d(
770 |                         128,
771 |                         512,
772 |                         (kernel_size, 1),
773 |                         (stride, 1),
774 |                         padding=(get_padding(kernel_size, 1), 0),
775 |                     )
776 |                 ),
777 |                 norm_f(
778 |                     Conv2d(
779 |                         512,
780 |                         1024,
781 |                         (kernel_size, 1),
782 |                         (stride, 1),
783 |                         padding=(get_padding(kernel_size, 1), 0),
784 |                     )
785 |                 ),
786 |                 norm_f(
787 |                     Conv2d(
788 |                         1024,
789 |                         1024,
790 |                         (kernel_size, 1),
791 |                         1,
792 |                         padding=(get_padding(kernel_size, 1), 0),
793 |                     )
794 |                 ),
795 |             ]
796 |         )
797 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
798 | 
799 |     def forward(self, x):
800 |         fmap = []
801 | 
802 |         # 1d to 2d
803 |         b, c, t = x.shape
804 |         if t % self.period != 0:  # pad first
805 |             n_pad = self.period - (t % self.period)
806 |             x = F.pad(x, (0, n_pad), "reflect")
807 |             t = t + n_pad
808 |         x = x.view(b, c, t // self.period, self.period)
809 | 
810 |         for l in self.convs:
811 |             x = l(x)
812 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
813 |             fmap.append(x)
814 |         x = self.conv_post(x)
815 |         fmap.append(x)
816 |         x = torch.flatten(x, 1, -1)
817 | 
818 |         return x, fmap
819 | 


--------------------------------------------------------------------------------
/infer_pack/models_onnx_moess.py:
--------------------------------------------------------------------------------
  1 | import math, pdb, os
  2 | from time import time as ttime
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | from infer_pack import modules
  7 | from infer_pack import attentions
  8 | from infer_pack import commons
  9 | from infer_pack.commons import init_weights, get_padding
 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 12 | from infer_pack.commons import init_weights
 13 | import numpy as np
 14 | from infer_pack import commons
 15 | 
 16 | 
 17 | class TextEncoder256(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         out_channels,
 21 |         hidden_channels,
 22 |         filter_channels,
 23 |         n_heads,
 24 |         n_layers,
 25 |         kernel_size,
 26 |         p_dropout,
 27 |         f0=True,
 28 |     ):
 29 |         super().__init__()
 30 |         self.out_channels = out_channels
 31 |         self.hidden_channels = hidden_channels
 32 |         self.filter_channels = filter_channels
 33 |         self.n_heads = n_heads
 34 |         self.n_layers = n_layers
 35 |         self.kernel_size = kernel_size
 36 |         self.p_dropout = p_dropout
 37 |         self.emb_phone = nn.Linear(256, hidden_channels)
 38 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
 39 |         if f0 == True:
 40 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
 41 |         self.encoder = attentions.Encoder(
 42 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
 43 |         )
 44 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 45 | 
 46 |     def forward(self, phone, pitch, lengths):
 47 |         if pitch == None:
 48 |             x = self.emb_phone(phone)
 49 |         else:
 50 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
 51 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
 52 |         x = self.lrelu(x)
 53 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
 54 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
 55 |             x.dtype
 56 |         )
 57 |         x = self.encoder(x * x_mask, x_mask)
 58 |         stats = self.proj(x) * x_mask
 59 | 
 60 |         m, logs = torch.split(stats, self.out_channels, dim=1)
 61 |         return m, logs, x_mask
 62 | 
 63 | 
 64 | class TextEncoder256Sim(nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         out_channels,
 68 |         hidden_channels,
 69 |         filter_channels,
 70 |         n_heads,
 71 |         n_layers,
 72 |         kernel_size,
 73 |         p_dropout,
 74 |         f0=True,
 75 |     ):
 76 |         super().__init__()
 77 |         self.out_channels = out_channels
 78 |         self.hidden_channels = hidden_channels
 79 |         self.filter_channels = filter_channels
 80 |         self.n_heads = n_heads
 81 |         self.n_layers = n_layers
 82 |         self.kernel_size = kernel_size
 83 |         self.p_dropout = p_dropout
 84 |         self.emb_phone = nn.Linear(256, hidden_channels)
 85 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
 86 |         if f0 == True:
 87 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
 88 |         self.encoder = attentions.Encoder(
 89 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
 90 |         )
 91 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 92 | 
 93 |     def forward(self, phone, pitch, lengths):
 94 |         if pitch == None:
 95 |             x = self.emb_phone(phone)
 96 |         else:
 97 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
 98 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
 99 |         x = self.lrelu(x)
100 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
101 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102 |             x.dtype
103 |         )
104 |         x = self.encoder(x * x_mask, x_mask)
105 |         x = self.proj(x) * x_mask
106 |         return x, x_mask
107 | 
108 | 
109 | class ResidualCouplingBlock(nn.Module):
110 |     def __init__(
111 |         self,
112 |         channels,
113 |         hidden_channels,
114 |         kernel_size,
115 |         dilation_rate,
116 |         n_layers,
117 |         n_flows=4,
118 |         gin_channels=0,
119 |     ):
120 |         super().__init__()
121 |         self.channels = channels
122 |         self.hidden_channels = hidden_channels
123 |         self.kernel_size = kernel_size
124 |         self.dilation_rate = dilation_rate
125 |         self.n_layers = n_layers
126 |         self.n_flows = n_flows
127 |         self.gin_channels = gin_channels
128 | 
129 |         self.flows = nn.ModuleList()
130 |         for i in range(n_flows):
131 |             self.flows.append(
132 |                 modules.ResidualCouplingLayer(
133 |                     channels,
134 |                     hidden_channels,
135 |                     kernel_size,
136 |                     dilation_rate,
137 |                     n_layers,
138 |                     gin_channels=gin_channels,
139 |                     mean_only=True,
140 |                 )
141 |             )
142 |             self.flows.append(modules.Flip())
143 | 
144 |     def forward(self, x, x_mask, g=None, reverse=False):
145 |         if not reverse:
146 |             for flow in self.flows:
147 |                 x, _ = flow(x, x_mask, g=g, reverse=reverse)
148 |         else:
149 |             for flow in reversed(self.flows):
150 |                 x = flow(x, x_mask, g=g, reverse=reverse)
151 |         return x
152 | 
153 |     def remove_weight_norm(self):
154 |         for i in range(self.n_flows):
155 |             self.flows[i * 2].remove_weight_norm()
156 | 
157 | 
158 | class PosteriorEncoder(nn.Module):
159 |     def __init__(
160 |         self,
161 |         in_channels,
162 |         out_channels,
163 |         hidden_channels,
164 |         kernel_size,
165 |         dilation_rate,
166 |         n_layers,
167 |         gin_channels=0,
168 |     ):
169 |         super().__init__()
170 |         self.in_channels = in_channels
171 |         self.out_channels = out_channels
172 |         self.hidden_channels = hidden_channels
173 |         self.kernel_size = kernel_size
174 |         self.dilation_rate = dilation_rate
175 |         self.n_layers = n_layers
176 |         self.gin_channels = gin_channels
177 | 
178 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
179 |         self.enc = modules.WN(
180 |             hidden_channels,
181 |             kernel_size,
182 |             dilation_rate,
183 |             n_layers,
184 |             gin_channels=gin_channels,
185 |         )
186 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
187 | 
188 |     def forward(self, x, x_lengths, g=None):
189 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
190 |             x.dtype
191 |         )
192 |         x = self.pre(x) * x_mask
193 |         x = self.enc(x, x_mask, g=g)
194 |         stats = self.proj(x) * x_mask
195 |         m, logs = torch.split(stats, self.out_channels, dim=1)
196 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
197 |         return z, m, logs, x_mask
198 | 
199 |     def remove_weight_norm(self):
200 |         self.enc.remove_weight_norm()
201 | 
202 | 
203 | class Generator(torch.nn.Module):
204 |     def __init__(
205 |         self,
206 |         initial_channel,
207 |         resblock,
208 |         resblock_kernel_sizes,
209 |         resblock_dilation_sizes,
210 |         upsample_rates,
211 |         upsample_initial_channel,
212 |         upsample_kernel_sizes,
213 |         gin_channels=0,
214 |     ):
215 |         super(Generator, self).__init__()
216 |         self.num_kernels = len(resblock_kernel_sizes)
217 |         self.num_upsamples = len(upsample_rates)
218 |         self.conv_pre = Conv1d(
219 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
220 |         )
221 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
222 | 
223 |         self.ups = nn.ModuleList()
224 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
225 |             self.ups.append(
226 |                 weight_norm(
227 |                     ConvTranspose1d(
228 |                         upsample_initial_channel // (2**i),
229 |                         upsample_initial_channel // (2 ** (i + 1)),
230 |                         k,
231 |                         u,
232 |                         padding=(k - u) // 2,
233 |                     )
234 |                 )
235 |             )
236 | 
237 |         self.resblocks = nn.ModuleList()
238 |         for i in range(len(self.ups)):
239 |             ch = upsample_initial_channel // (2 ** (i + 1))
240 |             for j, (k, d) in enumerate(
241 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
242 |             ):
243 |                 self.resblocks.append(resblock(ch, k, d))
244 | 
245 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
246 |         self.ups.apply(init_weights)
247 | 
248 |         if gin_channels != 0:
249 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
250 | 
251 |     def forward(self, x, g=None):
252 |         x = self.conv_pre(x)
253 |         if g is not None:
254 |             x = x + self.cond(g)
255 | 
256 |         for i in range(self.num_upsamples):
257 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
258 |             x = self.ups[i](x)
259 |             xs = None
260 |             for j in range(self.num_kernels):
261 |                 if xs is None:
262 |                     xs = self.resblocks[i * self.num_kernels + j](x)
263 |                 else:
264 |                     xs += self.resblocks[i * self.num_kernels + j](x)
265 |             x = xs / self.num_kernels
266 |         x = F.leaky_relu(x)
267 |         x = self.conv_post(x)
268 |         x = torch.tanh(x)
269 | 
270 |         return x
271 | 
272 |     def remove_weight_norm(self):
273 |         for l in self.ups:
274 |             remove_weight_norm(l)
275 |         for l in self.resblocks:
276 |             l.remove_weight_norm()
277 | 
278 | 
279 | class SineGen(torch.nn.Module):
280 |     """Definition of sine generator
281 |     SineGen(samp_rate, harmonic_num = 0,
282 |             sine_amp = 0.1, noise_std = 0.003,
283 |             voiced_threshold = 0,
284 |             flag_for_pulse=False)
285 |     samp_rate: sampling rate in Hz
286 |     harmonic_num: number of harmonic overtones (default 0)
287 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
288 |     noise_std: std of Gaussian noise (default 0.003)
289 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
290 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
291 |     Note: when flag_for_pulse is True, the first time step of a voiced
292 |         segment is always sin(np.pi) or cos(0)
293 |     """
294 | 
295 |     def __init__(
296 |         self,
297 |         samp_rate,
298 |         harmonic_num=0,
299 |         sine_amp=0.1,
300 |         noise_std=0.003,
301 |         voiced_threshold=0,
302 |         flag_for_pulse=False,
303 |     ):
304 |         super(SineGen, self).__init__()
305 |         self.sine_amp = sine_amp
306 |         self.noise_std = noise_std
307 |         self.harmonic_num = harmonic_num
308 |         self.dim = self.harmonic_num + 1
309 |         self.sampling_rate = samp_rate
310 |         self.voiced_threshold = voiced_threshold
311 | 
312 |     def _f02uv(self, f0):
313 |         # generate uv signal
314 |         uv = torch.ones_like(f0)
315 |         uv = uv * (f0 > self.voiced_threshold)
316 |         return uv
317 | 
318 |     def forward(self, f0, upp):
319 |         """sine_tensor, uv = forward(f0)
320 |         input F0: tensor(batchsize=1, length, dim=1)
321 |                   f0 for unvoiced steps should be 0
322 |         output sine_tensor: tensor(batchsize=1, length, dim)
323 |         output uv: tensor(batchsize=1, length, 1)
324 |         """
325 |         with torch.no_grad():
326 |             f0 = f0[:, None].transpose(1, 2)
327 |             f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
328 |             # fundamental component
329 |             f0_buf[:, :, 0] = f0[:, :, 0]
330 |             for idx in np.arange(self.harmonic_num):
331 |                 f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
332 |                     idx + 2
333 |                 )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
334 |             rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
335 |             rand_ini = torch.rand(
336 |                 f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
337 |             )
338 |             rand_ini[:, 0] = 0
339 |             rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
340 |             tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
341 |             tmp_over_one *= upp
342 |             tmp_over_one = F.interpolate(
343 |                 tmp_over_one.transpose(2, 1),
344 |                 scale_factor=upp,
345 |                 mode="linear",
346 |                 align_corners=True,
347 |             ).transpose(2, 1)
348 |             rad_values = F.interpolate(
349 |                 rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
350 |             ).transpose(
351 |                 2, 1
352 |             )  #######
353 |             tmp_over_one %= 1
354 |             tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
355 |             cumsum_shift = torch.zeros_like(rad_values)
356 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
357 |             sine_waves = torch.sin(
358 |                 torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
359 |             )
360 |             sine_waves = sine_waves * self.sine_amp
361 |             uv = self._f02uv(f0)
362 |             uv = F.interpolate(
363 |                 uv.transpose(2, 1), scale_factor=upp, mode="nearest"
364 |             ).transpose(2, 1)
365 |             noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
366 |             noise = noise_amp * torch.randn_like(sine_waves)
367 |             sine_waves = sine_waves * uv + noise
368 |         return sine_waves, uv, noise
369 | 
370 | 
371 | class SourceModuleHnNSF(torch.nn.Module):
372 |     """SourceModule for hn-nsf
373 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
374 |                  add_noise_std=0.003, voiced_threshod=0)
375 |     sampling_rate: sampling_rate in Hz
376 |     harmonic_num: number of harmonic above F0 (default: 0)
377 |     sine_amp: amplitude of sine source signal (default: 0.1)
378 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
379 |         note that amplitude of noise in unvoiced is decided
380 |         by sine_amp
381 |     voiced_threshold: threhold to set U/V given F0 (default: 0)
382 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
383 |     F0_sampled (batchsize, length, 1)
384 |     Sine_source (batchsize, length, 1)
385 |     noise_source (batchsize, length 1)
386 |     uv (batchsize, length, 1)
387 |     """
388 | 
389 |     def __init__(
390 |         self,
391 |         sampling_rate,
392 |         harmonic_num=0,
393 |         sine_amp=0.1,
394 |         add_noise_std=0.003,
395 |         voiced_threshod=0,
396 |         is_half=True,
397 |     ):
398 |         super(SourceModuleHnNSF, self).__init__()
399 | 
400 |         self.sine_amp = sine_amp
401 |         self.noise_std = add_noise_std
402 |         self.is_half = is_half
403 |         # to produce sine waveforms
404 |         self.l_sin_gen = SineGen(
405 |             sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
406 |         )
407 | 
408 |         # to merge source harmonics into a single excitation
409 |         self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
410 |         self.l_tanh = torch.nn.Tanh()
411 | 
412 |     def forward(self, x, upp=None):
413 |         sine_wavs, uv, _ = self.l_sin_gen(x, upp)
414 |         if self.is_half:
415 |             sine_wavs = sine_wavs.half()
416 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
417 |         return sine_merge, None, None  # noise, uv
418 | 
419 | 
420 | class GeneratorNSF(torch.nn.Module):
421 |     def __init__(
422 |         self,
423 |         initial_channel,
424 |         resblock,
425 |         resblock_kernel_sizes,
426 |         resblock_dilation_sizes,
427 |         upsample_rates,
428 |         upsample_initial_channel,
429 |         upsample_kernel_sizes,
430 |         gin_channels,
431 |         sr,
432 |         is_half=False,
433 |     ):
434 |         super(GeneratorNSF, self).__init__()
435 |         self.num_kernels = len(resblock_kernel_sizes)
436 |         self.num_upsamples = len(upsample_rates)
437 | 
438 |         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
439 |         self.m_source = SourceModuleHnNSF(
440 |             sampling_rate=sr, harmonic_num=0, is_half=is_half
441 |         )
442 |         self.noise_convs = nn.ModuleList()
443 |         self.conv_pre = Conv1d(
444 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
445 |         )
446 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
447 | 
448 |         self.ups = nn.ModuleList()
449 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
450 |             c_cur = upsample_initial_channel // (2 ** (i + 1))
451 |             self.ups.append(
452 |                 weight_norm(
453 |                     ConvTranspose1d(
454 |                         upsample_initial_channel // (2**i),
455 |                         upsample_initial_channel // (2 ** (i + 1)),
456 |                         k,
457 |                         u,
458 |                         padding=(k - u) // 2,
459 |                     )
460 |                 )
461 |             )
462 |             if i + 1 < len(upsample_rates):
463 |                 stride_f0 = np.prod(upsample_rates[i + 1 :])
464 |                 self.noise_convs.append(
465 |                     Conv1d(
466 |                         1,
467 |                         c_cur,
468 |                         kernel_size=stride_f0 * 2,
469 |                         stride=stride_f0,
470 |                         padding=stride_f0 // 2,
471 |                     )
472 |                 )
473 |             else:
474 |                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
475 | 
476 |         self.resblocks = nn.ModuleList()
477 |         for i in range(len(self.ups)):
478 |             ch = upsample_initial_channel // (2 ** (i + 1))
479 |             for j, (k, d) in enumerate(
480 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
481 |             ):
482 |                 self.resblocks.append(resblock(ch, k, d))
483 | 
484 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
485 |         self.ups.apply(init_weights)
486 | 
487 |         if gin_channels != 0:
488 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
489 | 
490 |         self.upp = np.prod(upsample_rates)
491 | 
492 |     def forward(self, x, f0, g=None):
493 |         har_source, noi_source, uv = self.m_source(f0, self.upp)
494 |         har_source = har_source.transpose(1, 2)
495 |         x = self.conv_pre(x)
496 |         if g is not None:
497 |             x = x + self.cond(g)
498 | 
499 |         for i in range(self.num_upsamples):
500 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
501 |             x = self.ups[i](x)
502 |             x_source = self.noise_convs[i](har_source)
503 |             x = x + x_source
504 |             xs = None
505 |             for j in range(self.num_kernels):
506 |                 if xs is None:
507 |                     xs = self.resblocks[i * self.num_kernels + j](x)
508 |                 else:
509 |                     xs += self.resblocks[i * self.num_kernels + j](x)
510 |             x = xs / self.num_kernels
511 |         x = F.leaky_relu(x)
512 |         x = self.conv_post(x)
513 |         x = torch.tanh(x)
514 |         return x
515 | 
516 |     def remove_weight_norm(self):
517 |         for l in self.ups:
518 |             remove_weight_norm(l)
519 |         for l in self.resblocks:
520 |             l.remove_weight_norm()
521 | 
522 | 
523 | sr2sr = {
524 |     "32k": 32000,
525 |     "40k": 40000,
526 |     "48k": 48000,
527 | }
528 | 
529 | 
530 | class SynthesizerTrnMs256NSFsidM(nn.Module):
531 |     def __init__(
532 |         self,
533 |         spec_channels,
534 |         segment_size,
535 |         inter_channels,
536 |         hidden_channels,
537 |         filter_channels,
538 |         n_heads,
539 |         n_layers,
540 |         kernel_size,
541 |         p_dropout,
542 |         resblock,
543 |         resblock_kernel_sizes,
544 |         resblock_dilation_sizes,
545 |         upsample_rates,
546 |         upsample_initial_channel,
547 |         upsample_kernel_sizes,
548 |         spk_embed_dim,
549 |         gin_channels,
550 |         sr,
551 |         **kwargs
552 |     ):
553 |         super().__init__()
554 |         if type(sr) == type("strr"):
555 |             sr = sr2sr[sr]
556 |         self.spec_channels = spec_channels
557 |         self.inter_channels = inter_channels
558 |         self.hidden_channels = hidden_channels
559 |         self.filter_channels = filter_channels
560 |         self.n_heads = n_heads
561 |         self.n_layers = n_layers
562 |         self.kernel_size = kernel_size
563 |         self.p_dropout = p_dropout
564 |         self.resblock = resblock
565 |         self.resblock_kernel_sizes = resblock_kernel_sizes
566 |         self.resblock_dilation_sizes = resblock_dilation_sizes
567 |         self.upsample_rates = upsample_rates
568 |         self.upsample_initial_channel = upsample_initial_channel
569 |         self.upsample_kernel_sizes = upsample_kernel_sizes
570 |         self.segment_size = segment_size
571 |         self.gin_channels = gin_channels
572 |         # self.hop_length = hop_length#
573 |         self.spk_embed_dim = spk_embed_dim
574 |         self.enc_p = TextEncoder256(
575 |             inter_channels,
576 |             hidden_channels,
577 |             filter_channels,
578 |             n_heads,
579 |             n_layers,
580 |             kernel_size,
581 |             p_dropout,
582 |         )
583 |         self.dec = GeneratorNSF(
584 |             inter_channels,
585 |             resblock,
586 |             resblock_kernel_sizes,
587 |             resblock_dilation_sizes,
588 |             upsample_rates,
589 |             upsample_initial_channel,
590 |             upsample_kernel_sizes,
591 |             gin_channels=gin_channels,
592 |             sr=sr,
593 |             is_half=kwargs["is_half"],
594 |         )
595 |         self.enc_q = PosteriorEncoder(
596 |             spec_channels,
597 |             inter_channels,
598 |             hidden_channels,
599 |             5,
600 |             1,
601 |             16,
602 |             gin_channels=gin_channels,
603 |         )
604 |         self.flow = ResidualCouplingBlock(
605 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
606 |         )
607 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
608 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
609 | 
610 |     def remove_weight_norm(self):
611 |         self.dec.remove_weight_norm()
612 |         self.flow.remove_weight_norm()
613 |         self.enc_q.remove_weight_norm()
614 | 
615 |     def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
616 |         g = self.emb_g(sid).unsqueeze(-1)
617 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
618 |         z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
619 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
620 |         o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
621 |         return o
622 | 
623 | 
624 | class SynthesizerTrnMs256NSFsid_sim(nn.Module):
625 |     """
626 |     Synthesizer for Training
627 |     """
628 | 
629 |     def __init__(
630 |         self,
631 |         spec_channels,
632 |         segment_size,
633 |         inter_channels,
634 |         hidden_channels,
635 |         filter_channels,
636 |         n_heads,
637 |         n_layers,
638 |         kernel_size,
639 |         p_dropout,
640 |         resblock,
641 |         resblock_kernel_sizes,
642 |         resblock_dilation_sizes,
643 |         upsample_rates,
644 |         upsample_initial_channel,
645 |         upsample_kernel_sizes,
646 |         spk_embed_dim,
647 |         # hop_length,
648 |         gin_channels=0,
649 |         use_sdp=True,
650 |         **kwargs
651 |     ):
652 |         super().__init__()
653 |         self.spec_channels = spec_channels
654 |         self.inter_channels = inter_channels
655 |         self.hidden_channels = hidden_channels
656 |         self.filter_channels = filter_channels
657 |         self.n_heads = n_heads
658 |         self.n_layers = n_layers
659 |         self.kernel_size = kernel_size
660 |         self.p_dropout = p_dropout
661 |         self.resblock = resblock
662 |         self.resblock_kernel_sizes = resblock_kernel_sizes
663 |         self.resblock_dilation_sizes = resblock_dilation_sizes
664 |         self.upsample_rates = upsample_rates
665 |         self.upsample_initial_channel = upsample_initial_channel
666 |         self.upsample_kernel_sizes = upsample_kernel_sizes
667 |         self.segment_size = segment_size
668 |         self.gin_channels = gin_channels
669 |         # self.hop_length = hop_length#
670 |         self.spk_embed_dim = spk_embed_dim
671 |         self.enc_p = TextEncoder256Sim(
672 |             inter_channels,
673 |             hidden_channels,
674 |             filter_channels,
675 |             n_heads,
676 |             n_layers,
677 |             kernel_size,
678 |             p_dropout,
679 |         )
680 |         self.dec = GeneratorNSF(
681 |             inter_channels,
682 |             resblock,
683 |             resblock_kernel_sizes,
684 |             resblock_dilation_sizes,
685 |             upsample_rates,
686 |             upsample_initial_channel,
687 |             upsample_kernel_sizes,
688 |             gin_channels=gin_channels,
689 |             is_half=kwargs["is_half"],
690 |         )
691 | 
692 |         self.flow = ResidualCouplingBlock(
693 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
694 |         )
695 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
696 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
697 | 
698 |     def remove_weight_norm(self):
699 |         self.dec.remove_weight_norm()
700 |         self.flow.remove_weight_norm()
701 |         self.enc_q.remove_weight_norm()
702 | 
703 |     def forward(
704 |         self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
705 |     ):  # y是spec不需要了现在
706 |         g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
707 |         x, x_mask = self.enc_p(phone, pitch, phone_lengths)
708 |         x = self.flow(x, x_mask, g=g, reverse=True)
709 |         o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
710 |         return o
711 | 
712 | 
713 | class MultiPeriodDiscriminator(torch.nn.Module):
714 |     def __init__(self, use_spectral_norm=False):
715 |         super(MultiPeriodDiscriminator, self).__init__()
716 |         periods = [2, 3, 5, 7, 11, 17]
717 |         # periods = [3, 5, 7, 11, 17, 23, 37]
718 | 
719 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
720 |         discs = discs + [
721 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
722 |         ]
723 |         self.discriminators = nn.ModuleList(discs)
724 | 
725 |     def forward(self, y, y_hat):
726 |         y_d_rs = []  #
727 |         y_d_gs = []
728 |         fmap_rs = []
729 |         fmap_gs = []
730 |         for i, d in enumerate(self.discriminators):
731 |             y_d_r, fmap_r = d(y)
732 |             y_d_g, fmap_g = d(y_hat)
733 |             # for j in range(len(fmap_r)):
734 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
735 |             y_d_rs.append(y_d_r)
736 |             y_d_gs.append(y_d_g)
737 |             fmap_rs.append(fmap_r)
738 |             fmap_gs.append(fmap_g)
739 | 
740 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
741 | 
742 | 
743 | class DiscriminatorS(torch.nn.Module):
744 |     def __init__(self, use_spectral_norm=False):
745 |         super(DiscriminatorS, self).__init__()
746 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
747 |         self.convs = nn.ModuleList(
748 |             [
749 |                 norm_f(Conv1d(1, 16, 15, 1, padding=7)),
750 |                 norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
751 |                 norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
752 |                 norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
753 |                 norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
754 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
755 |             ]
756 |         )
757 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
758 | 
759 |     def forward(self, x):
760 |         fmap = []
761 | 
762 |         for l in self.convs:
763 |             x = l(x)
764 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
765 |             fmap.append(x)
766 |         x = self.conv_post(x)
767 |         fmap.append(x)
768 |         x = torch.flatten(x, 1, -1)
769 | 
770 |         return x, fmap
771 | 
772 | 
773 | class DiscriminatorP(torch.nn.Module):
774 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
775 |         super(DiscriminatorP, self).__init__()
776 |         self.period = period
777 |         self.use_spectral_norm = use_spectral_norm
778 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
779 |         self.convs = nn.ModuleList(
780 |             [
781 |                 norm_f(
782 |                     Conv2d(
783 |                         1,
784 |                         32,
785 |                         (kernel_size, 1),
786 |                         (stride, 1),
787 |                         padding=(get_padding(kernel_size, 1), 0),
788 |                     )
789 |                 ),
790 |                 norm_f(
791 |                     Conv2d(
792 |                         32,
793 |                         128,
794 |                         (kernel_size, 1),
795 |                         (stride, 1),
796 |                         padding=(get_padding(kernel_size, 1), 0),
797 |                     )
798 |                 ),
799 |                 norm_f(
800 |                     Conv2d(
801 |                         128,
802 |                         512,
803 |                         (kernel_size, 1),
804 |                         (stride, 1),
805 |                         padding=(get_padding(kernel_size, 1), 0),
806 |                     )
807 |                 ),
808 |                 norm_f(
809 |                     Conv2d(
810 |                         512,
811 |                         1024,
812 |                         (kernel_size, 1),
813 |                         (stride, 1),
814 |                         padding=(get_padding(kernel_size, 1), 0),
815 |                     )
816 |                 ),
817 |                 norm_f(
818 |                     Conv2d(
819 |                         1024,
820 |                         1024,
821 |                         (kernel_size, 1),
822 |                         1,
823 |                         padding=(get_padding(kernel_size, 1), 0),
824 |                     )
825 |                 ),
826 |             ]
827 |         )
828 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
829 | 
830 |     def forward(self, x):
831 |         fmap = []
832 | 
833 |         # 1d to 2d
834 |         b, c, t = x.shape
835 |         if t % self.period != 0:  # pad first
836 |             n_pad = self.period - (t % self.period)
837 |             x = F.pad(x, (0, n_pad), "reflect")
838 |             t = t + n_pad
839 |         x = x.view(b, c, t // self.period, self.period)
840 | 
841 |         for l in self.convs:
842 |             x = l(x)
843 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
844 |             fmap.append(x)
845 |         x = self.conv_post(x)
846 |         fmap.append(x)
847 |         x = torch.flatten(x, 1, -1)
848 | 
849 |         return x, fmap
850 | 


--------------------------------------------------------------------------------
/infer_pack/modelsv2.py:
--------------------------------------------------------------------------------
   1 | import math, pdb, os
   2 | from time import time as ttime
   3 | import torch
   4 | from torch import nn
   5 | from torch.nn import functional as F
   6 | from infer_pack import modules
   7 | from infer_pack import attentions
   8 | from infer_pack import commons
   9 | from infer_pack.commons import init_weights, get_padding
  10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
  12 | from infer_pack.commons import init_weights
  13 | import numpy as np
  14 | from infer_pack import commons
  15 | 
  16 | 
  17 | class TextEncoder256(nn.Module):
  18 |     def __init__(
  19 |         self,
  20 |         out_channels,
  21 |         hidden_channels,
  22 |         filter_channels,
  23 |         n_heads,
  24 |         n_layers,
  25 |         kernel_size,
  26 |         p_dropout,
  27 |         f0=True,
  28 |     ):
  29 |         super().__init__()
  30 |         self.out_channels = out_channels
  31 |         self.hidden_channels = hidden_channels
  32 |         self.filter_channels = filter_channels
  33 |         self.n_heads = n_heads
  34 |         self.n_layers = n_layers
  35 |         self.kernel_size = kernel_size
  36 |         self.p_dropout = p_dropout
  37 |         self.emb_phone = nn.Linear(256, hidden_channels)
  38 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
  39 |         if f0 == True:
  40 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
  41 |         self.encoder = attentions.Encoder(
  42 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
  43 |         )
  44 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
  45 | 
  46 |     def forward(self, phone, pitch, lengths):
  47 |         if pitch == None:
  48 |             x = self.emb_phone(phone)
  49 |         else:
  50 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
  51 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
  52 |         x = self.lrelu(x)
  53 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
  54 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
  55 |             x.dtype
  56 |         )
  57 |         x = self.encoder(x * x_mask, x_mask)
  58 |         stats = self.proj(x) * x_mask
  59 | 
  60 |         m, logs = torch.split(stats, self.out_channels, dim=1)
  61 |         return m, logs, x_mask
  62 | class TextEncoder768(nn.Module):
  63 |     def __init__(
  64 |         self,
  65 |         out_channels,
  66 |         hidden_channels,
  67 |         filter_channels,
  68 |         n_heads,
  69 |         n_layers,
  70 |         kernel_size,
  71 |         p_dropout,
  72 |         f0=True,
  73 |     ):
  74 |         super().__init__()
  75 |         self.out_channels = out_channels
  76 |         self.hidden_channels = hidden_channels
  77 |         self.filter_channels = filter_channels
  78 |         self.n_heads = n_heads
  79 |         self.n_layers = n_layers
  80 |         self.kernel_size = kernel_size
  81 |         self.p_dropout = p_dropout
  82 |         self.emb_phone = nn.Linear(768, hidden_channels)
  83 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
  84 |         if f0 == True:
  85 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
  86 |         self.encoder = attentions.Encoder(
  87 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
  88 |         )
  89 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
  90 | 
  91 |     def forward(self, phone, pitch, lengths):
  92 |         if pitch == None:
  93 |             x = self.emb_phone(phone)
  94 |         else:
  95 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
  96 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
  97 |         x = self.lrelu(x)
  98 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
  99 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
 100 |             x.dtype
 101 |         )
 102 |         x = self.encoder(x * x_mask, x_mask)
 103 |         stats = self.proj(x) * x_mask
 104 | 
 105 |         m, logs = torch.split(stats, self.out_channels, dim=1)
 106 |         return m, logs, x_mask
 107 | 
 108 | class ResidualCouplingBlock(nn.Module):
 109 |     def __init__(
 110 |         self,
 111 |         channels,
 112 |         hidden_channels,
 113 |         kernel_size,
 114 |         dilation_rate,
 115 |         n_layers,
 116 |         n_flows=4,
 117 |         gin_channels=0,
 118 |     ):
 119 |         super().__init__()
 120 |         self.channels = channels
 121 |         self.hidden_channels = hidden_channels
 122 |         self.kernel_size = kernel_size
 123 |         self.dilation_rate = dilation_rate
 124 |         self.n_layers = n_layers
 125 |         self.n_flows = n_flows
 126 |         self.gin_channels = gin_channels
 127 | 
 128 |         self.flows = nn.ModuleList()
 129 |         for i in range(n_flows):
 130 |             self.flows.append(
 131 |                 modules.ResidualCouplingLayer(
 132 |                     channels,
 133 |                     hidden_channels,
 134 |                     kernel_size,
 135 |                     dilation_rate,
 136 |                     n_layers,
 137 |                     gin_channels=gin_channels,
 138 |                     mean_only=True,
 139 |                 )
 140 |             )
 141 |             self.flows.append(modules.Flip())
 142 | 
 143 |     def forward(self, x, x_mask, g=None, reverse=False):
 144 |         if not reverse:
 145 |             for flow in self.flows:
 146 |                 x, _ = flow(x, x_mask, g=g, reverse=reverse)
 147 |         else:
 148 |             for flow in reversed(self.flows):
 149 |                 x = flow(x, x_mask, g=g, reverse=reverse)
 150 |         return x
 151 | 
 152 |     def remove_weight_norm(self):
 153 |         for i in range(self.n_flows):
 154 |             self.flows[i * 2].remove_weight_norm()
 155 | 
 156 | 
 157 | class PosteriorEncoder(nn.Module):
 158 |     def __init__(
 159 |         self,
 160 |         in_channels,
 161 |         out_channels,
 162 |         hidden_channels,
 163 |         kernel_size,
 164 |         dilation_rate,
 165 |         n_layers,
 166 |         gin_channels=0,
 167 |     ):
 168 |         super().__init__()
 169 |         self.in_channels = in_channels
 170 |         self.out_channels = out_channels
 171 |         self.hidden_channels = hidden_channels
 172 |         self.kernel_size = kernel_size
 173 |         self.dilation_rate = dilation_rate
 174 |         self.n_layers = n_layers
 175 |         self.gin_channels = gin_channels
 176 | 
 177 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
 178 |         self.enc = modules.WN(
 179 |             hidden_channels,
 180 |             kernel_size,
 181 |             dilation_rate,
 182 |             n_layers,
 183 |             gin_channels=gin_channels,
 184 |         )
 185 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 186 | 
 187 |     def forward(self, x, x_lengths, g=None):
 188 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
 189 |             x.dtype
 190 |         )
 191 |         x = self.pre(x) * x_mask
 192 |         x = self.enc(x, x_mask, g=g)
 193 |         stats = self.proj(x) * x_mask
 194 |         m, logs = torch.split(stats, self.out_channels, dim=1)
 195 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
 196 |         return z, m, logs, x_mask
 197 | 
 198 |     def remove_weight_norm(self):
 199 |         self.enc.remove_weight_norm()
 200 | 
 201 | 
 202 | class Generator(torch.nn.Module):
 203 |     def __init__(
 204 |         self,
 205 |         initial_channel,
 206 |         resblock,
 207 |         resblock_kernel_sizes,
 208 |         resblock_dilation_sizes,
 209 |         upsample_rates,
 210 |         upsample_initial_channel,
 211 |         upsample_kernel_sizes,
 212 |         gin_channels=0,
 213 |     ):
 214 |         super(Generator, self).__init__()
 215 |         self.num_kernels = len(resblock_kernel_sizes)
 216 |         self.num_upsamples = len(upsample_rates)
 217 |         self.conv_pre = Conv1d(
 218 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
 219 |         )
 220 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
 221 | 
 222 |         self.ups = nn.ModuleList()
 223 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
 224 |             self.ups.append(
 225 |                 weight_norm(
 226 |                     ConvTranspose1d(
 227 |                         upsample_initial_channel // (2**i),
 228 |                         upsample_initial_channel // (2 ** (i + 1)),
 229 |                         k,
 230 |                         u,
 231 |                         padding=(k - u) // 2,
 232 |                     )
 233 |                 )
 234 |             )
 235 | 
 236 |         self.resblocks = nn.ModuleList()
 237 |         for i in range(len(self.ups)):
 238 |             ch = upsample_initial_channel // (2 ** (i + 1))
 239 |             for j, (k, d) in enumerate(
 240 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
 241 |             ):
 242 |                 self.resblocks.append(resblock(ch, k, d))
 243 | 
 244 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
 245 |         self.ups.apply(init_weights)
 246 | 
 247 |         if gin_channels != 0:
 248 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
 249 | 
 250 |     def forward(self, x, g=None):
 251 |         x = self.conv_pre(x)
 252 |         if g is not None:
 253 |             x = x + self.cond(g)
 254 | 
 255 |         for i in range(self.num_upsamples):
 256 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
 257 |             x = self.ups[i](x)
 258 |             xs = None
 259 |             for j in range(self.num_kernels):
 260 |                 if xs is None:
 261 |                     xs = self.resblocks[i * self.num_kernels + j](x)
 262 |                 else:
 263 |                     xs += self.resblocks[i * self.num_kernels + j](x)
 264 |             x = xs / self.num_kernels
 265 |         x = F.leaky_relu(x)
 266 |         x = self.conv_post(x)
 267 |         x = torch.tanh(x)
 268 | 
 269 |         return x
 270 | 
 271 |     def remove_weight_norm(self):
 272 |         for l in self.ups:
 273 |             remove_weight_norm(l)
 274 |         for l in self.resblocks:
 275 |             l.remove_weight_norm()
 276 | 
 277 | 
 278 | class SineGen(torch.nn.Module):
 279 |     """Definition of sine generator
 280 |     SineGen(samp_rate, harmonic_num = 0,
 281 |             sine_amp = 0.1, noise_std = 0.003,
 282 |             voiced_threshold = 0,
 283 |             flag_for_pulse=False)
 284 |     samp_rate: sampling rate in Hz
 285 |     harmonic_num: number of harmonic overtones (default 0)
 286 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
 287 |     noise_std: std of Gaussian noise (default 0.003)
 288 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
 289 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
 290 |     Note: when flag_for_pulse is True, the first time step of a voiced
 291 |         segment is always sin(np.pi) or cos(0)
 292 |     """
 293 | 
 294 |     def __init__(
 295 |         self,
 296 |         samp_rate,
 297 |         harmonic_num=0,
 298 |         sine_amp=0.1,
 299 |         noise_std=0.003,
 300 |         voiced_threshold=0,
 301 |         flag_for_pulse=False,
 302 |     ):
 303 |         super(SineGen, self).__init__()
 304 |         self.sine_amp = sine_amp
 305 |         self.noise_std = noise_std
 306 |         self.harmonic_num = harmonic_num
 307 |         self.dim = self.harmonic_num + 1
 308 |         self.sampling_rate = samp_rate
 309 |         self.voiced_threshold = voiced_threshold
 310 | 
 311 |     def _f02uv(self, f0):
 312 |         # generate uv signal
 313 |         uv = torch.ones_like(f0)
 314 |         uv = uv * (f0 > self.voiced_threshold)
 315 |         return uv
 316 | 
 317 |     def forward(self, f0, upp):
 318 |         """sine_tensor, uv = forward(f0)
 319 |         input F0: tensor(batchsize=1, length, dim=1)
 320 |                   f0 for unvoiced steps should be 0
 321 |         output sine_tensor: tensor(batchsize=1, length, dim)
 322 |         output uv: tensor(batchsize=1, length, 1)
 323 |         """
 324 |         with torch.no_grad():
 325 |             f0 = f0[:, None].transpose(1, 2)
 326 |             f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
 327 |             # fundamental component
 328 |             f0_buf[:, :, 0] = f0[:, :, 0]
 329 |             for idx in np.arange(self.harmonic_num):
 330 |                 f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
 331 |                     idx + 2
 332 |                 )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
 333 |             rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
 334 |             rand_ini = torch.rand(
 335 |                 f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
 336 |             )
 337 |             rand_ini[:, 0] = 0
 338 |             rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
 339 |             tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
 340 |             tmp_over_one *= upp
 341 |             tmp_over_one = F.interpolate(
 342 |                 tmp_over_one.transpose(2, 1),
 343 |                 scale_factor=upp,
 344 |                 mode="linear",
 345 |                 align_corners=True,
 346 |             ).transpose(2, 1)
 347 |             rad_values = F.interpolate(
 348 |                 rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
 349 |             ).transpose(
 350 |                 2, 1
 351 |             )  #######
 352 |             tmp_over_one %= 1
 353 |             tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
 354 |             cumsum_shift = torch.zeros_like(rad_values)
 355 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
 356 |             sine_waves = torch.sin(
 357 |                 torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
 358 |             )
 359 |             sine_waves = sine_waves * self.sine_amp
 360 |             uv = self._f02uv(f0)
 361 |             uv = F.interpolate(
 362 |                 uv.transpose(2, 1), scale_factor=upp, mode="nearest"
 363 |             ).transpose(2, 1)
 364 |             noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
 365 |             noise = noise_amp * torch.randn_like(sine_waves)
 366 |             sine_waves = sine_waves * uv + noise
 367 |         return sine_waves, uv, noise
 368 | 
 369 | 
 370 | class SourceModuleHnNSF(torch.nn.Module):
 371 |     """SourceModule for hn-nsf
 372 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
 373 |                  add_noise_std=0.003, voiced_threshod=0)
 374 |     sampling_rate: sampling_rate in Hz
 375 |     harmonic_num: number of harmonic above F0 (default: 0)
 376 |     sine_amp: amplitude of sine source signal (default: 0.1)
 377 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
 378 |         note that amplitude of noise in unvoiced is decided
 379 |         by sine_amp
 380 |     voiced_threshold: threhold to set U/V given F0 (default: 0)
 381 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
 382 |     F0_sampled (batchsize, length, 1)
 383 |     Sine_source (batchsize, length, 1)
 384 |     noise_source (batchsize, length 1)
 385 |     uv (batchsize, length, 1)
 386 |     """
 387 | 
 388 |     def __init__(
 389 |         self,
 390 |         sampling_rate,
 391 |         harmonic_num=0,
 392 |         sine_amp=0.1,
 393 |         add_noise_std=0.003,
 394 |         voiced_threshod=0,
 395 |         is_half=True,
 396 |     ):
 397 |         super(SourceModuleHnNSF, self).__init__()
 398 | 
 399 |         self.sine_amp = sine_amp
 400 |         self.noise_std = add_noise_std
 401 |         self.is_half = is_half
 402 |         # to produce sine waveforms
 403 |         self.l_sin_gen = SineGen(
 404 |             sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
 405 |         )
 406 | 
 407 |         # to merge source harmonics into a single excitation
 408 |         self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
 409 |         self.l_tanh = torch.nn.Tanh()
 410 | 
 411 |     def forward(self, x, upp=None):
 412 |         sine_wavs, uv, _ = self.l_sin_gen(x, upp)
 413 |         if self.is_half:
 414 |             sine_wavs = sine_wavs.half()
 415 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
 416 |         return sine_merge, None, None  # noise, uv
 417 | 
 418 | 
 419 | class GeneratorNSF(torch.nn.Module):
 420 |     def __init__(
 421 |         self,
 422 |         initial_channel,
 423 |         resblock,
 424 |         resblock_kernel_sizes,
 425 |         resblock_dilation_sizes,
 426 |         upsample_rates,
 427 |         upsample_initial_channel,
 428 |         upsample_kernel_sizes,
 429 |         gin_channels,
 430 |         sr,
 431 |         is_half=False,
 432 |     ):
 433 |         super(GeneratorNSF, self).__init__()
 434 |         self.num_kernels = len(resblock_kernel_sizes)
 435 |         self.num_upsamples = len(upsample_rates)
 436 | 
 437 |         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
 438 |         self.m_source = SourceModuleHnNSF(
 439 |             sampling_rate=sr, harmonic_num=0, is_half=is_half
 440 |         )
 441 |         self.noise_convs = nn.ModuleList()
 442 |         self.conv_pre = Conv1d(
 443 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
 444 |         )
 445 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
 446 | 
 447 |         self.ups = nn.ModuleList()
 448 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
 449 |             c_cur = upsample_initial_channel // (2 ** (i + 1))
 450 |             self.ups.append(
 451 |                 weight_norm(
 452 |                     ConvTranspose1d(
 453 |                         upsample_initial_channel // (2**i),
 454 |                         upsample_initial_channel // (2 ** (i + 1)),
 455 |                         k,
 456 |                         u,
 457 |                         padding=(k - u) // 2,
 458 |                     )
 459 |                 )
 460 |             )
 461 |             if i + 1 < len(upsample_rates):
 462 |                 stride_f0 = np.prod(upsample_rates[i + 1 :])
 463 |                 self.noise_convs.append(
 464 |                     Conv1d(
 465 |                         1,
 466 |                         c_cur,
 467 |                         kernel_size=stride_f0 * 2,
 468 |                         stride=stride_f0,
 469 |                         padding=stride_f0 // 2,
 470 |                     )
 471 |                 )
 472 |             else:
 473 |                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
 474 | 
 475 |         self.resblocks = nn.ModuleList()
 476 |         for i in range(len(self.ups)):
 477 |             ch = upsample_initial_channel // (2 ** (i + 1))
 478 |             for j, (k, d) in enumerate(
 479 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
 480 |             ):
 481 |                 self.resblocks.append(resblock(ch, k, d))
 482 | 
 483 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
 484 |         self.ups.apply(init_weights)
 485 | 
 486 |         if gin_channels != 0:
 487 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
 488 | 
 489 |         self.upp = np.prod(upsample_rates)
 490 | 
 491 |     def forward(self, x, f0, g=None):
 492 |         har_source, noi_source, uv = self.m_source(f0, self.upp)
 493 |         har_source = har_source.transpose(1, 2)
 494 |         x = self.conv_pre(x)
 495 |         if g is not None:
 496 |             x = x + self.cond(g)
 497 | 
 498 |         for i in range(self.num_upsamples):
 499 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
 500 |             x = self.ups[i](x)
 501 |             x_source = self.noise_convs[i](har_source)
 502 |             x = x + x_source
 503 |             xs = None
 504 |             for j in range(self.num_kernels):
 505 |                 if xs is None:
 506 |                     xs = self.resblocks[i * self.num_kernels + j](x)
 507 |                 else:
 508 |                     xs += self.resblocks[i * self.num_kernels + j](x)
 509 |             x = xs / self.num_kernels
 510 |         x = F.leaky_relu(x)
 511 |         x = self.conv_post(x)
 512 |         x = torch.tanh(x)
 513 |         return x
 514 | 
 515 |     def remove_weight_norm(self):
 516 |         for l in self.ups:
 517 |             remove_weight_norm(l)
 518 |         for l in self.resblocks:
 519 |             l.remove_weight_norm()
 520 | 
 521 | 
 522 | sr2sr = {
 523 |     "32k": 32000,
 524 |     "40k": 40000,
 525 |     "48k": 48000,
 526 | }
 527 | 
 528 | 
 529 | class SynthesizerTrnMs256NSFsid(nn.Module):
 530 |     def __init__(
 531 |         self,
 532 |         spec_channels,
 533 |         segment_size,
 534 |         inter_channels,
 535 |         hidden_channels,
 536 |         filter_channels,
 537 |         n_heads,
 538 |         n_layers,
 539 |         kernel_size,
 540 |         p_dropout,
 541 |         resblock,
 542 |         resblock_kernel_sizes,
 543 |         resblock_dilation_sizes,
 544 |         upsample_rates,
 545 |         upsample_initial_channel,
 546 |         upsample_kernel_sizes,
 547 |         spk_embed_dim,
 548 |         gin_channels,
 549 |         sr,
 550 |         **kwargs
 551 |     ):
 552 |         super().__init__()
 553 |         if type(sr) == type("strr"):
 554 |             sr = sr2sr[sr]
 555 |         self.spec_channels = spec_channels
 556 |         self.inter_channels = inter_channels
 557 |         self.hidden_channels = hidden_channels
 558 |         self.filter_channels = filter_channels
 559 |         self.n_heads = n_heads
 560 |         self.n_layers = n_layers
 561 |         self.kernel_size = kernel_size
 562 |         self.p_dropout = p_dropout
 563 |         self.resblock = resblock
 564 |         self.resblock_kernel_sizes = resblock_kernel_sizes
 565 |         self.resblock_dilation_sizes = resblock_dilation_sizes
 566 |         self.upsample_rates = upsample_rates
 567 |         self.upsample_initial_channel = upsample_initial_channel
 568 |         self.upsample_kernel_sizes = upsample_kernel_sizes
 569 |         self.segment_size = segment_size
 570 |         self.gin_channels = gin_channels
 571 |         # self.hop_length = hop_length#
 572 |         self.spk_embed_dim = spk_embed_dim
 573 |         self.enc_p = TextEncoder256(
 574 |             inter_channels,
 575 |             hidden_channels,
 576 |             filter_channels,
 577 |             n_heads,
 578 |             n_layers,
 579 |             kernel_size,
 580 |             p_dropout,
 581 |         )
 582 |         self.dec = GeneratorNSF(
 583 |             inter_channels,
 584 |             resblock,
 585 |             resblock_kernel_sizes,
 586 |             resblock_dilation_sizes,
 587 |             upsample_rates,
 588 |             upsample_initial_channel,
 589 |             upsample_kernel_sizes,
 590 |             gin_channels=gin_channels,
 591 |             sr=sr,
 592 |             is_half=kwargs["is_half"],
 593 |         )
 594 |         self.enc_q = PosteriorEncoder(
 595 |             spec_channels,
 596 |             inter_channels,
 597 |             hidden_channels,
 598 |             5,
 599 |             1,
 600 |             16,
 601 |             gin_channels=gin_channels,
 602 |         )
 603 |         self.flow = ResidualCouplingBlock(
 604 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
 605 |         )
 606 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
 607 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
 608 | 
 609 |     def remove_weight_norm(self):
 610 |         self.dec.remove_weight_norm()
 611 |         self.flow.remove_weight_norm()
 612 |         self.enc_q.remove_weight_norm()
 613 | 
 614 |     def forward(
 615 |         self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
 616 |     ):  # 这里ds是id，[bs,1]
 617 |         # print(1,pitch.shape)#[bs,t]
 618 |         g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
 619 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
 620 |         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
 621 |         z_p = self.flow(z, y_mask, g=g)
 622 |         z_slice, ids_slice = commons.rand_slice_segments(
 623 |             z, y_lengths, self.segment_size
 624 |         )
 625 |         # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
 626 |         pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
 627 |         # print(-2,pitchf.shape,z_slice.shape)
 628 |         o = self.dec(z_slice, pitchf, g=g)
 629 |         return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 630 | 
 631 |     def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
 632 |         g = self.emb_g(sid).unsqueeze(-1)
 633 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
 634 |         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
 635 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
 636 |         o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
 637 |         return o, x_mask, (z, z_p, m_p, logs_p)
 638 | class SynthesizerTrnMs768NSFsid(nn.Module):
 639 |     def __init__(
 640 |         self,
 641 |         spec_channels,
 642 |         segment_size,
 643 |         inter_channels,
 644 |         hidden_channels,
 645 |         filter_channels,
 646 |         n_heads,
 647 |         n_layers,
 648 |         kernel_size,
 649 |         p_dropout,
 650 |         resblock,
 651 |         resblock_kernel_sizes,
 652 |         resblock_dilation_sizes,
 653 |         upsample_rates,
 654 |         upsample_initial_channel,
 655 |         upsample_kernel_sizes,
 656 |         spk_embed_dim,
 657 |         gin_channels,
 658 |         sr,
 659 |         **kwargs
 660 |     ):
 661 |         super().__init__()
 662 |         if type(sr) == type("strr"):
 663 |             sr = sr2sr[sr]
 664 |         self.spec_channels = spec_channels
 665 |         self.inter_channels = inter_channels
 666 |         self.hidden_channels = hidden_channels
 667 |         self.filter_channels = filter_channels
 668 |         self.n_heads = n_heads
 669 |         self.n_layers = n_layers
 670 |         self.kernel_size = kernel_size
 671 |         self.p_dropout = p_dropout
 672 |         self.resblock = resblock
 673 |         self.resblock_kernel_sizes = resblock_kernel_sizes
 674 |         self.resblock_dilation_sizes = resblock_dilation_sizes
 675 |         self.upsample_rates = upsample_rates
 676 |         self.upsample_initial_channel = upsample_initial_channel
 677 |         self.upsample_kernel_sizes = upsample_kernel_sizes
 678 |         self.segment_size = segment_size
 679 |         self.gin_channels = gin_channels
 680 |         # self.hop_length = hop_length#
 681 |         self.spk_embed_dim = spk_embed_dim
 682 |         self.enc_p = TextEncoder768(
 683 |             inter_channels,
 684 |             hidden_channels,
 685 |             filter_channels,
 686 |             n_heads,
 687 |             n_layers,
 688 |             kernel_size,
 689 |             p_dropout,
 690 |         )
 691 |         self.dec = GeneratorNSF(
 692 |             inter_channels,
 693 |             resblock,
 694 |             resblock_kernel_sizes,
 695 |             resblock_dilation_sizes,
 696 |             upsample_rates,
 697 |             upsample_initial_channel,
 698 |             upsample_kernel_sizes,
 699 |             gin_channels=gin_channels,
 700 |             sr=sr,
 701 |             is_half=kwargs["is_half"],
 702 |         )
 703 |         self.enc_q = PosteriorEncoder(
 704 |             spec_channels,
 705 |             inter_channels,
 706 |             hidden_channels,
 707 |             5,
 708 |             1,
 709 |             16,
 710 |             gin_channels=gin_channels,
 711 |         )
 712 |         self.flow = ResidualCouplingBlock(
 713 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
 714 |         )
 715 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
 716 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
 717 | 
 718 |     def remove_weight_norm(self):
 719 |         self.dec.remove_weight_norm()
 720 |         self.flow.remove_weight_norm()
 721 |         self.enc_q.remove_weight_norm()
 722 | 
 723 |     def forward(
 724 |         self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
 725 |     ):  # 这里ds是id，[bs,1]
 726 |         # print(1,pitch.shape)#[bs,t]
 727 |         g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
 728 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
 729 |         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
 730 |         z_p = self.flow(z, y_mask, g=g)
 731 |         z_slice, ids_slice = commons.rand_slice_segments(
 732 |             z, y_lengths, self.segment_size
 733 |         )
 734 |         # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
 735 |         pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
 736 |         # print(-2,pitchf.shape,z_slice.shape)
 737 |         o = self.dec(z_slice, pitchf, g=g)
 738 |         return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 739 | 
 740 |     def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
 741 |         g = self.emb_g(sid).unsqueeze(-1)
 742 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
 743 |         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
 744 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
 745 |         o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
 746 |         return o, x_mask, (z, z_p, m_p, logs_p)
 747 | 
 748 | 
 749 | class SynthesizerTrnMs256NSFsid_nono(nn.Module):
 750 |     def __init__(
 751 |         self,
 752 |         spec_channels,
 753 |         segment_size,
 754 |         inter_channels,
 755 |         hidden_channels,
 756 |         filter_channels,
 757 |         n_heads,
 758 |         n_layers,
 759 |         kernel_size,
 760 |         p_dropout,
 761 |         resblock,
 762 |         resblock_kernel_sizes,
 763 |         resblock_dilation_sizes,
 764 |         upsample_rates,
 765 |         upsample_initial_channel,
 766 |         upsample_kernel_sizes,
 767 |         spk_embed_dim,
 768 |         gin_channels,
 769 |         sr=None,
 770 |         **kwargs
 771 |     ):
 772 |         super().__init__()
 773 |         self.spec_channels = spec_channels
 774 |         self.inter_channels = inter_channels
 775 |         self.hidden_channels = hidden_channels
 776 |         self.filter_channels = filter_channels
 777 |         self.n_heads = n_heads
 778 |         self.n_layers = n_layers
 779 |         self.kernel_size = kernel_size
 780 |         self.p_dropout = p_dropout
 781 |         self.resblock = resblock
 782 |         self.resblock_kernel_sizes = resblock_kernel_sizes
 783 |         self.resblock_dilation_sizes = resblock_dilation_sizes
 784 |         self.upsample_rates = upsample_rates
 785 |         self.upsample_initial_channel = upsample_initial_channel
 786 |         self.upsample_kernel_sizes = upsample_kernel_sizes
 787 |         self.segment_size = segment_size
 788 |         self.gin_channels = gin_channels
 789 |         # self.hop_length = hop_length#
 790 |         self.spk_embed_dim = spk_embed_dim
 791 |         self.enc_p = TextEncoder256(
 792 |             inter_channels,
 793 |             hidden_channels,
 794 |             filter_channels,
 795 |             n_heads,
 796 |             n_layers,
 797 |             kernel_size,
 798 |             p_dropout,
 799 |             f0=False,
 800 |         )
 801 |         self.dec = Generator(
 802 |             inter_channels,
 803 |             resblock,
 804 |             resblock_kernel_sizes,
 805 |             resblock_dilation_sizes,
 806 |             upsample_rates,
 807 |             upsample_initial_channel,
 808 |             upsample_kernel_sizes,
 809 |             gin_channels=gin_channels,
 810 |         )
 811 |         self.enc_q = PosteriorEncoder(
 812 |             spec_channels,
 813 |             inter_channels,
 814 |             hidden_channels,
 815 |             5,
 816 |             1,
 817 |             16,
 818 |             gin_channels=gin_channels,
 819 |         )
 820 |         self.flow = ResidualCouplingBlock(
 821 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
 822 |         )
 823 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
 824 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
 825 | 
 826 |     def remove_weight_norm(self):
 827 |         self.dec.remove_weight_norm()
 828 |         self.flow.remove_weight_norm()
 829 |         self.enc_q.remove_weight_norm()
 830 | 
 831 |     def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
 832 |         g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
 833 |         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
 834 |         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
 835 |         z_p = self.flow(z, y_mask, g=g)
 836 |         z_slice, ids_slice = commons.rand_slice_segments(
 837 |             z, y_lengths, self.segment_size
 838 |         )
 839 |         o = self.dec(z_slice, g=g)
 840 |         return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 841 | 
 842 |     def infer(self, phone, phone_lengths, sid, max_len=None):
 843 |         g = self.emb_g(sid).unsqueeze(-1)
 844 |         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
 845 |         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
 846 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
 847 |         o = self.dec((z * x_mask)[:, :, :max_len], g=g)
 848 |         return o, x_mask, (z, z_p, m_p, logs_p)
 849 | class SynthesizerTrnMs768NSFsid_nono(nn.Module):
 850 |     def __init__(
 851 |         self,
 852 |         spec_channels,
 853 |         segment_size,
 854 |         inter_channels,
 855 |         hidden_channels,
 856 |         filter_channels,
 857 |         n_heads,
 858 |         n_layers,
 859 |         kernel_size,
 860 |         p_dropout,
 861 |         resblock,
 862 |         resblock_kernel_sizes,
 863 |         resblock_dilation_sizes,
 864 |         upsample_rates,
 865 |         upsample_initial_channel,
 866 |         upsample_kernel_sizes,
 867 |         spk_embed_dim,
 868 |         gin_channels,
 869 |         sr=None,
 870 |         **kwargs
 871 |     ):
 872 |         super().__init__()
 873 |         self.spec_channels = spec_channels
 874 |         self.inter_channels = inter_channels
 875 |         self.hidden_channels = hidden_channels
 876 |         self.filter_channels = filter_channels
 877 |         self.n_heads = n_heads
 878 |         self.n_layers = n_layers
 879 |         self.kernel_size = kernel_size
 880 |         self.p_dropout = p_dropout
 881 |         self.resblock = resblock
 882 |         self.resblock_kernel_sizes = resblock_kernel_sizes
 883 |         self.resblock_dilation_sizes = resblock_dilation_sizes
 884 |         self.upsample_rates = upsample_rates
 885 |         self.upsample_initial_channel = upsample_initial_channel
 886 |         self.upsample_kernel_sizes = upsample_kernel_sizes
 887 |         self.segment_size = segment_size
 888 |         self.gin_channels = gin_channels
 889 |         # self.hop_length = hop_length#
 890 |         self.spk_embed_dim = spk_embed_dim
 891 |         self.enc_p = TextEncoder768(
 892 |             inter_channels,
 893 |             hidden_channels,
 894 |             filter_channels,
 895 |             n_heads,
 896 |             n_layers,
 897 |             kernel_size,
 898 |             p_dropout,
 899 |             f0=False,
 900 |         )
 901 |         self.dec = Generator(
 902 |             inter_channels,
 903 |             resblock,
 904 |             resblock_kernel_sizes,
 905 |             resblock_dilation_sizes,
 906 |             upsample_rates,
 907 |             upsample_initial_channel,
 908 |             upsample_kernel_sizes,
 909 |             gin_channels=gin_channels,
 910 |         )
 911 |         self.enc_q = PosteriorEncoder(
 912 |             spec_channels,
 913 |             inter_channels,
 914 |             hidden_channels,
 915 |             5,
 916 |             1,
 917 |             16,
 918 |             gin_channels=gin_channels,
 919 |         )
 920 |         self.flow = ResidualCouplingBlock(
 921 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
 922 |         )
 923 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
 924 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
 925 | 
 926 |     def remove_weight_norm(self):
 927 |         self.dec.remove_weight_norm()
 928 |         self.flow.remove_weight_norm()
 929 |         self.enc_q.remove_weight_norm()
 930 | 
 931 |     def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
 932 |         g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
 933 |         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
 934 |         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
 935 |         z_p = self.flow(z, y_mask, g=g)
 936 |         z_slice, ids_slice = commons.rand_slice_segments(
 937 |             z, y_lengths, self.segment_size
 938 |         )
 939 |         o = self.dec(z_slice, g=g)
 940 |         return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 941 | 
 942 |     def infer(self, phone, phone_lengths, sid, max_len=None):
 943 |         g = self.emb_g(sid).unsqueeze(-1)
 944 |         m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
 945 |         z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
 946 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
 947 |         o = self.dec((z * x_mask)[:, :, :max_len], g=g)
 948 |         return o, x_mask, (z, z_p, m_p, logs_p)
 949 | 
 950 | 
 951 | class MultiPeriodDiscriminator(torch.nn.Module):
 952 |     def __init__(self, use_spectral_norm=False):
 953 |         super(MultiPeriodDiscriminator, self).__init__()
 954 |         periods = [2, 3, 5, 7, 11, 17]
 955 |         # periods = [3, 5, 7, 11, 17, 23, 37]
 956 | 
 957 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
 958 |         discs = discs + [
 959 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
 960 |         ]
 961 |         self.discriminators = nn.ModuleList(discs)
 962 | 
 963 |     def forward(self, y, y_hat):
 964 |         y_d_rs = []  #
 965 |         y_d_gs = []
 966 |         fmap_rs = []
 967 |         fmap_gs = []
 968 |         for i, d in enumerate(self.discriminators):
 969 |             y_d_r, fmap_r = d(y)
 970 |             y_d_g, fmap_g = d(y_hat)
 971 |             # for j in range(len(fmap_r)):
 972 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
 973 |             y_d_rs.append(y_d_r)
 974 |             y_d_gs.append(y_d_g)
 975 |             fmap_rs.append(fmap_r)
 976 |             fmap_gs.append(fmap_g)
 977 | 
 978 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
 979 | 
 980 | class MultiPeriodDiscriminatorV2(torch.nn.Module):
 981 |     def __init__(self, use_spectral_norm=False):
 982 |         super(MultiPeriodDiscriminatorV2, self).__init__()
 983 |         # periods = [2, 3, 5, 7, 11, 17]
 984 |         periods = [2,3, 5, 7, 11, 17, 23, 37]
 985 | 
 986 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
 987 |         discs = discs + [
 988 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
 989 |         ]
 990 |         self.discriminators = nn.ModuleList(discs)
 991 | 
 992 |     def forward(self, y, y_hat):
 993 |         y_d_rs = []  #
 994 |         y_d_gs = []
 995 |         fmap_rs = []
 996 |         fmap_gs = []
 997 |         for i, d in enumerate(self.discriminators):
 998 |             y_d_r, fmap_r = d(y)
 999 |             y_d_g, fmap_g = d(y_hat)
1000 |             # for j in range(len(fmap_r)):
1001 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1002 |             y_d_rs.append(y_d_r)
1003 |             y_d_gs.append(y_d_g)
1004 |             fmap_rs.append(fmap_r)
1005 |             fmap_gs.append(fmap_g)
1006 | 
1007 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1008 | 
1009 | 
1010 | class DiscriminatorS(torch.nn.Module):
1011 |     def __init__(self, use_spectral_norm=False):
1012 |         super(DiscriminatorS, self).__init__()
1013 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1014 |         self.convs = nn.ModuleList(
1015 |             [
1016 |                 norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1017 |                 norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1018 |                 norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1019 |                 norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1020 |                 norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1021 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1022 |             ]
1023 |         )
1024 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1025 | 
1026 |     def forward(self, x):
1027 |         fmap = []
1028 | 
1029 |         for l in self.convs:
1030 |             x = l(x)
1031 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
1032 |             fmap.append(x)
1033 |         x = self.conv_post(x)
1034 |         fmap.append(x)
1035 |         x = torch.flatten(x, 1, -1)
1036 | 
1037 |         return x, fmap
1038 | 
1039 | 
1040 | class DiscriminatorP(torch.nn.Module):
1041 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1042 |         super(DiscriminatorP, self).__init__()
1043 |         self.period = period
1044 |         self.use_spectral_norm = use_spectral_norm
1045 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1046 |         self.convs = nn.ModuleList(
1047 |             [
1048 |                 norm_f(
1049 |                     Conv2d(
1050 |                         1,
1051 |                         32,
1052 |                         (kernel_size, 1),
1053 |                         (stride, 1),
1054 |                         padding=(get_padding(kernel_size, 1), 0),
1055 |                     )
1056 |                 ),
1057 |                 norm_f(
1058 |                     Conv2d(
1059 |                         32,
1060 |                         128,
1061 |                         (kernel_size, 1),
1062 |                         (stride, 1),
1063 |                         padding=(get_padding(kernel_size, 1), 0),
1064 |                     )
1065 |                 ),
1066 |                 norm_f(
1067 |                     Conv2d(
1068 |                         128,
1069 |                         512,
1070 |                         (kernel_size, 1),
1071 |                         (stride, 1),
1072 |                         padding=(get_padding(kernel_size, 1), 0),
1073 |                     )
1074 |                 ),
1075 |                 norm_f(
1076 |                     Conv2d(
1077 |                         512,
1078 |                         1024,
1079 |                         (kernel_size, 1),
1080 |                         (stride, 1),
1081 |                         padding=(get_padding(kernel_size, 1), 0),
1082 |                     )
1083 |                 ),
1084 |                 norm_f(
1085 |                     Conv2d(
1086 |                         1024,
1087 |                         1024,
1088 |                         (kernel_size, 1),
1089 |                         1,
1090 |                         padding=(get_padding(kernel_size, 1), 0),
1091 |                     )
1092 |                 ),
1093 |             ]
1094 |         )
1095 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1096 | 
1097 |     def forward(self, x):
1098 |         fmap = []
1099 | 
1100 |         # 1d to 2d
1101 |         b, c, t = x.shape
1102 |         if t % self.period != 0:  # pad first
1103 |             n_pad = self.period - (t % self.period)
1104 |             x = F.pad(x, (0, n_pad), "reflect")
1105 |             t = t + n_pad
1106 |         x = x.view(b, c, t // self.period, self.period)
1107 | 
1108 |         for l in self.convs:
1109 |             x = l(x)
1110 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
1111 |             fmap.append(x)
1112 |         x = self.conv_post(x)
1113 |         fmap.append(x)
1114 |         x = torch.flatten(x, 1, -1)
1115 | 
1116 |         return x, fmap
1117 | 


--------------------------------------------------------------------------------
/infer_pack/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 10 | from torch.nn.utils import weight_norm, remove_weight_norm
 11 | 
 12 | from infer_pack import commons
 13 | from infer_pack.commons import init_weights, get_padding
 14 | from infer_pack.transforms import piecewise_rational_quadratic_transform
 15 | 
 16 | 
 17 | LRELU_SLOPE = 0.1
 18 | 
 19 | 
 20 | class LayerNorm(nn.Module):
 21 |     def __init__(self, channels, eps=1e-5):
 22 |         super().__init__()
 23 |         self.channels = channels
 24 |         self.eps = eps
 25 | 
 26 |         self.gamma = nn.Parameter(torch.ones(channels))
 27 |         self.beta = nn.Parameter(torch.zeros(channels))
 28 | 
 29 |     def forward(self, x):
 30 |         x = x.transpose(1, -1)
 31 |         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 32 |         return x.transpose(1, -1)
 33 | 
 34 | 
 35 | class ConvReluNorm(nn.Module):
 36 |     def __init__(
 37 |         self,
 38 |         in_channels,
 39 |         hidden_channels,
 40 |         out_channels,
 41 |         kernel_size,
 42 |         n_layers,
 43 |         p_dropout,
 44 |     ):
 45 |         super().__init__()
 46 |         self.in_channels = in_channels
 47 |         self.hidden_channels = hidden_channels
 48 |         self.out_channels = out_channels
 49 |         self.kernel_size = kernel_size
 50 |         self.n_layers = n_layers
 51 |         self.p_dropout = p_dropout
 52 |         assert n_layers > 1, "Number of layers should be larger than 0."
 53 | 
 54 |         self.conv_layers = nn.ModuleList()
 55 |         self.norm_layers = nn.ModuleList()
 56 |         self.conv_layers.append(
 57 |             nn.Conv1d(
 58 |                 in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
 59 |             )
 60 |         )
 61 |         self.norm_layers.append(LayerNorm(hidden_channels))
 62 |         self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
 63 |         for _ in range(n_layers - 1):
 64 |             self.conv_layers.append(
 65 |                 nn.Conv1d(
 66 |                     hidden_channels,
 67 |                     hidden_channels,
 68 |                     kernel_size,
 69 |                     padding=kernel_size // 2,
 70 |                 )
 71 |             )
 72 |             self.norm_layers.append(LayerNorm(hidden_channels))
 73 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 74 |         self.proj.weight.data.zero_()
 75 |         self.proj.bias.data.zero_()
 76 | 
 77 |     def forward(self, x, x_mask):
 78 |         x_org = x
 79 |         for i in range(self.n_layers):
 80 |             x = self.conv_layers[i](x * x_mask)
 81 |             x = self.norm_layers[i](x)
 82 |             x = self.relu_drop(x)
 83 |         x = x_org + self.proj(x)
 84 |         return x * x_mask
 85 | 
 86 | 
 87 | class DDSConv(nn.Module):
 88 |     """
 89 |     Dialted and Depth-Separable Convolution
 90 |     """
 91 | 
 92 |     def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
 93 |         super().__init__()
 94 |         self.channels = channels
 95 |         self.kernel_size = kernel_size
 96 |         self.n_layers = n_layers
 97 |         self.p_dropout = p_dropout
 98 | 
 99 |         self.drop = nn.Dropout(p_dropout)
100 |         self.convs_sep = nn.ModuleList()
101 |         self.convs_1x1 = nn.ModuleList()
102 |         self.norms_1 = nn.ModuleList()
103 |         self.norms_2 = nn.ModuleList()
104 |         for i in range(n_layers):
105 |             dilation = kernel_size**i
106 |             padding = (kernel_size * dilation - dilation) // 2
107 |             self.convs_sep.append(
108 |                 nn.Conv1d(
109 |                     channels,
110 |                     channels,
111 |                     kernel_size,
112 |                     groups=channels,
113 |                     dilation=dilation,
114 |                     padding=padding,
115 |                 )
116 |             )
117 |             self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118 |             self.norms_1.append(LayerNorm(channels))
119 |             self.norms_2.append(LayerNorm(channels))
120 | 
121 |     def forward(self, x, x_mask, g=None):
122 |         if g is not None:
123 |             x = x + g
124 |         for i in range(self.n_layers):
125 |             y = self.convs_sep[i](x * x_mask)
126 |             y = self.norms_1[i](y)
127 |             y = F.gelu(y)
128 |             y = self.convs_1x1[i](y)
129 |             y = self.norms_2[i](y)
130 |             y = F.gelu(y)
131 |             y = self.drop(y)
132 |             x = x + y
133 |         return x * x_mask
134 | 
135 | 
136 | class WN(torch.nn.Module):
137 |     def __init__(
138 |         self,
139 |         hidden_channels,
140 |         kernel_size,
141 |         dilation_rate,
142 |         n_layers,
143 |         gin_channels=0,
144 |         p_dropout=0,
145 |     ):
146 |         super(WN, self).__init__()
147 |         assert kernel_size % 2 == 1
148 |         self.hidden_channels = hidden_channels
149 |         self.kernel_size = (kernel_size,)
150 |         self.dilation_rate = dilation_rate
151 |         self.n_layers = n_layers
152 |         self.gin_channels = gin_channels
153 |         self.p_dropout = p_dropout
154 | 
155 |         self.in_layers = torch.nn.ModuleList()
156 |         self.res_skip_layers = torch.nn.ModuleList()
157 |         self.drop = nn.Dropout(p_dropout)
158 | 
159 |         if gin_channels != 0:
160 |             cond_layer = torch.nn.Conv1d(
161 |                 gin_channels, 2 * hidden_channels * n_layers, 1
162 |             )
163 |             self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164 | 
165 |         for i in range(n_layers):
166 |             dilation = dilation_rate**i
167 |             padding = int((kernel_size * dilation - dilation) / 2)
168 |             in_layer = torch.nn.Conv1d(
169 |                 hidden_channels,
170 |                 2 * hidden_channels,
171 |                 kernel_size,
172 |                 dilation=dilation,
173 |                 padding=padding,
174 |             )
175 |             in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176 |             self.in_layers.append(in_layer)
177 | 
178 |             # last one is not necessary
179 |             if i < n_layers - 1:
180 |                 res_skip_channels = 2 * hidden_channels
181 |             else:
182 |                 res_skip_channels = hidden_channels
183 | 
184 |             res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186 |             self.res_skip_layers.append(res_skip_layer)
187 | 
188 |     def forward(self, x, x_mask, g=None, **kwargs):
189 |         output = torch.zeros_like(x)
190 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
191 | 
192 |         if g is not None:
193 |             g = self.cond_layer(g)
194 | 
195 |         for i in range(self.n_layers):
196 |             x_in = self.in_layers[i](x)
197 |             if g is not None:
198 |                 cond_offset = i * 2 * self.hidden_channels
199 |                 g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200 |             else:
201 |                 g_l = torch.zeros_like(x_in)
202 | 
203 |             acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204 |             acts = self.drop(acts)
205 | 
206 |             res_skip_acts = self.res_skip_layers[i](acts)
207 |             if i < self.n_layers - 1:
208 |                 res_acts = res_skip_acts[:, : self.hidden_channels, :]
209 |                 x = (x + res_acts) * x_mask
210 |                 output = output + res_skip_acts[:, self.hidden_channels :, :]
211 |             else:
212 |                 output = output + res_skip_acts
213 |         return output * x_mask
214 | 
215 |     def remove_weight_norm(self):
216 |         if self.gin_channels != 0:
217 |             torch.nn.utils.remove_weight_norm(self.cond_layer)
218 |         for l in self.in_layers:
219 |             torch.nn.utils.remove_weight_norm(l)
220 |         for l in self.res_skip_layers:
221 |             torch.nn.utils.remove_weight_norm(l)
222 | 
223 | 
224 | class ResBlock1(torch.nn.Module):
225 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226 |         super(ResBlock1, self).__init__()
227 |         self.convs1 = nn.ModuleList(
228 |             [
229 |                 weight_norm(
230 |                     Conv1d(
231 |                         channels,
232 |                         channels,
233 |                         kernel_size,
234 |                         1,
235 |                         dilation=dilation[0],
236 |                         padding=get_padding(kernel_size, dilation[0]),
237 |                     )
238 |                 ),
239 |                 weight_norm(
240 |                     Conv1d(
241 |                         channels,
242 |                         channels,
243 |                         kernel_size,
244 |                         1,
245 |                         dilation=dilation[1],
246 |                         padding=get_padding(kernel_size, dilation[1]),
247 |                     )
248 |                 ),
249 |                 weight_norm(
250 |                     Conv1d(
251 |                         channels,
252 |                         channels,
253 |                         kernel_size,
254 |                         1,
255 |                         dilation=dilation[2],
256 |                         padding=get_padding(kernel_size, dilation[2]),
257 |                     )
258 |                 ),
259 |             ]
260 |         )
261 |         self.convs1.apply(init_weights)
262 | 
263 |         self.convs2 = nn.ModuleList(
264 |             [
265 |                 weight_norm(
266 |                     Conv1d(
267 |                         channels,
268 |                         channels,
269 |                         kernel_size,
270 |                         1,
271 |                         dilation=1,
272 |                         padding=get_padding(kernel_size, 1),
273 |                     )
274 |                 ),
275 |                 weight_norm(
276 |                     Conv1d(
277 |                         channels,
278 |                         channels,
279 |                         kernel_size,
280 |                         1,
281 |                         dilation=1,
282 |                         padding=get_padding(kernel_size, 1),
283 |                     )
284 |                 ),
285 |                 weight_norm(
286 |                     Conv1d(
287 |                         channels,
288 |                         channels,
289 |                         kernel_size,
290 |                         1,
291 |                         dilation=1,
292 |                         padding=get_padding(kernel_size, 1),
293 |                     )
294 |                 ),
295 |             ]
296 |         )
297 |         self.convs2.apply(init_weights)
298 | 
299 |     def forward(self, x, x_mask=None):
300 |         for c1, c2 in zip(self.convs1, self.convs2):
301 |             xt = F.leaky_relu(x, LRELU_SLOPE)
302 |             if x_mask is not None:
303 |                 xt = xt * x_mask
304 |             xt = c1(xt)
305 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
306 |             if x_mask is not None:
307 |                 xt = xt * x_mask
308 |             xt = c2(xt)
309 |             x = xt + x
310 |         if x_mask is not None:
311 |             x = x * x_mask
312 |         return x
313 | 
314 |     def remove_weight_norm(self):
315 |         for l in self.convs1:
316 |             remove_weight_norm(l)
317 |         for l in self.convs2:
318 |             remove_weight_norm(l)
319 | 
320 | 
321 | class ResBlock2(torch.nn.Module):
322 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323 |         super(ResBlock2, self).__init__()
324 |         self.convs = nn.ModuleList(
325 |             [
326 |                 weight_norm(
327 |                     Conv1d(
328 |                         channels,
329 |                         channels,
330 |                         kernel_size,
331 |                         1,
332 |                         dilation=dilation[0],
333 |                         padding=get_padding(kernel_size, dilation[0]),
334 |                     )
335 |                 ),
336 |                 weight_norm(
337 |                     Conv1d(
338 |                         channels,
339 |                         channels,
340 |                         kernel_size,
341 |                         1,
342 |                         dilation=dilation[1],
343 |                         padding=get_padding(kernel_size, dilation[1]),
344 |                     )
345 |                 ),
346 |             ]
347 |         )
348 |         self.convs.apply(init_weights)
349 | 
350 |     def forward(self, x, x_mask=None):
351 |         for c in self.convs:
352 |             xt = F.leaky_relu(x, LRELU_SLOPE)
353 |             if x_mask is not None:
354 |                 xt = xt * x_mask
355 |             xt = c(xt)
356 |             x = xt + x
357 |         if x_mask is not None:
358 |             x = x * x_mask
359 |         return x
360 | 
361 |     def remove_weight_norm(self):
362 |         for l in self.convs:
363 |             remove_weight_norm(l)
364 | 
365 | 
366 | class Log(nn.Module):
367 |     def forward(self, x, x_mask, reverse=False, **kwargs):
368 |         if not reverse:
369 |             y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370 |             logdet = torch.sum(-y, [1, 2])
371 |             return y, logdet
372 |         else:
373 |             x = torch.exp(x) * x_mask
374 |             return x
375 | 
376 | 
377 | class Flip(nn.Module):
378 |     def forward(self, x, *args, reverse=False, **kwargs):
379 |         x = torch.flip(x, [1])
380 |         if not reverse:
381 |             logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382 |             return x, logdet
383 |         else:
384 |             return x
385 | 
386 | 
387 | class ElementwiseAffine(nn.Module):
388 |     def __init__(self, channels):
389 |         super().__init__()
390 |         self.channels = channels
391 |         self.m = nn.Parameter(torch.zeros(channels, 1))
392 |         self.logs = nn.Parameter(torch.zeros(channels, 1))
393 | 
394 |     def forward(self, x, x_mask, reverse=False, **kwargs):
395 |         if not reverse:
396 |             y = self.m + torch.exp(self.logs) * x
397 |             y = y * x_mask
398 |             logdet = torch.sum(self.logs * x_mask, [1, 2])
399 |             return y, logdet
400 |         else:
401 |             x = (x - self.m) * torch.exp(-self.logs) * x_mask
402 |             return x
403 | 
404 | 
405 | class ResidualCouplingLayer(nn.Module):
406 |     def __init__(
407 |         self,
408 |         channels,
409 |         hidden_channels,
410 |         kernel_size,
411 |         dilation_rate,
412 |         n_layers,
413 |         p_dropout=0,
414 |         gin_channels=0,
415 |         mean_only=False,
416 |     ):
417 |         assert channels % 2 == 0, "channels should be divisible by 2"
418 |         super().__init__()
419 |         self.channels = channels
420 |         self.hidden_channels = hidden_channels
421 |         self.kernel_size = kernel_size
422 |         self.dilation_rate = dilation_rate
423 |         self.n_layers = n_layers
424 |         self.half_channels = channels // 2
425 |         self.mean_only = mean_only
426 | 
427 |         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428 |         self.enc = WN(
429 |             hidden_channels,
430 |             kernel_size,
431 |             dilation_rate,
432 |             n_layers,
433 |             p_dropout=p_dropout,
434 |             gin_channels=gin_channels,
435 |         )
436 |         self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437 |         self.post.weight.data.zero_()
438 |         self.post.bias.data.zero_()
439 | 
440 |     def forward(self, x, x_mask, g=None, reverse=False):
441 |         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442 |         h = self.pre(x0) * x_mask
443 |         h = self.enc(h, x_mask, g=g)
444 |         stats = self.post(h) * x_mask
445 |         if not self.mean_only:
446 |             m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447 |         else:
448 |             m = stats
449 |             logs = torch.zeros_like(m)
450 | 
451 |         if not reverse:
452 |             x1 = m + x1 * torch.exp(logs) * x_mask
453 |             x = torch.cat([x0, x1], 1)
454 |             logdet = torch.sum(logs, [1, 2])
455 |             return x, logdet
456 |         else:
457 |             x1 = (x1 - m) * torch.exp(-logs) * x_mask
458 |             x = torch.cat([x0, x1], 1)
459 |             return x
460 | 
461 |     def remove_weight_norm(self):
462 |         self.enc.remove_weight_norm()
463 | 
464 | 
465 | class ConvFlow(nn.Module):
466 |     def __init__(
467 |         self,
468 |         in_channels,
469 |         filter_channels,
470 |         kernel_size,
471 |         n_layers,
472 |         num_bins=10,
473 |         tail_bound=5.0,
474 |     ):
475 |         super().__init__()
476 |         self.in_channels = in_channels
477 |         self.filter_channels = filter_channels
478 |         self.kernel_size = kernel_size
479 |         self.n_layers = n_layers
480 |         self.num_bins = num_bins
481 |         self.tail_bound = tail_bound
482 |         self.half_channels = in_channels // 2
483 | 
484 |         self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485 |         self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486 |         self.proj = nn.Conv1d(
487 |             filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488 |         )
489 |         self.proj.weight.data.zero_()
490 |         self.proj.bias.data.zero_()
491 | 
492 |     def forward(self, x, x_mask, g=None, reverse=False):
493 |         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494 |         h = self.pre(x0)
495 |         h = self.convs(h, x_mask, g=g)
496 |         h = self.proj(h) * x_mask
497 | 
498 |         b, c, t = x0.shape
499 |         h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
500 | 
501 |         unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502 |         unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503 |             self.filter_channels
504 |         )
505 |         unnormalized_derivatives = h[..., 2 * self.num_bins :]
506 | 
507 |         x1, logabsdet = piecewise_rational_quadratic_transform(
508 |             x1,
509 |             unnormalized_widths,
510 |             unnormalized_heights,
511 |             unnormalized_derivatives,
512 |             inverse=reverse,
513 |             tails="linear",
514 |             tail_bound=self.tail_bound,
515 |         )
516 | 
517 |         x = torch.cat([x0, x1], 1) * x_mask
518 |         logdet = torch.sum(logabsdet * x_mask, [1, 2])
519 |         if not reverse:
520 |             return x, logdet
521 |         else:
522 |             return x
523 | 


--------------------------------------------------------------------------------
/infer_pack/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(
 13 |     inputs,
 14 |     unnormalized_widths,
 15 |     unnormalized_heights,
 16 |     unnormalized_derivatives,
 17 |     inverse=False,
 18 |     tails=None,
 19 |     tail_bound=1.0,
 20 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 21 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 22 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 23 | ):
 24 |     if tails is None:
 25 |         spline_fn = rational_quadratic_spline
 26 |         spline_kwargs = {}
 27 |     else:
 28 |         spline_fn = unconstrained_rational_quadratic_spline
 29 |         spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
 30 | 
 31 |     outputs, logabsdet = spline_fn(
 32 |         inputs=inputs,
 33 |         unnormalized_widths=unnormalized_widths,
 34 |         unnormalized_heights=unnormalized_heights,
 35 |         unnormalized_derivatives=unnormalized_derivatives,
 36 |         inverse=inverse,
 37 |         min_bin_width=min_bin_width,
 38 |         min_bin_height=min_bin_height,
 39 |         min_derivative=min_derivative,
 40 |         **spline_kwargs
 41 |     )
 42 |     return outputs, logabsdet
 43 | 
 44 | 
 45 | def searchsorted(bin_locations, inputs, eps=1e-6):
 46 |     bin_locations[..., -1] += eps
 47 |     return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 48 | 
 49 | 
 50 | def unconstrained_rational_quadratic_spline(
 51 |     inputs,
 52 |     unnormalized_widths,
 53 |     unnormalized_heights,
 54 |     unnormalized_derivatives,
 55 |     inverse=False,
 56 |     tails="linear",
 57 |     tail_bound=1.0,
 58 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 59 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 60 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 61 | ):
 62 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 63 |     outside_interval_mask = ~inside_interval_mask
 64 | 
 65 |     outputs = torch.zeros_like(inputs)
 66 |     logabsdet = torch.zeros_like(inputs)
 67 | 
 68 |     if tails == "linear":
 69 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 70 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 71 |         unnormalized_derivatives[..., 0] = constant
 72 |         unnormalized_derivatives[..., -1] = constant
 73 | 
 74 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 75 |         logabsdet[outside_interval_mask] = 0
 76 |     else:
 77 |         raise RuntimeError("{} tails are not implemented.".format(tails))
 78 | 
 79 |     (
 80 |         outputs[inside_interval_mask],
 81 |         logabsdet[inside_interval_mask],
 82 |     ) = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound,
 89 |         right=tail_bound,
 90 |         bottom=-tail_bound,
 91 |         top=tail_bound,
 92 |         min_bin_width=min_bin_width,
 93 |         min_bin_height=min_bin_height,
 94 |         min_derivative=min_derivative,
 95 |     )
 96 | 
 97 |     return outputs, logabsdet
 98 | 
 99 | 
100 | def rational_quadratic_spline(
101 |     inputs,
102 |     unnormalized_widths,
103 |     unnormalized_heights,
104 |     unnormalized_derivatives,
105 |     inverse=False,
106 |     left=0.0,
107 |     right=1.0,
108 |     bottom=0.0,
109 |     top=1.0,
110 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 |     if torch.min(inputs) < left or torch.max(inputs) > right:
115 |         raise ValueError("Input to a transform is not within its domain")
116 | 
117 |     num_bins = unnormalized_widths.shape[-1]
118 | 
119 |     if min_bin_width * num_bins > 1.0:
120 |         raise ValueError("Minimal bin width too large for the number of bins")
121 |     if min_bin_height * num_bins > 1.0:
122 |         raise ValueError("Minimal bin height too large for the number of bins")
123 | 
124 |     widths = F.softmax(unnormalized_widths, dim=-1)
125 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 |     cumwidths = torch.cumsum(widths, dim=-1)
127 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 |     cumwidths = (right - left) * cumwidths + left
129 |     cumwidths[..., 0] = left
130 |     cumwidths[..., -1] = right
131 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 | 
133 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 | 
135 |     heights = F.softmax(unnormalized_heights, dim=-1)
136 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 |     cumheights = torch.cumsum(heights, dim=-1)
138 |     cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 |     cumheights = (top - bottom) * cumheights + bottom
140 |     cumheights[..., 0] = bottom
141 |     cumheights[..., -1] = top
142 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
143 | 
144 |     if inverse:
145 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
146 |     else:
147 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 | 
149 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 | 
152 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 |     delta = heights / widths
154 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
155 | 
156 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 | 
159 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
160 | 
161 |     if inverse:
162 |         a = (inputs - input_cumheights) * (
163 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 |         ) + input_heights * (input_delta - input_derivatives)
165 |         b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 |         )
168 |         c = -input_delta * (inputs - input_cumheights)
169 | 
170 |         discriminant = b.pow(2) - 4 * a * c
171 |         assert (discriminant >= 0).all()
172 | 
173 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
174 |         outputs = root * input_bin_widths + input_cumwidths
175 | 
176 |         theta_one_minus_theta = root * (1 - root)
177 |         denominator = input_delta + (
178 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 |             * theta_one_minus_theta
180 |         )
181 |         derivative_numerator = input_delta.pow(2) * (
182 |             input_derivatives_plus_one * root.pow(2)
183 |             + 2 * input_delta * theta_one_minus_theta
184 |             + input_derivatives * (1 - root).pow(2)
185 |         )
186 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 | 
188 |         return outputs, -logabsdet
189 |     else:
190 |         theta = (inputs - input_cumwidths) / input_bin_widths
191 |         theta_one_minus_theta = theta * (1 - theta)
192 | 
193 |         numerator = input_heights * (
194 |             input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 |         )
196 |         denominator = input_delta + (
197 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 |             * theta_one_minus_theta
199 |         )
200 |         outputs = input_cumheights + numerator / denominator
201 | 
202 |         derivative_numerator = input_delta.pow(2) * (
203 |             input_derivatives_plus_one * theta.pow(2)
204 |             + 2 * input_delta * theta_one_minus_theta
205 |             + input_derivatives * (1 - theta).pow(2)
206 |         )
207 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 | 
209 |         return outputs, logabsdet
210 | 


--------------------------------------------------------------------------------
/my_utils.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | import numpy as np
 3 | 
 4 | 
 5 | def load_audio(file, sr):
 6 |     try:
 7 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 8 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 9 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 |         file = (
11 |             file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 |         )  # 防止小白拷路径头尾带了空格和"和回车
13 |         out, _ = (
14 |             ffmpeg.input(file, threads=0)
15 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 |         )
18 |     except Exception as e:
19 |         raise RuntimeError(f"Failed to load audio: {e}")
20 | 
21 |     return np.frombuffer(out, np.float32).flatten()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numba==0.56.4
 2 | numpy==1.23.5
 3 | scipy==1.9.3
 4 | librosa==0.9.2
 5 | llvmlite==0.39.0
 6 | fairseq==0.12.2
 7 | faiss-cpu==1.7.0; sys_platform == "darwin"
 8 | faiss-cpu==1.7.2; sys_platform != "darwin"
 9 | gradio
10 | Cython
11 | future>=0.18.3
12 | pydub>=0.25.1
13 | soundfile>=0.12.1
14 | ffmpeg-python>=0.2.0
15 | tensorboardX
16 | functorch>=2.0.0
17 | Jinja2>=3.1.2
18 | json5>=0.9.11
19 | Markdown
20 | matplotlib>=3.7.1
21 | matplotlib-inline>=0.1.6
22 | praat-parselmouth>=0.4.3
23 | Pillow>=9.1.1
24 | pyworld>=0.3.2
25 | resampy>=0.4.2
26 | scikit-learn>=1.2.2
27 | starlette>=0.26.1
28 | tensorboard
29 | tensorboard-data-server
30 | tensorboard-plugin-wit
31 | torchgen>=0.0.1
32 | tqdm>=4.65.0
33 | tornado>=6.2
34 | Werkzeug>=2.2.3
35 | uc-micro-py>=1.0.1
36 | sympy>=1.11.1
37 | tabulate>=0.9.0
38 | PyYAML>=6.0
39 | pyasn1>=0.4.8
40 | pyasn1-modules>=0.2.8
41 | fsspec>=2023.3.0
42 | absl-py>=1.4.0
43 | audioread
44 | uvicorn>=0.21.1
45 | colorama>=0.4.6
46 | customtkinter
47 | torchcrepe


--------------------------------------------------------------------------------
/rvcgui.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | from tkinter import filedialog
  4 | import soundfile as sf
  5 | import tkinter as tk
  6 | import customtkinter as ctk
  7 | 
  8 | import os
  9 | import sys
 10 | import torch
 11 | import warnings
 12 | import customtkinter as ctk
 13 | 
 14 | now_dir = os.getcwd()
 15 | sys.path.append(now_dir)
 16 | tmp = os.path.join(now_dir, "TEMP")
 17 | os.makedirs(os.path.join(now_dir, "models"), exist_ok=True)
 18 | os.makedirs(os.path.join(now_dir, "output"), exist_ok=True)
 19 | os.environ["TEMP"] = tmp
 20 | warnings.filterwarnings("ignore")
 21 | torch.manual_seed(114514)
 22 | 
 23 | from vc_infer_pipeline import VC
 24 | from fairseq import checkpoint_utils
 25 | from scipy.io import wavfile
 26 | from my_utils import load_audio
 27 | from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 28 | from infer_pack.modelsv2 import SynthesizerTrnMs768NSFsid_nono, SynthesizerTrnMs768NSFsid
 29 | from multiprocessing import cpu_count
 30 | import threading
 31 | from time import sleep
 32 | from time import sleep
 33 | import traceback
 34 | import numpy as np
 35 | import subprocess
 36 | import zipfile
 37 | from config import Config
 38 | 
 39 | config = Config()
 40 | 
 41 | 
 42 | 
 43 | def extract_model_from_zip(zip_path, output_dir):
 44 |     # Extract the folder name from the zip file path
 45 |     folder_name = os.path.splitext(os.path.basename(zip_path))[0]
 46 | 
 47 |     # Create a folder with the same name as the zip file inside the output directory
 48 |     output_folder = os.path.join(output_dir, folder_name)
 49 |     os.makedirs(output_folder, exist_ok=True)
 50 | 
 51 |     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 52 |         for member in zip_ref.namelist():
 53 |             if (member.endswith('.pth') and not (os.path.basename(member).startswith("G_") or os.path.basename(member).startswith("D_")) and zip_ref.getinfo(member).file_size < 200*(1024**2)) or (member.endswith('.index') and not (os.path.basename(member).startswith("trained"))):
 54 |                 # Extract the file to the output folder
 55 |                 zip_ref.extract(member, output_folder)
 56 | 
 57 |                 # Move the file to the top level of the output folder
 58 |                 file_path = os.path.join(output_folder, member)
 59 |                 new_path = os.path.join(output_folder, os.path.basename(file_path))
 60 |                 os.rename(file_path, new_path)
 61 | 
 62 |     print(f"Model files extracted to folder: {output_folder}")
 63 |     
 64 |     
 65 | def play_audio(file_path):
 66 |     if sys.platform == 'win32':
 67 |         audio_file = os.path.abspath(file_path)
 68 |         subprocess.call(['start', '', audio_file], shell=True)
 69 |     elif sys.platform == 'darwin':
 70 |         audio_file = 'path/to/audio/file.wav'
 71 |         subprocess.call(['open', audio_file])
 72 |     elif sys.platform == 'linux':
 73 |         audio_file = 'path/to/audio/file.wav'
 74 |         subprocess.call(['xdg-open', audio_file])
 75 | 
 76 | def get_full_path(path):
 77 |     return os.path.abspath(path)
 78 | 
 79 | hubert_model = None
 80 | device = config.device
 81 | print(device)
 82 | is_half = config.is_half
 83 | 
 84 | def load_hubert():
 85 |     global hubert_model
 86 |     models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
 87 |         ["hubert_base.pt"],
 88 |         suffix="",
 89 |     )
 90 |     hubert_model = models[0]
 91 |     hubert_model = hubert_model.to(config.device)
 92 |     if is_half:
 93 |         hubert_model = hubert_model.half()
 94 |     else:
 95 |         hubert_model = hubert_model.float()
 96 |     hubert_model.eval()
 97 | 
 98 | 
 99 | def vc_single(
100 |     sid,
101 |     input_audio,
102 |     f0_up_key,
103 |     f0_file,
104 |     f0_method,
105 |     file_index,
106 |     index_rate,
107 |     crepe_hop_length,
108 |     output_path=None,
109 | ):  # spk_item, input_audio0, vc_transform0,f0_file,f0method0
110 |     global tgt_sr, net_g, vc, hubert_model
111 |     if input_audio is None:
112 |         return "You need to upload an audio", None
113 |     f0_up_key = int(f0_up_key)
114 |     try:
115 |         audio = load_audio(input_audio, 16000)
116 |         times = [0, 0, 0]
117 |         if hubert_model == None:
118 |             load_hubert()
119 |         if_f0 = cpt.get("f0", 1)
120 |         file_index = (
121 |             file_index.strip(" ")
122 |             .strip('"')
123 |             .strip("\n")
124 |             .strip('"')
125 |             .strip(" ")
126 |             .replace("trained", "added")
127 |         )  # 防止小白写错，自动帮他替换掉
128 |      
129 |         audio_opt = vc.pipeline(
130 |             hubert_model,
131 |             net_g,
132 |             sid,
133 |             audio,
134 |             times,
135 |             f0_up_key,
136 |             f0_method,
137 |             file_index,
138 |             # file_big_npy,
139 |             index_rate,
140 |             if_f0,
141 |             version,
142 |             crepe_hop_length,
143 |             None,
144 |         )
145 |         print(
146 |             "npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep=""
147 |         )
148 | 
149 |         if output_path is not None:
150 |             sf.write(output_path, audio_opt, tgt_sr, format='WAV')
151 | 
152 |         return "Success", (tgt_sr, audio_opt)
153 |     except:
154 |         info = traceback.format_exc()
155 |         print(info)
156 |         return info, (None, None)
157 | 
158 | 
159 | def vc_multi(
160 |     sid,
161 |     dir_path,
162 |     opt_root,
163 |     paths,
164 |     f0_up_key,
165 |     f0_method,
166 |     file_index,
167 |     index_rate,
168 | ):
169 |     try:
170 |         dir_path = (
171 |             dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
172 |         )  # 防止小白拷路径头尾带了空格和"和回车
173 |         opt_root = opt_root.strip(" ").strip(
174 |             '"').strip("\n").strip('"').strip(" ")
175 |         os.makedirs(opt_root, exist_ok=True)
176 |         try:
177 |             if dir_path != "":
178 |                 paths = [os.path.join(dir_path, name)
179 |                          for name in os.listdir(dir_path)]
180 |             else:
181 |                 paths = [path.name for path in paths]
182 |         except:
183 |             traceback.print_exc()
184 |             paths = [path.name for path in paths]
185 |         infos = []
186 |         for path in paths:
187 |             info, opt = vc_single(
188 |                 sid,
189 |                 path,
190 |                 f0_up_key,
191 |                 None,
192 |                 f0_method,
193 |                 file_index,
194 |                 index_rate,
195 |             )
196 |             if info == "Success":
197 |                 try:
198 |                     tgt_sr, audio_opt = opt
199 |                     wavfile.write(
200 |                         "%s/%s" % (opt_root, os.path.basename(path)
201 |                                    ), tgt_sr, audio_opt
202 |                     )
203 |                 except:
204 |                     info = traceback.format_exc()
205 |             infos.append("%s->%s" % (os.path.basename(path), info))
206 |             yield "\n".join(infos)
207 |         yield "\n".join(infos)
208 |     except:
209 |         yield traceback.format_exc()
210 | 
211 | 
212 | # 一个选项卡全局只能有一个音色
213 | def get_vc(weight_root, sid):
214 |     global n_spk, tgt_sr, net_g, vc, cpt, version
215 |     if sid == "" or sid == []:
216 |         global hubert_model
217 |         if hubert_model != None:  # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
218 |             print("clean_empty_cache")
219 |             del net_g, n_spk, vc, hubert_model, tgt_sr  # ,cpt
220 |             hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
221 |             if torch.cuda.is_available():
222 |                 torch.cuda.empty_cache()
223 |             ###楼下不这么折腾清理不干净
224 |             if_f0 = cpt.get("f0", 1)
225 |             version = cpt.get("version", "v1")
226 |             if version == "v1":
227 |                 if if_f0 == 1:
228 |                     net_g = SynthesizerTrnMs256NSFsid(
229 |                         *cpt["config"], is_half=config.is_half
230 |                     )
231 |                 else:
232 |                     net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
233 |             elif version == "v2":
234 |                 if if_f0 == 1:
235 |                     net_g = SynthesizerTrnMs768NSFsid(
236 |                         *cpt["config"], is_half=config.is_half
237 |                     )
238 |                 else:
239 |                     net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
240 |             del net_g, cpt
241 |             if torch.cuda.is_available():
242 |                 torch.cuda.empty_cache()
243 |             cpt = None
244 |         return {"visible": False, "__type__": "update"}
245 |     person = (weight_root)
246 |     print("loading %s" % person)
247 |     cpt = torch.load(person, map_location="cpu")
248 |     tgt_sr = cpt["config"][-1]
249 |     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
250 |     if_f0 = cpt.get("f0", 1)
251 |     version = cpt.get("version", "v1")
252 |     if version == "v1":
253 |         if if_f0 == 1:
254 |             net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
255 |         else:
256 |             net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
257 |     elif version == "v2":
258 |         if if_f0 == 1:
259 |             net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
260 |         else:
261 |             net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
262 |     del net_g.enc_q
263 |     print(net_g.load_state_dict(cpt["weight"], strict=False))
264 |     net_g.eval().to(config.device)
265 |     if config.is_half:
266 |         net_g = net_g.half()
267 |     else:
268 |         net_g = net_g.float()
269 |     vc = VC(tgt_sr, config)
270 |     n_spk = cpt["config"][-3]
271 |     return {"visible": True, "maximum": n_spk, "__type__": "update"}
272 | 
273 | 
274 | def clean():
275 |     return {"value": "", "__type__": "update"}
276 | 
277 | 
278 | def if_done(done, p):
279 |     while 1:
280 |         if p.poll() == None:
281 |             sleep(0.5)
282 |         else:
283 |             break
284 |     done[0] = True
285 | 
286 | 
287 | def if_done_multi(done, ps):
288 |     while 1:
289 |         # poll==None代表进程未结束
290 |         # 只要有一个进程未结束都不停
291 |         flag = 1
292 |         for p in ps:
293 |             if p.poll() == None:
294 |                 flag = 0
295 |                 sleep(0.5)
296 |                 break
297 |         if flag == 1:
298 |             break
299 |     done[0] = True
300 | 
301 | 
302 | # window
303 | 
304 | 
305 | def outputkey(length=5):
306 |     # generate all possible characters
307 |     characters = string.ascii_letters + string.digits
308 |     return ''.join(random.choices(characters, k=length))
309 | # choose `length` characters randomly from the list and join them into a string
310 | 
311 | def refresh_model_list():
312 |     global model_folders
313 |     model_folders = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join(
314 |     models_dir, f)) and any(f.endswith(".pth") for f in os.listdir(os.path.join(models_dir, f)))]
315 |     model_list.configure(values=model_folders)
316 |     model_list.update()
317 | 
318 | def browse_zip():
319 |     global zip_file
320 |     zip_file = filedialog.askopenfilename(
321 |         initialdir=os.getcwd(),
322 |         title="Select file",
323 |         filetypes=(("zip files", "*.zip"), ("all files", "*.*")),
324 |     )
325 |     extract_model_from_zip(zip_file, models_dir)
326 |     refresh_model_list()
327 |     
328 | def get_output_path(file_path):
329 |     
330 |     if not os.path.exists(file_path):
331 |         # change the file extension to .wav
332 |         
333 |         return file_path  # File path does not exist, return as is
334 | 
335 |     # Split file path into directory, base filename, and extension
336 |     dir_name, file_name = os.path.split(file_path)
337 |     file_name, file_ext = os.path.splitext(file_name)
338 | 
339 |     # Initialize index to 1
340 |     index = 1
341 | 
342 |     # Increment index until a new file path is found
343 |     while True:
344 |         new_dir = f"{dir_name}\\{chosenOne}\\"
345 |         new_file_name = f"{file_name}_RVC_{index}{file_ext}"
346 |         new_file_path = os.path.join(new_dir, new_file_name)
347 |         if not os.path.exists(new_file_path):
348 |             # change the file extension to .wav
349 |             if not os.path.exists(new_dir):
350 |                 os.makedirs(new_dir)
351 |             new_file_path = os.path.splitext(new_file_path)[0] + ".wav"
352 |             return new_file_path  # Found new file path, return it
353 |         index += 1
354 |     
355 | def on_button_click():
356 |     output_audio_frame.pack_forget()
357 |     result_state.pack_forget()
358 |     run_button.configure(state="disabled")
359 | 
360 |     # Get values from user input widgets
361 |     sid = sid_entry.get()
362 |     input_audio = input_audio_entry.get()
363 |     f0_pitch = round(f0_pitch_entry.get())
364 |     crepe_hop_length = round((crepe_hop_length_entry.get()) * 64)
365 |     f0_file = None
366 |     f0_method = f0_method_entry.get()
367 |     file_index = file_index_entry.get()
368 |     # file_big_npy = file_big_npy_entry.get()
369 |     index_rate = round(index_rate_entry.get(),2)
370 |     global output_file
371 |     output_file = get_output_path(input_audio)
372 |     print("sid: ", sid, "input_audio: ", input_audio, "f0_pitch: ", f0_pitch, "f0_file: ", f0_file, "f0_method: ", f0_method,
373 |           "file_index: ", file_index, "file_big_npy: ", "index_rate: ", index_rate, "output_file: ", output_file)
374 |     # Call the vc_single function with the user input values
375 |     if model_loaded == True and os.path.isfile(input_audio):
376 |         try:
377 |             loading_frame.pack(padx=10, pady=10)
378 |             loading_progress.start()
379 |             
380 |             result, audio_opt = vc_single(
381 |                 0, input_audio, f0_pitch, None, f0_method, file_index, index_rate,crepe_hop_length, output_file)
382 |             # output_label.configure(text=result + "\n saved at" + output_file)
383 |             print(os.path.join(output_file))
384 |             if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
385 |               print(output_file) 
386 |               
387 |               run_button.configure(state="enabled")
388 |               message = result
389 |               result_state.configure(text_color="green")
390 |               last_output_file.configure(text=output_file)
391 |               output_audio_frame.pack(padx=10, pady=10)
392 |             else: 
393 |               message = result
394 |               result_state.configure(text_color="red")
395 | 
396 |         except Exception as e:
397 |             print(e)
398 |             message = "Voice conversion failed", e
399 | 
400 |     # Update the output label with the result
401 |        # output_label.configure(text=result + "\n saved at" + output_file)
402 | 
403 |         run_button.configure(state="enabled")
404 |     else:
405 |         message = "Please select a model and input audio file"
406 |         run_button.configure(state="enabled")
407 |         result_state.configure(text_color="red")
408 | 
409 |     loading_progress.stop()
410 |     loading_frame.pack_forget()
411 |     result_state.pack(padx=10, pady=10, side="top")
412 |     result_state.configure(text=message)
413 | 
414 | 
415 | def browse_file():
416 |     filepath = filedialog.askopenfilename (
417 |         filetypes=[("Audio Files", ["*.mp3","*.wav"])])
418 |     filepath = os.path.normpath(filepath)  # Normalize file path
419 |     input_audio_entry.delete(0, tk.END)
420 |     input_audio_entry.insert(0, filepath)
421 | 
422 | 
423 | 
424 | def start_processing():
425 | 
426 |     t = threading.Thread(target=on_button_click)
427 |     t.start()
428 | 
429 | 
430 | # Create tkinter window and widgets
431 | root = ctk.CTk()
432 | ctk.set_appearance_mode("dark")
433 | root.title("RVC GUI")
434 | # Get screen dimensions
435 | screen_width = root.winfo_screenwidth()
436 | screen_height = root.winfo_screenheight()
437 | 
438 | # Set GUI dimensions as a percentage of screen size
439 | 
440 | gui_height = int(screen_height * 0.85)  # 80% of screen height
441 | gui_dimensions = f"800x{gui_height}"
442 | 
443 | root.geometry(gui_dimensions)
444 | root.resizable(False, True)
445 | 
446 | model_loaded = False
447 | 
448 | def selected_model(choice):
449 |     global chosenOne
450 |     chosenOne = choice
451 |     file_index_entry.delete(0, ctk.END)
452 |     model_dir = os.path.join(models_dir, choice)
453 |     pth_files = [f for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f)) 
454 |                  and f.endswith(".pth") and not (f.startswith("G_") or f.startswith("D_"))
455 |                  and os.path.getsize(os.path.join(model_dir, f)) < 200*(1024**2)]
456 |     
457 |     if pth_files:
458 |         global pth_file_path
459 |         pth_file_path = os.path.join(model_dir, pth_files[0])
460 |         npy_files = [f for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f)) 
461 |                      and f.endswith(".index")]
462 |         if npy_files:
463 |             npy_files_dir = [os.path.join(model_dir, f) for f in npy_files]
464 |             if len(npy_files_dir) == 1:
465 |                 index_file = npy_files_dir[0]
466 |                 print(f".pth file directory: {pth_file_path}")
467 |                 print(f".index file directory: {index_file}")
468 |                 file_index_entry.insert(0, os.path.normpath(index_file))
469 |             else:
470 |                 print(f"Incomplete set of .index files found in {model_dir}")
471 |         else:
472 |             print(f"No .index files found in {model_dir}")
473 |         get_vc(pth_file_path, 0)
474 |         global model_loaded
475 |         model_loaded = True
476 |     else:
477 |         print(f"No eligible .pth files found in {model_dir}")
478 | 
479 | 
480 | def index_slider_event(value):
481 |     index_rate_label.configure(
482 |         text='Feature retrieval rate: %s' % round(value, 2))
483 |    # print(value)
484 | 
485 | 
486 | def pitch_slider_event(value):
487 |     f0_pitch_label.configure(text='Pitch: %s' % round(value))
488 |   #  print(value)
489 |   
490 | def crepe_hop_length_slider_event(value):
491 |     crepe_hop_length_label.configure(text='crepe hop: %s' % round((value) * 64))
492 |   #  print(value)
493 | 
494 | 
495 | # hide crepe hop length slider if crepe is not selected
496 | def crepe_hop_length_slider_visibility(value):
497 |     if value == "crepe" or value == "crepe-tiny":
498 |         crepe_hop_length_label.grid(row=2, column=0, padx=10, pady=5, )
499 |         crepe_hop_length_entry.grid(row=2, column=1, padx=10, pady=5, )
500 |     else:
501 |         crepe_hop_length_label.grid_remove()
502 |         crepe_hop_length_entry.grid_remove()
503 | 
504 | def update_config(selected):
505 |     global device, is_half  # declare newconfig as a global variable
506 |     if selected == "GPU":
507 |         device = "cuda:0"
508 |        # is_half = True
509 |     else:
510 |         if torch.backends.mps.is_available():
511 |          device = "mps"
512 |        #  is_half = False
513 |         else: 
514 |             device = "cpu"
515 |             is_half = False
516 | 
517 |     config.device = device
518 |     config.is_half = is_half
519 |     
520 | 
521 |     if "pth_file_path" in globals():
522 |         load_hubert()
523 |         get_vc(pth_file_path, 0)
524 | 
525 | 
526 | models_dir = "./models"
527 | model_folders = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join(
528 |     models_dir, f)) and any(f.endswith(".pth") for f in os.listdir(os.path.join(models_dir, f)))]
529 | 
530 | 
531 | master_frame = ctk.CTkFrame(master=root, height=500)
532 | master_frame.pack(padx=5, pady=5)
533 | 
534 | 
535 | left_frame = ctk.CTkFrame(master=master_frame, )
536 | left_frame.grid(row=0, column=0, padx=10,  pady=10, sticky="nsew")
537 | 
538 | right_frame = ctk.CTkFrame(master=master_frame, )
539 | right_frame.grid(row=0, column=1, pady=10, padx=10, sticky="nsew")
540 | 
541 | 
542 | inputpath_frame = ctk.CTkFrame(master=left_frame)
543 | inputpath_frame.grid(row=0, column=0, padx=15, pady=10, sticky="nsew")
544 | 
545 | 
546 | output_audio_frame = ctk.CTkFrame(master=root)
547 | 
548 | select_model_frame = ctk.CTkFrame(left_frame)
549 | select_model_frame.grid(row=1, column=0, padx=15, pady=10, sticky="nsew")
550 | 
551 | pitch_frame = ctk.CTkFrame(left_frame)
552 | pitch_frame.grid(row=3, column=0, padx=10, pady=5, sticky="nsew")
553 | 
554 | 
555 | 
556 | # Get the list of .pth files in the models directory
557 | 
558 | 
559 | 
560 | sid_label = ctk.CTkLabel(select_model_frame, text="Speaker ID:")
561 | sid_entry = ctk.CTkEntry(select_model_frame)
562 | sid_entry.insert(0, "0")
563 | sid_entry.configure(state="disabled")
564 | 
565 | # intiilizing model select widget
566 | select_model = ctk.StringVar(value="Select a model")
567 | model_list = ctk.CTkOptionMenu(select_model_frame, values=model_folders,
568 |                                command=selected_model,
569 |                                variable=select_model
570 |                                )
571 | 
572 | # intiilizing audio file input widget
573 | input_audio_label = ctk.CTkLabel(inputpath_frame, text="Input audio file:")
574 | browse_button = ctk.CTkButton(
575 |     inputpath_frame, text="Browse", command=browse_file)
576 | input_audio_entry = ctk.CTkEntry(inputpath_frame)
577 | 
578 | #  intiilizing pitch widget
579 | f0_pitch_label = ctk.CTkLabel(pitch_frame, text="Pitch: 0")
580 | f0_pitch_entry = ctk.CTkSlider(
581 |     pitch_frame, from_=-20, to=20, number_of_steps=100, command=pitch_slider_event, )
582 | f0_pitch_entry.set(0)
583 | 
584 | #  intiilizing crepe hop length widget
585 | crepe_hop_length_label = ctk.CTkLabel(pitch_frame, text="crepe hop: 128")
586 | crepe_hop_length_entry = ctk.CTkSlider(
587 |     pitch_frame, from_=1, to=8, number_of_steps=7, command=crepe_hop_length_slider_event)
588 | crepe_hop_length_entry.set(2)
589 | 
590 | # intiilizing f0 file widget
591 | #f0_file_label = ctk.CTkLabel(right_frame, text="F0 file (Optional/Not Tested)")
592 | #f0_file_entry = ctk.CTkEntry(right_frame, width=250)
593 | 
594 | # intiilizing f0 method widget
595 | f0_method_label = ctk.CTkLabel(
596 |     pitch_frame, text="F0 method")
597 | f0_method_entry = ctk.CTkSegmentedButton(
598 |     pitch_frame, height=40, values=["dio", "pm","harvest", "crepe", "crepe-tiny" ], command=crepe_hop_length_slider_visibility)
599 | f0_method_entry.set("dio")
600 | 
601 | # intiilizing index file widget
602 | file_index_label = ctk.CTkLabel(right_frame, text=".index File (Recommended)")
603 | file_index_entry = ctk.CTkEntry(right_frame, width=250)
604 | 
605 | # intiilizing big npy file widget
606 | 
607 | 
608 | 
609 | # intiilizing index rate widget
610 | index_rate_entry = ctk.CTkSlider(
611 |     right_frame, from_=0, to=1, number_of_steps=20, command=index_slider_event, )
612 | index_rate_entry.set(0.4)
613 | index_rate_label = ctk.CTkLabel(
614 |     right_frame, text="Feature retrieval rate: 0.4" )
615 | 
616 | # intiilizing run button widget
617 | run_button = ctk.CTkButton(
618 |     left_frame, fg_color="green", hover_color="darkgreen", text="Convert", command=start_processing)
619 | 
620 | # intiilizing output label widget
621 | output_label = ctk.CTkLabel(right_frame, text="")
622 | 
623 | # intiilizing Notes label widget
624 | notes_label = ctk.CTkLabel(left_frame, justify="left", text_color="#8A8A8A", text="Tips: \n 1. harvest and crepe are the highest quality, but also the slowest methods. \n 2. dio and pm are the lightest and fastest methods, but also the lowest quality.")
625 | 
626 | # intiilizing loading progress bar widget
627 | 
628 | loading_frame = ctk.CTkFrame(master=root, width=200) 
629 | 
630 | laoding_label = ctk.CTkLabel(loading_frame, text="Converting..., If the window is not responding, Please wait.")
631 | laoding_label.pack(padx=10, pady=10)
632 | loading_progress = ctk.CTkProgressBar(master=loading_frame, width=200)
633 | loading_progress.configure(mode="indeterminate")
634 | loading_progress.pack(padx=10, pady=10)
635 | 
636 | # intiilizing result state label widget
637 | result_state = ctk.CTkLabel(
638 |     root, text="", height=50, width=100, corner_radius=10)
639 | 
640 | # intiilizing change device widget
641 | change_device_label = ctk.CTkLabel( right_frame, text="Processing mode")
642 | change_device = ctk.CTkSegmentedButton(
643 |     right_frame, command=lambda value: update_config(value))
644 | change_device.configure(
645 |     values=["GPU", "CPU"])
646 | 
647 | if "cpu" in device.lower() or device.lower() == "cpu":
648 |     change_device.set("CPU")
649 |     change_device.configure(state="disabled")
650 |    
651 | else:
652 |     change_device.set("GPU")
653 | 
654 | # intiilizing last output label & open output button widget
655 | last_output_label = ctk.CTkLabel(output_audio_frame, text="Output path: ")
656 | last_output_file = ctk.CTkLabel(output_audio_frame, text="", text_color="green")
657 | open_output_button = ctk.CTkButton(output_audio_frame, text="Open", command=lambda: play_audio(output_file))
658 | 
659 | # intiilizing import models button widget
660 | import_moodels_button = ctk.CTkButton(right_frame, fg_color="darkred", hover_color="black", corner_radius=20, text="Import model from .zip", command=browse_zip)
661 | 
662 | 
663 | 
664 | # button = ctk.CTkButton(root, text="Open Window", command=open_window)
665 | # button.pack()
666 | 
667 | 
668 | 
669 | # Packing widgets into window
670 | notes_label.grid(row=5, column=0, padx=10, pady=10)
671 | change_device_label.grid(row=1, column=0, columnspan=2, padx=10, pady=5)
672 | change_device.grid(row=2, column=0, columnspan=2, padx=10, pady=5)
673 | last_output_label.grid( pady=10, row=0, column=0)
674 | last_output_file.grid( pady=10, row=0, column=1)
675 | open_output_button.grid(pady=10, row=1, column=0, columnspan=2)
676 | import_moodels_button.grid(padx=10, pady=10, row=0, column=0)
677 | model_list.grid(padx=10, pady=10, row=0, column=2)
678 | sid_label.grid(padx=10, pady=10, row=0, column=0)
679 | sid_entry.grid(padx=0, pady=10, row=0, column= 1)
680 | browse_button.grid(padx=10, pady=10, row=0, column=2)
681 | input_audio_label.grid(padx=10, pady=10, row=0, column=0)
682 | input_audio_entry.grid(padx=10, pady=10, row=0, column=1)
683 | f0_method_label.grid(padx=10, pady=10, row=0, column=0)
684 | f0_method_entry.grid(padx=10, pady=10, row=0, column=1)
685 | #crepe_hop_length_label.grid(padx=10, pady=10, row=1, column=0)
686 | #crepe_hop_length_entry.grid(padx=10, pady=10, row=1, column=1)
687 | f0_pitch_label.grid(padx=10, pady=10, row=3, column=0)
688 | f0_pitch_entry.grid(padx=10, pady=10, row=3, column=1)
689 | #0_file_label.grid(padx=10, pady=10)
690 | #f0_file_entry.grid(padx=10, pady=10)
691 | file_index_label.grid(padx=10, pady=10)
692 | file_index_entry.grid(padx=10, pady=10)
693 | 
694 | 
695 | index_rate_label.grid(padx=10, pady=10)
696 | index_rate_entry.grid(padx=10, pady=10)
697 | run_button.grid(padx=30, pady=30, row=4, column=0, columnspan=2)
698 | output_label.grid(padx=0, pady=10)
699 | 
700 | root.mainloop()
701 | 


--------------------------------------------------------------------------------
/setup.bat:
--------------------------------------------------------------------------------
1 | python -m pip install -U pip setuptools wheel
2 | pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118
3 | pip install -r requirements.txt


--------------------------------------------------------------------------------
/trainset_preprocess_pipeline_print.py:
--------------------------------------------------------------------------------
  1 | import sys, os, multiprocessing
  2 | from scipy import signal
  3 | 
  4 | now_dir = os.getcwd()
  5 | sys.path.append(now_dir)
  6 | 
  7 | inp_root = sys.argv[1]
  8 | sr = int(sys.argv[2])
  9 | n_p = int(sys.argv[3])
 10 | exp_dir = sys.argv[4]
 11 | noparallel = sys.argv[5] == "True"
 12 | import numpy as np, os, traceback
 13 | from slicer2 import Slicer
 14 | import librosa, traceback
 15 | from scipy.io import wavfile
 16 | import multiprocessing
 17 | from my_utils import load_audio
 18 | 
 19 | mutex = multiprocessing.Lock()
 20 | f = open("%s/preprocess.log" % exp_dir, "a+")
 21 | 
 22 | 
 23 | def println(strr):
 24 |     mutex.acquire()
 25 |     print(strr)
 26 |     f.write("%s\n" % strr)
 27 |     f.flush()
 28 |     mutex.release()
 29 | 
 30 | 
 31 | class PreProcess:
 32 |     def __init__(self, sr, exp_dir):
 33 |         self.slicer = Slicer(
 34 |             sr=sr,
 35 |             threshold=-40,
 36 |             min_length=800,
 37 |             min_interval=400,
 38 |             hop_size=15,
 39 |             max_sil_kept=150,
 40 |         )
 41 |         self.sr = sr
 42 |         self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
 43 |         self.per = 3.0
 44 |         self.overlap = 0.3
 45 |         self.tail = self.per + self.overlap
 46 |         self.max = 0.95
 47 |         self.alpha = 0.8
 48 |         self.exp_dir = exp_dir
 49 |         self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
 50 |         self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
 51 |         os.makedirs(self.exp_dir, exist_ok=True)
 52 |         os.makedirs(self.gt_wavs_dir, exist_ok=True)
 53 |         os.makedirs(self.wavs16k_dir, exist_ok=True)
 54 | 
 55 |     def norm_write(self, tmp_audio, idx0, idx1):
 56 |         tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
 57 |             1 - self.alpha
 58 |         ) * tmp_audio
 59 |         wavfile.write(
 60 |             "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
 61 |             self.sr,
 62 |             tmp_audio.astype(np.float32),
 63 |         )
 64 |         tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq"
 65 |         wavfile.write(
 66 |             "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
 67 |             16000,
 68 |             tmp_audio.astype(np.float32),
 69 |         )
 70 | 
 71 |     def pipeline(self, path, idx0):
 72 |         try:
 73 |             audio = load_audio(path, self.sr)
 74 |             # zero phased digital filter cause pre-ringing noise...
 75 |             # audio = signal.filtfilt(self.bh, self.ah, audio) 
 76 |             audio = signal.lfilter(self.bh, self.ah, audio)
 77 | 
 78 |             idx1 = 0
 79 |             for audio in self.slicer.slice(audio):
 80 |                 i = 0
 81 |                 while 1:
 82 |                     start = int(self.sr * (self.per - self.overlap) * i)
 83 |                     i += 1
 84 |                     if len(audio[start:]) > self.tail * self.sr:
 85 |                         tmp_audio = audio[start : start + int(self.per * self.sr)]
 86 |                         self.norm_write(tmp_audio, idx0, idx1)
 87 |                         idx1 += 1
 88 |                     else:
 89 |                         tmp_audio = audio[start:]
 90 |                         idx1 += 1
 91 |                         break
 92 |                 self.norm_write(tmp_audio, idx0, idx1)
 93 |             println("%s->Suc." % path)
 94 |         except:
 95 |             println("%s->%s" % (path, traceback.format_exc()))
 96 | 
 97 |     def pipeline_mp(self, infos):
 98 |         for path, idx0 in infos:
 99 |             self.pipeline(path, idx0)
100 | 
101 |     def pipeline_mp_inp_dir(self, inp_root, n_p):
102 |         try:
103 |             infos = [
104 |                 ("%s/%s" % (inp_root, name), idx)
105 |                 for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
106 |             ]
107 |             if noparallel:
108 |                 for i in range(n_p):
109 |                     self.pipeline_mp(infos[i::n_p])
110 |             else:
111 |                 ps = []
112 |                 for i in range(n_p):
113 |                     p = multiprocessing.Process(
114 |                         target=self.pipeline_mp, args=(infos[i::n_p],)
115 |                     )
116 |                     p.start()
117 |                     ps.append(p)
118 |                     for p in ps:
119 |                         p.join()
120 |         except:
121 |             println("Fail. %s" % traceback.format_exc())
122 | 
123 | 
124 | def preprocess_trainset(inp_root, sr, n_p, exp_dir):
125 |     pp = PreProcess(sr, exp_dir)
126 |     println("start preprocess")
127 |     println(sys.argv)
128 |     pp.pipeline_mp_inp_dir(inp_root, n_p)
129 |     println("end preprocess")
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     preprocess_trainset(inp_root, sr, n_p, exp_dir)
134 | 


--------------------------------------------------------------------------------
/vc_infer_pipeline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np, parselmouth, torch, pdb
  2 | from time import time as ttime
  3 | import torch.nn.functional as F
  4 | import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
  5 | import scipy.signal as signal
  6 | import pyworld, os, traceback, faiss
  7 | from scipy import signal
  8 | from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation
  9 | 
 10 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
 11 | 
 12 | class VC(object):
 13 |     def __init__(self, tgt_sr, config):
 14 |         self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
 15 |             config.x_pad,
 16 |             config.x_query,
 17 |             config.x_center,
 18 |             config.x_max,
 19 |             config.is_half,
 20 |         )
 21 |         self.sr = 16000  # hubert输入采样率
 22 |         self.window = 160  # 每帧点数
 23 |         self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
 24 |         self.t_pad_tgt = tgt_sr * self.x_pad
 25 |         self.t_pad2 = self.t_pad * 2
 26 |         self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
 27 |         self.t_center = self.sr * self.x_center  # 查询切点位置
 28 |         self.t_max = self.sr * self.x_max  # 免查询时长阈值
 29 |         self.device = config.device
 30 | 
 31 |     #region f0 Overhaul Region
 32 |     # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
 33 |     def get_optimal_torch_device(self, index: int = 0) -> torch.device:
 34 |         # Get cuda device
 35 |         if torch.cuda.is_available():
 36 |             return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
 37 |         elif torch.backends.mps.is_available():
 38 |             return torch.device("mps")
 39 |         # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
 40 |         # Else wise return the "cpu" as a torch device, 
 41 |         return torch.device("cpu")
 42 | 
 43 |     # Get the f0 via parselmouth computation
 44 |     def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len):
 45 |         f0 = (
 46 |             parselmouth.Sound(x, self.sr)
 47 |             .to_pitch_ac(
 48 |                 time_step=time_step / 1000,
 49 |                 voicing_threshold=0.6,
 50 |                 pitch_floor=f0_min,
 51 |                 pitch_ceiling=f0_max,
 52 |             )
 53 |             .selected_array["frequency"]
 54 |         )
 55 |         pad_size = (p_len - len(f0) + 1) // 2
 56 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
 57 |             f0 = np.pad(
 58 |                 f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
 59 |             )
 60 |         return f0
 61 | 
 62 |     # Get the f0 via the pyworld computation. Fork Feature +dio along with harvest
 63 |     def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type):
 64 |         if f0_type == "harvest":
 65 |             f0, t = pyworld.harvest(
 66 |                 x.astype(np.double),
 67 |                 fs=self.sr,
 68 |                 f0_ceil=f0_max,
 69 |                 f0_floor=f0_min,
 70 |                 frame_period=10,
 71 |             )
 72 |         elif f0_type == "dio":
 73 |             f0, t = pyworld.dio(
 74 |                 x.astype(np.double),
 75 |                 fs=self.sr,
 76 |                 f0_ceil=f0_max,
 77 |                 f0_floor=f0_min,
 78 |                 frame_period=10,
 79 |             )
 80 |         f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
 81 |         f0 = signal.medfilt(f0, 3) 
 82 |         return f0
 83 |     
 84 |     # Fork Feature: Get the f0 via the crepe algorithm from torchcrepe
 85 |     def get_f0_crepe_computation(
 86 |             self, 
 87 |             x, 
 88 |             f0_min,
 89 |             f0_max,
 90 |             p_len,
 91 |             hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
 92 |             model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
 93 |     ):
 94 |         x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
 95 |         x /= np.quantile(np.abs(x), 0.999)
 96 |         torch_device = self.get_optimal_torch_device()
 97 |         audio = torch.from_numpy(x).to(torch_device, copy=True)
 98 |         audio = torch.unsqueeze(audio, dim=0)
 99 |         if audio.ndim == 2 and audio.shape[0] > 1:
100 |             audio = torch.mean(audio, dim=0, keepdim=True).detach()
101 |         audio = audio.detach()
102 |         print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
103 |         pitch: Tensor = torchcrepe.predict(
104 |             audio,
105 |             self.sr,
106 |             hop_length,
107 |             f0_min,
108 |             f0_max,
109 |             model,
110 |             batch_size=hop_length * 2,
111 |             device=torch_device,
112 |             pad=True
113 |         )
114 |         p_len = p_len or x.shape[0] // hop_length
115 |         # Resize the pitch for final f0
116 |         source = np.array(pitch.squeeze(0).cpu().float().numpy())
117 |         source[source < 0.001] = np.nan
118 |         target = np.interp(
119 |             np.arange(0, len(source) * p_len, len(source)) / p_len,
120 |             np.arange(0, len(source)),
121 |             source
122 |         )
123 |         f0 = np.nan_to_num(target)
124 |         return f0 # Resized f0
125 |     
126 |     #endregion
127 | 
128 |     def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None):
129 |         time_step = self.window / self.sr * 1000
130 |         f0_min = 50
131 |         f0_max = 1100
132 |         f0_mel_min = 1127 * np.log(1 + f0_min / 700)
133 |         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
134 |         if f0_method == "pm":
135 |             f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len)
136 |         elif f0_method == "harvest":
137 |             f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest")
138 |         elif f0_method == "dio": # Fork Feature
139 |             f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio")
140 |         elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe
141 |             f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
142 |         elif f0_method == "crepe-tiny": # For Feature add crepe-tiny model
143 |             f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
144 | 
145 |         print("Using the following f0 method: " + f0_method)
146 |         f0 *= pow(2, f0_up_key / 12)
147 |         # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
148 |         tf0 = self.sr // self.window  # 每秒f0点数
149 |         if inp_f0 is not None:
150 |             delta_t = np.round(
151 |                 (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
152 |             ).astype("int16")
153 |             replace_f0 = np.interp(
154 |                 list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
155 |             )
156 |             shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
157 |             f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
158 |                 :shape
159 |             ]
160 |         # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
161 |         f0bak = f0.copy()
162 |         f0_mel = 1127 * np.log(1 + f0 / 700)
163 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
164 |             f0_mel_max - f0_mel_min
165 |         ) + 1
166 |         f0_mel[f0_mel <= 1] = 1
167 |         f0_mel[f0_mel > 255] = 255
168 |         f0_coarse = np.rint(f0_mel).astype(np.int)
169 | 
170 |         return f0_coarse, f0bak  # 1-0
171 | 
172 |     def vc(
173 |         self,
174 |         model,
175 |         net_g,
176 |         sid,
177 |         audio0,
178 |         pitch,
179 |         pitchf,
180 |         times,
181 |         index,
182 |         big_npy,
183 |         index_rate,
184 |         version,
185 |     ):  # ,file_index,file_big_npy
186 |         feats = torch.from_numpy(audio0)
187 |         if self.is_half:
188 |             feats = feats.half()
189 |         else:
190 |             feats = feats.float()
191 |         if feats.dim() == 2:  # double channels
192 |             feats = feats.mean(-1)
193 |         assert feats.dim() == 1, feats.dim()
194 |         feats = feats.view(1, -1)
195 |         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
196 | 
197 |         inputs = {
198 |             "source": feats.to(self.device),
199 |             "padding_mask": padding_mask,
200 |             "output_layer": 9 if version == "v1" else 12,
201 |         }
202 |         t0 = ttime()
203 |         with torch.no_grad():
204 |             logits = model.extract_features(**inputs)
205 |             feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
206 | 
207 |         if (
208 |             isinstance(index, type(None)) == False
209 |             and isinstance(big_npy, type(None)) == False
210 |             and index_rate != 0
211 |         ):
212 |             npy = feats[0].cpu().numpy()
213 |             if self.is_half:
214 |                 npy = npy.astype("float32")
215 | 
216 |             # _, I = index.search(npy, 1)
217 |             # npy = big_npy[I.squeeze()]
218 | 
219 |             score, ix = index.search(npy, k=8)
220 |             weight = np.square(1 / score)
221 |             weight /= weight.sum(axis=1, keepdims=True)
222 |             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
223 | 
224 |             if self.is_half:
225 |                 npy = npy.astype("float16")
226 |             feats = (
227 |                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
228 |                 + (1 - index_rate) * feats
229 |             )
230 | 
231 |         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
232 |         t1 = ttime()
233 |         p_len = audio0.shape[0] // self.window
234 |         if feats.shape[1] < p_len:
235 |             p_len = feats.shape[1]
236 |             if pitch != None and pitchf != None:
237 |                 pitch = pitch[:, :p_len]
238 |                 pitchf = pitchf[:, :p_len]
239 |         p_len = torch.tensor([p_len], device=self.device).long()
240 |         with torch.no_grad():
241 |             if pitch != None and pitchf != None:
242 |                 audio1 = (
243 |                     (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
244 |                     .data.cpu()
245 |                     .float()
246 |                     .numpy()
247 |                 )
248 |             else:
249 |                 audio1 = (
250 |                     (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
251 |                 )
252 |         del feats, p_len, padding_mask
253 |         if torch.cuda.is_available():
254 |             torch.cuda.empty_cache()
255 |         t2 = ttime()
256 |         times[0] += t1 - t0
257 |         times[2] += t2 - t1
258 |         return audio1
259 | 
260 |     def pipeline(
261 |         self,
262 |         model,
263 |         net_g,
264 |         sid,
265 |         audio,
266 |         times,
267 |         f0_up_key,
268 |         f0_method,
269 |         file_index,
270 |         # file_big_npy,
271 |         index_rate,
272 |         if_f0,
273 |         version,
274 |         crepe_hop_length,
275 |         f0_file=None,
276 |     ):
277 |         if (
278 |             file_index != ""
279 |             # and file_big_npy != ""
280 |             # and os.path.exists(file_big_npy) == True
281 |             and os.path.exists(file_index) == True
282 |             and index_rate != 0
283 |         ):
284 |             try:
285 |                 index = faiss.read_index(file_index)
286 |                 # big_npy = np.load(file_big_npy)
287 |                 big_npy = index.reconstruct_n(0, index.ntotal)
288 |             except:
289 |                 traceback.print_exc()
290 |                 index = big_npy = None
291 |         else:
292 |             index = big_npy = None
293 |         audio = signal.filtfilt(bh, ah, audio)
294 |         audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
295 |         opt_ts = []
296 |         if audio_pad.shape[0] > self.t_max:
297 |             audio_sum = np.zeros_like(audio)
298 |             for i in range(self.window):
299 |                 audio_sum += audio_pad[i : i - self.window]
300 |             for t in range(self.t_center, audio.shape[0], self.t_center):
301 |                 opt_ts.append(
302 |                     t
303 |                     - self.t_query
304 |                     + np.where(
305 |                         np.abs(audio_sum[t - self.t_query : t + self.t_query])
306 |                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
307 |                     )[0][0]
308 |                 )
309 |         s = 0
310 |         audio_opt = []
311 |         t = None
312 |         t1 = ttime()
313 |         audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
314 |         p_len = audio_pad.shape[0] // self.window
315 |         inp_f0 = None
316 |         if hasattr(f0_file, "name") == True:
317 |             try:
318 |                 with open(f0_file.name, "r") as f:
319 |                     lines = f.read().strip("\n").split("\n")
320 |                 inp_f0 = []
321 |                 for line in lines:
322 |                     inp_f0.append([float(i) for i in line.split(",")])
323 |                 inp_f0 = np.array(inp_f0, dtype="float32")
324 |             except:
325 |                 traceback.print_exc()
326 |         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
327 |         pitch, pitchf = None, None
328 |         if if_f0 == 1:
329 |             pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0)
330 |             pitch = pitch[:p_len]
331 |             pitchf = pitchf[:p_len]
332 |             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
333 |             pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float32).unsqueeze(0).float()
334 |         t2 = ttime()
335 |         times[1] += t2 - t1
336 |         for t in opt_ts:
337 |             t = t // self.window * self.window
338 |             if if_f0 == 1:
339 |                 audio_opt.append(
340 |                     self.vc(
341 |                         model,
342 |                         net_g,
343 |                         sid,
344 |                         audio_pad[s : t + self.t_pad2 + self.window],
345 |                         pitch[:, s // self.window : (t + self.t_pad2) // self.window],
346 |                         pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
347 |                         times,
348 |                         index,
349 |                         big_npy,
350 |                         index_rate,
351 |                         version,
352 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
353 |                 )
354 |             else:
355 |                 audio_opt.append(
356 |                     self.vc(
357 |                         model,
358 |                         net_g,
359 |                         sid,
360 |                         audio_pad[s : t + self.t_pad2 + self.window],
361 |                         None,
362 |                         None,
363 |                         times,
364 |                         index,
365 |                         big_npy,
366 |                         index_rate,
367 |                          version,
368 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
369 |                 )
370 |             s = t
371 |         if if_f0 == 1:
372 |             audio_opt.append(
373 |                 self.vc(
374 |                     model,
375 |                     net_g,
376 |                     sid,
377 |                     audio_pad[t:],
378 |                     pitch[:, t // self.window :] if t is not None else pitch,
379 |                     pitchf[:, t // self.window :] if t is not None else pitchf,
380 |                     times,
381 |                     index,
382 |                     big_npy,
383 |                     index_rate,
384 |                      version,
385 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
386 |             )
387 |         else:
388 |             audio_opt.append(
389 |                 self.vc(
390 |                     model,
391 |                     net_g,
392 |                     sid,
393 |                     audio_pad[t:],
394 |                     None,
395 |                     None,
396 |                     times,
397 |                     index,
398 |                     big_npy,
399 |                     index_rate,
400 |                      version,
401 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
402 |             )
403 |         audio_opt = np.concatenate(audio_opt)
404 |         del pitch, pitchf, sid
405 |         if torch.cuda.is_available():
406 |             torch.cuda.empty_cache()
407 |         return audio_opt


--------------------------------------------------------------------------------