├── .all-contributorsrc
├── .copier-answers.yml
├── .dockerignore
├── .editorconfig
├── .flake8
├── .github
    ├── CODE_OF_CONDUCT.md
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── 1-bug_report.yml
    │   ├── 2-feature-request.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── labels.toml
    └── workflows
    │   ├── ci.yml
    │   ├── hacktoberfest.yml
    │   ├── issue-manager.yml
    │   ├── labels.yml
    │   └── poetry-upgrade.yml
├── .gitignore
├── .gitpod.yml
├── .idea
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── so-vits-svc-fork.iml
    ├── vcs.xml
    ├── watcherTasks.xml
    └── workspace.xml
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── README_zh_CN.md
├── commitlint.config.js
├── docs
    ├── Makefile
    ├── _static
    │   ├── .gitkeep
    │   └── gui.png
    ├── changelog.md
    ├── conf.py
    ├── contributing.md
    ├── index.md
    ├── installation.md
    ├── make.bat
    └── usage.md
├── easy-installation
    ├── install-cn.bat
    └── install.bat
├── notebooks
    └── so-vits-svc-fork-4.0.ipynb
├── poetry.lock
├── pyproject.toml
├── renovate.json
├── setup.py
├── src
    └── so_vits_svc_fork
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── cluster
    │       ├── __init__.py
    │       └── train_cluster.py
    │   ├── dataset.py
    │   ├── default_gui_presets.json
    │   ├── f0.py
    │   ├── gui.py
    │   ├── hparams.py
    │   ├── inference
    │       ├── __init__.py
    │       ├── core.py
    │       └── main.py
    │   ├── logger.py
    │   ├── modules
    │       ├── __init__.py
    │       ├── attentions.py
    │       ├── commons.py
    │       ├── decoders
    │       │   ├── __init__.py
    │       │   ├── f0.py
    │       │   ├── hifigan
    │       │   │   ├── __init__.py
    │       │   │   ├── _models.py
    │       │   │   └── _utils.py
    │       │   └── mb_istft
    │       │   │   ├── __init__.py
    │       │   │   ├── _generators.py
    │       │   │   ├── _loss.py
    │       │   │   ├── _pqmf.py
    │       │   │   ├── _stft.py
    │       │   │   └── _stft_loss.py
    │       ├── descriminators.py
    │       ├── encoders.py
    │       ├── flows.py
    │       ├── losses.py
    │       ├── mel_processing.py
    │       ├── modules.py
    │       └── synthesizers.py
    │   ├── preprocessing
    │       ├── __init__.py
    │       ├── config_templates
    │       │   ├── quickvc.json
    │       │   ├── so-vits-svc-4.0v1-legacy.json
    │       │   └── so-vits-svc-4.0v1.json
    │       ├── preprocess_classify.py
    │       ├── preprocess_flist_config.py
    │       ├── preprocess_hubert_f0.py
    │       ├── preprocess_resample.py
    │       ├── preprocess_speaker_diarization.py
    │       ├── preprocess_split.py
    │       └── preprocess_utils.py
    │   ├── py.typed
    │   ├── train.py
    │   └── utils.py
└── tests
    ├── __init__.py
    ├── dataset_raw
        └── test
        │   ├── LJ001-0001.wav
        │   ├── LJ001-0002.wav
        │   ├── LJ001-0003.wav
        │   ├── LJ001-0004.wav
        │   ├── LJ001-0005.wav
        │   ├── LJ001-0006.wav
        │   ├── LJ001-0007.wav
        │   ├── LJ001-0008.wav
        │   ├── LJ001-0009.wav
        │   ├── LJ001-0010.wav
        │   └── nested
        │       ├── LJ001-0001.wav
        │       └── に.wav
    └── test_main.py


/.all-contributorsrc:
--------------------------------------------------------------------------------
  1 | {
  2 |   "projectName": "so-vits-svc-fork",
  3 |   "projectOwner": "voicepaw",
  4 |   "repoType": "github",
  5 |   "repoHost": "https://github.com",
  6 |   "files": ["README.md"],
  7 |   "imageSize": 80,
  8 |   "commit": true,
  9 |   "commitConvention": "angular",
 10 |   "contributors": [
 11 |     {
 12 |       "login": "34j",
 13 |       "name": "34j",
 14 |       "avatar_url": "https://avatars.githubusercontent.com/u/55338215?v=4",
 15 |       "profile": "https://github.com/34j",
 16 |       "contributions": [
 17 |         "code",
 18 |         "ideas",
 19 |         "doc",
 20 |         "example",
 21 |         "infra",
 22 |         "maintenance",
 23 |         "review",
 24 |         "test",
 25 |         "tutorial",
 26 |         "promotion",
 27 |         "bug"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "login": "GarrettConway",
 32 |       "name": "GarrettConway",
 33 |       "avatar_url": "https://avatars.githubusercontent.com/u/22782004?v=4",
 34 |       "profile": "https://github.com/GarrettConway",
 35 |       "contributions": ["code", "bug", "doc", "review"]
 36 |     },
 37 |     {
 38 |       "login": "BlueAmulet",
 39 |       "name": "BlueAmulet",
 40 |       "avatar_url": "https://avatars.githubusercontent.com/u/43395286?v=4",
 41 |       "profile": "https://github.com/BlueAmulet",
 42 |       "contributions": ["ideas", "question", "code", "maintenance"]
 43 |     },
 44 |     {
 45 |       "login": "ThrowawayAccount01",
 46 |       "name": "ThrowawayAccount01",
 47 |       "avatar_url": "https://avatars.githubusercontent.com/u/125531852?v=4",
 48 |       "profile": "https://github.com/ThrowawayAccount01",
 49 |       "contributions": ["bug"]
 50 |     },
 51 |     {
 52 |       "login": "MashiroSA",
 53 |       "name": "緋",
 54 |       "avatar_url": "https://avatars.githubusercontent.com/u/40637516?v=4",
 55 |       "profile": "https://github.com/MashiroSA",
 56 |       "contributions": ["doc", "bug"]
 57 |     },
 58 |     {
 59 |       "login": "Lordmau5",
 60 |       "name": "Lordmau5",
 61 |       "avatar_url": "https://avatars.githubusercontent.com/u/1345036?v=4",
 62 |       "profile": "https://github.com/Lordmau5",
 63 |       "contributions": [
 64 |         "bug",
 65 |         "code",
 66 |         "ideas",
 67 |         "maintenance",
 68 |         "question",
 69 |         "userTesting"
 70 |       ]
 71 |     },
 72 |     {
 73 |       "login": "DL909",
 74 |       "name": "DL909",
 75 |       "avatar_url": "https://avatars.githubusercontent.com/u/71912115?v=4",
 76 |       "profile": "https://github.com/DL909",
 77 |       "contributions": ["bug"]
 78 |     },
 79 |     {
 80 |       "login": "Satisfy256",
 81 |       "name": "Satisfy256",
 82 |       "avatar_url": "https://avatars.githubusercontent.com/u/101394399?v=4",
 83 |       "profile": "https://github.com/Satisfy256",
 84 |       "contributions": ["bug"]
 85 |     },
 86 |     {
 87 |       "login": "pierluigizagaria",
 88 |       "name": "Pierluigi Zagaria",
 89 |       "avatar_url": "https://avatars.githubusercontent.com/u/57801386?v=4",
 90 |       "profile": "https://github.com/pierluigizagaria",
 91 |       "contributions": ["userTesting"]
 92 |     },
 93 |     {
 94 |       "login": "ruckusmattster",
 95 |       "name": "ruckusmattster",
 96 |       "avatar_url": "https://avatars.githubusercontent.com/u/77196088?v=4",
 97 |       "profile": "https://github.com/ruckusmattster",
 98 |       "contributions": ["bug"]
 99 |     },
100 |     {
101 |       "login": "Desuka-art",
102 |       "name": "Desuka-art",
103 |       "avatar_url": "https://avatars.githubusercontent.com/u/111822082?v=4",
104 |       "profile": "https://github.com/Desuka-art",
105 |       "contributions": ["bug"]
106 |     },
107 |     {
108 |       "login": "heyfixit",
109 |       "name": "heyfixit",
110 |       "avatar_url": "https://avatars.githubusercontent.com/u/41658450?v=4",
111 |       "profile": "https://github.com/heyfixit",
112 |       "contributions": ["doc"]
113 |     },
114 |     {
115 |       "login": "nerdyrodent",
116 |       "name": "Nerdy Rodent",
117 |       "avatar_url": "https://avatars.githubusercontent.com/u/74688049?v=4",
118 |       "profile": "https://www.youtube.com/c/NerdyRodent",
119 |       "contributions": ["video"]
120 |     },
121 |     {
122 |       "login": "xieyumc",
123 |       "name": "谢宇",
124 |       "avatar_url": "https://avatars.githubusercontent.com/u/47858007?v=4",
125 |       "profile": "https://github.com/xieyumc",
126 |       "contributions": ["doc"]
127 |     },
128 |     {
129 |       "login": "ColdCawfee",
130 |       "name": "ColdCawfee",
131 |       "avatar_url": "https://avatars.githubusercontent.com/u/79474598?v=4",
132 |       "profile": "https://github.com/ColdCawfee",
133 |       "contributions": ["bug"]
134 |     },
135 |     {
136 |       "login": "sbersier",
137 |       "name": "sbersier",
138 |       "avatar_url": "https://avatars.githubusercontent.com/u/34165937?v=4",
139 |       "profile": "https://github.com/sbersier",
140 |       "contributions": ["ideas", "userTesting", "bug"]
141 |     },
142 |     {
143 |       "login": "Meldoner",
144 |       "name": "Meldoner",
145 |       "avatar_url": "https://avatars.githubusercontent.com/u/43951115?v=4",
146 |       "profile": "https://github.com/Meldoner",
147 |       "contributions": ["bug", "ideas", "code"]
148 |     },
149 |     {
150 |       "login": "mmodeusher",
151 |       "name": "mmodeusher",
152 |       "avatar_url": "https://avatars.githubusercontent.com/u/46575920?v=4",
153 |       "profile": "https://github.com/mmodeusher",
154 |       "contributions": ["bug"]
155 |     },
156 |     {
157 |       "login": "AlonDan",
158 |       "name": "AlonDan",
159 |       "avatar_url": "https://avatars.githubusercontent.com/u/21152334?v=4",
160 |       "profile": "https://github.com/AlonDan",
161 |       "contributions": ["bug"]
162 |     },
163 |     {
164 |       "login": "Likkkez",
165 |       "name": "Likkkez",
166 |       "avatar_url": "https://avatars.githubusercontent.com/u/44336181?v=4",
167 |       "profile": "https://github.com/Likkkez",
168 |       "contributions": ["bug"]
169 |     },
170 |     {
171 |       "login": "DuctTapeGames",
172 |       "name": "Duct Tape Games",
173 |       "avatar_url": "https://avatars.githubusercontent.com/u/84365142?v=4",
174 |       "profile": "https://github.com/DuctTapeGames",
175 |       "contributions": ["bug"]
176 |     },
177 |     {
178 |       "login": "hxl9654",
179 |       "name": "Xianglong He",
180 |       "avatar_url": "https://avatars.githubusercontent.com/u/6624983?v=4",
181 |       "profile": "https://tec.hxlxz.com/",
182 |       "contributions": ["bug"]
183 |     },
184 |     {
185 |       "login": "75aosu",
186 |       "name": "75aosu",
187 |       "avatar_url": "https://avatars.githubusercontent.com/u/79185331?v=4",
188 |       "profile": "https://github.com/75aosu",
189 |       "contributions": ["bug"]
190 |     },
191 |     {
192 |       "login": "tonyco82",
193 |       "name": "tonyco82",
194 |       "avatar_url": "https://avatars.githubusercontent.com/u/56610534?v=4",
195 |       "profile": "https://github.com/tonyco82",
196 |       "contributions": ["bug"]
197 |     },
198 |     {
199 |       "login": "yxlllc",
200 |       "name": "yxlllc",
201 |       "avatar_url": "https://avatars.githubusercontent.com/u/33565655?v=4",
202 |       "profile": "https://github.com/yxlllc",
203 |       "contributions": ["ideas", "code"]
204 |     },
205 |     {
206 |       "login": "outhipped",
207 |       "name": "outhipped",
208 |       "avatar_url": "https://avatars.githubusercontent.com/u/116147475?v=4",
209 |       "profile": "https://github.com/outhipped",
210 |       "contributions": ["bug"]
211 |     },
212 |     {
213 |       "login": "escoolioinglesias",
214 |       "name": "escoolioinglesias",
215 |       "avatar_url": "https://avatars.githubusercontent.com/u/73505402?v=4",
216 |       "profile": "https://github.com/escoolioinglesias",
217 |       "contributions": ["bug", "userTesting", "video"]
218 |     },
219 |     {
220 |       "login": "Blacksingh",
221 |       "name": "Blacksingh",
222 |       "avatar_url": "https://avatars.githubusercontent.com/u/130872856?v=4",
223 |       "profile": "https://github.com/Blacksingh",
224 |       "contributions": ["bug"]
225 |     },
226 |     {
227 |       "login": "tybantarnusa",
228 |       "name": "Mgs. M. Thoyib Antarnusa",
229 |       "avatar_url": "https://avatars.githubusercontent.com/u/9532857?v=4",
230 |       "profile": "http://tybantarnusa.com",
231 |       "contributions": ["bug"]
232 |     },
233 |     {
234 |       "login": "ZeroHackz",
235 |       "name": "Exosfeer",
236 |       "avatar_url": "https://avatars.githubusercontent.com/u/15729496?v=4",
237 |       "profile": "https://github.com/ZeroHackz",
238 |       "contributions": ["bug", "code"]
239 |     },
240 |     {
241 |       "login": "guranon",
242 |       "name": "guranon",
243 |       "avatar_url": "https://avatars.githubusercontent.com/u/130421189?v=4",
244 |       "profile": "https://github.com/guranon",
245 |       "contributions": ["bug", "ideas", "code"]
246 |     },
247 |     {
248 |       "login": "alexanderkoumis",
249 |       "name": "Alexander Koumis",
250 |       "avatar_url": "https://avatars.githubusercontent.com/u/5108856?v=4",
251 |       "profile": "https://github.com/alexanderkoumis",
252 |       "contributions": ["code"]
253 |     },
254 |     {
255 |       "login": "acekagami",
256 |       "name": "acekagami",
257 |       "avatar_url": "https://avatars.githubusercontent.com/u/127201056?v=4",
258 |       "profile": "https://github.com/acekagami",
259 |       "contributions": ["translation"]
260 |     },
261 |     {
262 |       "login": "Highupech",
263 |       "name": "Highupech",
264 |       "avatar_url": "https://avatars.githubusercontent.com/u/114140670?v=4",
265 |       "profile": "https://github.com/Highupech",
266 |       "contributions": ["bug"]
267 |     },
268 |     {
269 |       "login": "Scorpi",
270 |       "name": "Scorpi",
271 |       "avatar_url": "https://avatars.githubusercontent.com/u/969654?v=4",
272 |       "profile": "https://github.com/Scorpi",
273 |       "contributions": ["code"]
274 |     },
275 |     {
276 |       "login": "maximxlss",
277 |       "name": "Maximxls",
278 |       "avatar_url": "https://avatars.githubusercontent.com/u/29152154?v=4",
279 |       "profile": "http://maximxlss.github.io",
280 |       "contributions": ["code"]
281 |     },
282 |     {
283 |       "login": "Star3Lord",
284 |       "name": "Star3Lord",
285 |       "avatar_url": "https://avatars.githubusercontent.com/u/57606931?v=4",
286 |       "profile": "https://github.com/Star3Lord",
287 |       "contributions": ["bug", "code"]
288 |     },
289 |     {
290 |       "login": "Ph0rk0z",
291 |       "name": "Forkoz",
292 |       "avatar_url": "https://avatars.githubusercontent.com/u/59298527?v=4",
293 |       "profile": "https://github.com/Ph0rk0z",
294 |       "contributions": ["bug", "code"]
295 |     },
296 |     {
297 |       "login": "Zerui18",
298 |       "name": "Zerui Chen",
299 |       "avatar_url": "https://avatars.githubusercontent.com/u/34794550?v=4",
300 |       "profile": "https://github.com/Zerui18",
301 |       "contributions": ["code", "ideas"]
302 |     },
303 |     {
304 |       "login": "shenberg",
305 |       "name": "Roee Shenberg",
306 |       "avatar_url": "https://avatars.githubusercontent.com/u/653972?v=4",
307 |       "profile": "https://www.meimadix.com",
308 |       "contributions": ["userTesting", "ideas", "code"]
309 |     },
310 |     {
311 |       "login": "ShinyJustyZ",
312 |       "name": "Justas",
313 |       "avatar_url": "https://avatars.githubusercontent.com/u/65282440?v=4",
314 |       "profile": "https://github.com/ShinyJustyZ",
315 |       "contributions": ["bug", "code"]
316 |     },
317 |     {
318 |       "login": "Onako2",
319 |       "name": "Onako2",
320 |       "avatar_url": "https://avatars.githubusercontent.com/u/79749977?v=4",
321 |       "profile": "https://onako2.github.io/",
322 |       "contributions": ["doc"]
323 |     },
324 |     {
325 |       "login": "4ll0w3v1l",
326 |       "name": "4ll0w3v1l",
327 |       "avatar_url": "https://avatars.githubusercontent.com/u/53517147?v=4",
328 |       "profile": "https://github.com/4ll0w3v1l",
329 |       "contributions": ["code"]
330 |     },
331 |     {
332 |       "login": "SamuelSwartzberg",
333 |       "name": "j5y0V6b",
334 |       "avatar_url": "https://avatars.githubusercontent.com/u/16353439?v=4",
335 |       "profile": "https://github.com/SamuelSwartzberg",
336 |       "contributions": ["security"]
337 |     },
338 |     {
339 |       "login": "marcellocirelli",
340 |       "name": "marcellocirelli",
341 |       "avatar_url": "https://avatars.githubusercontent.com/u/51972090?v=4",
342 |       "profile": "https://github.com/marcellocirelli",
343 |       "contributions": ["bug"]
344 |     },
345 |     {
346 |       "login": "Priyanshu-hawk",
347 |       "name": "Priyanshu Patel",
348 |       "avatar_url": "https://avatars.githubusercontent.com/u/76026651?v=4",
349 |       "profile": "https://github.com/Priyanshu-hawk",
350 |       "contributions": ["code"]
351 |     },
352 |     {
353 |       "login": "annagorshunova",
354 |       "name": "Anna Gorshunova",
355 |       "avatar_url": "https://avatars.githubusercontent.com/u/5199204?v=4",
356 |       "profile": "https://github.com/annagorshunova",
357 |       "contributions": ["bug", "code"]
358 |     }
359 |   ],
360 |   "contributorsPerLine": 7,
361 |   "skipCi": true,
362 |   "commitType": "docs"
363 | }
364 | 


--------------------------------------------------------------------------------
/.copier-answers.yml:
--------------------------------------------------------------------------------
 1 | # Changes here will be overwritten by Copier
 2 | _commit: d5acceb
 3 | _src_path: gh:34j/pypackage-template-fork
 4 | add_me_as_contributor: false
 5 | copyright_year: '2023'
 6 | documentation: true
 7 | email: 34j.95a2p@simplelogin.com
 8 | full_name: 34j
 9 | github_username: 34j
10 | initial_commit: false
11 | open_source_license: MIT
12 | open_with_vscode: false
13 | package_name: so_vits_svc_fork
14 | project_name: SoftVC VITS Singing Voice Conversion Fork
15 | project_short_description: A fork of so-vits-svc.
16 | project_slug: so-vits-svc-fork
17 | run_poetry_install: true
18 | setup_github: false
19 | setup_pre_commit: false
20 | setup_venv: true
21 | venv_version: '3.10'
22 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Ignore everything
2 | *
3 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = docs
3 | max-line-length = 88
4 | ignore = E203, E501, E741, E402, E712, W503, E731, E711, E226
5 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/.github/CODE_OF_CONDUCT.md


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: ["34j"]
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug report
 2 | description: Create a report to help us improve
 3 | labels: [bug]
 4 | body:
 5 |   - type: textarea
 6 |     id: description
 7 |     attributes:
 8 |       label: Describe the bug
 9 |       description: A clear and concise description of what the bug is.
10 |       placeholder: Describe the bug
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: reproduce
15 |     attributes:
16 |       label: To Reproduce
17 |       description: Steps to reproduce the behavior.
18 |       placeholder: To Reproduce
19 |     validations:
20 |       required: true
21 |   - type: textarea
22 |     id: context
23 |     attributes:
24 |       label: Additional context
25 |       description: Add any other context about the problem here.
26 |       placeholder: Additional context
27 |   - type: input
28 |     id: version
29 |     attributes:
30 |       label: Version
31 |       description: Version of the project.
32 |       placeholder: Version
33 |     validations:
34 |       required: true
35 |   - type: input
36 |     id: platform
37 |     attributes:
38 |       label: Platform
39 |       description: Platform where the bug was found.
40 |       placeholder: "Example: Windows 11 / macOS 12.0.1 / Ubuntu 20.04"
41 |     validations:
42 |       required: true
43 |   - type: checkboxes
44 |     id: terms
45 |     attributes:
46 |       label: Code of Conduct
47 |       description: By submitting this issue, you agree to follow our
48 |         [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md).
49 |       options:
50 |         - label: I agree to follow this project's Code of Conduct.
51 |           required: true
52 |   - type: checkboxes
53 |     id: no-duplicate
54 |     attributes:
55 |       label: No Duplicate
56 |       description: Please check [existing issues](https://github.com/34j/so-vits-svc-fork/issues) to avoid duplicates.
57 |       options:
58 |         - label: I have checked existing issues to avoid duplicates.
59 |           required: true
60 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: [enhancement]
 4 | body:
 5 |   - type: textarea
 6 |     id: description
 7 |     attributes:
 8 |       label: Is your feature request related to a problem? Please describe.
 9 |       description: A clear and concise description of what the problem is.
10 |       value: I'm always frustrated when
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: solution
15 |     attributes:
16 |       label: Describe alternatives you've considered
17 |       description: A clear and concise description of any alternative solutions or features you've considered.
18 |       placeholder: Describe alternatives you've considered
19 |     validations:
20 |       required: true
21 |   - type: textarea
22 |     id: context
23 |     attributes:
24 |       label: Additional context
25 |       description: Add any other context or screenshots about the feature request here.
26 |       placeholder: Additional context
27 |   - type: checkboxes
28 |     id: terms
29 |     attributes:
30 |       label: Code of Conduct
31 |       description: By submitting this issue, you agree to follow our
32 |         [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md).
33 |       options:
34 |         - label: I agree to follow this project's Code of Conduct
35 |           required: true
36 |   - type: checkboxes
37 |     id: willing
38 |     attributes:
39 |       label: Are you willing to resolve this issue by submitting a Pull Request?
40 |       description: Remember that first-time contributors are welcome! 🙌
41 |       options:
42 |         - label: Yes, I have the time, and I know how to start.
43 |         - label: Yes, I have the time, but I don't know how to start. I would need guidance.
44 |         - label: No, I don't have the time, although I believe I could do it if I had the time...
45 |         - label: No, I don't have the time and I wouldn't even know how to start.
46 |     validations:
47 |       required: true
48 |   - type: markdown
49 |     attributes:
50 |       value: 👋 Have a great day and thank you for the feature request!
51 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Discussions
4 |     url: https://github.com/34j/so-vits-svc-fork/discussions
5 |     about: Please ask and answer questions here.
6 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |   😀 Wonderful!  Thank you for opening a pull request.
 3 | 
 4 |   Please fill in the information below to expedite the review
 5 |   and (hopefully) merge of your change.
 6 | -->
 7 | 
 8 | ### Description of change
 9 | 
10 | <!--
11 |   First, have Copilot write it down.
12 | -->
13 | 
14 | copilot:all
15 | 
16 | <!--
17 |   Please be clear and concise what the change is intended to do,
18 |   why this change is needed, and how you've verified that it
19 |   corrects what you intended.
20 | 
21 |   In some cases it may be helpful to include the current behavior
22 |   and the new behavior.
23 | 
24 |   If the change is related to an open issue, you can link it here.
25 |   If you include `Fixes #0000` (replacing `0000` with the issue number)
26 |   when this is merged it will automatically mark the issue as fixed and
27 |   close it.
28 | -->
29 | 
30 | ### Pull-Request Checklist
31 | 
32 | <!--
33 |   Please make sure to review and check all of the following.
34 | 
35 |   If an item is not applicable, you can add "N/A" to the end.
36 | -->
37 | 
38 | - [ ] Code is up-to-date with the `main` branch
39 | - [ ] This pull request follows [Contributing.md](https://github.com/34j/so-vits-svc-fork/blob/main/CONTRIBUTING.md)
40 | - [ ] This pull request links relevant issues as `Fixes #0000`
41 | - [ ] `pre-commit run -a` passes with this change or ci passes
42 | - [ ] `poetry run pytest` passes with this change or ci passes
43 | - [ ] (There are new or updated unit tests validating the change)
44 | - [ ] Documentation has been updated to reflect this change
45 | - [ ] The new commits follow conventions outlined in the [conventional commit spec](https://www.conventionalcommits.org/en/v1.0.0/)
46 | 
47 | <!--
48 |   🎉 Thank you for contributing!
49 | -->
50 | 


--------------------------------------------------------------------------------
/.github/labels.toml:
--------------------------------------------------------------------------------
 1 | [breaking]
 2 | color = "ffcc00"
 3 | name = "breaking"
 4 | description = "Breaking change."
 5 | 
 6 | [bug]
 7 | color = "d73a4a"
 8 | name = "bug"
 9 | description = "Something isn't working"
10 | 
11 | [dependencies]
12 | color = "0366d6"
13 | name = "dependencies"
14 | description = "Pull requests that update a dependency file"
15 | 
16 | [github_actions]
17 | color = "000000"
18 | name = "github_actions"
19 | description = "Update of github actions"
20 | 
21 | [documentation]
22 | color = "1bc4a5"
23 | name = "documentation"
24 | description = "Improvements or additions to documentation"
25 | 
26 | [duplicate]
27 | color = "cfd3d7"
28 | name = "duplicate"
29 | description = "This issue or pull request already exists"
30 | 
31 | [enhancement]
32 | color = "a2eeef"
33 | name = "enhancement"
34 | description = "New feature or request"
35 | 
36 | ["good first issue"]
37 | color = "7057ff"
38 | name = "good first issue"
39 | description = "Good for newcomers"
40 | 
41 | ["help wanted"]
42 | color = "008672"
43 | name = "help wanted"
44 | description = "Extra attention is needed"
45 | 
46 | [invalid]
47 | color = "e4e669"
48 | name = "invalid"
49 | description = "This doesn't seem right"
50 | 
51 | [nochangelog]
52 | color = "555555"
53 | name = "nochangelog"
54 | description = "Exclude pull requests from changelog"
55 | 
56 | [question]
57 | color = "d876e3"
58 | name = "question"
59 | description = "Further information is requested"
60 | 
61 | [removed]
62 | color = "e99695"
63 | name = "removed"
64 | description = "Removed piece of functionalities."
65 | 
66 | [tests]
67 | color = "bfd4f2"
68 | name = "tests"
69 | description = "CI, CD and testing related changes"
70 | 
71 | [wontfix]
72 | color = "ffffff"
73 | name = "wontfix"
74 | description = "This will not be worked on"
75 | 
76 | [discussion]
77 | color = "c2e0c6"
78 | name = "discussion"
79 | description = "Some discussion around the project"
80 | 
81 | [hacktoberfest]
82 | color = "ffa663"
83 | name = "hacktoberfest"
84 | description = "Good issues for Hacktoberfest"
85 | 
86 | [answered]
87 | color = "0ee2b6"
88 | name = "answered"
89 | description = "Automatically closes as answered after a delay"
90 | 
91 | [waiting]
92 | color = "5f7972"
93 | name = "waiting"
94 | description = "Automatically closes if no answer after a delay"
95 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | concurrency:
10 |   group: ${{ github.head_ref || github.run_id }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   lint:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v3
18 |       - uses: actions/setup-python@v5
19 |         with:
20 |           python-version: "3.9"
21 |       - uses: pre-commit/action@v3.0.1
22 | 
23 |   # Make sure commit messages follow the conventional commits convention:
24 |   # https://www.conventionalcommits.org
25 |   commitlint:
26 |     name: Lint Commit Messages
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - uses: actions/checkout@v3
30 |         with:
31 |           fetch-depth: 0
32 |       - uses: wagoid/commitlint-github-action@v5.5.1
33 | 
34 |   test:
35 |     strategy:
36 |       fail-fast: false
37 |       matrix:
38 |         python-version:
39 |           - "3.8"
40 |           - "3.9"
41 |           - "3.10"
42 |           - "3.11"
43 |           # - "3.12"
44 |         os:
45 |           - ubuntu-latest
46 |           # - windows-latest
47 |           # - macOS-latest
48 |     runs-on: ${{ matrix.os }}
49 |     steps:
50 |       - uses: actions/checkout@v3
51 |       - name: Set up Python
52 |         uses: actions/setup-python@v5
53 |         with:
54 |           python-version: ${{ matrix.python-version }}
55 |       - uses: snok/install-poetry@v1.3.4
56 |       - name: Install Dependencies
57 |         run: poetry install
58 |         shell: bash
59 |       - name: Test with Pytest
60 |         run: poetry run pytest --cov-report=xml
61 |         shell: bash
62 |       - name: Upload coverage to Codecov
63 |         uses: codecov/codecov-action@v4
64 |         with:
65 |           token: ${{ secrets.CODECOV_TOKEN }}
66 | 
67 |   release:
68 |     runs-on: ubuntu-latest
69 |     environment: release
70 |     if: github.ref == 'refs/heads/main'
71 |     needs:
72 |       - test
73 |       - lint
74 |       - commitlint
75 | 
76 |     steps:
77 |       - uses: actions/checkout@v3
78 |         with:
79 |           fetch-depth: 0
80 | 
81 |       # Run semantic release:
82 |       # - Update CHANGELOG.md
83 |       # - Update version in code
84 |       # - Create git tag
85 |       # - Create GitHub release
86 |       # - Publish to PyPI
87 |       - name: Python Semantic Release
88 |         uses: relekang/python-semantic-release@v7.34.6
89 |         with:
90 |           github_token: ${{ secrets.GITHUB_TOKEN }}
91 |           pypi_token: ${{ secrets.PYPI_TOKEN }}
92 | 


--------------------------------------------------------------------------------
/.github/workflows/hacktoberfest.yml:
--------------------------------------------------------------------------------
 1 | name: Hacktoberfest
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Run every day in October
 6 |     - cron: "0 0 * 10 *"
 7 |     # Run on the 1st of November to revert
 8 |     - cron: "0 13 1 11 *"
 9 | 
10 | jobs:
11 |   hacktoberfest:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: browniebroke/hacktoberfest-labeler-action@v2.3.0
16 |         with:
17 |           github_token: ${{ secrets.GH_PAT }}
18 | 


--------------------------------------------------------------------------------
/.github/workflows/issue-manager.yml:
--------------------------------------------------------------------------------
 1 | name: Issue Manager
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *"
 6 |   issue_comment:
 7 |     types:
 8 |       - created
 9 |   issues:
10 |     types:
11 |       - labeled
12 |   pull_request_target:
13 |     types:
14 |       - labeled
15 |   workflow_dispatch:
16 | 
17 | jobs:
18 |   issue-manager:
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - uses: tiangolo/issue-manager@0.5.0
22 |         with:
23 |           token: ${{ secrets.GITHUB_TOKEN }}
24 |           config: >
25 |             {
26 |               "answered": {
27 |                 "message": "Assuming the original issue was solved, it will be automatically closed now."
28 |               },
29 |               "waiting": {
30 |                 "message": "Automatically closing. To re-open, please provide the additional information requested."
31 |               }
32 |             }
33 | 


--------------------------------------------------------------------------------
/.github/workflows/labels.yml:
--------------------------------------------------------------------------------
 1 | name: Sync Github labels
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - ".github/**"
 9 | 
10 | jobs:
11 |   labels:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: 3.8
19 |       - name: Install labels
20 |         run: pip install labels
21 |       - name: Sync config with Github
22 |         run: labels -u ${{ github.repository_owner }} -t ${{ secrets.GITHUB_TOKEN }} sync -f .github/labels.toml
23 | 


--------------------------------------------------------------------------------
/.github/workflows/poetry-upgrade.yml:
--------------------------------------------------------------------------------
 1 | name: Upgrader
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "29 23 16 * *"
 7 | 
 8 | jobs:
 9 |   upgrade:
10 |     uses: browniebroke/github-actions/.github/workflows/poetry-upgrade.yml@v1
11 |     secrets:
12 |       gh_pat: ${{ secrets.GH_PAT }}
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder {{package_name}} settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope {{package_name}} settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | # pytype static type analyzer
137 | .pytype/
138 | 
139 | # Cython debug symbols
140 | cython_debug/
141 | 
142 | # additional files
143 | tests/**/*.wav
144 | !tests/dataset_raw/test/**/*.wav
145 | tests/**/*.npy
146 | tests/**/*.pt
147 | tests/**/*.txt
148 | tests/**/*.json
149 | tests/**/*.pth
150 | tests/**/*.download
151 | tests/**/*.lab
152 | tests/**/*.pdf
153 | tests/**/*.csv
154 | tests/**/*.ckpt
155 | tests/**/*.yaml
156 | *.tfevents.*
157 | *.pt
158 | user_gui_presets.json
159 | 


--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
1 | tasks:
2 |   - command: |
3 |       pip install poetry
4 |       PIP_USER=false poetry install
5 |   - command: |
6 |       pip install pre-commit
7 |       pre-commit install
8 |       PIP_USER=false pre-commit install-hooks
9 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <profile version="1.0">
3 |     <option name="myName" value="Project Default" />
4 |     <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
5 |   </profile>
6 | </component>
7 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>
7 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (so-vits-svc-fork)" project-jdk-type="Python SDK" />
4 | </project>
5 | 


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/so-vits-svc-fork.iml" filepath="$PROJECT_DIR$/.idea/so-vits-svc-fork.iml" />
6 |     </modules>
7 |   </component>
8 | </project>
9 | 


--------------------------------------------------------------------------------
/.idea/so-vits-svc-fork.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="Flask">
 4 |     <option name="enabled" value="true" />
 5 |   </component>
 6 |   <component name="NewModuleRootManager">
 7 |     <content url="file://$MODULE_DIR$">
 8 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
10 |     </content>
11 |     <orderEntry type="inheritedJdk" />
12 |     <orderEntry type="sourceFolder" forTests="false" />
13 |   </component>
14 |   <component name="PyDocumentationSettings">
15 |     <option name="format" value="PLAIN" />
16 |     <option name="myDocStringFormat" value="Plain" />
17 |   </component>
18 |   <component name="TemplatesService">
19 |     <option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
20 |   </component>
21 |   <component name="TestRunnerService">
22 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
23 |   </component>
24 | </module>
25 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>
7 | 


--------------------------------------------------------------------------------
/.idea/watcherTasks.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectTasksOptions">
 4 |     <TaskOptions isEnabled="false">
 5 |       <option name="arguments" value="run black $FilePath$" />
 6 |       <option name="checkSyntaxErrors" value="true" />
 7 |       <option name="description" />
 8 |       <option name="exitCodeBehavior" value="ERROR" />
 9 |       <option name="fileExtension" value="py" />
10 |       <option name="immediateSync" value="false" />
11 |       <option name="name" value="black" />
12 |       <option name="output" value="" />
13 |       <option name="outputFilters">
14 |         <array />
15 |       </option>
16 |       <option name="outputFromStdout" value="false" />
17 |       <option name="program" value="poetry" />
18 |       <option name="runOnExternalChanges" value="false" />
19 |       <option name="scopeName" value="Project Files" />
20 |       <option name="trackOnlyRoot" value="false" />
21 |       <option name="workingDir" value="$ProjectFileDir$" />
22 |       <envs />
23 |     </TaskOptions>
24 |     <TaskOptions isEnabled="false">
25 |       <option name="arguments" value="run isort $FilePath$" />
26 |       <option name="checkSyntaxErrors" value="true" />
27 |       <option name="description" />
28 |       <option name="exitCodeBehavior" value="ERROR" />
29 |       <option name="fileExtension" value="py" />
30 |       <option name="immediateSync" value="false" />
31 |       <option name="name" value="isort" />
32 |       <option name="output" value="" />
33 |       <option name="outputFilters">
34 |         <array />
35 |       </option>
36 |       <option name="outputFromStdout" value="false" />
37 |       <option name="program" value="poetry" />
38 |       <option name="runOnExternalChanges" value="false" />
39 |       <option name="scopeName" value="Project Files" />
40 |       <option name="trackOnlyRoot" value="false" />
41 |       <option name="workingDir" value="$ProjectFileDir$" />
42 |       <envs />
43 |     </TaskOptions>
44 |     <TaskOptions isEnabled="false">
45 |       <option name="arguments" value="run pyupgrade --py37-plus $FilePath$" />
46 |       <option name="checkSyntaxErrors" value="true" />
47 |       <option name="description" />
48 |       <option name="exitCodeBehavior" value="NEVER" />
49 |       <option name="fileExtension" value="py" />
50 |       <option name="immediateSync" value="false" />
51 |       <option name="name" value="pyupgrade" />
52 |       <option name="output" value="" />
53 |       <option name="outputFilters">
54 |         <array />
55 |       </option>
56 |       <option name="outputFromStdout" value="false" />
57 |       <option name="program" value="poetry" />
58 |       <option name="runOnExternalChanges" value="false" />
59 |       <option name="scopeName" value="Project Files" />
60 |       <option name="trackOnlyRoot" value="false" />
61 |       <option name="workingDir" value="$ProjectFileDir$" />
62 |       <envs />
63 |     </TaskOptions>
64 |   </component>
65 | </project>
66 | 


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="7aacf003-229b-4eb0-80a4-ff105dc3c3d4" name="変更" comment="">
  5 |       <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" afterDir="false" />
  6 |       <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" afterDir="false" />
  7 |       <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/__init__.py" afterDir="false" />
  8 |       <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/_model.py" afterDir="false" />
  9 |       <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/onnx_export.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/_export.py" afterDir="false" />
 10 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/1.wav" beforeDir="false" />
 11 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/10.wav" beforeDir="false" />
 12 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/2.wav" beforeDir="false" />
 13 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/3.wav" beforeDir="false" />
 14 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/4.wav" beforeDir="false" />
 15 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/5.wav" beforeDir="false" />
 16 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/6.wav" beforeDir="false" />
 17 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/7.wav" beforeDir="false" />
 18 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/8.wav" beforeDir="false" />
 19 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/9.wav" beforeDir="false" />
 20 |       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/あ.wav" beforeDir="false" />
 21 |       <change beforePath="$PROJECT_DIR$/tests/test_main.py" beforeDir="false" afterPath="$PROJECT_DIR$/tests/test_main.py" afterDir="false" />
 22 |     </list>
 23 |     <option name="SHOW_DIALOG" value="false" />
 24 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 25 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 26 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 27 |   </component>
 28 |   <component name="FileTemplateManagerImpl">
 29 |     <option name="RECENT_TEMPLATES">
 30 |       <list>
 31 |         <option value="Python Script" />
 32 |       </list>
 33 |     </option>
 34 |   </component>
 35 |   <component name="Git.Settings">
 36 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
 37 |   </component>
 38 |   <component name="MarkdownSettingsMigration">
 39 |     <option name="stateVersion" value="1" />
 40 |   </component>
 41 |   <component name="ProjectId" id="2N3U7T2ZqSld9sk8NQi5nSwUOO9" />
 42 |   <component name="ProjectLevelVcsManager">
 43 |     <ConfirmationsSetting value="2" id="Add" />
 44 |   </component>
 45 |   <component name="ProjectViewState">
 46 |     <option name="hideEmptyMiddlePackages" value="true" />
 47 |     <option name="showLibraryContents" value="true" />
 48 |   </component>
 49 |   <component name="PropertiesComponent">{
 50 |   &quot;keyToString&quot;: {
 51 |     &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
 52 |     &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
 53 |     &quot;WebServerToolWindowFactoryState&quot;: &quot;false&quot;,
 54 |     &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
 55 |     &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
 56 |     &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;
 57 |   }
 58 | }</component>
 59 |   <component name="RecentsManager">
 60 |     <key name="MoveFile.RECENT_KEYS">
 61 |       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\modules" />
 62 |       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\inference" />
 63 |       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\onnxexport" />
 64 |       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\preprocessing" />
 65 |       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\modules\decoders" />
 66 |     </key>
 67 |   </component>
 68 |   <component name="RunManager" selected="Python.so-vits-svc-fork">
 69 |     <configuration name="so-vits-svc-fork" type="PythonConfigurationType" factoryName="Python">
 70 |       <module name="so-vits-svc-fork" />
 71 |       <option name="INTERPRETER_OPTIONS" value="" />
 72 |       <option name="PARENT_ENVS" value="true" />
 73 |       <envs>
 74 |         <env name="PYTHONUNBUFFERED" value="1" />
 75 |       </envs>
 76 |       <option name="SDK_HOME" value="" />
 77 |       <option name="WORKING_DIRECTORY" value="" />
 78 |       <option name="IS_MODULE_SDK" value="false" />
 79 |       <option name="ADD_CONTENT_ROOTS" value="true" />
 80 |       <option name="ADD_SOURCE_ROOTS" value="true" />
 81 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
 82 |       <option name="SCRIPT_NAME" value="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\flask_api.py" />
 83 |       <option name="PARAMETERS" value="" />
 84 |       <option name="SHOW_COMMAND_LINE" value="false" />
 85 |       <option name="EMULATE_TERMINAL" value="false" />
 86 |       <option name="MODULE_MODE" value="false" />
 87 |       <option name="REDIRECT_INPUT" value="false" />
 88 |       <option name="INPUT_FILE" value="" />
 89 |       <method v="2" />
 90 |     </configuration>
 91 |     <configuration name="pytest" type="tests" factoryName="py.test">
 92 |       <module name="{{ project_slug }}" />
 93 |       <option name="INTERPRETER_OPTIONS" value="" />
 94 |       <option name="PARENT_ENVS" value="true" />
 95 |       <option name="SDK_HOME" value="" />
 96 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 97 |       <option name="IS_MODULE_SDK" value="true" />
 98 |       <option name="ADD_CONTENT_ROOTS" value="true" />
 99 |       <option name="ADD_SOURCE_ROOTS" value="true" />
100 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
101 |       <EXTENSION ID="net.ashald.envfile">
102 |         <option name="IS_ENABLED" value="false" />
103 |         <option name="IS_SUBST" value="false" />
104 |         <option name="IS_PATH_MACRO_SUPPORTED" value="false" />
105 |         <option name="IS_IGNORE_MISSING_FILES" value="false" />
106 |         <option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
107 |         <ENTRIES>
108 |           <ENTRY IS_ENABLED="true" PARSER="runconfig" />
109 |         </ENTRIES>
110 |       </EXTENSION>
111 |       <option name="_new_keywords" value="&quot;&quot;" />
112 |       <option name="_new_parameters" value="&quot;&quot;" />
113 |       <option name="_new_additionalArguments" value="&quot;&quot;" />
114 |       <option name="_new_target" value="&quot;&quot;" />
115 |       <option name="_new_targetType" value="&quot;CUSTOM&quot;" />
116 |       <method v="2" />
117 |     </configuration>
118 |   </component>
119 |   <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="アプリケーションレベル" UseSingleDictionary="true" transferred="true" />
120 |   <component name="TaskManager">
121 |     <task active="true" id="Default" summary="デフォルトタスク">
122 |       <changelist id="7aacf003-229b-4eb0-80a4-ff105dc3c3d4" name="変更" comment="" />
123 |       <created>1678892092249</created>
124 |       <option name="number" value="Default" />
125 |       <option name="presentableId" value="Default" />
126 |       <updated>1678892092249</updated>
127 |       <workItem from="1678892093553" duration="810000" />
128 |       <workItem from="1678932243084" duration="593000" />
129 |       <workItem from="1680174456649" duration="1005000" />
130 |       <workItem from="1680251014707" duration="2800000" />
131 |       <workItem from="1680319074742" duration="3292000" />
132 |     </task>
133 |     <servers />
134 |   </component>
135 |   <component name="TypeScriptGeneratedFilesManager">
136 |     <option name="version" value="3" />
137 |   </component>
138 |   <component name="Vcs.Log.Tabs.Properties">
139 |     <option name="TAB_STATES">
140 |       <map>
141 |         <entry key="MAIN">
142 |           <value>
143 |             <State />
144 |           </value>
145 |         </entry>
146 |       </map>
147 |     </option>
148 |   </component>
149 | </project>
150 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | exclude: "CHANGELOG.md|.copier-answers.yml"
 4 | default_stages: [commit]
 5 | 
 6 | ci:
 7 |   autofix_commit_msg: "chore(pre-commit.ci): auto fixes"
 8 |   autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate"
 9 | 
10 | repos:
11 |   - repo: https://github.com/commitizen-tools/commitizen
12 |     rev: v3.28.0
13 |     hooks:
14 |       - id: commitizen
15 |         stages: [commit-msg]
16 |   - repo: https://github.com/pre-commit/pre-commit-hooks
17 |     rev: v4.6.0
18 |     hooks:
19 |       - id: debug-statements
20 |       - id: check-builtin-literals
21 |       - id: check-case-conflict
22 |       - id: check-docstring-first
23 |       - id: check-json
24 |       - id: check-toml
25 |       - id: check-xml
26 |       - id: check-yaml
27 |       - id: detect-private-key
28 |       - id: end-of-file-fixer
29 |       - id: trailing-whitespace
30 |   - repo: https://github.com/python-poetry/poetry
31 |     rev: 1.8.3
32 |     hooks:
33 |       - id: poetry-check
34 |   - repo: https://github.com/pre-commit/mirrors-prettier
35 |     rev: v3.1.0
36 |     hooks:
37 |       - id: prettier
38 |         args: ["--tab-width", "2"]
39 |   - repo: https://github.com/asottile/pyupgrade
40 |     rev: v3.17.0
41 |     hooks:
42 |       - id: pyupgrade
43 |         args: [--py38-plus]
44 |   - repo: https://github.com/PyCQA/autoflake
45 |     rev: v2.3.1
46 |     hooks:
47 |       - id: autoflake
48 |   - repo: https://github.com/PyCQA/isort
49 |     rev: 5.13.2
50 |     hooks:
51 |       - id: isort
52 |   - repo: https://github.com/psf/black
53 |     rev: 24.1.0
54 |     hooks:
55 |       - id: black
56 |   - repo: https://github.com/codespell-project/codespell
57 |     rev: v2.2.6
58 |     hooks:
59 |       - id: codespell
60 |         args: [-w]
61 |   - repo: https://github.com/PyCQA/flake8
62 |     rev: 7.1.1
63 |     hooks:
64 |       - id: flake8
65 |   #- repo: https://github.com/pre-commit/mirrors-mypy
66 |   #  rev: v0.931
67 |   #  hooks:
68 |   #    - id: mypy
69 |   #      additional_dependencies: []
70 |   # - repo: https://github.com/PyCQA/bandit
71 |   #   rev: 1.7.4
72 |   #   hooks:
73 |   #     - id: bandit
74 |   #       args: [-x, tests]
75 |   - repo: https://github.com/srstevenson/nb-clean
76 |     rev: "3.3.0"
77 |     hooks:
78 |       - id: nb-clean
79 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the version of Python and other tools you might need
 8 | build:
 9 |   os: ubuntu-20.04
10 |   tools:
11 |     python: "3.9"
12 |   jobs:
13 |     post_create_environment:
14 |       # Install poetry
15 |       - pip install poetry
16 |       # Tell poetry to not use a virtual environment
17 |       - poetry config virtualenvs.create false
18 |     post_install:
19 |       # Install dependencies
20 |       - poetry install --with docs
21 | 
22 | # Build documentation in the docs directory with Sphinx
23 | sphinx:
24 |   configuration: docs/conf.py
25 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | Contributions are welcome, and they are greatly appreciated! Every little helps, and credit will always be given.
  4 | 
  5 | You can contribute in many ways:
  6 | 
  7 | ## Types of Contributions
  8 | 
  9 | ### Report Bugs
 10 | 
 11 | Report bugs to [our issue page][gh-issues]. If you are reporting a bug, please include:
 12 | 
 13 | - Your operating system name and version.
 14 | - Any details about your local setup that might be helpful in troubleshooting.
 15 | - Detailed steps to reproduce the bug.
 16 | 
 17 | ### Fix Bugs
 18 | 
 19 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help wanted" is open to whoever wants to implement it.
 20 | 
 21 | ### Implement Features
 22 | 
 23 | Look through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it.
 24 | 
 25 | ### Write Documentation
 26 | 
 27 | SoftVC VITS Singing Voice Conversion Fork could always use more documentation, whether as part of the official SoftVC VITS Singing Voice Conversion Fork docs, in docstrings, or even on the web in blog posts, articles, and such.
 28 | 
 29 | ### Submit Feedback
 30 | 
 31 | The best way to send feedback [our issue page][gh-issues] on GitHub. If you are proposing a feature:
 32 | 
 33 | - Explain in detail how it would work.
 34 | - Keep the scope as narrow as possible, to make it easier to implement.
 35 | - Remember that this is a volunteer-driven project, and that contributions are welcome 😊
 36 | 
 37 | ## Get Started!
 38 | 
 39 | Ready to contribute? Here's how to set yourself up for local development.
 40 | 
 41 | 1. Fork the repo on GitHub.
 42 | 
 43 | 2. Clone your fork locally:
 44 | 
 45 |    ```shell
 46 |    $ git clone git@github.com:your_name_here/so-vits-svc-fork.git
 47 |    ```
 48 | 
 49 | 3. Install the project dependencies with [Poetry](https://python-poetry.org):
 50 | 
 51 |    ```shell
 52 |    $ poetry install
 53 |    ```
 54 | 
 55 | 4. Create a branch for local development:
 56 | 
 57 |    ```shell
 58 |    $ git checkout -b name-of-your-bugfix-or-feature
 59 |    ```
 60 | 
 61 |    Now you can make your changes locally.
 62 | 
 63 | 5. When you're done making changes, check that your changes pass our tests:
 64 | 
 65 |    ```shell
 66 |    $ poetry run pytest
 67 |    ```
 68 | 
 69 | 6. Linting is done through [pre-commit](https://pre-commit.com). Provided you have the tool installed globally, you can run them all as one-off:
 70 | 
 71 |    ```shell
 72 |    $ pre-commit run -a
 73 |    ```
 74 | 
 75 |    Or better, install the hooks once and have them run automatically each time you commit:
 76 | 
 77 |    ```shell
 78 |    $ pre-commit install
 79 |    ```
 80 | 
 81 | 7. Commit your changes and push your branch to GitHub:
 82 | 
 83 |    ```shell
 84 |    $ git add .
 85 |    $ git commit -m "feat(something): your detailed description of your changes"
 86 |    $ git push origin name-of-your-bugfix-or-feature
 87 |    ```
 88 | 
 89 |    Note: the commit message should follow [the conventional commits](https://www.conventionalcommits.org). We run [`commitlint` on CI](https://github.com/marketplace/actions/commit-linter) to validate it, and if you've installed pre-commit hooks at the previous step, the message will be checked at commit time.
 90 | 
 91 | 8. Submit a pull request through the GitHub website or using the GitHub CLI (if you have it installed):
 92 | 
 93 |    ```shell
 94 |    $ gh pr create --fill
 95 |    ```
 96 | 
 97 | ## Pull Request Guidelines
 98 | 
 99 | We like to have the pull request open as soon as possible, that's a great place to discuss any piece of work, even unfinished. You can use draft pull request if it's still a work in progress. Here are a few guidelines to follow:
100 | 
101 | 1. Include tests for feature or bug fixes.
102 | 2. Update the documentation for significant features.
103 | 3. Ensure tests are passing on CI.
104 | 
105 | ## Tips
106 | 
107 | To run a subset of tests:
108 | 
109 | ```shell
110 | $ pytest tests
111 | ```
112 | 
113 | ## Making a new release
114 | 
115 | The deployment should be automated and can be triggered from the Semantic Release workflow in GitHub. The next version will be based on [the commit logs](https://python-semantic-release.readthedocs.io/en/latest/commit-log-parsing.html#commit-log-parsing). This is done by [python-semantic-release](https://python-semantic-release.readthedocs.io/en/latest/index.html) via a GitHub action.
116 | 
117 | [gh-issues]: https://github.com/34j/so-vits-svc-fork/issues
118 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
2 | RUN ["apt", "update"]
3 | RUN ["apt", "install", "-y", "build-essential"]
4 | RUN ["pip", "install", "-U", "pip", "setuptools", "wheel"]
5 | RUN ["pip", "install", "-U", "so-vits-svc-fork"]
6 | ENTRYPOINT [ "svcg" ]
7 | 


--------------------------------------------------------------------------------
/commitlint.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   extends: ["@commitlint/config-conventional"],
3 |   rules: {
4 |     "header-max-length": [0, "always", Infinity],
5 |     "body-max-line-length": [0, "always", Infinity],
6 |     "footer-max-line-length": [0, "always", Infinity],
7 |   },
8 | };
9 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/docs/_static/.gitkeep


--------------------------------------------------------------------------------
/docs/_static/gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/docs/_static/gui.png


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CHANGELOG.md
2 | 
3 | ```
4 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | from pathlib import Path
 7 | from typing import Any
 8 | 
 9 | from sphinx.application import Sphinx
10 | from sphinx.ext import apidoc
11 | 
12 | # -- Project information -----------------------------------------------------
13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
14 | 
15 | project = "SoftVC VITS Singing Voice Conversion Fork"
16 | copyright = "2023, 34j"
17 | author = "34j"
18 | release = "0.0.0"
19 | 
20 | # -- General configuration ---------------------------------------------------
21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
22 | 
23 | # Add any Sphinx extension module names here, as strings. They can be
24 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
25 | # ones.
26 | extensions = [
27 |     "myst_parser",
28 |     "sphinx.ext.napoleon",
29 |     "sphinx.ext.autodoc",
30 |     "sphinx.ext.viewcode",
31 | ]
32 | napoleon_google_docstring = False
33 | 
34 | # The suffix of source filenames.
35 | source_suffix = [".rst", ".md"]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ["_templates"]
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns: list[str] = []
44 | 
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | 
48 | # The theme to use for HTML and HTML Help pages.  See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = "sphinx_rtd_theme"
52 | 
53 | # Add any paths that contain custom static files (such as style sheets) here,
54 | # relative to this directory. They are copied after the builtin static files,
55 | # so a file named "default.css" will overwrite the builtin "default.css".
56 | html_static_path = ["_static"]
57 | 
58 | 
59 | # -- Automatically run sphinx-apidoc -----------------------------------------
60 | 
61 | 
62 | def run_apidoc(_: Any) -> None:
63 |     docs_path = Path(__file__).parent
64 |     module_path = docs_path.parent / "src" / "so_vits_svc_fork"
65 | 
66 |     apidoc.main(
67 |         [
68 |             "--force",
69 |             "--module-first",
70 |             "-o",
71 |             docs_path.as_posix(),
72 |             module_path.as_posix(),
73 |         ]
74 |     )
75 | 
76 | 
77 | def setup(app: Sphinx) -> None:
78 |     app.connect("builder-inited", run_apidoc)
79 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CONTRIBUTING.md
2 | 
3 | ```
4 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to SoftVC VITS Singing Voice Conversion Fork documentation!
 2 | 
 3 | ```{toctree}
 4 | :caption: Installation & Usage
 5 | :maxdepth: 2
 6 | 
 7 | installation
 8 | usage
 9 | ```
10 | 
11 | ```{toctree}
12 | :caption: Project Info
13 | :maxdepth: 2
14 | 
15 | changelog
16 | contributing
17 | ```
18 | 
19 | ```{toctree}
20 | :caption: API Reference
21 | :maxdepth: 2
22 | 
23 | so_vits_svc_fork
24 | ```
25 | 
26 | ```{include} ../README.md
27 | 
28 | ```
29 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 | 
3 | The package is published on [PyPI](https://pypi.org/project/so-vits-svc-fork/) and can be installed with `pip` (or any equivalent):
4 | 
5 | ```bash
6 | pip install so-vits-svc-fork
7 | ```
8 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
 1 | # Usage
 2 | 
 3 | To use this package, import it:
 4 | 
 5 | ```python
 6 | import so_vits_svc_fork
 7 | ```
 8 | 
 9 | TODO: Document usage
10 | 


--------------------------------------------------------------------------------
/easy-installation/install-cn.bat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/easy-installation/install-cn.bat


--------------------------------------------------------------------------------
/easy-installation/install.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | echo You can rerun this script to update the installation.
 4 | 
 5 | echo Moving to AppData\Roaming\so-vits-svc-fork...
 6 | mkdir "%APPDATA%\so-vits-svc-fork" >nul 2>&1
 7 | cd "%APPDATA%\so-vits-svc-fork"
 8 | 
 9 | echo Checking for Python 3.10...
10 | 
11 | py -3.10 --version >nul 2>&1
12 | if %errorlevel%==0 (
13 |     echo Python 3.10 is already installed.
14 | ) else (
15 |     echo Python 3.10 is not installed. Downloading installer...
16 |     curl https://www.python.org/ftp/python/3.10.10/python-3.10.10-amd64.exe -o python-3.10.10-amd64.exe
17 | 
18 |     echo Installing Python 3.10...
19 |     python-3.10.10-amd64.exe /quiet InstallAllUsers=1 PrependPath=1
20 | 
21 |     echo Cleaning up installer...
22 |     del python-3.10.10-amd64.exe
23 | )
24 | 
25 | echo Creating virtual environment...
26 | py -3.10 -m venv venv
27 | 
28 | echo Updating pip and wheel...
29 | venv\Scripts\python.exe -m pip install --upgrade pip wheel
30 | 
31 | nvidia-smi >nul 2>&1
32 | if %errorlevel%==0 (
33 |     echo Installing PyTorch with GPU support...
34 | venv\Scripts\pip.exe install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
35 | ) else (
36 |     echo Installing PyTorch without GPU support...
37 |     venv\Scripts\pip.exe install torch torchaudio
38 | )
39 | 
40 | echo Installing so-vits-svc-fork...
41 | venv\Scripts\pip.exe install so-vits-svc-fork
42 | 
43 | rem echo Creating shortcut...
44 | rem powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%USDRPROFILE%\Desktop\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()"
45 | 
46 | echo Creating shortcut to the start menu...
47 | powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%APPDATA%\Microsoft\Windows\Start Menu\Programs\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()"
48 | 
49 | echo Launching so-vits-svc-fork GUI...
50 | venv\Scripts\svcg.exe
51 | 


--------------------------------------------------------------------------------
/notebooks/so-vits-svc-fork-4.0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Before training\n",
  8 |     "\n",
  9 |     "This program saves the last 3 generations of models to Google Drive. Since 1 generation of models is >1GB, you should have at least 3GB of free space in Google Drive. If you do not have such free space, it is recommended to create another Google Account.\n",
 10 |     "\n",
 11 |     "Training requires >10GB VRAM. (T4 should be enough) Inference does not require such a lot of VRAM."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Installation"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "#@title Check GPU\n",
 28 |     "!nvidia-smi"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "#@title Mount Google Drive\n",
 38 |     "from google.colab import drive\n",
 39 |     "drive.mount('/content/drive')"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "#@title Install dependencies\n",
 49 |     "#@markdown pip may fail to resolve dependencies and raise ERROR, but it can be ignored.\n",
 50 |     "!python -m pip install -U pip wheel\n",
 51 |     "%pip install -U ipython \n",
 52 |     "\n",
 53 |     "#@markdown Branch (for development)\n",
 54 |     "BRANCH = \"none\" #@param {\"type\": \"string\"}\n",
 55 |     "if BRANCH == \"none\":\n",
 56 |     "    %pip install -U so-vits-svc-fork\n",
 57 |     "else:\n",
 58 |     "    %pip install -U git+https://github.com/34j/so-vits-svc-fork.git@{BRANCH}"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Training"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "#@title Make dataset directory\n",
 75 |     "!mkdir -p \"dataset_raw\""
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "#!rm -r \"dataset_raw\"\n",
 85 |     "#!rm -r \"dataset/44k\""
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "#@title Copy your dataset\n",
 95 |     "#@markdown **We assume that your dataset is in your Google Drive's `so-vits-svc-fork/dataset/(speaker_name)` directory.**\n",
 96 |     "DATASET_NAME = \"kiritan\" #@param {type: \"string\"}\n",
 97 |     "!cp -R /content/drive/MyDrive/so-vits-svc-fork/dataset/{DATASET_NAME}/ -t \"dataset_raw/\""
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "#@title Download dataset (Tsukuyomi-chan JVS)\n",
107 |     "#@markdown You can download this dataset if you don't have your own dataset.\n",
108 |     "#@markdown Make sure you agree to the license when using this dataset.\n",
109 |     "#@markdown https://tyc.rei-yumesaki.net/material/corpus/#toc6\n",
110 |     "# !wget https://tyc.rei-yumesaki.net/files/sozai-tyc-corpus1.zip\n",
111 |     "# !unzip sozai-tyc-corpus1.zip\n",
112 |     "# !mv \"/content/つくよみちゃんコーパス Vol.1 声優統計コーパス（JVSコーパス準拠）/おまけ：WAV（+12dB増幅＆高音域削減）/WAV（+12dB増幅＆高音域削減）\" \"dataset_raw/tsukuyomi\""
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "#@title Automatic preprocessing\n",
122 |     "!svc pre-resample"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "!svc pre-config"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "#@title Export configs file\n",
141 |     "#@markdown This assumes that you want to save the **config.json** on the default location. There will be also a backup file created in case the action is done accidentally.!cp configs/44k/config.json configs/44k/config.bkp.json!cp drive/MyDrive/so-vits-svc-fork/config.json configs/44k"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "#@title Import configs file (Optional Step, NOT REQUIRED)\n",
151 |     "#@markdown This assumes that you are saving the **config.json** on the default location. There will be also a backup file created in case the action is done accidentally.!cp drive/MyDrive/so-vits-svc-fork/config.json drive/MyDrive/so-vits-svc-fork/config.bkp.json!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n",
161 |     "!svc pre-hubert -fm {F0_METHOD}"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "#@title Train\n",
171 |     "%load_ext tensorboard\n",
172 |     "%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k\n",
173 |     "!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## Training Cluster model"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "!svc train-cluster --output-path drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "## Inference"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "#@title Get the author's voice as a source\n",
206 |     "import random\n",
207 |     "NAME = str(random.randint(1, 49))\n",
208 |     "TYPE = \"fsd50k\" #@param [\"\", \"digit\", \"dog\", \"fsd50k\"]\n",
209 |     "CUSTOM_FILEPATH = \"\" #@param {type: \"string\"}\n",
210 |     "if CUSTOM_FILEPATH != \"\":\n",
211 |     "    NAME = CUSTOM_FILEPATH\n",
212 |     "else:\n",
213 |     "    # it is extremely difficult to find a voice that can download from the internet directly\n",
214 |     "    if TYPE == \"dog\":\n",
215 |     "        !wget -N f\"https://huggingface.co/datasets/437aewuh/dog-dataset/resolve/main/dogs/dogs_{NAME:.0000}.wav\" -O {NAME}.wav\n",
216 |     "    elif TYPE == \"digit\":\n",
217 |     "        # george, jackson, lucas, nicolas, ...\n",
218 |     "        !wget -N f\"https://github.com/Jakobovski/free-spoken-digit-dataset/raw/master/recordings/0_george_{NAME}.wav\" -O {NAME}.wav\n",
219 |     "    elif TYPE == \"fsd50k\":\n",
220 |     "        !wget -N f\"https://huggingface.co/datasets/Fhrozen/FSD50k/blob/main/clips/dev/{10000+int(NAME)}.wav\" -O {NAME}.wav\n",
221 |     "    else:\n",
222 |     "        !wget -N f\"https://zunko.jp/sozai/utau/voice_{\"kiritan\" if NAME < 25 else \"itako\"}{NAME % 5 + 1}.wav\" -O {NAME}.wav\n",
223 |     "from IPython.display import Audio, display\n",
224 |     "display(Audio(f\"{NAME}.wav\"))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "#@title Use trained model\n",
234 |     "#@markdown **Put your .wav file in `so-vits-svc-fork/audio` directory**\n",
235 |     "from IPython.display import Audio, display\n",
236 |     "!svc infer drive/MyDrive/so-vits-svc-fork/audio/{NAME}.wav -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json\n",
237 |     "display(Audio(f\"drive/MyDrive/so-vits-svc-fork/audio/{NAME}.out.wav\", autoplay=True))"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "##@title Use trained model (with cluster)\n",
247 |     "!svc infer {NAME}.wav -s speaker -r 0.1 -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json -k drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt\n",
248 |     "display(Audio(f\"{NAME}.out.wav\", autoplay=True))"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "### Pretrained models"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "#@title https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/tree/main\n",
265 |     "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/G_riri_220.pth\"\n",
266 |     "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/config.json\""
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "!svc infer {NAME}.wav -c config.json -m G_riri_220.pth\n",
276 |     "display(Audio(f\"{NAME}.out.wav\", autoplay=True))"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "#@title https://huggingface.co/therealvul/so-vits-svc-4.0/tree/main\n",
286 |     "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/G_166400.pth\"\n",
287 |     "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/config.json\""
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "!svc infer {NAME}.wav --speaker \"Pinkie {neutral}\" -c config.json -m G_166400.pth\n",
297 |     "display(Audio(f\"{NAME}.out.wav\", autoplay=True))"
298 |    ]
299 |   }
300 |  ],
301 |  "metadata": {
302 |   "accelerator": "GPU",
303 |   "colab": {
304 |    "provenance": []
305 |   },
306 |   "gpuClass": "standard",
307 |   "kernelspec": {
308 |    "display_name": "Python 3",
309 |    "name": "python3"
310 |   },
311 |   "language_info": {
312 |    "codemirror_mode": {
313 |     "name": "ipython",
314 |     "version": 3
315 |    },
316 |    "file_extension": ".py",
317 |    "mimetype": "text/x-python",
318 |    "name": "python",
319 |    "nbconvert_exporter": "python",
320 |    "pygments_lexer": "ipython3"
321 |   }
322 |  },
323 |  "nbformat": 4,
324 |  "nbformat_minor": 0
325 | }
326 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "so-vits-svc-fork"
  3 | version = "4.2.26"
  4 | description = "A fork of so-vits-svc."
  5 | authors = ["34j <34j.95a2p@simplelogin.com>"]
  6 | license = "MIT"
  7 | readme = "README.md"
  8 | repository = "https://github.com/34j/so-vits-svc-fork"
  9 | documentation = "https://so-vits-svc-fork.readthedocs.io"
 10 | classifiers = [
 11 |     "Development Status :: 2 - Pre-Alpha",
 12 |     "Intended Audience :: Developers",
 13 |     "Natural Language :: English",
 14 |     "Operating System :: OS Independent",
 15 |     "Topic :: Software Development :: Libraries",
 16 | ]
 17 | packages = [
 18 |     { include = "so_vits_svc_fork", from = "src" },
 19 | ]
 20 | 
 21 | [tool.poetry.scripts]
 22 | so-vits-svc-fork = "so_vits_svc_fork.__main__:cli"
 23 | svc = "so_vits_svc_fork.__main__:cli"
 24 | svcf = "so_vits_svc_fork.__main__:cli"
 25 | svcg = "so_vits_svc_fork.gui:main"
 26 | svc-gui = "so_vits_svc_fork.gui:main"
 27 | svcf-gui = "so_vits_svc_fork.gui:main"
 28 | 
 29 | [tool.poetry.urls]
 30 | "Bug Tracker" = "https://github.com/34j/so-vits-svc-fork/issues"
 31 | "Changelog" = "https://github.com/34j/so-vits-svc-fork/blob/main/CHANGELOG.md"
 32 | 
 33 | [tool.poetry.dependencies]
 34 | python = ">=3.9,<3.13"
 35 | librosa = "*"
 36 | numpy = "^1.26.4"
 37 | pyworld = "*"
 38 | requests = "*"
 39 | scipy = "*"
 40 | sounddevice = "*"
 41 | SoundFile = "*"
 42 | tqdm = "*"
 43 | praat-parselmouth = "*"
 44 | onnx = "*"
 45 | onnxsim = "*"
 46 | onnxoptimizer = "*"
 47 | torch = "^2"
 48 | torchaudio = "*"
 49 | tensorboard = "*"
 50 | rich = "*"
 51 | tqdm-joblib = "^0.0.4"
 52 | tensorboardx = "*"
 53 | cm-time = ">=0.1.2"
 54 | pebble = ">=5.0"
 55 | torchcrepe = ">=0.0.17"
 56 | lightning = "^2.0.1"
 57 | fastapi = "==0.111.1"
 58 | transformers = "^4.28.1"
 59 | matplotlib = "^3.7.1"
 60 | click = "^8.1.7"
 61 | setuptools = "^69.5.1"
 62 | pysimplegui-4-foss = "^4.60.4.1"
 63 | 
 64 | [tool.poetry.group.dev.dependencies]
 65 | pre-commit = ">=3"
 66 | pytest = "^8.0.0"
 67 | pytest-cov = "^4.0.0"
 68 | pipdeptree = "^2.7.0"
 69 | pip-licenses = "^5.0.0"
 70 | 
 71 | [tool.poetry.group.docs]
 72 | optional = true
 73 | 
 74 | [tool.poetry.group.docs.dependencies]
 75 | myst-parser = ">=0.16"
 76 | sphinx = ">=4.0"
 77 | sphinx-rtd-theme = ">=1.0"
 78 | 
 79 | [tool.semantic_release]
 80 | branch = "main"
 81 | version_toml = "pyproject.toml:tool.poetry.version"
 82 | version_variable = "src/so_vits_svc_fork/__init__.py:__version__"
 83 | build_command = "pip install poetry && poetry build"
 84 | 
 85 | [tool.pytest.ini_options]
 86 | addopts = "-v -Wdefault --cov=so_vits_svc_fork --cov-report=term-missing:skip-covered"
 87 | pythonpath = ["src"]
 88 | 
 89 | [tool.coverage.run]
 90 | branch = true
 91 | 
 92 | [tool.coverage.report]
 93 | exclude_lines = [
 94 |     "pragma: no cover",
 95 |     "@overload",
 96 |     "if TYPE_CHECKING",
 97 |     "raise NotImplementedError",
 98 |     'if __name__ == "__main__":',
 99 | ]
100 | 
101 | [tool.isort]
102 | profile = "black"
103 | known_first_party = ["so_vits_svc_fork", "tests"]
104 | 
105 | [tool.autoflake]
106 | remove_all_unused_imports = true
107 | 
108 | [tool.mypy]
109 | check_untyped_defs = true
110 | disallow_any_generics = true
111 | disallow_incomplete_defs = true
112 | disallow_untyped_defs = true
113 | mypy_path = "src/"
114 | no_implicit_optional = true
115 | show_error_codes = true
116 | warn_unreachable = true
117 | warn_unused_ignores = true
118 | exclude = [
119 |     'docs/.*',
120 |     'setup.py',
121 | ]
122 | 
123 | [[tool.mypy.overrides]]
124 | module = "tests.*"
125 | allow_untyped_defs = true
126 | 
127 | [[tool.mypy.overrides]]
128 | module = "docs.*"
129 | ignore_errors = true
130 | 
131 | [tool.bandit]
132 | exclude_dirs = ["src"]
133 | 
134 | [build-system]
135 | requires = ["poetry-core>=1.0.0"]
136 | build-backend = "poetry.core.masonry.api"
137 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["github>browniebroke/renovate-configs:python"]
3 | }
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This is a shim to allow GitHub to detect the package, build is done with poetry
 4 | # Taken from https://github.com/Textualize/rich
 5 | 
 6 | import setuptools
 7 | 
 8 | if __name__ == "__main__":
 9 |     setuptools.setup(name="so-vits-svc-fork")
10 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "4.2.26"
2 | 
3 | from .logger import init_logger
4 | 
5 | init_logger()
6 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/cluster/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | from typing import Any
 5 | 
 6 | import torch
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | 
10 | def get_cluster_model(ckpt_path: Path | str):
11 |     with Path(ckpt_path).open("rb") as f:
12 |         checkpoint = torch.load(
13 |             f, map_location="cpu"
14 |         )  # Danger of arbitrary code execution
15 |     kmeans_dict = {}
16 |     for spk, ckpt in checkpoint.items():
17 |         km = KMeans(ckpt["n_features_in_"])
18 |         km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
19 |         km.__dict__["_n_threads"] = ckpt["_n_threads"]
20 |         km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
21 |         kmeans_dict[spk] = km
22 |     return kmeans_dict
23 | 
24 | 
25 | def check_speaker(model: Any, speaker: Any):
26 |     if speaker not in model:
27 |         raise ValueError(f"Speaker {speaker} not in {list(model.keys())}")
28 | 
29 | 
30 | def get_cluster_result(model: Any, x: Any, speaker: Any):
31 |     """
32 |     x: np.array [t, 256]
33 |     return cluster class result
34 |     """
35 |     check_speaker(model, speaker)
36 |     return model[speaker].predict(x)
37 | 
38 | 
39 | def get_cluster_center_result(model: Any, x: Any, speaker: Any):
40 |     """x: np.array [t, 256]"""
41 |     check_speaker(model, speaker)
42 |     predict = model[speaker].predict(x)
43 |     return model[speaker].cluster_centers_[predict]
44 | 
45 | 
46 | def get_center(model: Any, x: Any, speaker: Any):
47 |     check_speaker(model, speaker)
48 |     return model[speaker].cluster_centers_[x]
49 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/cluster/train_cluster.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import math
  4 | from logging import getLogger
  5 | from pathlib import Path
  6 | from typing import Any
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | from cm_time import timer
 11 | from joblib import Parallel, delayed
 12 | from sklearn.cluster import KMeans, MiniBatchKMeans
 13 | from tqdm_joblib import tqdm_joblib
 14 | 
 15 | LOG = getLogger(__name__)
 16 | 
 17 | 
 18 | def train_cluster(
 19 |     input_dir: Path | str,
 20 |     n_clusters: int,
 21 |     use_minibatch: bool = True,
 22 |     batch_size: int = 4096,
 23 |     partial_fit: bool = False,
 24 |     verbose: bool = False,
 25 | ) -> dict:
 26 |     input_dir = Path(input_dir)
 27 |     if not partial_fit:
 28 |         LOG.info(f"Loading features from {input_dir}")
 29 |         features = []
 30 |         for path in input_dir.rglob("*.data.pt"):
 31 |             with path.open("rb") as f:
 32 |                 features.append(
 33 |                     torch.load(f, weights_only=True)["content"].squeeze(0).numpy().T
 34 |                 )
 35 |         if not features:
 36 |             raise ValueError(f"No features found in {input_dir}")
 37 |         features = np.concatenate(features, axis=0).astype(np.float32)
 38 |         if features.shape[0] < n_clusters:
 39 |             raise ValueError(
 40 |                 "Too few HuBERT features to cluster. Consider using a smaller number of clusters."
 41 |             )
 42 |         LOG.info(
 43 |             f"shape: {features.shape}, size: {features.nbytes/1024**2:.2f} MB, dtype: {features.dtype}"
 44 |         )
 45 |         with timer() as t:
 46 |             if use_minibatch:
 47 |                 kmeans = MiniBatchKMeans(
 48 |                     n_clusters=n_clusters,
 49 |                     verbose=verbose,
 50 |                     batch_size=batch_size,
 51 |                     max_iter=80,
 52 |                     n_init="auto",
 53 |                 ).fit(features)
 54 |             else:
 55 |                 kmeans = KMeans(
 56 |                     n_clusters=n_clusters, verbose=verbose, n_init="auto"
 57 |                 ).fit(features)
 58 |         LOG.info(f"Clustering took {t.elapsed:.2f} seconds")
 59 | 
 60 |         x = {
 61 |             "n_features_in_": kmeans.n_features_in_,
 62 |             "_n_threads": kmeans._n_threads,
 63 |             "cluster_centers_": kmeans.cluster_centers_,
 64 |         }
 65 |         return x
 66 |     else:
 67 |         # minibatch partial fit
 68 |         paths = list(input_dir.rglob("*.data.pt"))
 69 |         if len(paths) == 0:
 70 |             raise ValueError(f"No features found in {input_dir}")
 71 |         LOG.info(f"Found {len(paths)} features in {input_dir}")
 72 |         n_batches = math.ceil(len(paths) / batch_size)
 73 |         LOG.info(f"Splitting into {n_batches} batches")
 74 |         with timer() as t:
 75 |             kmeans = MiniBatchKMeans(
 76 |                 n_clusters=n_clusters,
 77 |                 verbose=verbose,
 78 |                 batch_size=batch_size,
 79 |                 max_iter=80,
 80 |                 n_init="auto",
 81 |             )
 82 |             for i in range(0, len(paths), batch_size):
 83 |                 LOG.info(
 84 |                     f"Processing batch {i//batch_size+1}/{n_batches} for speaker {input_dir.stem}"
 85 |                 )
 86 |                 features = []
 87 |                 for path in paths[i : i + batch_size]:
 88 |                     with path.open("rb") as f:
 89 |                         features.append(
 90 |                             torch.load(f, weights_only=True)["content"]
 91 |                             .squeeze(0)
 92 |                             .numpy()
 93 |                             .T
 94 |                         )
 95 |                 features = np.concatenate(features, axis=0).astype(np.float32)
 96 |                 kmeans.partial_fit(features)
 97 |         LOG.info(f"Clustering took {t.elapsed:.2f} seconds")
 98 | 
 99 |         x = {
100 |             "n_features_in_": kmeans.n_features_in_,
101 |             "_n_threads": kmeans._n_threads,
102 |             "cluster_centers_": kmeans.cluster_centers_,
103 |         }
104 |         return x
105 | 
106 | 
107 | def main(
108 |     input_dir: Path | str,
109 |     output_path: Path | str,
110 |     n_clusters: int = 10000,
111 |     use_minibatch: bool = True,
112 |     batch_size: int = 4096,
113 |     partial_fit: bool = False,
114 |     verbose: bool = False,
115 | ) -> None:
116 |     input_dir = Path(input_dir)
117 |     output_path = Path(output_path)
118 | 
119 |     if not (use_minibatch or not partial_fit):
120 |         raise ValueError("partial_fit requires use_minibatch")
121 | 
122 |     def train_cluster_(input_path: Path, **kwargs: Any) -> tuple[str, dict]:
123 |         return input_path.stem, train_cluster(input_path, **kwargs)
124 | 
125 |     with tqdm_joblib(desc="Training clusters", total=len(list(input_dir.iterdir()))):
126 |         parallel_result = Parallel(n_jobs=-1)(
127 |             delayed(train_cluster_)(
128 |                 speaker_name,
129 |                 n_clusters=n_clusters,
130 |                 use_minibatch=use_minibatch,
131 |                 batch_size=batch_size,
132 |                 partial_fit=partial_fit,
133 |                 verbose=verbose,
134 |             )
135 |             for speaker_name in input_dir.iterdir()
136 |         )
137 |     assert parallel_result is not None
138 |     checkpoint = dict(parallel_result)
139 |     output_path.parent.mkdir(exist_ok=True, parents=True)
140 |     with output_path.open("wb") as f:
141 |         torch.save(checkpoint, f)
142 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/dataset.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | from random import Random
 5 | from typing import Sequence
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.nn.functional as F
10 | from torch.utils.data import Dataset
11 | 
12 | from .hparams import HParams
13 | 
14 | 
15 | class TextAudioDataset(Dataset):
16 |     def __init__(self, hps: HParams, is_validation: bool = False):
17 |         self.datapaths = [
18 |             Path(x).parent / (Path(x).name + ".data.pt")
19 |             for x in Path(
20 |                 hps.data.validation_files if is_validation else hps.data.training_files
21 |             )
22 |             .read_text("utf-8")
23 |             .splitlines()
24 |         ]
25 |         self.hps = hps
26 |         self.random = Random(hps.train.seed)
27 |         self.random.shuffle(self.datapaths)
28 |         self.max_spec_len = 800
29 | 
30 |     def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
31 |         with Path(self.datapaths[index]).open("rb") as f:
32 |             data = torch.load(f, weights_only=True, map_location="cpu")
33 | 
34 |         # cut long data randomly
35 |         spec_len = data["mel_spec"].shape[1]
36 |         hop_len = self.hps.data.hop_length
37 |         if spec_len > self.max_spec_len:
38 |             start = self.random.randint(0, spec_len - self.max_spec_len)
39 |             end = start + self.max_spec_len - 10
40 |             for key in data.keys():
41 |                 if key == "audio":
42 |                     data[key] = data[key][:, start * hop_len : end * hop_len]
43 |                 elif key == "spk":
44 |                     continue
45 |                 else:
46 |                     data[key] = data[key][..., start:end]
47 |         torch.cuda.empty_cache()
48 |         return data
49 | 
50 |     def __len__(self) -> int:
51 |         return len(self.datapaths)
52 | 
53 | 
54 | def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor:
55 |     max_idx = torch.argmax(torch.tensor([x_.shape[-1] for x_ in array]))
56 |     max_x = array[max_idx]
57 |     x_padded = [
58 |         F.pad(x_, (0, max_x.shape[-1] - x_.shape[-1]), mode="constant", value=0)
59 |         for x_ in array
60 |     ]
61 |     return torch.stack(x_padded)
62 | 
63 | 
64 | class TextAudioCollate(nn.Module):
65 |     def forward(
66 |         self, batch: Sequence[dict[str, torch.Tensor]]
67 |     ) -> tuple[torch.Tensor, ...]:
68 |         batch = [b for b in batch if b is not None]
69 |         batch = list(sorted(batch, key=lambda x: x["mel_spec"].shape[1], reverse=True))
70 |         lengths = torch.tensor([b["mel_spec"].shape[1] for b in batch]).long()
71 |         results = {}
72 |         for key in batch[0].keys():
73 |             if key not in ["spk"]:
74 |                 results[key] = _pad_stack([b[key] for b in batch]).cpu()
75 |             else:
76 |                 results[key] = torch.tensor([[b[key]] for b in batch]).cpu()
77 | 
78 |         return (
79 |             results["content"],
80 |             results["f0"],
81 |             results["spec"],
82 |             results["mel_spec"],
83 |             results["audio"],
84 |             results["spk"],
85 |             lengths,
86 |             results["uv"],
87 |         )
88 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/default_gui_presets.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Default VC (GPU, GTX 1060)": {
 3 |     "silence_threshold": -35.0,
 4 |     "transpose": 12.0,
 5 |     "auto_predict_f0": false,
 6 |     "f0_method": "dio",
 7 |     "cluster_infer_ratio": 0.0,
 8 |     "noise_scale": 0.4,
 9 |     "pad_seconds": 0.1,
10 |     "chunk_seconds": 0.5,
11 |     "absolute_thresh": true,
12 |     "max_chunk_seconds": 40,
13 |     "crossfade_seconds": 0.05,
14 |     "block_seconds": 0.35,
15 |     "additional_infer_before_seconds": 0.15,
16 |     "additional_infer_after_seconds": 0.1,
17 |     "realtime_algorithm": "1 (Divide constantly)",
18 |     "passthrough_original": false,
19 |     "use_gpu": true
20 |   },
21 |   "Default VC (CPU)": {
22 |     "silence_threshold": -35.0,
23 |     "transpose": 12.0,
24 |     "auto_predict_f0": false,
25 |     "f0_method": "dio",
26 |     "cluster_infer_ratio": 0.0,
27 |     "noise_scale": 0.4,
28 |     "pad_seconds": 0.1,
29 |     "chunk_seconds": 0.5,
30 |     "absolute_thresh": true,
31 |     "max_chunk_seconds": 40,
32 |     "crossfade_seconds": 0.05,
33 |     "block_seconds": 1.5,
34 |     "additional_infer_before_seconds": 0.01,
35 |     "additional_infer_after_seconds": 0.01,
36 |     "realtime_algorithm": "1 (Divide constantly)",
37 |     "passthrough_original": false,
38 |     "use_gpu": false
39 |   },
40 |   "Default VC (Mobile CPU)": {
41 |     "silence_threshold": -35.0,
42 |     "transpose": 12.0,
43 |     "auto_predict_f0": false,
44 |     "f0_method": "dio",
45 |     "cluster_infer_ratio": 0.0,
46 |     "noise_scale": 0.4,
47 |     "pad_seconds": 0.1,
48 |     "chunk_seconds": 0.5,
49 |     "absolute_thresh": true,
50 |     "max_chunk_seconds": 40,
51 |     "crossfade_seconds": 0.05,
52 |     "block_seconds": 2.5,
53 |     "additional_infer_before_seconds": 0.01,
54 |     "additional_infer_after_seconds": 0.01,
55 |     "realtime_algorithm": "1 (Divide constantly)",
56 |     "passthrough_original": false,
57 |     "use_gpu": false
58 |   },
59 |   "Default VC (Crooning)": {
60 |     "silence_threshold": -35.0,
61 |     "transpose": 12.0,
62 |     "auto_predict_f0": false,
63 |     "f0_method": "dio",
64 |     "cluster_infer_ratio": 0.0,
65 |     "noise_scale": 0.4,
66 |     "pad_seconds": 0.1,
67 |     "chunk_seconds": 0.5,
68 |     "absolute_thresh": true,
69 |     "max_chunk_seconds": 40,
70 |     "crossfade_seconds": 0.04,
71 |     "block_seconds": 0.15,
72 |     "additional_infer_before_seconds": 0.05,
73 |     "additional_infer_after_seconds": 0.05,
74 |     "realtime_algorithm": "1 (Divide constantly)",
75 |     "passthrough_original": false,
76 |     "use_gpu": true
77 |   },
78 |   "Default File": {
79 |     "silence_threshold": -35.0,
80 |     "transpose": 0.0,
81 |     "auto_predict_f0": true,
82 |     "f0_method": "crepe",
83 |     "cluster_infer_ratio": 0.0,
84 |     "noise_scale": 0.4,
85 |     "pad_seconds": 0.1,
86 |     "chunk_seconds": 0.5,
87 |     "absolute_thresh": true,
88 |     "max_chunk_seconds": 40,
89 |     "auto_play": true,
90 |     "passthrough_original": false
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/f0.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from logging import getLogger
  4 | from typing import Any, Literal
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torchcrepe
  9 | from cm_time import timer
 10 | from numpy import dtype, float32, ndarray
 11 | from torch import FloatTensor, Tensor
 12 | 
 13 | from so_vits_svc_fork.utils import get_optimal_device
 14 | 
 15 | LOG = getLogger(__name__)
 16 | 
 17 | 
 18 | def normalize_f0(
 19 |     f0: FloatTensor, x_mask: FloatTensor, uv: FloatTensor, random_scale=True
 20 | ) -> FloatTensor:
 21 |     # calculate means based on x_mask
 22 |     uv_sum = torch.sum(uv, dim=1, keepdim=True)
 23 |     uv_sum[uv_sum == 0] = 9999
 24 |     means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum
 25 | 
 26 |     if random_scale:
 27 |         factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device)
 28 |     else:
 29 |         factor = torch.ones(f0.shape[0], 1).to(f0.device)
 30 |     # normalize f0 based on means and factor
 31 |     f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
 32 |     if torch.isnan(f0_norm).any():
 33 |         exit(0)
 34 |     return f0_norm * x_mask
 35 | 
 36 | 
 37 | def interpolate_f0(
 38 |     f0: ndarray[Any, dtype[float32]]
 39 | ) -> tuple[ndarray[Any, dtype[float32]], ndarray[Any, dtype[float32]]]:
 40 |     data = np.reshape(f0, (f0.size, 1))
 41 | 
 42 |     vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
 43 |     vuv_vector[data > 0.0] = 1.0
 44 |     vuv_vector[data <= 0.0] = 0.0
 45 | 
 46 |     ip_data = data
 47 | 
 48 |     frame_number = data.size
 49 |     last_value = 0.0
 50 |     for i in range(frame_number):
 51 |         if data[i] <= 0.0:
 52 |             j = i + 1
 53 |             for j in range(i + 1, frame_number):
 54 |                 if data[j] > 0.0:
 55 |                     break
 56 |             if j < frame_number - 1:
 57 |                 if last_value > 0.0:
 58 |                     step = (data[j] - data[i - 1]) / float(j - i)
 59 |                     for k in range(i, j):
 60 |                         ip_data[k] = data[i - 1] + step * (k - i + 1)
 61 |                 else:
 62 |                     for k in range(i, j):
 63 |                         ip_data[k] = data[j]
 64 |             else:
 65 |                 for k in range(i, frame_number):
 66 |                     ip_data[k] = last_value
 67 |         else:
 68 |             ip_data[i] = data[i]
 69 |             last_value = data[i]
 70 | 
 71 |     return ip_data[:, 0], vuv_vector[:, 0]
 72 | 
 73 | 
 74 | def compute_f0_parselmouth(
 75 |     wav_numpy: ndarray[Any, dtype[float32]],
 76 |     p_len: None | int = None,
 77 |     sampling_rate: int = 44100,
 78 |     hop_length: int = 512,
 79 | ):
 80 |     import parselmouth
 81 | 
 82 |     x = wav_numpy
 83 |     if p_len is None:
 84 |         p_len = x.shape[0] // hop_length
 85 |     else:
 86 |         assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error"
 87 |     time_step = hop_length / sampling_rate * 1000
 88 |     f0_min = 50
 89 |     f0_max = 1100
 90 |     f0 = (
 91 |         parselmouth.Sound(x, sampling_rate)
 92 |         .to_pitch_ac(
 93 |             time_step=time_step / 1000,
 94 |             voicing_threshold=0.6,
 95 |             pitch_floor=f0_min,
 96 |             pitch_ceiling=f0_max,
 97 |         )
 98 |         .selected_array["frequency"]
 99 |     )
100 | 
101 |     pad_size = (p_len - len(f0) + 1) // 2
102 |     if pad_size > 0 or p_len - len(f0) - pad_size > 0:
103 |         f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
104 |     return f0
105 | 
106 | 
107 | def _resize_f0(
108 |     x: ndarray[Any, dtype[float32]], target_len: int
109 | ) -> ndarray[Any, dtype[float32]]:
110 |     source = np.array(x)
111 |     source[source < 0.001] = np.nan
112 |     target = np.interp(
113 |         np.arange(0, len(source) * target_len, len(source)) / target_len,
114 |         np.arange(0, len(source)),
115 |         source,
116 |     )
117 |     res = np.nan_to_num(target)
118 |     return res
119 | 
120 | 
121 | def compute_f0_pyworld(
122 |     wav_numpy: ndarray[Any, dtype[float32]],
123 |     p_len: None | int = None,
124 |     sampling_rate: int = 44100,
125 |     hop_length: int = 512,
126 |     type_: Literal["dio", "harvest"] = "dio",
127 | ):
128 |     import pyworld
129 | 
130 |     if p_len is None:
131 |         p_len = wav_numpy.shape[0] // hop_length
132 |     if type_ == "dio":
133 |         f0, t = pyworld.dio(
134 |             wav_numpy.astype(np.double),
135 |             fs=sampling_rate,
136 |             f0_ceil=f0_max,
137 |             f0_floor=f0_min,
138 |             frame_period=1000 * hop_length / sampling_rate,
139 |         )
140 |     elif type_ == "harvest":
141 |         f0, t = pyworld.harvest(
142 |             wav_numpy.astype(np.double),
143 |             fs=sampling_rate,
144 |             f0_ceil=f0_max,
145 |             f0_floor=f0_min,
146 |             frame_period=1000 * hop_length / sampling_rate,
147 |         )
148 |     f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
149 |     for index, pitch in enumerate(f0):
150 |         f0[index] = round(pitch, 1)
151 |     return _resize_f0(f0, p_len)
152 | 
153 | 
154 | def compute_f0_crepe(
155 |     wav_numpy: ndarray[Any, dtype[float32]],
156 |     p_len: None | int = None,
157 |     sampling_rate: int = 44100,
158 |     hop_length: int = 512,
159 |     device: str | torch.device = get_optimal_device(),
160 |     model: Literal["full", "tiny"] = "full",
161 | ):
162 |     audio = torch.from_numpy(wav_numpy).to(device, copy=True)
163 |     audio = torch.unsqueeze(audio, dim=0)
164 | 
165 |     if audio.ndim == 2 and audio.shape[0] > 1:
166 |         audio = torch.mean(audio, dim=0, keepdim=True).detach()
167 |     # (T) -> (1, T)
168 |     audio = audio.detach()
169 | 
170 |     pitch: Tensor = torchcrepe.predict(
171 |         audio,
172 |         sampling_rate,
173 |         hop_length,
174 |         f0_min,
175 |         f0_max,
176 |         model,
177 |         batch_size=hop_length * 2,
178 |         device=device,
179 |         pad=True,
180 |     )
181 | 
182 |     f0 = pitch.squeeze(0).cpu().float().numpy()
183 |     p_len = p_len or wav_numpy.shape[0] // hop_length
184 |     f0 = _resize_f0(f0, p_len)
185 |     return f0
186 | 
187 | 
188 | def compute_f0(
189 |     wav_numpy: ndarray[Any, dtype[float32]],
190 |     p_len: None | int = None,
191 |     sampling_rate: int = 44100,
192 |     hop_length: int = 512,
193 |     method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
194 |     **kwargs,
195 | ):
196 |     with timer() as t:
197 |         wav_numpy = wav_numpy.astype(np.float32)
198 |         wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999)
199 |         if method in ["dio", "harvest"]:
200 |             f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method)
201 |         elif method == "crepe":
202 |             f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs)
203 |         elif method == "crepe-tiny":
204 |             f0 = compute_f0_crepe(
205 |                 wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs
206 |             )
207 |         elif method == "parselmouth":
208 |             f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
209 |         else:
210 |             raise ValueError(
211 |                 "type must be dio, crepe, crepe-tiny, harvest or parselmouth"
212 |             )
213 |     rtf = t.elapsed / (len(wav_numpy) / sampling_rate)
214 |     LOG.info(f"F0 inference time:       {t.elapsed:.3f}s, RTF: {rtf:.3f}")
215 |     return f0
216 | 
217 | 
218 | def f0_to_coarse(f0: torch.Tensor | float):
219 |     is_torch = isinstance(f0, torch.Tensor)
220 |     f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
221 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
222 |         f0_mel_max - f0_mel_min
223 |     ) + 1
224 | 
225 |     f0_mel[f0_mel <= 1] = 1
226 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
227 |     f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
228 |     assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
229 |         f0_coarse.max(),
230 |         f0_coarse.min(),
231 |     )
232 |     return f0_coarse
233 | 
234 | 
235 | f0_bin = 256
236 | f0_max = 1100.0
237 | f0_min = 50.0
238 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
239 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
240 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/hparams.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | 
 5 | 
 6 | class HParams:
 7 |     def __init__(self, **kwargs: Any) -> None:
 8 |         for k, v in kwargs.items():
 9 |             if type(v) == dict:  # noqa
10 |                 v = HParams(**v)
11 |             self[k] = v
12 | 
13 |     def keys(self):
14 |         return self.__dict__.keys()
15 | 
16 |     def items(self):
17 |         return self.__dict__.items()
18 | 
19 |     def values(self):
20 |         return self.__dict__.values()
21 | 
22 |     def get(self, key: str, default: Any = None):
23 |         return self.__dict__.get(key, default)
24 | 
25 |     def __len__(self):
26 |         return len(self.__dict__)
27 | 
28 |     def __getitem__(self, key):
29 |         return getattr(self, key)
30 | 
31 |     def __setitem__(self, key, value):
32 |         return setattr(self, key, value)
33 | 
34 |     def __contains__(self, key):
35 |         return key in self.__dict__
36 | 
37 |     def __repr__(self):
38 |         return self.__dict__.__repr__()
39 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/inference/__init__.py


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/inference/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from logging import getLogger
  4 | from pathlib import Path
  5 | from typing import Literal, Sequence
  6 | 
  7 | import librosa
  8 | import numpy as np
  9 | import soundfile
 10 | import torch
 11 | from cm_time import timer
 12 | from tqdm import tqdm
 13 | 
 14 | from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc
 15 | from so_vits_svc_fork.utils import get_optimal_device
 16 | 
 17 | LOG = getLogger(__name__)
 18 | 
 19 | 
 20 | def infer(
 21 |     *,
 22 |     # paths
 23 |     input_path: Path | str | Sequence[Path | str],
 24 |     output_path: Path | str | Sequence[Path | str],
 25 |     model_path: Path | str,
 26 |     config_path: Path | str,
 27 |     recursive: bool = False,
 28 |     # svc config
 29 |     speaker: int | str,
 30 |     cluster_model_path: Path | str | None = None,
 31 |     transpose: int = 0,
 32 |     auto_predict_f0: bool = False,
 33 |     cluster_infer_ratio: float = 0,
 34 |     noise_scale: float = 0.4,
 35 |     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
 36 |     # slice config
 37 |     db_thresh: int = -40,
 38 |     pad_seconds: float = 0.5,
 39 |     chunk_seconds: float = 0.5,
 40 |     absolute_thresh: bool = False,
 41 |     max_chunk_seconds: float = 40,
 42 |     device: str | torch.device = get_optimal_device(),
 43 | ):
 44 |     if isinstance(input_path, (str, Path)):
 45 |         input_path = [input_path]
 46 |     if isinstance(output_path, (str, Path)):
 47 |         output_path = [output_path]
 48 |     if len(input_path) != len(output_path):
 49 |         raise ValueError(
 50 |             f"input_path and output_path must have same length, but got {len(input_path)} and {len(output_path)}"
 51 |         )
 52 | 
 53 |     model_path = Path(model_path)
 54 |     config_path = Path(config_path)
 55 |     output_path = [Path(p) for p in output_path]
 56 |     input_path = [Path(p) for p in input_path]
 57 |     output_paths = []
 58 |     input_paths = []
 59 | 
 60 |     for input_path, output_path in zip(input_path, output_path):
 61 |         if input_path.is_dir():
 62 |             if not recursive:
 63 |                 raise ValueError(
 64 |                     f"input_path is a directory, but recursive is False: {input_path}"
 65 |                 )
 66 |             input_paths.extend(list(input_path.rglob("*.*")))
 67 |             output_paths.extend(
 68 |                 [output_path / p.relative_to(input_path) for p in input_paths]
 69 |             )
 70 |             continue
 71 |         input_paths.append(input_path)
 72 |         output_paths.append(output_path)
 73 | 
 74 |     cluster_model_path = Path(cluster_model_path) if cluster_model_path else None
 75 |     svc_model = Svc(
 76 |         net_g_path=model_path.as_posix(),
 77 |         config_path=config_path.as_posix(),
 78 |         cluster_model_path=(
 79 |             cluster_model_path.as_posix() if cluster_model_path else None
 80 |         ),
 81 |         device=device,
 82 |     )
 83 | 
 84 |     try:
 85 |         pbar = tqdm(list(zip(input_paths, output_paths)), disable=len(input_paths) == 1)
 86 |         for input_path, output_path in pbar:
 87 |             pbar.set_description(f"{input_path}")
 88 |             try:
 89 |                 audio, _ = librosa.load(str(input_path), sr=svc_model.target_sample)
 90 |             except Exception as e:
 91 |                 LOG.error(f"Failed to load {input_path}")
 92 |                 LOG.exception(e)
 93 |                 continue
 94 |             output_path.parent.mkdir(parents=True, exist_ok=True)
 95 |             audio = svc_model.infer_silence(
 96 |                 audio.astype(np.float32),
 97 |                 speaker=speaker,
 98 |                 transpose=transpose,
 99 |                 auto_predict_f0=auto_predict_f0,
100 |                 cluster_infer_ratio=cluster_infer_ratio,
101 |                 noise_scale=noise_scale,
102 |                 f0_method=f0_method,
103 |                 db_thresh=db_thresh,
104 |                 pad_seconds=pad_seconds,
105 |                 chunk_seconds=chunk_seconds,
106 |                 absolute_thresh=absolute_thresh,
107 |                 max_chunk_seconds=max_chunk_seconds,
108 |             )
109 |             soundfile.write(str(output_path), audio, svc_model.target_sample)
110 |     finally:
111 |         del svc_model
112 |         torch.cuda.empty_cache()
113 | 
114 | 
115 | def realtime(
116 |     *,
117 |     # paths
118 |     model_path: Path | str,
119 |     config_path: Path | str,
120 |     # svc config
121 |     speaker: str,
122 |     cluster_model_path: Path | str | None = None,
123 |     transpose: int = 0,
124 |     auto_predict_f0: bool = False,
125 |     cluster_infer_ratio: float = 0,
126 |     noise_scale: float = 0.4,
127 |     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
128 |     # slice config
129 |     db_thresh: int = -40,
130 |     pad_seconds: float = 0.5,
131 |     chunk_seconds: float = 0.5,
132 |     # realtime config
133 |     crossfade_seconds: float = 0.05,
134 |     additional_infer_before_seconds: float = 0.2,
135 |     additional_infer_after_seconds: float = 0.1,
136 |     block_seconds: float = 0.5,
137 |     version: int = 2,
138 |     input_device: int | str | None = None,
139 |     output_device: int | str | None = None,
140 |     device: str | torch.device = get_optimal_device(),
141 |     passthrough_original: bool = False,
142 | ):
143 |     import sounddevice as sd
144 | 
145 |     model_path = Path(model_path)
146 |     config_path = Path(config_path)
147 |     cluster_model_path = Path(cluster_model_path) if cluster_model_path else None
148 |     svc_model = Svc(
149 |         net_g_path=model_path.as_posix(),
150 |         config_path=config_path.as_posix(),
151 |         cluster_model_path=(
152 |             cluster_model_path.as_posix() if cluster_model_path else None
153 |         ),
154 |         device=device,
155 |     )
156 | 
157 |     LOG.info("Creating realtime model...")
158 |     if version == 1:
159 |         model = RealtimeVC(
160 |             svc_model=svc_model,
161 |             crossfade_len=int(crossfade_seconds * svc_model.target_sample),
162 |             additional_infer_before_len=int(
163 |                 additional_infer_before_seconds * svc_model.target_sample
164 |             ),
165 |             additional_infer_after_len=int(
166 |                 additional_infer_after_seconds * svc_model.target_sample
167 |             ),
168 |         )
169 |     else:
170 |         model = RealtimeVC2(
171 |             svc_model=svc_model,
172 |         )
173 | 
174 |     # LOG all device info
175 |     devices = sd.query_devices()
176 |     LOG.info(f"Device: {devices}")
177 |     if isinstance(input_device, str):
178 |         input_device_candidates = [
179 |             i for i, d in enumerate(devices) if d["name"] == input_device
180 |         ]
181 |         if len(input_device_candidates) == 0:
182 |             LOG.warning(f"Input device {input_device} not found, using default")
183 |             input_device = None
184 |         else:
185 |             input_device = input_device_candidates[0]
186 |     if isinstance(output_device, str):
187 |         output_device_candidates = [
188 |             i for i, d in enumerate(devices) if d["name"] == output_device
189 |         ]
190 |         if len(output_device_candidates) == 0:
191 |             LOG.warning(f"Output device {output_device} not found, using default")
192 |             output_device = None
193 |         else:
194 |             output_device = output_device_candidates[0]
195 |     if input_device is None or input_device >= len(devices):
196 |         input_device = sd.default.device[0]
197 |     if output_device is None or output_device >= len(devices):
198 |         output_device = sd.default.device[1]
199 |     LOG.info(
200 |         f"Input Device: {devices[input_device]['name']}, Output Device: {devices[output_device]['name']}"
201 |     )
202 | 
203 |     # the model RTL is somewhat significantly high only in the first inference
204 |     # there could be no better way to warm up the model than to do a dummy inference
205 |     # (there are not differences in the behavior of the model between the first and the later inferences)
206 |     # so we do a dummy inference to warm up the model (1 second of audio)
207 |     LOG.info("Warming up the model...")
208 |     svc_model.infer(
209 |         speaker=speaker,
210 |         transpose=transpose,
211 |         auto_predict_f0=auto_predict_f0,
212 |         cluster_infer_ratio=cluster_infer_ratio,
213 |         noise_scale=noise_scale,
214 |         f0_method=f0_method,
215 |         audio=np.zeros(svc_model.target_sample, dtype=np.float32),
216 |     )
217 | 
218 |     def callback(
219 |         indata: np.ndarray,
220 |         outdata: np.ndarray,
221 |         frames: int,
222 |         time: int,
223 |         status: sd.CallbackFlags,
224 |     ) -> None:
225 |         LOG.debug(
226 |             f"Frames: {frames}, Status: {status}, Shape: {indata.shape}, Time: {time}"
227 |         )
228 | 
229 |         kwargs = dict(
230 |             input_audio=indata.mean(axis=1).astype(np.float32),
231 |             # svc config
232 |             speaker=speaker,
233 |             transpose=transpose,
234 |             auto_predict_f0=auto_predict_f0,
235 |             cluster_infer_ratio=cluster_infer_ratio,
236 |             noise_scale=noise_scale,
237 |             f0_method=f0_method,
238 |             # slice config
239 |             db_thresh=db_thresh,
240 |             # pad_seconds=pad_seconds,
241 |             chunk_seconds=chunk_seconds,
242 |         )
243 |         if version == 1:
244 |             kwargs["pad_seconds"] = pad_seconds
245 |         with timer() as t:
246 |             inference = model.process(
247 |                 **kwargs,
248 |             ).reshape(-1, 1)
249 |         if passthrough_original:
250 |             outdata[:] = (indata + inference) / 2
251 |         else:
252 |             outdata[:] = inference
253 |         rtf = t.elapsed / block_seconds
254 |         LOG.info(f"Realtime inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}")
255 |         if rtf > 1:
256 |             LOG.warning("RTF is too high, consider increasing block_seconds")
257 | 
258 |     try:
259 |         with sd.Stream(
260 |             device=(input_device, output_device),
261 |             channels=1,
262 |             callback=callback,
263 |             samplerate=svc_model.target_sample,
264 |             blocksize=int(block_seconds * svc_model.target_sample),
265 |             latency="low",
266 |         ) as stream:
267 |             LOG.info(f"Latency: {stream.latency}")
268 |             while True:
269 |                 sd.sleep(1000)
270 |     finally:
271 |         # del model, svc_model
272 |         torch.cuda.empty_cache()
273 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from logging import DEBUG, INFO, StreamHandler, basicConfig, captureWarnings, getLogger
 4 | from pathlib import Path
 5 | 
 6 | from rich.logging import RichHandler
 7 | 
 8 | LOGGER_INIT = False
 9 | 
10 | 
11 | def init_logger() -> None:
12 |     global LOGGER_INIT
13 |     if LOGGER_INIT:
14 |         return
15 | 
16 |     IS_TEST = "test" in Path.cwd().stem
17 |     package_name = sys.modules[__name__].__package__
18 |     basicConfig(
19 |         level=INFO,
20 |         format="%(asctime)s %(message)s",
21 |         datefmt="[%X]",
22 |         handlers=[
23 |             StreamHandler() if is_notebook() else RichHandler(),
24 |             # FileHandler(f"{package_name}.log"),
25 |         ],
26 |     )
27 |     if IS_TEST:
28 |         getLogger(package_name).setLevel(DEBUG)
29 |     captureWarnings(True)
30 |     LOGGER_INIT = True
31 | 
32 | 
33 | def is_notebook():
34 |     try:
35 |         from IPython import get_ipython
36 | 
37 |         if "IPKernelApp" not in get_ipython().config:  # pragma: no cover
38 |             raise ImportError("console")
39 |             return False
40 |         if "VSCODE_PID" in os.environ:  # pragma: no cover
41 |             raise ImportError("vscode")
42 |             return False
43 |     except Exception:
44 |         return False
45 |     else:  # pragma: no cover
46 |         return True
47 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/modules/__init__.py


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/commons.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import Tensor
  6 | 
  7 | 
  8 | def slice_segments(x: Tensor, starts: Tensor, length: int) -> Tensor:
  9 |     if length is None:
 10 |         return x
 11 |     length = min(length, x.size(-1))
 12 |     x_slice = torch.zeros((x.size()[:-1] + (length,)), dtype=x.dtype, device=x.device)
 13 |     ends = starts + length
 14 |     for i, (start, end) in enumerate(zip(starts, ends)):
 15 |         # LOG.debug(i, start, end, x.size(), x[i, ..., start:end].size(), x_slice.size())
 16 |         # x_slice[i, ...] = x[i, ..., start:end] need to pad
 17 |         # x_slice[i, ..., :end - start] = x[i, ..., start:end] this does not work
 18 |         x_slice[i, ...] = F.pad(x[i, ..., start:end], (0, max(0, length - x.size(-1))))
 19 |     return x_slice
 20 | 
 21 | 
 22 | def rand_slice_segments_with_pitch(
 23 |     x: Tensor, f0: Tensor, x_lengths: Tensor | int | None, segment_size: int | None
 24 | ):
 25 |     if segment_size is None:
 26 |         return x, f0, torch.arange(x.size(0), device=x.device)
 27 |     if x_lengths is None:
 28 |         x_lengths = x.size(-1) * torch.ones(
 29 |             x.size(0), dtype=torch.long, device=x.device
 30 |         )
 31 |     # slice_starts = (torch.rand(z.size(0), device=z.device) * (z_lengths - segment_size)).long()
 32 |     slice_starts = (
 33 |         torch.rand(x.size(0), device=x.device)
 34 |         * torch.max(
 35 |             x_lengths - segment_size, torch.zeros_like(x_lengths, device=x.device)
 36 |         )
 37 |     ).long()
 38 |     z_slice = slice_segments(x, slice_starts, segment_size)
 39 |     f0_slice = slice_segments(f0, slice_starts, segment_size)
 40 |     return z_slice, f0_slice, slice_starts
 41 | 
 42 | 
 43 | def slice_2d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor:
 44 |     batch_size, num_features, seq_len = x.shape
 45 |     ends = starts + length
 46 |     idxs = (
 47 |         torch.arange(seq_len, device=x.device)
 48 |         .unsqueeze(0)
 49 |         .unsqueeze(1)
 50 |         .repeat(batch_size, num_features, 1)
 51 |     )
 52 |     mask = (idxs >= starts.unsqueeze(-1).unsqueeze(-1)) & (
 53 |         idxs < ends.unsqueeze(-1).unsqueeze(-1)
 54 |     )
 55 |     return x[mask].reshape(batch_size, num_features, length)
 56 | 
 57 | 
 58 | def slice_1d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor:
 59 |     batch_size, seq_len = x.shape
 60 |     ends = starts + length
 61 |     idxs = torch.arange(seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1)
 62 |     mask = (idxs >= starts.unsqueeze(-1)) & (idxs < ends.unsqueeze(-1))
 63 |     return x[mask].reshape(batch_size, length)
 64 | 
 65 | 
 66 | def _slice_segments_v3(x: Tensor, starts: Tensor, length: int) -> Tensor:
 67 |     shape = x.shape[:-1] + (length,)
 68 |     ends = starts + length
 69 |     idxs = torch.arange(x.shape[-1], device=x.device).unsqueeze(0).unsqueeze(0)
 70 |     unsqueeze_dims = len(shape) - len(
 71 |         x.shape
 72 |     )  # calculate number of dimensions to unsqueeze
 73 |     starts = starts.reshape(starts.shape + (1,) * unsqueeze_dims)
 74 |     ends = ends.reshape(ends.shape + (1,) * unsqueeze_dims)
 75 |     mask = (idxs >= starts) & (idxs < ends)
 76 |     return x[mask].reshape(shape)
 77 | 
 78 | 
 79 | def init_weights(m, mean=0.0, std=0.01):
 80 |     classname = m.__class__.__name__
 81 |     if classname.find("Conv") != -1:
 82 |         m.weight.data.normal_(mean, std)
 83 | 
 84 | 
 85 | def get_padding(kernel_size, dilation=1):
 86 |     return int((kernel_size * dilation - dilation) / 2)
 87 | 
 88 | 
 89 | def convert_pad_shape(pad_shape):
 90 |     l = pad_shape[::-1]
 91 |     pad_shape = [item for sublist in l for item in sublist]
 92 |     return pad_shape
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |     return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |     n_channels_int = n_channels[0]
103 |     in_act = input_a + input_b
104 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |     acts = t_act * s_act
107 |     return acts
108 | 
109 | 
110 | def sequence_mask(length, max_length=None):
111 |     if max_length is None:
112 |         max_length = length.max()
113 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
114 |     return x.unsqueeze(0) < length.unsqueeze(1)
115 | 
116 | 
117 | def clip_grad_value_(parameters, clip_value, norm_type=2):
118 |     if isinstance(parameters, torch.Tensor):
119 |         parameters = [parameters]
120 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
121 |     norm_type = float(norm_type)
122 |     if clip_value is not None:
123 |         clip_value = float(clip_value)
124 | 
125 |     total_norm = 0
126 |     for p in parameters:
127 |         param_norm = p.grad.data.norm(norm_type)
128 |         total_norm += param_norm.item() ** norm_type
129 |         if clip_value is not None:
130 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
131 |     total_norm = total_norm ** (1.0 / norm_type)
132 |     return total_norm
133 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/modules/decoders/__init__.py


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/f0.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from so_vits_svc_fork.modules import attentions as attentions
 5 | 
 6 | 
 7 | class F0Decoder(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         out_channels,
11 |         hidden_channels,
12 |         filter_channels,
13 |         n_heads,
14 |         n_layers,
15 |         kernel_size,
16 |         p_dropout,
17 |         spk_channels=0,
18 |     ):
19 |         super().__init__()
20 |         self.out_channels = out_channels
21 |         self.hidden_channels = hidden_channels
22 |         self.filter_channels = filter_channels
23 |         self.n_heads = n_heads
24 |         self.n_layers = n_layers
25 |         self.kernel_size = kernel_size
26 |         self.p_dropout = p_dropout
27 |         self.spk_channels = spk_channels
28 | 
29 |         self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
30 |         self.decoder = attentions.FFT(
31 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
32 |         )
33 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
34 |         self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
35 |         self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
36 | 
37 |     def forward(self, x, norm_f0, x_mask, spk_emb=None):
38 |         x = torch.detach(x)
39 |         if spk_emb is not None:
40 |             spk_emb = torch.detach(spk_emb)
41 |             x = x + self.cond(spk_emb)
42 |         x += self.f0_prenet(norm_f0)
43 |         x = self.prenet(x) * x_mask
44 |         x = self.decoder(x * x_mask, x_mask)
45 |         x = self.proj(x) * x_mask
46 |         return x
47 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from ._models import NSFHifiGANGenerator
2 | 
3 | __all__ = ["NSFHifiGANGenerator"]
4 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.nn import Conv1d, ConvTranspose1d
  8 | from torch.nn.utils import remove_weight_norm, weight_norm
  9 | 
 10 | from ...modules import ResBlock1, ResBlock2
 11 | from ._utils import init_weights
 12 | 
 13 | LOG = getLogger(__name__)
 14 | 
 15 | LRELU_SLOPE = 0.1
 16 | 
 17 | 
 18 | def padDiff(x):
 19 |     return F.pad(
 20 |         F.pad(x, (0, 0, -1, 1), "constant", 0) - x, (0, 0, 0, -1), "constant", 0
 21 |     )
 22 | 
 23 | 
 24 | class SineGen(torch.nn.Module):
 25 |     """Definition of sine generator
 26 |     SineGen(samp_rate, harmonic_num = 0,
 27 |             sine_amp = 0.1, noise_std = 0.003,
 28 |             voiced_threshold = 0,
 29 |             flag_for_pulse=False)
 30 |     samp_rate: sampling rate in Hz
 31 |     harmonic_num: number of harmonic overtones (default 0)
 32 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
 33 |     noise_std: std of Gaussian noise (default 0.003)
 34 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
 35 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
 36 |     Note: when flag_for_pulse is True, the first time step of a voiced
 37 |         segment is always sin(np.pi) or cos(0)
 38 |     """
 39 | 
 40 |     def __init__(
 41 |         self,
 42 |         samp_rate,
 43 |         harmonic_num=0,
 44 |         sine_amp=0.1,
 45 |         noise_std=0.003,
 46 |         voiced_threshold=0,
 47 |         flag_for_pulse=False,
 48 |     ):
 49 |         super().__init__()
 50 |         self.sine_amp = sine_amp
 51 |         self.noise_std = noise_std
 52 |         self.harmonic_num = harmonic_num
 53 |         self.dim = self.harmonic_num + 1
 54 |         self.sampling_rate = samp_rate
 55 |         self.voiced_threshold = voiced_threshold
 56 |         self.flag_for_pulse = flag_for_pulse
 57 | 
 58 |     def _f02uv(self, f0):
 59 |         # generate uv signal
 60 |         uv = (f0 > self.voiced_threshold).type(torch.float32)
 61 |         return uv
 62 | 
 63 |     def _f02sine(self, f0_values):
 64 |         """f0_values: (batchsize, length, dim)
 65 |         where dim indicates fundamental tone and overtones
 66 |         """
 67 |         # convert to F0 in rad. The integer part n can be ignored
 68 |         # because 2 * np.pi * n doesn't affect phase
 69 |         rad_values = (f0_values / self.sampling_rate) % 1
 70 | 
 71 |         # initial phase noise (no noise for fundamental component)
 72 |         rand_ini = torch.rand(
 73 |             f0_values.shape[0], f0_values.shape[2], device=f0_values.device
 74 |         )
 75 |         rand_ini[:, 0] = 0
 76 |         rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
 77 | 
 78 |         # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
 79 |         if not self.flag_for_pulse:
 80 |             # for normal case
 81 | 
 82 |             # To prevent torch.cumsum numerical overflow,
 83 |             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
 84 |             # Buffer tmp_over_one_idx indicates the time step to add -1.
 85 |             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
 86 |             tmp_over_one = torch.cumsum(rad_values, 1) % 1
 87 |             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
 88 |             cumsum_shift = torch.zeros_like(rad_values)
 89 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
 90 | 
 91 |             sines = torch.sin(
 92 |                 torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
 93 |             )
 94 |         else:
 95 |             # If necessary, make sure that the first time step of every
 96 |             # voiced segments is sin(pi) or cos(0)
 97 |             # This is used for pulse-train generation
 98 | 
 99 |             # identify the last time step in unvoiced segments
100 |             uv = self._f02uv(f0_values)
101 |             uv_1 = torch.roll(uv, shifts=-1, dims=1)
102 |             uv_1[:, -1, :] = 1
103 |             u_loc = (uv < 1) * (uv_1 > 0)
104 | 
105 |             # get the instantanouse phase
106 |             tmp_cumsum = torch.cumsum(rad_values, dim=1)
107 |             # different batch needs to be processed differently
108 |             for idx in range(f0_values.shape[0]):
109 |                 temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
110 |                 temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
111 |                 # stores the accumulation of i.phase within
112 |                 # each voiced segments
113 |                 tmp_cumsum[idx, :, :] = 0
114 |                 tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
115 | 
116 |             # rad_values - tmp_cumsum: remove the accumulation of i.phase
117 |             # within the previous voiced segment.
118 |             i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
119 | 
120 |             # get the sines
121 |             sines = torch.cos(i_phase * 2 * np.pi)
122 |         return sines
123 | 
124 |     def forward(self, f0):
125 |         """sine_tensor, uv = forward(f0)
126 |         input F0: tensor(batchsize=1, length, dim=1)
127 |                   f0 for unvoiced steps should be 0
128 |         output sine_tensor: tensor(batchsize=1, length, dim)
129 |         output uv: tensor(batchsize=1, length, 1)
130 |         """
131 |         with torch.no_grad():
132 |             # f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
133 |             # fundamental component
134 |             # fn = torch.multiply(
135 |             #    f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)
136 |             # )
137 |             fn = torch.multiply(
138 |                 f0, torch.arange(1, self.harmonic_num + 2).to(f0.device).to(f0.dtype)
139 |             )
140 | 
141 |             # generate sine waveforms
142 |             sine_waves = self._f02sine(fn) * self.sine_amp
143 | 
144 |             # generate uv signal
145 |             # uv = torch.ones(f0.shape)
146 |             # uv = uv * (f0 > self.voiced_threshold)
147 |             uv = self._f02uv(f0)
148 | 
149 |             # noise: for unvoiced should be similar to sine_amp
150 |             #        std = self.sine_amp/3 -> max value ~ self.sine_amp
151 |             # .       for voiced regions is self.noise_std
152 |             noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
153 |             noise = noise_amp * torch.randn_like(sine_waves)
154 | 
155 |             # first: set the unvoiced part to 0 by uv
156 |             # then: additive noise
157 |             sine_waves = sine_waves * uv + noise
158 |         return sine_waves, uv, noise
159 | 
160 | 
161 | class SourceModuleHnNSF(torch.nn.Module):
162 |     """SourceModule for hn-nsf
163 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
164 |                  add_noise_std=0.003, voiced_threshod=0)
165 |     sampling_rate: sampling_rate in Hz
166 |     harmonic_num: number of harmonic above F0 (default: 0)
167 |     sine_amp: amplitude of sine source signal (default: 0.1)
168 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
169 |         note that amplitude of noise in unvoiced is decided
170 |         by sine_amp
171 |     voiced_threshold: threshold to set U/V given F0 (default: 0)
172 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
173 |     F0_sampled (batchsize, length, 1)
174 |     Sine_source (batchsize, length, 1)
175 |     noise_source (batchsize, length 1)
176 |     uv (batchsize, length, 1)
177 |     """
178 | 
179 |     def __init__(
180 |         self,
181 |         sampling_rate,
182 |         harmonic_num=0,
183 |         sine_amp=0.1,
184 |         add_noise_std=0.003,
185 |         voiced_threshod=0,
186 |     ):
187 |         super().__init__()
188 | 
189 |         self.sine_amp = sine_amp
190 |         self.noise_std = add_noise_std
191 | 
192 |         # to produce sine waveforms
193 |         self.l_sin_gen = SineGen(
194 |             sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
195 |         )
196 | 
197 |         # to merge source harmonics into a single excitation
198 |         self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
199 |         self.l_tanh = torch.nn.Tanh()
200 | 
201 |     def forward(self, x):
202 |         """
203 |         Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
204 |         F0_sampled (batchsize, length, 1)
205 |         Sine_source (batchsize, length, 1)
206 |         noise_source (batchsize, length 1)
207 |         """
208 |         # source for harmonic branch
209 |         sine_wavs, uv, _ = self.l_sin_gen(x)
210 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
211 | 
212 |         # source for noise branch, in the same shape as uv
213 |         noise = torch.randn_like(uv) * self.sine_amp / 3
214 |         return sine_merge, noise, uv
215 | 
216 | 
217 | class NSFHifiGANGenerator(torch.nn.Module):
218 |     def __init__(self, h):
219 |         super().__init__()
220 |         self.h = h
221 | 
222 |         self.num_kernels = len(h["resblock_kernel_sizes"])
223 |         self.num_upsamples = len(h["upsample_rates"])
224 |         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
225 |         self.m_source = SourceModuleHnNSF(
226 |             sampling_rate=h["sampling_rate"], harmonic_num=8
227 |         )
228 |         self.noise_convs = nn.ModuleList()
229 |         self.conv_pre = weight_norm(
230 |             Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)
231 |         )
232 |         resblock = ResBlock1 if h["resblock"] == "1" else ResBlock2
233 |         self.ups = nn.ModuleList()
234 |         for i, (u, k) in enumerate(
235 |             zip(h["upsample_rates"], h["upsample_kernel_sizes"])
236 |         ):
237 |             c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
238 |             self.ups.append(
239 |                 weight_norm(
240 |                     ConvTranspose1d(
241 |                         h["upsample_initial_channel"] // (2**i),
242 |                         h["upsample_initial_channel"] // (2 ** (i + 1)),
243 |                         k,
244 |                         u,
245 |                         padding=(k - u) // 2,
246 |                     )
247 |                 )
248 |             )
249 |             if i + 1 < len(h["upsample_rates"]):  #
250 |                 stride_f0 = np.prod(h["upsample_rates"][i + 1 :])
251 |                 self.noise_convs.append(
252 |                     Conv1d(
253 |                         1,
254 |                         c_cur,
255 |                         kernel_size=stride_f0 * 2,
256 |                         stride=stride_f0,
257 |                         padding=stride_f0 // 2,
258 |                     )
259 |                 )
260 |             else:
261 |                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
262 |         self.resblocks = nn.ModuleList()
263 |         for i in range(len(self.ups)):
264 |             ch = h["upsample_initial_channel"] // (2 ** (i + 1))
265 |             for j, (k, d) in enumerate(
266 |                 zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])
267 |             ):
268 |                 self.resblocks.append(resblock(ch, k, d))
269 | 
270 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
271 |         self.ups.apply(init_weights)
272 |         self.conv_post.apply(init_weights)
273 |         self.cond = nn.Conv1d(h["gin_channels"], h["upsample_initial_channel"], 1)
274 | 
275 |     def forward(self, x, f0, g=None):
276 |         # LOG.info(1,x.shape,f0.shape,f0[:, None].shape)
277 |         f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
278 |         # LOG.info(2,f0.shape)
279 |         har_source, noi_source, uv = self.m_source(f0)
280 |         har_source = har_source.transpose(1, 2)
281 |         x = self.conv_pre(x)
282 |         x = x + self.cond(g)
283 |         # LOG.info(124,x.shape,har_source.shape)
284 |         for i in range(self.num_upsamples):
285 |             x = F.leaky_relu(x, LRELU_SLOPE)
286 |             # LOG.info(3,x.shape)
287 |             x = self.ups[i](x)
288 |             x_source = self.noise_convs[i](har_source)
289 |             # LOG.info(4,x_source.shape,har_source.shape,x.shape)
290 |             x = x + x_source
291 |             xs = None
292 |             for j in range(self.num_kernels):
293 |                 if xs is None:
294 |                     xs = self.resblocks[i * self.num_kernels + j](x)
295 |                 else:
296 |                     xs += self.resblocks[i * self.num_kernels + j](x)
297 |             x = xs / self.num_kernels
298 |         x = F.leaky_relu(x)
299 |         x = self.conv_post(x)
300 |         x = torch.tanh(x)
301 | 
302 |         return x
303 | 
304 |     def remove_weight_norm(self):
305 |         LOG.info("Removing weight norm...")
306 |         for l in self.ups:
307 |             remove_weight_norm(l)
308 |         for l in self.resblocks:
309 |             l.remove_weight_norm()
310 |         remove_weight_norm(self.conv_pre)
311 |         remove_weight_norm(self.conv_post)
312 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | # matplotlib.use("Agg")
 4 | 
 5 | LOG = getLogger(__name__)
 6 | 
 7 | 
 8 | def init_weights(m, mean=0.0, std=0.01):
 9 |     classname = m.__class__.__name__
10 |     if classname.find("Conv") != -1:
11 |         m.weight.data.normal_(mean, std)
12 | 
13 | 
14 | def get_padding(kernel_size, dilation=1):
15 |     return int((kernel_size * dilation - dilation) / 2)
16 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._generators import (
 2 |     Multiband_iSTFT_Generator,
 3 |     Multistream_iSTFT_Generator,
 4 |     iSTFT_Generator,
 5 | )
 6 | from ._loss import subband_stft_loss
 7 | from ._pqmf import PQMF
 8 | 
 9 | __all__ = [
10 |     "subband_stft_loss",
11 |     "PQMF",
12 |     "iSTFT_Generator",
13 |     "Multiband_iSTFT_Generator",
14 |     "Multistream_iSTFT_Generator",
15 | ]
16 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py:
--------------------------------------------------------------------------------
 1 | from ._stft_loss import MultiResolutionSTFTLoss
 2 | 
 3 | 
 4 | def subband_stft_loss(h, y_mb, y_hat_mb):
 5 |     sub_stft_loss = MultiResolutionSTFTLoss(
 6 |         h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths
 7 |     )
 8 |     y_mb = y_mb.view(-1, y_mb.size(2))
 9 |     y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2))
10 |     sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, : y_mb.size(-1)], y_mb)
11 |     return sub_sc_loss + sub_mag_loss
12 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Tomoki Hayashi
  2 | #  MIT License (https://opensource.org/licenses/MIT)
  3 | 
  4 | """Pseudo QMF modules."""
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | from scipy.signal.windows import kaiser
 10 | 
 11 | 
 12 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
 13 |     """Design prototype filter for PQMF.
 14 |     This method is based on `A Kaiser window approach for the design of prototype
 15 |     filters of cosine modulated filterbanks`_.
 16 |     Args:
 17 |         taps (int): The number of filter taps.
 18 |         cutoff_ratio (float): Cut-off frequency ratio.
 19 |         beta (float): Beta coefficient for kaiser window.
 20 |     Returns:
 21 |         ndarray: Impluse response of prototype filter (taps + 1,).
 22 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 23 |         https://ieeexplore.ieee.org/abstract/document/681427
 24 |     """
 25 |     # check the arguments are valid
 26 |     assert taps % 2 == 0, "The number of taps mush be even number."
 27 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 28 | 
 29 |     # make initial filter
 30 |     omega_c = np.pi * cutoff_ratio
 31 |     with np.errstate(invalid="ignore"):
 32 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
 33 |             np.pi * (np.arange(taps + 1) - 0.5 * taps)
 34 |         )
 35 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 36 | 
 37 |     # apply kaiser window
 38 |     w = kaiser(taps + 1, beta)
 39 |     h = h_i * w
 40 | 
 41 |     return h
 42 | 
 43 | 
 44 | class PQMF(torch.nn.Module):
 45 |     """PQMF module.
 46 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 47 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 48 |         https://ieeexplore.ieee.org/document/258122
 49 |     """
 50 | 
 51 |     def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0):
 52 |         """Initialize PQMF module.
 53 |         Args:
 54 |             subbands (int): The number of subbands.
 55 |             taps (int): The number of filter taps.
 56 |             cutoff_ratio (float): Cut-off frequency ratio.
 57 |             beta (float): Beta coefficient for kaiser window.
 58 |         """
 59 |         super().__init__()
 60 | 
 61 |         # define filter coefficient
 62 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 63 |         h_analysis = np.zeros((subbands, len(h_proto)))
 64 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 65 |         for k in range(subbands):
 66 |             h_analysis[k] = (
 67 |                 2
 68 |                 * h_proto
 69 |                 * np.cos(
 70 |                     (2 * k + 1)
 71 |                     * (np.pi / (2 * subbands))
 72 |                     * (np.arange(taps + 1) - ((taps - 1) / 2))
 73 |                     + (-1) ** k * np.pi / 4
 74 |                 )
 75 |             )
 76 |             h_synthesis[k] = (
 77 |                 2
 78 |                 * h_proto
 79 |                 * np.cos(
 80 |                     (2 * k + 1)
 81 |                     * (np.pi / (2 * subbands))
 82 |                     * (np.arange(taps + 1) - ((taps - 1) / 2))
 83 |                     - (-1) ** k * np.pi / 4
 84 |                 )
 85 |             )
 86 | 
 87 |         # convert to tensor
 88 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device)
 89 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device)
 90 | 
 91 |         # register coefficients as buffer
 92 |         self.register_buffer("analysis_filter", analysis_filter)
 93 |         self.register_buffer("synthesis_filter", synthesis_filter)
 94 | 
 95 |         # filter for downsampling & upsampling
 96 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device)
 97 |         for k in range(subbands):
 98 |             updown_filter[k, k, 0] = 1.0
 99 |         self.register_buffer("updown_filter", updown_filter)
100 |         self.subbands = subbands
101 | 
102 |         # keep padding info
103 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
104 | 
105 |     def analysis(self, x):
106 |         """Analysis with PQMF.
107 |         Args:
108 |             x (Tensor): Input tensor (B, 1, T).
109 |         Returns:
110 |             Tensor: Output tensor (B, subbands, T // subbands).
111 |         """
112 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
113 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
114 | 
115 |     def synthesis(self, x):
116 |         """Synthesis with PQMF.
117 |         Args:
118 |             x (Tensor): Input tensor (B, subbands, T // subbands).
119 |         Returns:
120 |             Tensor: Output tensor (B, 1, T).
121 |         """
122 |         # NOTE(kan-bayashi): Power will be dreased so here multiply by # subbands.
123 |         #   Not sure this is the correct way, it is better to check again.
124 |         # TODO(kan-bayashi): Understand the reconstruction procedure
125 |         x = F.conv_transpose1d(
126 |             x, self.updown_filter * self.subbands, stride=self.subbands
127 |         )
128 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)
129 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | Copyright (c) 2017, Prem Seetharaman
  4 | All rights reserved.
  5 | * Redistribution and use in source and binary forms, with or without
  6 |   modification, are permitted provided that the following conditions are met:
  7 | * Redistributions of source code must retain the above copyright notice,
  8 |   this list of conditions and the following disclaimer.
  9 | * Redistributions in binary form must reproduce the above copyright notice, this
 10 |   list of conditions and the following disclaimer in the
 11 |   documentation and/or other materials provided with the distribution.
 12 | * Neither the name of the copyright holder nor the names of its
 13 |   contributors may be used to endorse or promote products derived from this
 14 |   software without specific prior written permission.
 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | """
 26 | 
 27 | import librosa.util as librosa_util
 28 | import numpy as np
 29 | import torch
 30 | import torch.nn.functional as F
 31 | from librosa.util import pad_center, tiny
 32 | from scipy.signal import get_window
 33 | from torch.autograd import Variable
 34 | 
 35 | 
 36 | def window_sumsquare(
 37 |     window,
 38 |     n_frames,
 39 |     hop_length=200,
 40 |     win_length=800,
 41 |     n_fft=800,
 42 |     dtype=np.float32,
 43 |     norm=None,
 44 | ):
 45 |     """
 46 |     # from librosa 0.6
 47 |     Compute the sum-square envelope of a window function at a given hop length.
 48 |     This is used to estimate modulation effects induced by windowing
 49 |     observations in short-time fourier transforms.
 50 |     Parameters
 51 |     ----------
 52 |     window : string, tuple, number, callable, or list-like
 53 |         Window specification, as in `get_window`
 54 |     n_frames : int > 0
 55 |         The number of analysis frames
 56 |     hop_length : int > 0
 57 |         The number of samples to advance between frames
 58 |     win_length : [optional]
 59 |         The length of the window function.  By default, this matches `n_fft`.
 60 |     n_fft : int > 0
 61 |         The length of each analysis frame.
 62 |     dtype : np.dtype
 63 |         The data type of the output
 64 |     Returns
 65 |     -------
 66 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 67 |         The sum-squared envelope of the window function
 68 |     """
 69 |     if win_length is None:
 70 |         win_length = n_fft
 71 | 
 72 |     n = n_fft + hop_length * (n_frames - 1)
 73 |     x = np.zeros(n, dtype=dtype)
 74 | 
 75 |     # Compute the squared window at the desired length
 76 |     win_sq = get_window(window, win_length, fftbins=True)
 77 |     win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
 78 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 79 | 
 80 |     # Fill the envelope
 81 |     for i in range(n_frames):
 82 |         sample = i * hop_length
 83 |         x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
 84 |     return x
 85 | 
 86 | 
 87 | class STFT(torch.nn.Module):
 88 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 89 | 
 90 |     def __init__(
 91 |         self, filter_length=800, hop_length=200, win_length=800, window="hann"
 92 |     ):
 93 |         super().__init__()
 94 |         self.filter_length = filter_length
 95 |         self.hop_length = hop_length
 96 |         self.win_length = win_length
 97 |         self.window = window
 98 |         self.forward_transform = None
 99 |         scale = self.filter_length / self.hop_length
100 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
101 | 
102 |         cutoff = int(self.filter_length / 2 + 1)
103 |         fourier_basis = np.vstack(
104 |             [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
105 |         )
106 | 
107 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
108 |         inverse_basis = torch.FloatTensor(
109 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :]
110 |         )
111 | 
112 |         if window is not None:
113 |             assert filter_length >= win_length
114 |             # get window and zero center pad it to filter_length
115 |             fft_window = get_window(window, win_length, fftbins=True)
116 |             fft_window = pad_center(fft_window, filter_length)
117 |             fft_window = torch.from_numpy(fft_window).float()
118 | 
119 |             # window the bases
120 |             forward_basis *= fft_window
121 |             inverse_basis *= fft_window
122 | 
123 |         self.register_buffer("forward_basis", forward_basis.float())
124 |         self.register_buffer("inverse_basis", inverse_basis.float())
125 | 
126 |     def transform(self, input_data):
127 |         num_batches = input_data.size(0)
128 |         num_samples = input_data.size(1)
129 | 
130 |         self.num_samples = num_samples
131 | 
132 |         # similar to librosa, reflect-pad the input
133 |         input_data = input_data.view(num_batches, 1, num_samples)
134 |         input_data = F.pad(
135 |             input_data.unsqueeze(1),
136 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
137 |             mode="reflect",
138 |         )
139 |         input_data = input_data.squeeze(1)
140 | 
141 |         forward_transform = F.conv1d(
142 |             input_data,
143 |             Variable(self.forward_basis, requires_grad=False),
144 |             stride=self.hop_length,
145 |             padding=0,
146 |         )
147 | 
148 |         cutoff = int((self.filter_length / 2) + 1)
149 |         real_part = forward_transform[:, :cutoff, :]
150 |         imag_part = forward_transform[:, cutoff:, :]
151 | 
152 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
153 |         phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
154 | 
155 |         return magnitude, phase
156 | 
157 |     def inverse(self, magnitude, phase):
158 |         recombine_magnitude_phase = torch.cat(
159 |             [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
160 |         )
161 | 
162 |         inverse_transform = F.conv_transpose1d(
163 |             recombine_magnitude_phase,
164 |             Variable(self.inverse_basis, requires_grad=False),
165 |             stride=self.hop_length,
166 |             padding=0,
167 |         )
168 | 
169 |         if self.window is not None:
170 |             window_sum = window_sumsquare(
171 |                 self.window,
172 |                 magnitude.size(-1),
173 |                 hop_length=self.hop_length,
174 |                 win_length=self.win_length,
175 |                 n_fft=self.filter_length,
176 |                 dtype=np.float32,
177 |             )
178 |             # remove modulation effects
179 |             approx_nonzero_indices = torch.from_numpy(
180 |                 np.where(window_sum > tiny(window_sum))[0]
181 |             )
182 |             window_sum = torch.autograd.Variable(
183 |                 torch.from_numpy(window_sum), requires_grad=False
184 |             )
185 |             window_sum = window_sum.to(inverse_transform.device())
186 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
187 |                 approx_nonzero_indices
188 |             ]
189 | 
190 |             # scale by hop ratio
191 |             inverse_transform *= float(self.filter_length) / self.hop_length
192 | 
193 |         inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
194 |         inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
195 | 
196 |         return inverse_transform
197 | 
198 |     def forward(self, input_data):
199 |         self.magnitude, self.phase = self.transform(input_data)
200 |         reconstruction = self.inverse(self.magnitude, self.phase)
201 |         return reconstruction
202 | 
203 | 
204 | class TorchSTFT(torch.nn.Module):
205 |     def __init__(
206 |         self, filter_length=800, hop_length=200, win_length=800, window="hann"
207 |     ):
208 |         super().__init__()
209 |         self.filter_length = filter_length
210 |         self.hop_length = hop_length
211 |         self.win_length = win_length
212 |         self.window = torch.from_numpy(
213 |             get_window(window, win_length, fftbins=True).astype(np.float32)
214 |         )
215 | 
216 |     def transform(self, input_data):
217 |         forward_transform = torch.stft(
218 |             input_data,
219 |             self.filter_length,
220 |             self.hop_length,
221 |             self.win_length,
222 |             window=self.window,
223 |             return_complex=True,
224 |         )
225 | 
226 |         return torch.abs(forward_transform), torch.angle(forward_transform)
227 | 
228 |     def inverse(self, magnitude, phase):
229 |         inverse_transform = torch.istft(
230 |             magnitude * torch.exp(phase * 1j),
231 |             self.filter_length,
232 |             self.hop_length,
233 |             self.win_length,
234 |             window=self.window.to(magnitude.device),
235 |         )
236 | 
237 |         return inverse_transform.unsqueeze(
238 |             -2
239 |         )  # unsqueeze to stay consistent with conv_transpose1d implementation
240 | 
241 |     def forward(self, input_data):
242 |         self.magnitude, self.phase = self.transform(input_data)
243 |         reconstruction = self.inverse(self.magnitude, self.phase)
244 |         return reconstruction
245 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Tomoki Hayashi
  2 | #  MIT License (https://opensource.org/licenses/MIT)
  3 | 
  4 | """STFT-based Loss modules."""
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | 
  9 | 
 10 | def stft(x, fft_size, hop_size, win_length, window):
 11 |     """Perform STFT and convert to magnitude spectrogram.
 12 |     Args:
 13 |         x (Tensor): Input signal tensor (B, T).
 14 |         fft_size (int): FFT size.
 15 |         hop_size (int): Hop size.
 16 |         win_length (int): Window length.
 17 |         window (str): Window function type.
 18 |     Returns:
 19 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 20 |     """
 21 |     x_stft = torch.stft(
 22 |         x, fft_size, hop_size, win_length, window.to(x.device), return_complex=False
 23 |     )
 24 |     real = x_stft[..., 0]
 25 |     imag = x_stft[..., 1]
 26 | 
 27 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 28 |     return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
 29 | 
 30 | 
 31 | class SpectralConvergengeLoss(torch.nn.Module):
 32 |     """Spectral convergence loss module."""
 33 | 
 34 |     def __init__(self):
 35 |         """Initialize spectral convergence loss module."""
 36 |         super().__init__()
 37 | 
 38 |     def forward(self, x_mag, y_mag):
 39 |         """Calculate forward propagation.
 40 |         Args:
 41 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 42 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 43 |         Returns:
 44 |             Tensor: Spectral convergence loss value.
 45 |         """
 46 |         return torch.norm(y_mag - x_mag) / torch.norm(
 47 |             y_mag
 48 |         )  # MB-iSTFT-VITS changed here due to codespell
 49 | 
 50 | 
 51 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 52 |     """Log STFT magnitude loss module."""
 53 | 
 54 |     def __init__(self):
 55 |         """Initialize los STFT magnitude loss module."""
 56 |         super().__init__()
 57 | 
 58 |     def forward(self, x_mag, y_mag):
 59 |         """Calculate forward propagation.
 60 |         Args:
 61 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 62 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 63 |         Returns:
 64 |             Tensor: Log STFT magnitude loss value.
 65 |         """
 66 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 67 | 
 68 | 
 69 | class STFTLoss(torch.nn.Module):
 70 |     """STFT loss module."""
 71 | 
 72 |     def __init__(
 73 |         self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"
 74 |     ):
 75 |         """Initialize STFT loss module."""
 76 |         super().__init__()
 77 |         self.fft_size = fft_size
 78 |         self.shift_size = shift_size
 79 |         self.win_length = win_length
 80 |         self.window = getattr(torch, window)(win_length)
 81 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 82 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 83 | 
 84 |     def forward(self, x, y):
 85 |         """Calculate forward propagation.
 86 |         Args:
 87 |             x (Tensor): Predicted signal (B, T).
 88 |             y (Tensor): Groundtruth signal (B, T).
 89 |         Returns:
 90 |             Tensor: Spectral convergence loss value.
 91 |             Tensor: Log STFT magnitude loss value.
 92 |         """
 93 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
 94 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
 95 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
 96 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
 97 | 
 98 |         return sc_loss, mag_loss
 99 | 
100 | 
101 | class MultiResolutionSTFTLoss(torch.nn.Module):
102 |     """Multi resolution STFT loss module."""
103 | 
104 |     def __init__(
105 |         self,
106 |         fft_sizes=[1024, 2048, 512],
107 |         hop_sizes=[120, 240, 50],
108 |         win_lengths=[600, 1200, 240],
109 |         window="hann_window",
110 |     ):
111 |         """Initialize Multi resolution STFT loss module.
112 |         Args:
113 |             fft_sizes (list): List of FFT sizes.
114 |             hop_sizes (list): List of hop sizes.
115 |             win_lengths (list): List of window lengths.
116 |             window (str): Window function type.
117 |         """
118 |         super().__init__()
119 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
120 |         self.stft_losses = torch.nn.ModuleList()
121 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
122 |             self.stft_losses += [STFTLoss(fs, ss, wl, window)]
123 | 
124 |     def forward(self, x, y):
125 |         """Calculate forward propagation.
126 |         Args:
127 |             x (Tensor): Predicted signal (B, T).
128 |             y (Tensor): Groundtruth signal (B, T).
129 |         Returns:
130 |             Tensor: Multi resolution spectral convergence loss value.
131 |             Tensor: Multi resolution log STFT magnitude loss value.
132 |         """
133 |         sc_loss = 0.0
134 |         mag_loss = 0.0
135 |         for f in self.stft_losses:
136 |             sc_l, mag_l = f(x, y)
137 |             sc_loss += sc_l
138 |             mag_loss += mag_l
139 |         sc_loss /= len(self.stft_losses)
140 |         mag_loss /= len(self.stft_losses)
141 | 
142 |         return sc_loss, mag_loss
143 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/descriminators.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import AvgPool1d, Conv1d, Conv2d
  4 | from torch.nn import functional as F
  5 | from torch.nn.utils import spectral_norm, weight_norm
  6 | 
  7 | from so_vits_svc_fork.modules import modules as modules
  8 | from so_vits_svc_fork.modules.commons import get_padding
  9 | 
 10 | 
 11 | class DiscriminatorP(torch.nn.Module):
 12 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
 13 |         super().__init__()
 14 |         self.period = period
 15 |         self.use_spectral_norm = use_spectral_norm
 16 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 17 |         self.convs = nn.ModuleList(
 18 |             [
 19 |                 norm_f(
 20 |                     Conv2d(
 21 |                         1,
 22 |                         32,
 23 |                         (kernel_size, 1),
 24 |                         (stride, 1),
 25 |                         padding=(get_padding(kernel_size, 1), 0),
 26 |                     )
 27 |                 ),
 28 |                 norm_f(
 29 |                     Conv2d(
 30 |                         32,
 31 |                         128,
 32 |                         (kernel_size, 1),
 33 |                         (stride, 1),
 34 |                         padding=(get_padding(kernel_size, 1), 0),
 35 |                     )
 36 |                 ),
 37 |                 norm_f(
 38 |                     Conv2d(
 39 |                         128,
 40 |                         512,
 41 |                         (kernel_size, 1),
 42 |                         (stride, 1),
 43 |                         padding=(get_padding(kernel_size, 1), 0),
 44 |                     )
 45 |                 ),
 46 |                 norm_f(
 47 |                     Conv2d(
 48 |                         512,
 49 |                         1024,
 50 |                         (kernel_size, 1),
 51 |                         (stride, 1),
 52 |                         padding=(get_padding(kernel_size, 1), 0),
 53 |                     )
 54 |                 ),
 55 |                 norm_f(
 56 |                     Conv2d(
 57 |                         1024,
 58 |                         1024,
 59 |                         (kernel_size, 1),
 60 |                         1,
 61 |                         padding=(get_padding(kernel_size, 1), 0),
 62 |                     )
 63 |                 ),
 64 |             ]
 65 |         )
 66 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
 67 | 
 68 |     def forward(self, x):
 69 |         fmap = []
 70 | 
 71 |         # 1d to 2d
 72 |         b, c, t = x.shape
 73 |         if t % self.period != 0:  # pad first
 74 |             n_pad = self.period - (t % self.period)
 75 |             x = F.pad(x, (0, n_pad), "reflect")
 76 |             t = t + n_pad
 77 |         x = x.view(b, c, t // self.period, self.period)
 78 | 
 79 |         for l in self.convs:
 80 |             x = l(x)
 81 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
 82 |             fmap.append(x)
 83 |         x = self.conv_post(x)
 84 |         fmap.append(x)
 85 |         x = torch.flatten(x, 1, -1)
 86 | 
 87 |         return x, fmap
 88 | 
 89 | 
 90 | class DiscriminatorS(torch.nn.Module):
 91 |     def __init__(self, use_spectral_norm=False):
 92 |         super().__init__()
 93 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 94 |         self.convs = nn.ModuleList(
 95 |             [
 96 |                 norm_f(Conv1d(1, 16, 15, 1, padding=7)),
 97 |                 norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
 98 |                 norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
 99 |                 norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
100 |                 norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
101 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
102 |             ]
103 |         )
104 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
105 | 
106 |     def forward(self, x):
107 |         fmap = []
108 | 
109 |         for l in self.convs:
110 |             x = l(x)
111 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
112 |             fmap.append(x)
113 |         x = self.conv_post(x)
114 |         fmap.append(x)
115 |         x = torch.flatten(x, 1, -1)
116 | 
117 |         return x, fmap
118 | 
119 | 
120 | class MultiPeriodDiscriminator(torch.nn.Module):
121 |     def __init__(self, use_spectral_norm=False):
122 |         super().__init__()
123 |         periods = [2, 3, 5, 7, 11]
124 | 
125 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
126 |         discs = discs + [
127 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
128 |         ]
129 |         self.discriminators = nn.ModuleList(discs)
130 | 
131 |     def forward(self, y, y_hat):
132 |         y_d_rs = []
133 |         y_d_gs = []
134 |         fmap_rs = []
135 |         fmap_gs = []
136 |         for i, d in enumerate(self.discriminators):
137 |             y_d_r, fmap_r = d(y)
138 |             y_d_g, fmap_g = d(y_hat)
139 |             y_d_rs.append(y_d_r)
140 |             y_d_gs.append(y_d_g)
141 |             fmap_rs.append(fmap_r)
142 |             fmap_gs.append(fmap_g)
143 | 
144 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
145 | 
146 | 
147 | class MultiScaleDiscriminator(torch.nn.Module):
148 |     def __init__(self):
149 |         super().__init__()
150 |         self.discriminators = nn.ModuleList(
151 |             [
152 |                 DiscriminatorS(use_spectral_norm=True),
153 |                 DiscriminatorS(),
154 |                 DiscriminatorS(),
155 |             ]
156 |         )
157 |         self.meanpools = nn.ModuleList(
158 |             [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
159 |         )
160 | 
161 |     def forward(self, y, y_hat):
162 |         y_d_rs = []
163 |         y_d_gs = []
164 |         fmap_rs = []
165 |         fmap_gs = []
166 |         for i, d in enumerate(self.discriminators):
167 |             if i != 0:
168 |                 y = self.meanpools[i - 1](y)
169 |                 y_hat = self.meanpools[i - 1](y_hat)
170 |             y_d_r, fmap_r = d(y)
171 |             y_d_g, fmap_g = d(y_hat)
172 |             y_d_rs.append(y_d_r)
173 |             fmap_rs.append(fmap_r)
174 |             y_d_gs.append(y_d_g)
175 |             fmap_gs.append(fmap_g)
176 | 
177 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
178 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/encoders.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from so_vits_svc_fork.modules import attentions as attentions
  5 | from so_vits_svc_fork.modules import commons as commons
  6 | from so_vits_svc_fork.modules import modules as modules
  7 | 
  8 | 
  9 | class SpeakerEncoder(torch.nn.Module):
 10 |     def __init__(
 11 |         self,
 12 |         mel_n_channels=80,
 13 |         model_num_layers=3,
 14 |         model_hidden_size=256,
 15 |         model_embedding_size=256,
 16 |     ):
 17 |         super().__init__()
 18 |         self.lstm = nn.LSTM(
 19 |             mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
 20 |         )
 21 |         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
 22 |         self.relu = nn.ReLU()
 23 | 
 24 |     def forward(self, mels):
 25 |         self.lstm.flatten_parameters()
 26 |         _, (hidden, _) = self.lstm(mels)
 27 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 28 |         return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 29 | 
 30 |     def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
 31 |         mel_slices = []
 32 |         for i in range(0, total_frames - partial_frames, partial_hop):
 33 |             mel_range = torch.arange(i, i + partial_frames)
 34 |             mel_slices.append(mel_range)
 35 | 
 36 |         return mel_slices
 37 | 
 38 |     def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
 39 |         mel_len = mel.size(1)
 40 |         last_mel = mel[:, -partial_frames:]
 41 | 
 42 |         if mel_len > partial_frames:
 43 |             mel_slices = self.compute_partial_slices(
 44 |                 mel_len, partial_frames, partial_hop
 45 |             )
 46 |             mels = list(mel[:, s] for s in mel_slices)
 47 |             mels.append(last_mel)
 48 |             mels = torch.stack(tuple(mels), 0).squeeze(1)
 49 | 
 50 |             with torch.no_grad():
 51 |                 partial_embeds = self(mels)
 52 |             embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
 53 |             # embed = embed / torch.linalg.norm(embed, 2)
 54 |         else:
 55 |             with torch.no_grad():
 56 |                 embed = self(last_mel)
 57 | 
 58 |         return embed
 59 | 
 60 | 
 61 | class Encoder(nn.Module):
 62 |     def __init__(
 63 |         self,
 64 |         in_channels,
 65 |         out_channels,
 66 |         hidden_channels,
 67 |         kernel_size,
 68 |         dilation_rate,
 69 |         n_layers,
 70 |         gin_channels=0,
 71 |     ):
 72 |         super().__init__()
 73 |         self.in_channels = in_channels
 74 |         self.out_channels = out_channels
 75 |         self.hidden_channels = hidden_channels
 76 |         self.kernel_size = kernel_size
 77 |         self.dilation_rate = dilation_rate
 78 |         self.n_layers = n_layers
 79 |         self.gin_channels = gin_channels
 80 | 
 81 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
 82 |         self.enc = modules.WN(
 83 |             hidden_channels,
 84 |             kernel_size,
 85 |             dilation_rate,
 86 |             n_layers,
 87 |             gin_channels=gin_channels,
 88 |         )
 89 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 90 | 
 91 |     def forward(self, x, x_lengths, g=None):
 92 |         # print(x.shape,x_lengths.shape)
 93 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
 94 |             x.dtype
 95 |         )
 96 |         x = self.pre(x) * x_mask
 97 |         x = self.enc(x, x_mask, g=g)
 98 |         stats = self.proj(x) * x_mask
 99 |         m, logs = torch.split(stats, self.out_channels, dim=1)
100 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
101 |         return z, m, logs, x_mask
102 | 
103 | 
104 | class TextEncoder(nn.Module):
105 |     def __init__(
106 |         self,
107 |         out_channels,
108 |         hidden_channels,
109 |         kernel_size,
110 |         n_layers,
111 |         gin_channels=0,
112 |         filter_channels=None,
113 |         n_heads=None,
114 |         p_dropout=None,
115 |     ):
116 |         super().__init__()
117 |         self.out_channels = out_channels
118 |         self.hidden_channels = hidden_channels
119 |         self.kernel_size = kernel_size
120 |         self.n_layers = n_layers
121 |         self.gin_channels = gin_channels
122 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
123 |         self.f0_emb = nn.Embedding(256, hidden_channels)
124 | 
125 |         self.enc_ = attentions.Encoder(
126 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
127 |         )
128 | 
129 |     def forward(self, x, x_mask, f0=None, noice_scale=1):
130 |         x = x + self.f0_emb(f0).transpose(1, 2)
131 |         x = self.enc_(x * x_mask, x_mask)
132 |         stats = self.proj(x) * x_mask
133 |         m, logs = torch.split(stats, self.out_channels, dim=1)
134 |         z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
135 | 
136 |         return z, m, logs, x_mask
137 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/flows.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from so_vits_svc_fork.modules import modules as modules
 4 | 
 5 | 
 6 | class ResidualCouplingBlock(nn.Module):
 7 |     def __init__(
 8 |         self,
 9 |         channels,
10 |         hidden_channels,
11 |         kernel_size,
12 |         dilation_rate,
13 |         n_layers,
14 |         n_flows=4,
15 |         gin_channels=0,
16 |     ):
17 |         super().__init__()
18 |         self.channels = channels
19 |         self.hidden_channels = hidden_channels
20 |         self.kernel_size = kernel_size
21 |         self.dilation_rate = dilation_rate
22 |         self.n_layers = n_layers
23 |         self.n_flows = n_flows
24 |         self.gin_channels = gin_channels
25 | 
26 |         self.flows = nn.ModuleList()
27 |         for i in range(n_flows):
28 |             self.flows.append(
29 |                 modules.ResidualCouplingLayer(
30 |                     channels,
31 |                     hidden_channels,
32 |                     kernel_size,
33 |                     dilation_rate,
34 |                     n_layers,
35 |                     gin_channels=gin_channels,
36 |                     mean_only=True,
37 |                 )
38 |             )
39 |             self.flows.append(modules.Flip())
40 | 
41 |     def forward(self, x, x_mask, g=None, reverse=False):
42 |         if not reverse:
43 |             for flow in self.flows:
44 |                 x, _ = flow(x, x_mask, g=g, reverse=reverse)
45 |         else:
46 |             for flow in reversed(self.flows):
47 |                 x = flow(x, x_mask, g=g, reverse=reverse)
48 |         return x
49 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     """
48 |     z_p = z_p.float()
49 |     logs_q = logs_q.float()
50 |     m_p = m_p.float()
51 |     logs_p = logs_p.float()
52 |     z_mask = z_mask.float()
53 |     # print(logs_p)
54 |     kl = logs_p - logs_q - 0.5
55 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 |     kl = torch.sum(kl * z_mask)
57 |     l = kl / torch.sum(z_mask)
58 |     return l
59 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/mel_processing.py:
--------------------------------------------------------------------------------
  1 | """from logging import getLogger
  2 | 
  3 | import torch
  4 | import torch.utils.data
  5 | import torchaudio
  6 | 
  7 | LOG = getLogger(__name__)
  8 | 
  9 | 
 10 | from ..hparams import HParams
 11 | 
 12 | 
 13 | def spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor:
 14 |     return torchaudio.transforms.Spectrogram(
 15 |         n_fft=hps.data.filter_length,
 16 |         win_length=hps.data.win_length,
 17 |         hop_length=hps.data.hop_length,
 18 |         power=1.0,
 19 |         window_fn=torch.hann_window,
 20 |         normalized=False,
 21 |     ).to(audio.device)(audio)
 22 | 
 23 | 
 24 | def spec_to_mel_torch(spec: torch.Tensor, hps: HParams) -> torch.Tensor:
 25 |     return torchaudio.transforms.MelScale(
 26 |         n_mels=hps.data.n_mel_channels,
 27 |         sample_rate=hps.data.sampling_rate,
 28 |         f_min=hps.data.mel_fmin,
 29 |         f_max=hps.data.mel_fmax,
 30 |     ).to(spec.device)(spec)
 31 | 
 32 | 
 33 | def mel_spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor:
 34 |     return torchaudio.transforms.MelSpectrogram(
 35 |         sample_rate=hps.data.sampling_rate,
 36 |         n_fft=hps.data.filter_length,
 37 |         n_mels=hps.data.n_mel_channels,
 38 |         win_length=hps.data.win_length,
 39 |         hop_length=hps.data.hop_length,
 40 |         f_min=hps.data.mel_fmin,
 41 |         f_max=hps.data.mel_fmax,
 42 |         power=1.0,
 43 |         window_fn=torch.hann_window,
 44 |         normalized=False,
 45 |     ).to(audio.device)(audio)"""
 46 | 
 47 | from logging import getLogger
 48 | 
 49 | import torch
 50 | import torch.utils.data
 51 | from librosa.filters import mel as librosa_mel_fn
 52 | 
 53 | LOG = getLogger(__name__)
 54 | 
 55 | MAX_WAV_VALUE = 32768.0
 56 | 
 57 | 
 58 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 59 |     """
 60 |     PARAMS
 61 |     ------
 62 |     C: compression factor
 63 |     """
 64 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 65 | 
 66 | 
 67 | def dynamic_range_decompression_torch(x, C=1):
 68 |     """
 69 |     PARAMS
 70 |     ------
 71 |     C: compression factor used to compress
 72 |     """
 73 |     return torch.exp(x) / C
 74 | 
 75 | 
 76 | def spectral_normalize_torch(magnitudes):
 77 |     output = dynamic_range_compression_torch(magnitudes)
 78 |     return output
 79 | 
 80 | 
 81 | def spectral_de_normalize_torch(magnitudes):
 82 |     output = dynamic_range_decompression_torch(magnitudes)
 83 |     return output
 84 | 
 85 | 
 86 | mel_basis = {}
 87 | hann_window = {}
 88 | 
 89 | 
 90 | def spectrogram_torch(y, hps, center=False):
 91 |     if torch.min(y) < -1.0:
 92 |         LOG.info("min value is ", torch.min(y))
 93 |     if torch.max(y) > 1.0:
 94 |         LOG.info("max value is ", torch.max(y))
 95 |     n_fft = hps.data.filter_length
 96 |     hop_size = hps.data.hop_length
 97 |     win_size = hps.data.win_length
 98 |     global hann_window
 99 |     dtype_device = str(y.dtype) + "_" + str(y.device)
100 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
101 |     if wnsize_dtype_device not in hann_window:
102 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
103 |             dtype=y.dtype, device=y.device
104 |         )
105 | 
106 |     y = torch.nn.functional.pad(
107 |         y.unsqueeze(1),
108 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
109 |         mode="reflect",
110 |     )
111 |     y = y.squeeze(1)
112 | 
113 |     spec = torch.stft(
114 |         y,
115 |         n_fft,
116 |         hop_length=hop_size,
117 |         win_length=win_size,
118 |         window=hann_window[wnsize_dtype_device],
119 |         center=center,
120 |         pad_mode="reflect",
121 |         normalized=False,
122 |         onesided=True,
123 |         return_complex=False,
124 |     )
125 | 
126 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
127 |     return spec
128 | 
129 | 
130 | def spec_to_mel_torch(spec, hps):
131 |     sampling_rate = hps.data.sampling_rate
132 |     n_fft = hps.data.filter_length
133 |     num_mels = hps.data.n_mel_channels
134 |     fmin = hps.data.mel_fmin
135 |     fmax = hps.data.mel_fmax
136 |     global mel_basis
137 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
138 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
139 |     if fmax_dtype_device not in mel_basis:
140 |         mel = librosa_mel_fn(
141 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
142 |         )
143 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
144 |             dtype=spec.dtype, device=spec.device
145 |         )
146 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
147 |     spec = spectral_normalize_torch(spec)
148 |     return spec
149 | 
150 | 
151 | def mel_spectrogram_torch(y, hps, center=False):
152 |     sampling_rate = hps.data.sampling_rate
153 |     n_fft = hps.data.filter_length
154 |     num_mels = hps.data.n_mel_channels
155 |     fmin = hps.data.mel_fmin
156 |     fmax = hps.data.mel_fmax
157 |     hop_size = hps.data.hop_length
158 |     win_size = hps.data.win_length
159 |     if torch.min(y) < -1.0:
160 |         LOG.info(f"min value is {torch.min(y)}")
161 |     if torch.max(y) > 1.0:
162 |         LOG.info(f"max value is {torch.max(y)}")
163 | 
164 |     global mel_basis, hann_window
165 |     dtype_device = str(y.dtype) + "_" + str(y.device)
166 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
167 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
168 |     if fmax_dtype_device not in mel_basis:
169 |         mel = librosa_mel_fn(
170 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
171 |         )
172 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
173 |             dtype=y.dtype, device=y.device
174 |         )
175 |     if wnsize_dtype_device not in hann_window:
176 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
177 |             dtype=y.dtype, device=y.device
178 |         )
179 | 
180 |     y = torch.nn.functional.pad(
181 |         y.unsqueeze(1),
182 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
183 |         mode="reflect",
184 |     )
185 |     y = y.squeeze(1)
186 | 
187 |     spec = torch.stft(
188 |         y,
189 |         n_fft,
190 |         hop_length=hop_size,
191 |         win_length=win_size,
192 |         window=hann_window[wnsize_dtype_device],
193 |         center=center,
194 |         pad_mode="reflect",
195 |         normalized=False,
196 |         onesided=True,
197 |         return_complex=False,
198 |     )
199 | 
200 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
201 | 
202 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
203 |     spec = spectral_normalize_torch(spec)
204 | 
205 |     return spec
206 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/synthesizers.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from logging import getLogger
  3 | from typing import Any, Literal, Sequence
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | 
  8 | import so_vits_svc_fork.f0
  9 | from so_vits_svc_fork.f0 import f0_to_coarse
 10 | from so_vits_svc_fork.modules import commons as commons
 11 | from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
 12 | from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator
 13 | from so_vits_svc_fork.modules.decoders.mb_istft import (
 14 |     Multiband_iSTFT_Generator,
 15 |     Multistream_iSTFT_Generator,
 16 |     iSTFT_Generator,
 17 | )
 18 | from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder
 19 | from so_vits_svc_fork.modules.flows import ResidualCouplingBlock
 20 | 
 21 | LOG = getLogger(__name__)
 22 | 
 23 | 
 24 | class SynthesizerTrn(nn.Module):
 25 |     """
 26 |     Synthesizer for Training
 27 |     """
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         spec_channels: int,
 32 |         segment_size: int,
 33 |         inter_channels: int,
 34 |         hidden_channels: int,
 35 |         filter_channels: int,
 36 |         n_heads: int,
 37 |         n_layers: int,
 38 |         kernel_size: int,
 39 |         p_dropout: int,
 40 |         resblock: str,
 41 |         resblock_kernel_sizes: Sequence[int],
 42 |         resblock_dilation_sizes: Sequence[Sequence[int]],
 43 |         upsample_rates: Sequence[int],
 44 |         upsample_initial_channel: int,
 45 |         upsample_kernel_sizes: Sequence[int],
 46 |         gin_channels: int,
 47 |         ssl_dim: int,
 48 |         n_speakers: int,
 49 |         sampling_rate: int = 44100,
 50 |         type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
 51 |         gen_istft_n_fft: int = 16,
 52 |         gen_istft_hop_size: int = 4,
 53 |         subbands: int = 4,
 54 |         **kwargs: Any,
 55 |     ):
 56 |         super().__init__()
 57 |         self.spec_channels = spec_channels
 58 |         self.inter_channels = inter_channels
 59 |         self.hidden_channels = hidden_channels
 60 |         self.filter_channels = filter_channels
 61 |         self.n_heads = n_heads
 62 |         self.n_layers = n_layers
 63 |         self.kernel_size = kernel_size
 64 |         self.p_dropout = p_dropout
 65 |         self.resblock = resblock
 66 |         self.resblock_kernel_sizes = resblock_kernel_sizes
 67 |         self.resblock_dilation_sizes = resblock_dilation_sizes
 68 |         self.upsample_rates = upsample_rates
 69 |         self.upsample_initial_channel = upsample_initial_channel
 70 |         self.upsample_kernel_sizes = upsample_kernel_sizes
 71 |         self.segment_size = segment_size
 72 |         self.gin_channels = gin_channels
 73 |         self.ssl_dim = ssl_dim
 74 |         self.n_speakers = n_speakers
 75 |         self.sampling_rate = sampling_rate
 76 |         self.type_ = type_
 77 |         self.gen_istft_n_fft = gen_istft_n_fft
 78 |         self.gen_istft_hop_size = gen_istft_hop_size
 79 |         self.subbands = subbands
 80 |         if kwargs:
 81 |             warnings.warn(f"Unused arguments: {kwargs}")
 82 | 
 83 |         self.emb_g = nn.Embedding(n_speakers, gin_channels)
 84 | 
 85 |         if ssl_dim is None:
 86 |             self.pre = nn.LazyConv1d(hidden_channels, kernel_size=5, padding=2)
 87 |         else:
 88 |             self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
 89 | 
 90 |         self.enc_p = TextEncoder(
 91 |             inter_channels,
 92 |             hidden_channels,
 93 |             filter_channels=filter_channels,
 94 |             n_heads=n_heads,
 95 |             n_layers=n_layers,
 96 |             kernel_size=kernel_size,
 97 |             p_dropout=p_dropout,
 98 |         )
 99 | 
100 |         LOG.info(f"Decoder type: {type_}")
101 |         if type_ == "hifi-gan":
102 |             hps = {
103 |                 "sampling_rate": sampling_rate,
104 |                 "inter_channels": inter_channels,
105 |                 "resblock": resblock,
106 |                 "resblock_kernel_sizes": resblock_kernel_sizes,
107 |                 "resblock_dilation_sizes": resblock_dilation_sizes,
108 |                 "upsample_rates": upsample_rates,
109 |                 "upsample_initial_channel": upsample_initial_channel,
110 |                 "upsample_kernel_sizes": upsample_kernel_sizes,
111 |                 "gin_channels": gin_channels,
112 |             }
113 |             self.dec = NSFHifiGANGenerator(h=hps)
114 |             self.mb = False
115 |         else:
116 |             hps = {
117 |                 "initial_channel": inter_channels,
118 |                 "resblock": resblock,
119 |                 "resblock_kernel_sizes": resblock_kernel_sizes,
120 |                 "resblock_dilation_sizes": resblock_dilation_sizes,
121 |                 "upsample_rates": upsample_rates,
122 |                 "upsample_initial_channel": upsample_initial_channel,
123 |                 "upsample_kernel_sizes": upsample_kernel_sizes,
124 |                 "gin_channels": gin_channels,
125 |                 "gen_istft_n_fft": gen_istft_n_fft,
126 |                 "gen_istft_hop_size": gen_istft_hop_size,
127 |                 "subbands": subbands,
128 |             }
129 | 
130 |             # gen_istft_n_fft, gen_istft_hop_size, subbands
131 |             if type_ == "istft":
132 |                 del hps["subbands"]
133 |                 self.dec = iSTFT_Generator(**hps)
134 |             elif type_ == "ms-istft":
135 |                 self.dec = Multistream_iSTFT_Generator(**hps)
136 |             elif type_ == "mb-istft":
137 |                 self.dec = Multiband_iSTFT_Generator(**hps)
138 |             else:
139 |                 raise ValueError(f"Unknown type: {type_}")
140 |             self.mb = True
141 | 
142 |         self.enc_q = Encoder(
143 |             spec_channels,
144 |             inter_channels,
145 |             hidden_channels,
146 |             5,
147 |             1,
148 |             16,
149 |             gin_channels=gin_channels,
150 |         )
151 |         self.flow = ResidualCouplingBlock(
152 |             inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
153 |         )
154 |         self.f0_decoder = F0Decoder(
155 |             1,
156 |             hidden_channels,
157 |             filter_channels,
158 |             n_heads,
159 |             n_layers,
160 |             kernel_size,
161 |             p_dropout,
162 |             spk_channels=gin_channels,
163 |         )
164 |         self.emb_uv = nn.Embedding(2, hidden_channels)
165 | 
166 |     def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
167 |         g = self.emb_g(g).transpose(1, 2)
168 |         # ssl prenet
169 |         x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
170 |             c.dtype
171 |         )
172 |         x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
173 | 
174 |         # f0 predict
175 |         lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
176 |         norm_lf0 = so_vits_svc_fork.f0.normalize_f0(lf0, x_mask, uv)
177 |         pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
178 | 
179 |         # encoder
180 |         z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
181 |         z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
182 | 
183 |         # flow
184 |         z_p = self.flow(z, spec_mask, g=g)
185 |         z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(
186 |             z, f0, spec_lengths, self.segment_size
187 |         )
188 | 
189 |         # MB-iSTFT-VITS
190 |         if self.mb:
191 |             o, o_mb = self.dec(z_slice, g=g)
192 |         # HiFi-GAN
193 |         else:
194 |             o = self.dec(z_slice, g=g, f0=pitch_slice)
195 |             o_mb = None
196 |         return (
197 |             o,
198 |             o_mb,
199 |             ids_slice,
200 |             spec_mask,
201 |             (z, z_p, m_p, logs_p, m_q, logs_q),
202 |             pred_lf0,
203 |             norm_lf0,
204 |             lf0,
205 |         )
206 | 
207 |     def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
208 |         c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
209 |         g = self.emb_g(g).transpose(1, 2)
210 |         x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
211 |             c.dtype
212 |         )
213 |         x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
214 | 
215 |         if predict_f0:
216 |             lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
217 |             norm_lf0 = so_vits_svc_fork.f0.normalize_f0(
218 |                 lf0, x_mask, uv, random_scale=False
219 |             )
220 |             pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
221 |             f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
222 | 
223 |         z_p, m_p, logs_p, c_mask = self.enc_p(
224 |             x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale
225 |         )
226 |         z = self.flow(z_p, c_mask, g=g, reverse=True)
227 | 
228 |         # MB-iSTFT-VITS
229 |         if self.mb:
230 |             o, o_mb = self.dec(z * c_mask, g=g)
231 |         else:
232 |             o = self.dec(z * c_mask, g=g, f0=f0)
233 |         return o
234 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/preprocessing/__init__.py


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 200,
 5 |     "seed": 1234,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 16,
11 |     "fp16_run": false,
12 |     "bf16_run": false,
13 |     "lr_decay": 0.999875,
14 |     "segment_size": 10240,
15 |     "init_lr_ratio": 1,
16 |     "warmup_epochs": 0,
17 |     "c_mel": 45,
18 |     "c_kl": 1.0,
19 |     "use_sr": true,
20 |     "max_speclen": 512,
21 |     "port": "8001",
22 |     "keep_ckpts": 3,
23 |     "fft_sizes": [768, 1366, 342],
24 |     "hop_sizes": [60, 120, 20],
25 |     "win_lengths": [300, 600, 120],
26 |     "window": "hann_window",
27 |     "num_workers": 4,
28 |     "log_version": 0,
29 |     "ckpt_name_by_step": false,
30 |     "accumulate_grad_batches": 1
31 |   },
32 |   "data": {
33 |     "training_files": "filelists/44k/train.txt",
34 |     "validation_files": "filelists/44k/val.txt",
35 |     "max_wav_value": 32768.0,
36 |     "sampling_rate": 44100,
37 |     "filter_length": 2048,
38 |     "hop_length": 512,
39 |     "win_length": 2048,
40 |     "n_mel_channels": 80,
41 |     "mel_fmin": 0.0,
42 |     "mel_fmax": 22050,
43 |     "contentvec_final_proj": false
44 |   },
45 |   "model": {
46 |     "inter_channels": 192,
47 |     "hidden_channels": 192,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 6,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3, 7, 11],
55 |     "resblock_dilation_sizes": [
56 |       [1, 3, 5],
57 |       [1, 3, 5],
58 |       [1, 3, 5]
59 |     ],
60 |     "upsample_rates": [8, 4],
61 |     "upsample_initial_channel": 512,
62 |     "upsample_kernel_sizes": [32, 16],
63 |     "n_layers_q": 3,
64 |     "use_spectral_norm": false,
65 |     "gin_channels": 256,
66 |     "ssl_dim": 768,
67 |     "n_speakers": 200,
68 |     "type_": "ms-istft",
69 |     "gen_istft_n_fft": 16,
70 |     "gen_istft_hop_size": 4,
71 |     "subbands": 4,
72 |     "pretrained": {
73 |       "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth",
74 |       "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth"
75 |     }
76 |   },
77 |   "spk": {}
78 | }
79 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 800,
 5 |     "seed": 1234,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 16,
11 |     "fp16_run": false,
12 |     "bf16_run": false,
13 |     "lr_decay": 0.999875,
14 |     "segment_size": 10240,
15 |     "init_lr_ratio": 1,
16 |     "warmup_epochs": 0,
17 |     "c_mel": 45,
18 |     "c_kl": 1.0,
19 |     "use_sr": true,
20 |     "max_speclen": 512,
21 |     "port": "8001",
22 |     "keep_ckpts": 3,
23 |     "num_workers": 4,
24 |     "log_version": 0,
25 |     "ckpt_name_by_step": false,
26 |     "accumulate_grad_batches": 1
27 |   },
28 |   "data": {
29 |     "training_files": "filelists/44k/train.txt",
30 |     "validation_files": "filelists/44k/val.txt",
31 |     "max_wav_value": 32768.0,
32 |     "sampling_rate": 44100,
33 |     "filter_length": 2048,
34 |     "hop_length": 512,
35 |     "win_length": 2048,
36 |     "n_mel_channels": 80,
37 |     "mel_fmin": 0.0,
38 |     "mel_fmax": 22050
39 |   },
40 |   "model": {
41 |     "inter_channels": 192,
42 |     "hidden_channels": 192,
43 |     "filter_channels": 768,
44 |     "n_heads": 2,
45 |     "n_layers": 6,
46 |     "kernel_size": 3,
47 |     "p_dropout": 0.1,
48 |     "resblock": "1",
49 |     "resblock_kernel_sizes": [3, 7, 11],
50 |     "resblock_dilation_sizes": [
51 |       [1, 3, 5],
52 |       [1, 3, 5],
53 |       [1, 3, 5]
54 |     ],
55 |     "upsample_rates": [8, 8, 2, 2, 2],
56 |     "upsample_initial_channel": 512,
57 |     "upsample_kernel_sizes": [16, 16, 4, 4, 4],
58 |     "n_layers_q": 3,
59 |     "use_spectral_norm": false,
60 |     "gin_channels": 256,
61 |     "ssl_dim": 256,
62 |     "n_speakers": 200,
63 |     "pretrained": {
64 |       "D_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth",
65 |       "G_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth"
66 |     }
67 |   },
68 |   "spk": {}
69 | }
70 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 200,
 5 |     "seed": 1234,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 16,
11 |     "fp16_run": false,
12 |     "bf16_run": false,
13 |     "lr_decay": 0.999875,
14 |     "segment_size": 10240,
15 |     "init_lr_ratio": 1,
16 |     "warmup_epochs": 0,
17 |     "c_mel": 45,
18 |     "c_kl": 1.0,
19 |     "use_sr": true,
20 |     "max_speclen": 512,
21 |     "port": "8001",
22 |     "keep_ckpts": 3,
23 |     "num_workers": 4,
24 |     "log_version": 0,
25 |     "ckpt_name_by_step": false,
26 |     "accumulate_grad_batches": 1
27 |   },
28 |   "data": {
29 |     "training_files": "filelists/44k/train.txt",
30 |     "validation_files": "filelists/44k/val.txt",
31 |     "max_wav_value": 32768.0,
32 |     "sampling_rate": 44100,
33 |     "filter_length": 2048,
34 |     "hop_length": 512,
35 |     "win_length": 2048,
36 |     "n_mel_channels": 80,
37 |     "mel_fmin": 0.0,
38 |     "mel_fmax": 22050,
39 |     "contentvec_final_proj": false
40 |   },
41 |   "model": {
42 |     "inter_channels": 192,
43 |     "hidden_channels": 192,
44 |     "filter_channels": 768,
45 |     "n_heads": 2,
46 |     "n_layers": 6,
47 |     "kernel_size": 3,
48 |     "p_dropout": 0.1,
49 |     "resblock": "1",
50 |     "resblock_kernel_sizes": [3, 7, 11],
51 |     "resblock_dilation_sizes": [
52 |       [1, 3, 5],
53 |       [1, 3, 5],
54 |       [1, 3, 5]
55 |     ],
56 |     "upsample_rates": [8, 8, 2, 2, 2],
57 |     "upsample_initial_channel": 512,
58 |     "upsample_kernel_sizes": [16, 16, 4, 4, 4],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "gin_channels": 256,
62 |     "ssl_dim": 768,
63 |     "n_speakers": 200,
64 |     "type_": "hifi-gan",
65 |     "pretrained": {
66 |       "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth",
67 |       "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth"
68 |     }
69 |   },
70 |   "spk": {}
71 | }
72 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_classify.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from logging import getLogger
 4 | from pathlib import Path
 5 | 
 6 | import keyboard
 7 | import librosa
 8 | import sounddevice as sd
 9 | import soundfile as sf
10 | from rich.console import Console
11 | from tqdm.rich import tqdm
12 | 
13 | LOG = getLogger(__name__)
14 | 
15 | 
16 | def preprocess_classify(
17 |     input_dir: Path | str, output_dir: Path | str, create_new: bool = True
18 | ) -> None:
19 |     # paths
20 |     input_dir_ = Path(input_dir)
21 |     output_dir_ = Path(output_dir)
22 |     speed = 1
23 |     if not input_dir_.is_dir():
24 |         raise ValueError(f"{input_dir} is not a directory.")
25 |     output_dir_.mkdir(exist_ok=True)
26 | 
27 |     console = Console()
28 |     # get audio paths and folders
29 |     audio_paths = list(input_dir_.glob("*.*"))
30 |     last_folders = [x for x in output_dir_.glob("*") if x.is_dir()]
31 |     console.print("Press ↑ or ↓ to change speed. Press any other key to classify.")
32 |     console.print(f"Folders: {[x.name for x in last_folders]}")
33 | 
34 |     pbar_description = ""
35 | 
36 |     pbar = tqdm(audio_paths)
37 |     for audio_path in pbar:
38 |         # read file
39 |         audio, sr = sf.read(audio_path)
40 | 
41 |         # update description
42 |         duration = librosa.get_duration(y=audio, sr=sr)
43 |         pbar_description = f"{duration:.1f} {pbar_description}"
44 |         pbar.set_description(pbar_description)
45 | 
46 |         while True:
47 |             # start playing
48 |             sd.play(librosa.effects.time_stretch(audio, rate=speed), sr, loop=True)
49 | 
50 |             # wait for key press
51 |             key = str(keyboard.read_key())
52 |             if key == "down":
53 |                 speed /= 1.1
54 |                 console.print(f"Speed: {speed:.2f}")
55 |             elif key == "up":
56 |                 speed *= 1.1
57 |                 console.print(f"Speed: {speed:.2f}")
58 |             else:
59 |                 break
60 | 
61 |             # stop playing
62 |             sd.stop()
63 | 
64 |         # print if folder changed
65 |         folders = [x for x in output_dir_.glob("*") if x.is_dir()]
66 |         if folders != last_folders:
67 |             console.print(f"Folders updated: {[x.name for x in folders]}")
68 |             last_folders = folders
69 | 
70 |         # get folder
71 |         folder_candidates = [x for x in folders if x.name.startswith(key)]
72 |         if len(folder_candidates) == 0:
73 |             if create_new:
74 |                 folder = output_dir_ / key
75 |             else:
76 |                 console.print(f"No folder starts with {key}.")
77 |                 continue
78 |         else:
79 |             if len(folder_candidates) > 1:
80 |                 LOG.warning(
81 |                     f"Multiple folders ({[x.name for x in folder_candidates]}) start with {key}. "
82 |                     f"Using first one ({folder_candidates[0].name})."
83 |                 )
84 |             folder = folder_candidates[0]
85 |         folder.mkdir(exist_ok=True)
86 | 
87 |         # move file
88 |         new_path = folder / audio_path.name
89 |         audio_path.rename(new_path)
90 | 
91 |         # update description
92 |         pbar_description = f"Last: {audio_path.name} -> {folder.name}"
93 | 
94 |         # yield result
95 |         # yield audio_path, key, folder, new_path
96 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | import os
 5 | from copy import deepcopy
 6 | from logging import getLogger
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | from librosa import get_duration
11 | from tqdm import tqdm
12 | 
13 | LOG = getLogger(__name__)
14 | CONFIG_TEMPLATE_DIR = Path(__file__).parent / "config_templates"
15 | 
16 | 
17 | def preprocess_config(
18 |     input_dir: Path | str,
19 |     train_list_path: Path | str,
20 |     val_list_path: Path | str,
21 |     test_list_path: Path | str,
22 |     config_path: Path | str,
23 |     config_name: str,
24 | ):
25 |     input_dir = Path(input_dir)
26 |     train_list_path = Path(train_list_path)
27 |     val_list_path = Path(val_list_path)
28 |     test_list_path = Path(test_list_path)
29 |     config_path = Path(config_path)
30 |     train = []
31 |     val = []
32 |     test = []
33 |     spk_dict = {}
34 |     spk_id = 0
35 |     random = np.random.RandomState(1234)
36 |     for speaker in os.listdir(input_dir):
37 |         spk_dict[speaker] = spk_id
38 |         spk_id += 1
39 |         paths = []
40 |         for path in tqdm(list((input_dir / speaker).rglob("*.wav"))):
41 |             if get_duration(filename=path) < 0.3:
42 |                 LOG.warning(f"skip {path} because it is too short.")
43 |                 continue
44 |             paths.append(path)
45 |         random.shuffle(paths)
46 |         if len(paths) <= 4:
47 |             raise ValueError(
48 |                 f"too few files in {input_dir / speaker} (expected at least 5)."
49 |             )
50 |         train += paths[2:-2]
51 |         val += paths[:2]
52 |         test += paths[-2:]
53 | 
54 |     LOG.info(f"Writing {train_list_path}")
55 |     train_list_path.parent.mkdir(parents=True, exist_ok=True)
56 |     train_list_path.write_text(
57 |         "\n".join([x.as_posix() for x in train]), encoding="utf-8"
58 |     )
59 | 
60 |     LOG.info(f"Writing {val_list_path}")
61 |     val_list_path.parent.mkdir(parents=True, exist_ok=True)
62 |     val_list_path.write_text("\n".join([x.as_posix() for x in val]), encoding="utf-8")
63 | 
64 |     LOG.info(f"Writing {test_list_path}")
65 |     test_list_path.parent.mkdir(parents=True, exist_ok=True)
66 |     test_list_path.write_text("\n".join([x.as_posix() for x in test]), encoding="utf-8")
67 | 
68 |     config = deepcopy(
69 |         json.loads(
70 |             (
71 |                 CONFIG_TEMPLATE_DIR
72 |                 / (
73 |                     config_name
74 |                     if config_name.endswith(".json")
75 |                     else config_name + ".json"
76 |                 )
77 |             ).read_text(encoding="utf-8")
78 |         )
79 |     )
80 |     config["spk"] = spk_dict
81 |     config["data"]["training_files"] = train_list_path.as_posix()
82 |     config["data"]["validation_files"] = val_list_path.as_posix()
83 |     LOG.info(f"Writing {config_path}")
84 |     config_path.parent.mkdir(parents=True, exist_ok=True)
85 |     with config_path.open("w", encoding="utf-8") as f:
86 |         json.dump(config, f, indent=2)
87 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from logging import getLogger
  4 | from pathlib import Path
  5 | from random import shuffle
  6 | from typing import Iterable, Literal
  7 | 
  8 | import librosa
  9 | import numpy as np
 10 | import torch
 11 | import torchaudio
 12 | from joblib import Parallel, cpu_count, delayed
 13 | from tqdm import tqdm
 14 | from transformers import HubertModel
 15 | 
 16 | import so_vits_svc_fork.f0
 17 | from so_vits_svc_fork import utils
 18 | 
 19 | from ..hparams import HParams
 20 | from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch
 21 | from ..utils import get_optimal_device, get_total_gpu_memory
 22 | from .preprocess_utils import check_hubert_min_duration
 23 | 
 24 | LOG = getLogger(__name__)
 25 | HUBERT_MEMORY = 2900
 26 | HUBERT_MEMORY_CREPE = 3900
 27 | 
 28 | 
 29 | def _process_one(
 30 |     *,
 31 |     filepath: Path,
 32 |     content_model: HubertModel,
 33 |     device: torch.device | str = get_optimal_device(),
 34 |     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
 35 |     force_rebuild: bool = False,
 36 |     hps: HParams,
 37 | ):
 38 |     audio, sr = librosa.load(filepath, sr=hps.data.sampling_rate, mono=True)
 39 | 
 40 |     if not check_hubert_min_duration(audio, sr):
 41 |         LOG.info(f"Skip {filepath} because it is too short.")
 42 |         return
 43 | 
 44 |     data_path = filepath.parent / (filepath.name + ".data.pt")
 45 |     if data_path.exists() and not force_rebuild:
 46 |         return
 47 | 
 48 |     # Compute f0
 49 |     f0 = so_vits_svc_fork.f0.compute_f0(
 50 |         audio, sampling_rate=sr, hop_length=hps.data.hop_length, method=f0_method
 51 |     )
 52 |     f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0)
 53 |     f0 = torch.from_numpy(f0).float()
 54 |     uv = torch.from_numpy(uv).float()
 55 | 
 56 |     # Compute HuBERT content
 57 |     audio = torch.from_numpy(audio).float().to(device)
 58 |     c = utils.get_content(
 59 |         content_model,
 60 |         audio,
 61 |         device,
 62 |         sr=sr,
 63 |         legacy_final_proj=hps.data.get("contentvec_final_proj", True),
 64 |     )
 65 |     c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
 66 |     torch.cuda.empty_cache()
 67 | 
 68 |     # Compute spectrogram
 69 |     audio, sr = torchaudio.load(filepath)
 70 |     spec = spectrogram_torch(audio, hps).squeeze(0)
 71 |     mel_spec = spec_to_mel_torch(spec, hps)
 72 |     torch.cuda.empty_cache()
 73 | 
 74 |     # fix lengths
 75 |     lmin = min(spec.shape[1], mel_spec.shape[1], f0.shape[0], uv.shape[0], c.shape[1])
 76 |     spec, mel_spec, f0, uv, c = (
 77 |         spec[:, :lmin],
 78 |         mel_spec[:, :lmin],
 79 |         f0[:lmin],
 80 |         uv[:lmin],
 81 |         c[:, :lmin],
 82 |     )
 83 | 
 84 |     # get speaker id
 85 |     spk_name = filepath.parent.name
 86 |     spk = hps.spk.__dict__[spk_name]
 87 |     spk = torch.tensor(spk).long()
 88 |     assert (
 89 |         spec.shape[1] == mel_spec.shape[1] == f0.shape[0] == uv.shape[0] == c.shape[1]
 90 |     ), (spec.shape, mel_spec.shape, f0.shape, uv.shape, c.shape)
 91 |     data = {
 92 |         "spec": spec,
 93 |         "mel_spec": mel_spec,
 94 |         "f0": f0,
 95 |         "uv": uv,
 96 |         "content": c,
 97 |         "audio": audio,
 98 |         "spk": spk,
 99 |     }
100 |     data = {k: v.cpu() for k, v in data.items()}
101 |     with data_path.open("wb") as f:
102 |         torch.save(data, f)
103 | 
104 | 
105 | def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs):
106 |     hps = kwargs["hps"]
107 |     content_model = utils.get_hubert_model(
108 |         get_optimal_device(), hps.data.get("contentvec_final_proj", True)
109 |     )
110 | 
111 |     for filepath in tqdm(filepaths, position=pbar_position):
112 |         _process_one(
113 |             content_model=content_model,
114 |             filepath=filepath,
115 |             **kwargs,
116 |         )
117 | 
118 | 
119 | def preprocess_hubert_f0(
120 |     input_dir: Path | str,
121 |     config_path: Path | str,
122 |     n_jobs: int | None = None,
123 |     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
124 |     force_rebuild: bool = False,
125 | ):
126 |     input_dir = Path(input_dir)
127 |     config_path = Path(config_path)
128 |     hps = utils.get_hparams(config_path)
129 |     if n_jobs is None:
130 |         # add cpu_count() to avoid SIGKILL
131 |         memory = get_total_gpu_memory("total")
132 |         n_jobs = min(
133 |             max(
134 |                 (
135 |                     memory
136 |                     // (HUBERT_MEMORY_CREPE if f0_method == "crepe" else HUBERT_MEMORY)
137 |                     if memory is not None
138 |                     else 1
139 |                 ),
140 |                 1,
141 |             ),
142 |             cpu_count(),
143 |         )
144 |         LOG.info(f"n_jobs automatically set to {n_jobs}, memory: {memory} MiB")
145 | 
146 |     filepaths = list(input_dir.rglob("*.wav"))
147 |     n_jobs = min(len(filepaths) // 16 + 1, n_jobs)
148 |     shuffle(filepaths)
149 |     filepath_chunks = np.array_split(filepaths, n_jobs)
150 |     Parallel(n_jobs=n_jobs)(
151 |         delayed(_process_batch)(
152 |             filepaths=chunk,
153 |             pbar_position=pbar_position,
154 |             f0_method=f0_method,
155 |             force_rebuild=force_rebuild,
156 |             hps=hps,
157 |         )
158 |         for (pbar_position, chunk) in enumerate(filepath_chunks)
159 |     )
160 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_resample.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import warnings
  4 | from logging import getLogger
  5 | from pathlib import Path
  6 | from typing import Iterable
  7 | 
  8 | import librosa
  9 | import soundfile
 10 | from joblib import Parallel, delayed
 11 | from tqdm_joblib import tqdm_joblib
 12 | 
 13 | from .preprocess_utils import check_hubert_min_duration
 14 | 
 15 | LOG = getLogger(__name__)
 16 | 
 17 | # input_dir and output_dir exists.
 18 | # write code to convert input dir audio files to output dir audio files,
 19 | # without changing folder structure. Use joblib to parallelize.
 20 | # Converting audio files includes:
 21 | # - resampling to specified sampling rate
 22 | # - trim silence
 23 | # - adjust volume in a smart way
 24 | # - save as 16-bit wav file
 25 | 
 26 | 
 27 | def _get_unique_filename(path: Path, existing_paths: Iterable[Path]) -> Path:
 28 |     """Return a unique path by appending a number to the original path."""
 29 |     if path not in existing_paths:
 30 |         return path
 31 |     i = 1
 32 |     while True:
 33 |         new_path = path.parent / f"{path.stem}_{i}{path.suffix}"
 34 |         if new_path not in existing_paths:
 35 |             return new_path
 36 |         i += 1
 37 | 
 38 | 
 39 | def is_relative_to(path: Path, *other):
 40 |     """Return True if the path is relative to another path or False.
 41 |     Python 3.9+ has Path.is_relative_to() method, but we need to support Python 3.8.
 42 |     """
 43 |     try:
 44 |         path.relative_to(*other)
 45 |         return True
 46 |     except ValueError:
 47 |         return False
 48 | 
 49 | 
 50 | def _preprocess_one(
 51 |     input_path: Path,
 52 |     output_path: Path,
 53 |     sr: int,
 54 |     *,
 55 |     top_db: int,
 56 |     frame_seconds: float,
 57 |     hop_seconds: float,
 58 | ) -> None:
 59 |     """Preprocess one audio file."""
 60 | 
 61 |     try:
 62 |         audio, sr = librosa.load(input_path, sr=sr, mono=True)
 63 | 
 64 |     # Audioread is the last backend it will attempt, so this is the exception thrown on failure
 65 |     except Exception as e:
 66 |         # Failure due to attempting to load a file that is not audio, so return early
 67 |         LOG.warning(f"Failed to load {input_path} due to {e}")
 68 |         return
 69 | 
 70 |     if not check_hubert_min_duration(audio, sr):
 71 |         LOG.info(f"Skip {input_path} because it is too short.")
 72 |         return
 73 | 
 74 |     # Adjust volume
 75 |     audio /= max(audio.max(), -audio.min())
 76 | 
 77 |     # Trim silence
 78 |     audio, _ = librosa.effects.trim(
 79 |         audio,
 80 |         top_db=top_db,
 81 |         frame_length=int(frame_seconds * sr),
 82 |         hop_length=int(hop_seconds * sr),
 83 |     )
 84 | 
 85 |     if not check_hubert_min_duration(audio, sr):
 86 |         LOG.info(f"Skip {input_path} because it is too short.")
 87 |         return
 88 | 
 89 |     soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")
 90 | 
 91 | 
 92 | def preprocess_resample(
 93 |     input_dir: Path | str,
 94 |     output_dir: Path | str,
 95 |     sampling_rate: int,
 96 |     n_jobs: int = -1,
 97 |     *,
 98 |     top_db: int = 30,
 99 |     frame_seconds: float = 0.1,
100 |     hop_seconds: float = 0.05,
101 | ) -> None:
102 |     input_dir = Path(input_dir)
103 |     output_dir = Path(output_dir)
104 |     """Preprocess audio files in input_dir and save them to output_dir."""
105 | 
106 |     out_paths = []
107 |     in_paths = list(input_dir.rglob("*.*"))
108 |     if not in_paths:
109 |         raise ValueError(f"No audio files found in {input_dir}")
110 |     for in_path in in_paths:
111 |         in_path_relative = in_path.relative_to(input_dir)
112 |         if not in_path.is_absolute() and is_relative_to(
113 |             in_path, Path("dataset_raw") / "44k"
114 |         ):
115 |             new_in_path_relative = in_path_relative.relative_to("44k")
116 |             warnings.warn(
117 |                 f"Recommended folder structure has changed since v1.0.0. "
118 |                 "Please move your dataset directly under dataset_raw folder. "
119 |                 f"Recoginzed {in_path_relative} as {new_in_path_relative}"
120 |             )
121 |             in_path_relative = new_in_path_relative
122 | 
123 |         if len(in_path_relative.parts) < 2:
124 |             continue
125 |         speaker_name = in_path_relative.parts[0]
126 |         file_name = in_path_relative.with_suffix(".wav").name
127 |         out_path = output_dir / speaker_name / file_name
128 |         out_path = _get_unique_filename(out_path, out_paths)
129 |         out_path.parent.mkdir(parents=True, exist_ok=True)
130 |         out_paths.append(out_path)
131 | 
132 |     in_and_out_paths = list(zip(in_paths, out_paths))
133 | 
134 |     with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
135 |         Parallel(n_jobs=n_jobs)(
136 |             delayed(_preprocess_one)(
137 |                 *args,
138 |                 sr=sampling_rate,
139 |                 top_db=top_db,
140 |                 frame_seconds=frame_seconds,
141 |                 hop_seconds=hop_seconds,
142 |             )
143 |             for args in in_and_out_paths
144 |         )
145 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from collections import defaultdict
 4 | from logging import getLogger
 5 | from pathlib import Path
 6 | 
 7 | import librosa
 8 | import soundfile as sf
 9 | import torch
10 | from joblib import Parallel, delayed
11 | from pyannote.audio import Pipeline
12 | from tqdm import tqdm
13 | from tqdm_joblib import tqdm_joblib
14 | 
15 | LOG = getLogger(__name__)
16 | 
17 | 
18 | def _process_one(
19 |     input_path: Path,
20 |     output_dir: Path,
21 |     sr: int,
22 |     *,
23 |     min_speakers: int = 1,
24 |     max_speakers: int = 1,
25 |     huggingface_token: str | None = None,
26 | ) -> None:
27 |     try:
28 |         audio, sr = librosa.load(input_path, sr=sr, mono=True)
29 |     except Exception as e:
30 |         LOG.warning(f"Failed to read {input_path}: {e}")
31 |         return
32 |     pipeline = Pipeline.from_pretrained(
33 |         "pyannote/speaker-diarization-3.1", use_auth_token=huggingface_token
34 |     )
35 |     if pipeline is None:
36 |         raise ValueError("Failed to load pipeline")
37 |     pipeline = pipeline.to(torch.device("cuda"))
38 |     LOG.info(f"Processing {input_path}. This may take a while...")
39 |     diarization = pipeline(
40 |         input_path, min_speakers=min_speakers, max_speakers=max_speakers
41 |     )
42 | 
43 |     LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}")
44 |     speaker_count = defaultdict(int)
45 | 
46 |     output_dir.mkdir(parents=True, exist_ok=True)
47 |     for segment, track, speaker in tqdm(
48 |         list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}"
49 |     ):
50 |         if segment.end - segment.start < 1:
51 |             continue
52 |         speaker_count[speaker] += 1
53 |         audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)]
54 |         sf.write(
55 |             (output_dir / f"{speaker}_{speaker_count[speaker]:04d}.wav"),
56 |             audio_cut,
57 |             sr,
58 |         )
59 | 
60 |     LOG.info(f"Speaker count: {speaker_count}")
61 | 
62 | 
63 | def preprocess_speaker_diarization(
64 |     input_dir: Path | str,
65 |     output_dir: Path | str,
66 |     sr: int,
67 |     *,
68 |     min_speakers: int = 1,
69 |     max_speakers: int = 1,
70 |     huggingface_token: str | None = None,
71 |     n_jobs: int = -1,
72 | ) -> None:
73 |     if huggingface_token is not None and not huggingface_token.startswith("hf_"):
74 |         LOG.warning("Huggingface token probably should start with hf_")
75 |     if not torch.cuda.is_available():
76 |         LOG.warning("CUDA is not available. This will be extremely slow.")
77 |     input_dir = Path(input_dir)
78 |     output_dir = Path(output_dir)
79 |     input_dir.mkdir(parents=True, exist_ok=True)
80 |     output_dir.mkdir(parents=True, exist_ok=True)
81 |     input_paths = list(input_dir.rglob("*.*"))
82 |     with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)):
83 |         Parallel(n_jobs=n_jobs)(
84 |             delayed(_process_one)(
85 |                 input_path,
86 |                 output_dir / input_path.relative_to(input_dir).parent / input_path.stem,
87 |                 sr,
88 |                 max_speakers=max_speakers,
89 |                 min_speakers=min_speakers,
90 |                 huggingface_token=huggingface_token,
91 |             )
92 |             for input_path in input_paths
93 |         )
94 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_split.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from logging import getLogger
 4 | from pathlib import Path
 5 | 
 6 | import librosa
 7 | import soundfile as sf
 8 | from joblib import Parallel, delayed
 9 | from tqdm import tqdm
10 | from tqdm_joblib import tqdm_joblib
11 | 
12 | LOG = getLogger(__name__)
13 | 
14 | 
15 | def _process_one(
16 |     input_path: Path,
17 |     output_dir: Path,
18 |     sr: int,
19 |     *,
20 |     max_length: float = 10.0,
21 |     top_db: int = 30,
22 |     frame_seconds: float = 0.5,
23 |     hop_seconds: float = 0.1,
24 | ):
25 |     try:
26 |         audio, sr = librosa.load(input_path, sr=sr, mono=True)
27 |     except Exception as e:
28 |         LOG.warning(f"Failed to read {input_path}: {e}")
29 |         return
30 |     intervals = librosa.effects.split(
31 |         audio,
32 |         top_db=top_db,
33 |         frame_length=int(sr * frame_seconds),
34 |         hop_length=int(sr * hop_seconds),
35 |     )
36 |     output_dir.mkdir(parents=True, exist_ok=True)
37 |     for start, end in tqdm(intervals, desc=f"Writing {input_path}"):
38 |         for sub_start in range(start, end, int(sr * max_length)):
39 |             sub_end = min(sub_start + int(sr * max_length), end)
40 |             audio_cut = audio[sub_start:sub_end]
41 |             sf.write(
42 |                 (
43 |                     output_dir
44 |                     / f"{input_path.stem}_{sub_start / sr:.3f}_{sub_end / sr:.3f}.wav"
45 |                 ),
46 |                 audio_cut,
47 |                 sr,
48 |             )
49 | 
50 | 
51 | def preprocess_split(
52 |     input_dir: Path | str,
53 |     output_dir: Path | str,
54 |     sr: int,
55 |     *,
56 |     max_length: float = 10.0,
57 |     top_db: int = 30,
58 |     frame_seconds: float = 0.5,
59 |     hop_seconds: float = 0.1,
60 |     n_jobs: int = -1,
61 | ):
62 |     input_dir = Path(input_dir)
63 |     output_dir = Path(output_dir)
64 |     output_dir.mkdir(parents=True, exist_ok=True)
65 |     input_paths = list(input_dir.rglob("*.*"))
66 |     with tqdm_joblib(desc="Splitting", total=len(input_paths)):
67 |         Parallel(n_jobs=n_jobs)(
68 |             delayed(_process_one)(
69 |                 input_path,
70 |                 output_dir / input_path.relative_to(input_dir).parent,
71 |                 sr,
72 |                 max_length=max_length,
73 |                 top_db=top_db,
74 |                 frame_seconds=frame_seconds,
75 |                 hop_seconds=hop_seconds,
76 |             )
77 |             for input_path in input_paths
78 |         )
79 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_utils.py:
--------------------------------------------------------------------------------
1 | from numpy import ndarray
2 | 
3 | 
4 | def check_hubert_min_duration(audio: ndarray, sr: int) -> bool:
5 |     return len(audio) / sr >= 0.3
6 | 


--------------------------------------------------------------------------------
/src/so_vits_svc_fork/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/py.typed


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/__init__.py


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0001.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0002.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0003.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0004.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0005.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0006.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0007.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0008.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0009.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0010.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/nested/LJ001-0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/nested/LJ001-0001.wav


--------------------------------------------------------------------------------
/tests/dataset_raw/test/nested/に.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/nested/に.wav


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from pathlib import Path
 4 | from unittest import SkipTest, TestCase
 5 | 
 6 | IS_CI = os.environ.get("GITHUB_ACTIONS", False)
 7 | IS_COLAB = os.getenv("COLAB_RELEASE_TAG", False)
 8 | 
 9 | 
10 | class TestMain(TestCase):
11 |     def test_import(self):
12 |         import so_vits_svc_fork.cluster.train_cluster  # noqa
13 |         import so_vits_svc_fork.inference.main  # noqa
14 | 
15 |         # import so_vits_svc_fork.modules.onnx._export  # noqa
16 |         import so_vits_svc_fork.preprocessing.preprocess_flist_config  # noqa
17 |         import so_vits_svc_fork.preprocessing.preprocess_hubert_f0  # noqa
18 |         import so_vits_svc_fork.preprocessing.preprocess_resample  # noqa
19 |         import so_vits_svc_fork.preprocessing.preprocess_split  # noqa
20 |         import so_vits_svc_fork.train  # noqa
21 | 
22 |     def test_infer(self):
23 |         if IS_CI:
24 |             raise SkipTest("Skip inference test on CI")
25 |         from so_vits_svc_fork.inference.main import infer  # noqa
26 | 
27 |         # infer("tests/dataset_raw/34j/1.wav", "tests/configs/config.json", "tests/logs/44k")
28 | 
29 |     def test_preprocess(self):
30 |         from so_vits_svc_fork.preprocessing.preprocess_resample import (
31 |             preprocess_resample,
32 |         )
33 | 
34 |         preprocess_resample(
35 |             "tests/dataset_raw", "tests/dataset/44k", 44100, n_jobs=1 if IS_CI else -1
36 |         )
37 | 
38 |         from so_vits_svc_fork.preprocessing.preprocess_flist_config import (
39 |             preprocess_config,
40 |         )
41 | 
42 |         preprocess_config(
43 |             "tests/dataset/44k",
44 |             "tests/filelists/train.txt",
45 |             "tests/filelists/val.txt",
46 |             "tests/filelists/test.txt",
47 |             "tests/configs/44k/config.json",
48 |             "so-vits-svc-4.0v1",
49 |         )
50 | 
51 |         if IS_CI:
52 |             raise SkipTest("Skip hubert and f0 test on CI")
53 |         from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import (
54 |             preprocess_hubert_f0,
55 |         )
56 | 
57 |         preprocess_hubert_f0("tests/dataset/44k", "tests/configs/44k/config.json")
58 | 
59 |     def test_train(self):
60 |         if not IS_COLAB:
61 |             raise SkipTest("Skip training test on non-colab")
62 |         # requires >10GB of GPU memory, can be only tested on colab
63 |         from so_vits_svc_fork.train import train
64 | 
65 |         config_path = Path("tests/logs/44k/config.json")
66 |         config_json = json.loads(config_path.read_text("utf-8"))
67 |         config_json["train"]["epochs"] = 1
68 |         config_path.write_text(json.dumps(config_json), "utf-8")
69 |         train(config_path, "tests/logs/44k")
70 | 


--------------------------------------------------------------------------------