├── .all-contributorsrc
├── .copier-answers.yml
├── .dockerignore
├── .editorconfig
├── .flake8
├── .github
├── CODE_OF_CONDUCT.md
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── 1-bug_report.yml
│ ├── 2-feature-request.yml
│ └── config.yml
├── PULL_REQUEST_TEMPLATE.md
├── labels.toml
└── workflows
│ ├── ci.yml
│ ├── hacktoberfest.yml
│ ├── issue-manager.yml
│ ├── labels.yml
│ └── poetry-upgrade.yml
├── .gitignore
├── .gitpod.yml
├── .idea
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── so-vits-svc-fork.iml
├── vcs.xml
├── watcherTasks.xml
└── workspace.xml
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── README_zh_CN.md
├── commitlint.config.js
├── docs
├── Makefile
├── _static
│ ├── .gitkeep
│ └── gui.png
├── changelog.md
├── conf.py
├── contributing.md
├── index.md
├── installation.md
├── make.bat
└── usage.md
├── easy-installation
├── install-cn.bat
└── install.bat
├── notebooks
└── so-vits-svc-fork-4.0.ipynb
├── poetry.lock
├── pyproject.toml
├── renovate.json
├── setup.py
├── src
└── so_vits_svc_fork
│ ├── __init__.py
│ ├── __main__.py
│ ├── cluster
│ ├── __init__.py
│ └── train_cluster.py
│ ├── dataset.py
│ ├── default_gui_presets.json
│ ├── f0.py
│ ├── gui.py
│ ├── hparams.py
│ ├── inference
│ ├── __init__.py
│ ├── core.py
│ └── main.py
│ ├── logger.py
│ ├── modules
│ ├── __init__.py
│ ├── attentions.py
│ ├── commons.py
│ ├── decoders
│ │ ├── __init__.py
│ │ ├── f0.py
│ │ ├── hifigan
│ │ │ ├── __init__.py
│ │ │ ├── _models.py
│ │ │ └── _utils.py
│ │ └── mb_istft
│ │ │ ├── __init__.py
│ │ │ ├── _generators.py
│ │ │ ├── _loss.py
│ │ │ ├── _pqmf.py
│ │ │ ├── _stft.py
│ │ │ └── _stft_loss.py
│ ├── descriminators.py
│ ├── encoders.py
│ ├── flows.py
│ ├── losses.py
│ ├── mel_processing.py
│ ├── modules.py
│ └── synthesizers.py
│ ├── preprocessing
│ ├── __init__.py
│ ├── config_templates
│ │ ├── quickvc.json
│ │ ├── so-vits-svc-4.0v1-legacy.json
│ │ └── so-vits-svc-4.0v1.json
│ ├── preprocess_classify.py
│ ├── preprocess_flist_config.py
│ ├── preprocess_hubert_f0.py
│ ├── preprocess_resample.py
│ ├── preprocess_speaker_diarization.py
│ ├── preprocess_split.py
│ └── preprocess_utils.py
│ ├── py.typed
│ ├── train.py
│ └── utils.py
└── tests
├── __init__.py
├── dataset_raw
└── test
│ ├── LJ001-0001.wav
│ ├── LJ001-0002.wav
│ ├── LJ001-0003.wav
│ ├── LJ001-0004.wav
│ ├── LJ001-0005.wav
│ ├── LJ001-0006.wav
│ ├── LJ001-0007.wav
│ ├── LJ001-0008.wav
│ ├── LJ001-0009.wav
│ ├── LJ001-0010.wav
│ └── nested
│ ├── LJ001-0001.wav
│ └── に.wav
└── test_main.py
/.all-contributorsrc:
--------------------------------------------------------------------------------
1 | {
2 | "projectName": "so-vits-svc-fork",
3 | "projectOwner": "voicepaw",
4 | "repoType": "github",
5 | "repoHost": "https://github.com",
6 | "files": ["README.md"],
7 | "imageSize": 80,
8 | "commit": true,
9 | "commitConvention": "angular",
10 | "contributors": [
11 | {
12 | "login": "34j",
13 | "name": "34j",
14 | "avatar_url": "https://avatars.githubusercontent.com/u/55338215?v=4",
15 | "profile": "https://github.com/34j",
16 | "contributions": [
17 | "code",
18 | "ideas",
19 | "doc",
20 | "example",
21 | "infra",
22 | "maintenance",
23 | "review",
24 | "test",
25 | "tutorial",
26 | "promotion",
27 | "bug"
28 | ]
29 | },
30 | {
31 | "login": "GarrettConway",
32 | "name": "GarrettConway",
33 | "avatar_url": "https://avatars.githubusercontent.com/u/22782004?v=4",
34 | "profile": "https://github.com/GarrettConway",
35 | "contributions": ["code", "bug", "doc", "review"]
36 | },
37 | {
38 | "login": "BlueAmulet",
39 | "name": "BlueAmulet",
40 | "avatar_url": "https://avatars.githubusercontent.com/u/43395286?v=4",
41 | "profile": "https://github.com/BlueAmulet",
42 | "contributions": ["ideas", "question", "code", "maintenance"]
43 | },
44 | {
45 | "login": "ThrowawayAccount01",
46 | "name": "ThrowawayAccount01",
47 | "avatar_url": "https://avatars.githubusercontent.com/u/125531852?v=4",
48 | "profile": "https://github.com/ThrowawayAccount01",
49 | "contributions": ["bug"]
50 | },
51 | {
52 | "login": "MashiroSA",
53 | "name": "緋",
54 | "avatar_url": "https://avatars.githubusercontent.com/u/40637516?v=4",
55 | "profile": "https://github.com/MashiroSA",
56 | "contributions": ["doc", "bug"]
57 | },
58 | {
59 | "login": "Lordmau5",
60 | "name": "Lordmau5",
61 | "avatar_url": "https://avatars.githubusercontent.com/u/1345036?v=4",
62 | "profile": "https://github.com/Lordmau5",
63 | "contributions": [
64 | "bug",
65 | "code",
66 | "ideas",
67 | "maintenance",
68 | "question",
69 | "userTesting"
70 | ]
71 | },
72 | {
73 | "login": "DL909",
74 | "name": "DL909",
75 | "avatar_url": "https://avatars.githubusercontent.com/u/71912115?v=4",
76 | "profile": "https://github.com/DL909",
77 | "contributions": ["bug"]
78 | },
79 | {
80 | "login": "Satisfy256",
81 | "name": "Satisfy256",
82 | "avatar_url": "https://avatars.githubusercontent.com/u/101394399?v=4",
83 | "profile": "https://github.com/Satisfy256",
84 | "contributions": ["bug"]
85 | },
86 | {
87 | "login": "pierluigizagaria",
88 | "name": "Pierluigi Zagaria",
89 | "avatar_url": "https://avatars.githubusercontent.com/u/57801386?v=4",
90 | "profile": "https://github.com/pierluigizagaria",
91 | "contributions": ["userTesting"]
92 | },
93 | {
94 | "login": "ruckusmattster",
95 | "name": "ruckusmattster",
96 | "avatar_url": "https://avatars.githubusercontent.com/u/77196088?v=4",
97 | "profile": "https://github.com/ruckusmattster",
98 | "contributions": ["bug"]
99 | },
100 | {
101 | "login": "Desuka-art",
102 | "name": "Desuka-art",
103 | "avatar_url": "https://avatars.githubusercontent.com/u/111822082?v=4",
104 | "profile": "https://github.com/Desuka-art",
105 | "contributions": ["bug"]
106 | },
107 | {
108 | "login": "heyfixit",
109 | "name": "heyfixit",
110 | "avatar_url": "https://avatars.githubusercontent.com/u/41658450?v=4",
111 | "profile": "https://github.com/heyfixit",
112 | "contributions": ["doc"]
113 | },
114 | {
115 | "login": "nerdyrodent",
116 | "name": "Nerdy Rodent",
117 | "avatar_url": "https://avatars.githubusercontent.com/u/74688049?v=4",
118 | "profile": "https://www.youtube.com/c/NerdyRodent",
119 | "contributions": ["video"]
120 | },
121 | {
122 | "login": "xieyumc",
123 | "name": "谢宇",
124 | "avatar_url": "https://avatars.githubusercontent.com/u/47858007?v=4",
125 | "profile": "https://github.com/xieyumc",
126 | "contributions": ["doc"]
127 | },
128 | {
129 | "login": "ColdCawfee",
130 | "name": "ColdCawfee",
131 | "avatar_url": "https://avatars.githubusercontent.com/u/79474598?v=4",
132 | "profile": "https://github.com/ColdCawfee",
133 | "contributions": ["bug"]
134 | },
135 | {
136 | "login": "sbersier",
137 | "name": "sbersier",
138 | "avatar_url": "https://avatars.githubusercontent.com/u/34165937?v=4",
139 | "profile": "https://github.com/sbersier",
140 | "contributions": ["ideas", "userTesting", "bug"]
141 | },
142 | {
143 | "login": "Meldoner",
144 | "name": "Meldoner",
145 | "avatar_url": "https://avatars.githubusercontent.com/u/43951115?v=4",
146 | "profile": "https://github.com/Meldoner",
147 | "contributions": ["bug", "ideas", "code"]
148 | },
149 | {
150 | "login": "mmodeusher",
151 | "name": "mmodeusher",
152 | "avatar_url": "https://avatars.githubusercontent.com/u/46575920?v=4",
153 | "profile": "https://github.com/mmodeusher",
154 | "contributions": ["bug"]
155 | },
156 | {
157 | "login": "AlonDan",
158 | "name": "AlonDan",
159 | "avatar_url": "https://avatars.githubusercontent.com/u/21152334?v=4",
160 | "profile": "https://github.com/AlonDan",
161 | "contributions": ["bug"]
162 | },
163 | {
164 | "login": "Likkkez",
165 | "name": "Likkkez",
166 | "avatar_url": "https://avatars.githubusercontent.com/u/44336181?v=4",
167 | "profile": "https://github.com/Likkkez",
168 | "contributions": ["bug"]
169 | },
170 | {
171 | "login": "DuctTapeGames",
172 | "name": "Duct Tape Games",
173 | "avatar_url": "https://avatars.githubusercontent.com/u/84365142?v=4",
174 | "profile": "https://github.com/DuctTapeGames",
175 | "contributions": ["bug"]
176 | },
177 | {
178 | "login": "hxl9654",
179 | "name": "Xianglong He",
180 | "avatar_url": "https://avatars.githubusercontent.com/u/6624983?v=4",
181 | "profile": "https://tec.hxlxz.com/",
182 | "contributions": ["bug"]
183 | },
184 | {
185 | "login": "75aosu",
186 | "name": "75aosu",
187 | "avatar_url": "https://avatars.githubusercontent.com/u/79185331?v=4",
188 | "profile": "https://github.com/75aosu",
189 | "contributions": ["bug"]
190 | },
191 | {
192 | "login": "tonyco82",
193 | "name": "tonyco82",
194 | "avatar_url": "https://avatars.githubusercontent.com/u/56610534?v=4",
195 | "profile": "https://github.com/tonyco82",
196 | "contributions": ["bug"]
197 | },
198 | {
199 | "login": "yxlllc",
200 | "name": "yxlllc",
201 | "avatar_url": "https://avatars.githubusercontent.com/u/33565655?v=4",
202 | "profile": "https://github.com/yxlllc",
203 | "contributions": ["ideas", "code"]
204 | },
205 | {
206 | "login": "outhipped",
207 | "name": "outhipped",
208 | "avatar_url": "https://avatars.githubusercontent.com/u/116147475?v=4",
209 | "profile": "https://github.com/outhipped",
210 | "contributions": ["bug"]
211 | },
212 | {
213 | "login": "escoolioinglesias",
214 | "name": "escoolioinglesias",
215 | "avatar_url": "https://avatars.githubusercontent.com/u/73505402?v=4",
216 | "profile": "https://github.com/escoolioinglesias",
217 | "contributions": ["bug", "userTesting", "video"]
218 | },
219 | {
220 | "login": "Blacksingh",
221 | "name": "Blacksingh",
222 | "avatar_url": "https://avatars.githubusercontent.com/u/130872856?v=4",
223 | "profile": "https://github.com/Blacksingh",
224 | "contributions": ["bug"]
225 | },
226 | {
227 | "login": "tybantarnusa",
228 | "name": "Mgs. M. Thoyib Antarnusa",
229 | "avatar_url": "https://avatars.githubusercontent.com/u/9532857?v=4",
230 | "profile": "http://tybantarnusa.com",
231 | "contributions": ["bug"]
232 | },
233 | {
234 | "login": "ZeroHackz",
235 | "name": "Exosfeer",
236 | "avatar_url": "https://avatars.githubusercontent.com/u/15729496?v=4",
237 | "profile": "https://github.com/ZeroHackz",
238 | "contributions": ["bug", "code"]
239 | },
240 | {
241 | "login": "guranon",
242 | "name": "guranon",
243 | "avatar_url": "https://avatars.githubusercontent.com/u/130421189?v=4",
244 | "profile": "https://github.com/guranon",
245 | "contributions": ["bug", "ideas", "code"]
246 | },
247 | {
248 | "login": "alexanderkoumis",
249 | "name": "Alexander Koumis",
250 | "avatar_url": "https://avatars.githubusercontent.com/u/5108856?v=4",
251 | "profile": "https://github.com/alexanderkoumis",
252 | "contributions": ["code"]
253 | },
254 | {
255 | "login": "acekagami",
256 | "name": "acekagami",
257 | "avatar_url": "https://avatars.githubusercontent.com/u/127201056?v=4",
258 | "profile": "https://github.com/acekagami",
259 | "contributions": ["translation"]
260 | },
261 | {
262 | "login": "Highupech",
263 | "name": "Highupech",
264 | "avatar_url": "https://avatars.githubusercontent.com/u/114140670?v=4",
265 | "profile": "https://github.com/Highupech",
266 | "contributions": ["bug"]
267 | },
268 | {
269 | "login": "Scorpi",
270 | "name": "Scorpi",
271 | "avatar_url": "https://avatars.githubusercontent.com/u/969654?v=4",
272 | "profile": "https://github.com/Scorpi",
273 | "contributions": ["code"]
274 | },
275 | {
276 | "login": "maximxlss",
277 | "name": "Maximxls",
278 | "avatar_url": "https://avatars.githubusercontent.com/u/29152154?v=4",
279 | "profile": "http://maximxlss.github.io",
280 | "contributions": ["code"]
281 | },
282 | {
283 | "login": "Star3Lord",
284 | "name": "Star3Lord",
285 | "avatar_url": "https://avatars.githubusercontent.com/u/57606931?v=4",
286 | "profile": "https://github.com/Star3Lord",
287 | "contributions": ["bug", "code"]
288 | },
289 | {
290 | "login": "Ph0rk0z",
291 | "name": "Forkoz",
292 | "avatar_url": "https://avatars.githubusercontent.com/u/59298527?v=4",
293 | "profile": "https://github.com/Ph0rk0z",
294 | "contributions": ["bug", "code"]
295 | },
296 | {
297 | "login": "Zerui18",
298 | "name": "Zerui Chen",
299 | "avatar_url": "https://avatars.githubusercontent.com/u/34794550?v=4",
300 | "profile": "https://github.com/Zerui18",
301 | "contributions": ["code", "ideas"]
302 | },
303 | {
304 | "login": "shenberg",
305 | "name": "Roee Shenberg",
306 | "avatar_url": "https://avatars.githubusercontent.com/u/653972?v=4",
307 | "profile": "https://www.meimadix.com",
308 | "contributions": ["userTesting", "ideas", "code"]
309 | },
310 | {
311 | "login": "ShinyJustyZ",
312 | "name": "Justas",
313 | "avatar_url": "https://avatars.githubusercontent.com/u/65282440?v=4",
314 | "profile": "https://github.com/ShinyJustyZ",
315 | "contributions": ["bug", "code"]
316 | },
317 | {
318 | "login": "Onako2",
319 | "name": "Onako2",
320 | "avatar_url": "https://avatars.githubusercontent.com/u/79749977?v=4",
321 | "profile": "https://onako2.github.io/",
322 | "contributions": ["doc"]
323 | },
324 | {
325 | "login": "4ll0w3v1l",
326 | "name": "4ll0w3v1l",
327 | "avatar_url": "https://avatars.githubusercontent.com/u/53517147?v=4",
328 | "profile": "https://github.com/4ll0w3v1l",
329 | "contributions": ["code"]
330 | },
331 | {
332 | "login": "SamuelSwartzberg",
333 | "name": "j5y0V6b",
334 | "avatar_url": "https://avatars.githubusercontent.com/u/16353439?v=4",
335 | "profile": "https://github.com/SamuelSwartzberg",
336 | "contributions": ["security"]
337 | },
338 | {
339 | "login": "marcellocirelli",
340 | "name": "marcellocirelli",
341 | "avatar_url": "https://avatars.githubusercontent.com/u/51972090?v=4",
342 | "profile": "https://github.com/marcellocirelli",
343 | "contributions": ["bug"]
344 | },
345 | {
346 | "login": "Priyanshu-hawk",
347 | "name": "Priyanshu Patel",
348 | "avatar_url": "https://avatars.githubusercontent.com/u/76026651?v=4",
349 | "profile": "https://github.com/Priyanshu-hawk",
350 | "contributions": ["code"]
351 | },
352 | {
353 | "login": "annagorshunova",
354 | "name": "Anna Gorshunova",
355 | "avatar_url": "https://avatars.githubusercontent.com/u/5199204?v=4",
356 | "profile": "https://github.com/annagorshunova",
357 | "contributions": ["bug", "code"]
358 | }
359 | ],
360 | "contributorsPerLine": 7,
361 | "skipCi": true,
362 | "commitType": "docs"
363 | }
364 |
--------------------------------------------------------------------------------
/.copier-answers.yml:
--------------------------------------------------------------------------------
1 | # Changes here will be overwritten by Copier
2 | _commit: d5acceb
3 | _src_path: gh:34j/pypackage-template-fork
4 | add_me_as_contributor: false
5 | copyright_year: '2023'
6 | documentation: true
7 | email: 34j.95a2p@simplelogin.com
8 | full_name: 34j
9 | github_username: 34j
10 | initial_commit: false
11 | open_source_license: MIT
12 | open_with_vscode: false
13 | package_name: so_vits_svc_fork
14 | project_name: SoftVC VITS Singing Voice Conversion Fork
15 | project_short_description: A fork of so-vits-svc.
16 | project_slug: so-vits-svc-fork
17 | run_poetry_install: true
18 | setup_github: false
19 | setup_pre_commit: false
20 | setup_venv: true
21 | venv_version: '3.10'
22 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Ignore everything
2 | *
3 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 |
17 | [LICENSE]
18 | insert_final_newline = false
19 |
20 | [Makefile]
21 | indent_style = tab
22 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = docs
3 | max-line-length = 88
4 | ignore = E203, E501, E741, E402, E712, W503, E731, E711, E226
5 |
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/.github/CODE_OF_CONDUCT.md
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: ["34j"]
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-bug_report.yml:
--------------------------------------------------------------------------------
1 | name: Bug report
2 | description: Create a report to help us improve
3 | labels: [bug]
4 | body:
5 | - type: textarea
6 | id: description
7 | attributes:
8 | label: Describe the bug
9 | description: A clear and concise description of what the bug is.
10 | placeholder: Describe the bug
11 | validations:
12 | required: true
13 | - type: textarea
14 | id: reproduce
15 | attributes:
16 | label: To Reproduce
17 | description: Steps to reproduce the behavior.
18 | placeholder: To Reproduce
19 | validations:
20 | required: true
21 | - type: textarea
22 | id: context
23 | attributes:
24 | label: Additional context
25 | description: Add any other context about the problem here.
26 | placeholder: Additional context
27 | - type: input
28 | id: version
29 | attributes:
30 | label: Version
31 | description: Version of the project.
32 | placeholder: Version
33 | validations:
34 | required: true
35 | - type: input
36 | id: platform
37 | attributes:
38 | label: Platform
39 | description: Platform where the bug was found.
40 | placeholder: "Example: Windows 11 / macOS 12.0.1 / Ubuntu 20.04"
41 | validations:
42 | required: true
43 | - type: checkboxes
44 | id: terms
45 | attributes:
46 | label: Code of Conduct
47 | description: By submitting this issue, you agree to follow our
48 | [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md).
49 | options:
50 | - label: I agree to follow this project's Code of Conduct.
51 | required: true
52 | - type: checkboxes
53 | id: no-duplicate
54 | attributes:
55 | label: No Duplicate
56 | description: Please check [existing issues](https://github.com/34j/so-vits-svc-fork/issues) to avoid duplicates.
57 | options:
58 | - label: I have checked existing issues to avoid duplicates.
59 | required: true
60 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yml:
--------------------------------------------------------------------------------
1 | name: Feature request
2 | description: Suggest an idea for this project
3 | labels: [enhancement]
4 | body:
5 | - type: textarea
6 | id: description
7 | attributes:
8 | label: Is your feature request related to a problem? Please describe.
9 | description: A clear and concise description of what the problem is.
10 | value: I'm always frustrated when
11 | validations:
12 | required: true
13 | - type: textarea
14 | id: solution
15 | attributes:
16 | label: Describe alternatives you've considered
17 | description: A clear and concise description of any alternative solutions or features you've considered.
18 | placeholder: Describe alternatives you've considered
19 | validations:
20 | required: true
21 | - type: textarea
22 | id: context
23 | attributes:
24 | label: Additional context
25 | description: Add any other context or screenshots about the feature request here.
26 | placeholder: Additional context
27 | - type: checkboxes
28 | id: terms
29 | attributes:
30 | label: Code of Conduct
31 | description: By submitting this issue, you agree to follow our
32 | [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md).
33 | options:
34 | - label: I agree to follow this project's Code of Conduct
35 | required: true
36 | - type: checkboxes
37 | id: willing
38 | attributes:
39 | label: Are you willing to resolve this issue by submitting a Pull Request?
40 | description: Remember that first-time contributors are welcome! 🙌
41 | options:
42 | - label: Yes, I have the time, and I know how to start.
43 | - label: Yes, I have the time, but I don't know how to start. I would need guidance.
44 | - label: No, I don't have the time, although I believe I could do it if I had the time...
45 | - label: No, I don't have the time and I wouldn't even know how to start.
46 | validations:
47 | required: true
48 | - type: markdown
49 | attributes:
50 | value: 👋 Have a great day and thank you for the feature request!
51 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Discussions
4 | url: https://github.com/34j/so-vits-svc-fork/discussions
5 | about: Please ask and answer questions here.
6 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
7 |
8 | ### Description of change
9 |
10 |
13 |
14 | copilot:all
15 |
16 |
29 |
30 | ### Pull-Request Checklist
31 |
32 |
37 |
38 | - [ ] Code is up-to-date with the `main` branch
39 | - [ ] This pull request follows [Contributing.md](https://github.com/34j/so-vits-svc-fork/blob/main/CONTRIBUTING.md)
40 | - [ ] This pull request links relevant issues as `Fixes #0000`
41 | - [ ] `pre-commit run -a` passes with this change or ci passes
42 | - [ ] `poetry run pytest` passes with this change or ci passes
43 | - [ ] (There are new or updated unit tests validating the change)
44 | - [ ] Documentation has been updated to reflect this change
45 | - [ ] The new commits follow conventions outlined in the [conventional commit spec](https://www.conventionalcommits.org/en/v1.0.0/)
46 |
47 |
50 |
--------------------------------------------------------------------------------
/.github/labels.toml:
--------------------------------------------------------------------------------
1 | [breaking]
2 | color = "ffcc00"
3 | name = "breaking"
4 | description = "Breaking change."
5 |
6 | [bug]
7 | color = "d73a4a"
8 | name = "bug"
9 | description = "Something isn't working"
10 |
11 | [dependencies]
12 | color = "0366d6"
13 | name = "dependencies"
14 | description = "Pull requests that update a dependency file"
15 |
16 | [github_actions]
17 | color = "000000"
18 | name = "github_actions"
19 | description = "Update of github actions"
20 |
21 | [documentation]
22 | color = "1bc4a5"
23 | name = "documentation"
24 | description = "Improvements or additions to documentation"
25 |
26 | [duplicate]
27 | color = "cfd3d7"
28 | name = "duplicate"
29 | description = "This issue or pull request already exists"
30 |
31 | [enhancement]
32 | color = "a2eeef"
33 | name = "enhancement"
34 | description = "New feature or request"
35 |
36 | ["good first issue"]
37 | color = "7057ff"
38 | name = "good first issue"
39 | description = "Good for newcomers"
40 |
41 | ["help wanted"]
42 | color = "008672"
43 | name = "help wanted"
44 | description = "Extra attention is needed"
45 |
46 | [invalid]
47 | color = "e4e669"
48 | name = "invalid"
49 | description = "This doesn't seem right"
50 |
51 | [nochangelog]
52 | color = "555555"
53 | name = "nochangelog"
54 | description = "Exclude pull requests from changelog"
55 |
56 | [question]
57 | color = "d876e3"
58 | name = "question"
59 | description = "Further information is requested"
60 |
61 | [removed]
62 | color = "e99695"
63 | name = "removed"
64 | description = "Removed piece of functionalities."
65 |
66 | [tests]
67 | color = "bfd4f2"
68 | name = "tests"
69 | description = "CI, CD and testing related changes"
70 |
71 | [wontfix]
72 | color = "ffffff"
73 | name = "wontfix"
74 | description = "This will not be worked on"
75 |
76 | [discussion]
77 | color = "c2e0c6"
78 | name = "discussion"
79 | description = "Some discussion around the project"
80 |
81 | [hacktoberfest]
82 | color = "ffa663"
83 | name = "hacktoberfest"
84 | description = "Good issues for Hacktoberfest"
85 |
86 | [answered]
87 | color = "0ee2b6"
88 | name = "answered"
89 | description = "Automatically closes as answered after a delay"
90 |
91 | [waiting]
92 | color = "5f7972"
93 | name = "waiting"
94 | description = "Automatically closes if no answer after a delay"
95 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 |
9 | concurrency:
10 | group: ${{ github.head_ref || github.run_id }}
11 | cancel-in-progress: true
12 |
13 | jobs:
14 | lint:
15 | runs-on: ubuntu-latest
16 | steps:
17 | - uses: actions/checkout@v3
18 | - uses: actions/setup-python@v5
19 | with:
20 | python-version: "3.9"
21 | - uses: pre-commit/action@v3.0.1
22 |
23 | # Make sure commit messages follow the conventional commits convention:
24 | # https://www.conventionalcommits.org
25 | commitlint:
26 | name: Lint Commit Messages
27 | runs-on: ubuntu-latest
28 | steps:
29 | - uses: actions/checkout@v3
30 | with:
31 | fetch-depth: 0
32 | - uses: wagoid/commitlint-github-action@v5.5.1
33 |
34 | test:
35 | strategy:
36 | fail-fast: false
37 | matrix:
38 | python-version:
39 | - "3.8"
40 | - "3.9"
41 | - "3.10"
42 | - "3.11"
43 | # - "3.12"
44 | os:
45 | - ubuntu-latest
46 | # - windows-latest
47 | # - macOS-latest
48 | runs-on: ${{ matrix.os }}
49 | steps:
50 | - uses: actions/checkout@v3
51 | - name: Set up Python
52 | uses: actions/setup-python@v5
53 | with:
54 | python-version: ${{ matrix.python-version }}
55 | - uses: snok/install-poetry@v1.3.4
56 | - name: Install Dependencies
57 | run: poetry install
58 | shell: bash
59 | - name: Test with Pytest
60 | run: poetry run pytest --cov-report=xml
61 | shell: bash
62 | - name: Upload coverage to Codecov
63 | uses: codecov/codecov-action@v4
64 | with:
65 | token: ${{ secrets.CODECOV_TOKEN }}
66 |
67 | release:
68 | runs-on: ubuntu-latest
69 | environment: release
70 | if: github.ref == 'refs/heads/main'
71 | needs:
72 | - test
73 | - lint
74 | - commitlint
75 |
76 | steps:
77 | - uses: actions/checkout@v3
78 | with:
79 | fetch-depth: 0
80 |
81 | # Run semantic release:
82 | # - Update CHANGELOG.md
83 | # - Update version in code
84 | # - Create git tag
85 | # - Create GitHub release
86 | # - Publish to PyPI
87 | - name: Python Semantic Release
88 | uses: relekang/python-semantic-release@v7.34.6
89 | with:
90 | github_token: ${{ secrets.GITHUB_TOKEN }}
91 | pypi_token: ${{ secrets.PYPI_TOKEN }}
92 |
--------------------------------------------------------------------------------
/.github/workflows/hacktoberfest.yml:
--------------------------------------------------------------------------------
1 | name: Hacktoberfest
2 |
3 | on:
4 | schedule:
5 | # Run every day in October
6 | - cron: "0 0 * 10 *"
7 | # Run on the 1st of November to revert
8 | - cron: "0 13 1 11 *"
9 |
10 | jobs:
11 | hacktoberfest:
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: browniebroke/hacktoberfest-labeler-action@v2.3.0
16 | with:
17 | github_token: ${{ secrets.GH_PAT }}
18 |
--------------------------------------------------------------------------------
/.github/workflows/issue-manager.yml:
--------------------------------------------------------------------------------
1 | name: Issue Manager
2 |
3 | on:
4 | schedule:
5 | - cron: "0 0 * * *"
6 | issue_comment:
7 | types:
8 | - created
9 | issues:
10 | types:
11 | - labeled
12 | pull_request_target:
13 | types:
14 | - labeled
15 | workflow_dispatch:
16 |
17 | jobs:
18 | issue-manager:
19 | runs-on: ubuntu-latest
20 | steps:
21 | - uses: tiangolo/issue-manager@0.5.0
22 | with:
23 | token: ${{ secrets.GITHUB_TOKEN }}
24 | config: >
25 | {
26 | "answered": {
27 | "message": "Assuming the original issue was solved, it will be automatically closed now."
28 | },
29 | "waiting": {
30 | "message": "Automatically closing. To re-open, please provide the additional information requested."
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/.github/workflows/labels.yml:
--------------------------------------------------------------------------------
1 | name: Sync Github labels
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | paths:
8 | - ".github/**"
9 |
10 | jobs:
11 | labels:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v3
15 | - name: Set up Python
16 | uses: actions/setup-python@v5
17 | with:
18 | python-version: 3.8
19 | - name: Install labels
20 | run: pip install labels
21 | - name: Sync config with Github
22 | run: labels -u ${{ github.repository_owner }} -t ${{ secrets.GITHUB_TOKEN }} sync -f .github/labels.toml
23 |
--------------------------------------------------------------------------------
/.github/workflows/poetry-upgrade.yml:
--------------------------------------------------------------------------------
1 | name: Upgrader
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule:
6 | - cron: "29 23 16 * *"
7 |
8 | jobs:
9 | upgrade:
10 | uses: browniebroke/github-actions/.github/workflows/poetry-upgrade.yml@v1
11 | secrets:
12 | gh_pat: ${{ secrets.GH_PAT }}
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 |
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 |
106 | # SageMath parsed files
107 | *.sage.py
108 |
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 |
118 | # Spyder {{package_name}} settings
119 | .spyderproject
120 | .spyproject
121 |
122 | # Rope {{package_name}} settings
123 | .ropeproject
124 |
125 | # mkdocs documentation
126 | /site
127 |
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 |
133 | # Pyre type checker
134 | .pyre/
135 |
136 | # pytype static type analyzer
137 | .pytype/
138 |
139 | # Cython debug symbols
140 | cython_debug/
141 |
142 | # additional files
143 | tests/**/*.wav
144 | !tests/dataset_raw/test/**/*.wav
145 | tests/**/*.npy
146 | tests/**/*.pt
147 | tests/**/*.txt
148 | tests/**/*.json
149 | tests/**/*.pth
150 | tests/**/*.download
151 | tests/**/*.lab
152 | tests/**/*.pdf
153 | tests/**/*.csv
154 | tests/**/*.ckpt
155 | tests/**/*.yaml
156 | *.tfevents.*
157 | *.pt
158 | user_gui_presets.json
159 |
--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
1 | tasks:
2 | - command: |
3 | pip install poetry
4 | PIP_USER=false poetry install
5 | - command: |
6 | pip install pre-commit
7 | pre-commit install
8 | PIP_USER=false pre-commit install-hooks
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/so-vits-svc-fork.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/watcherTasks.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | {
50 | "keyToString": {
51 | "RunOnceActivity.OpenProjectViewOnStart": "true",
52 | "RunOnceActivity.ShowReadmeOnStart": "true",
53 | "WebServerToolWindowFactoryState": "false",
54 | "node.js.detected.package.eslint": "true",
55 | "node.js.selected.package.eslint": "(autodetect)",
56 | "nodejs_package_manager_path": "npm"
57 | }
58 | }
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 | 1678892092249
124 |
125 |
126 | 1678892092249
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
147 |
148 |
149 |
150 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | exclude: "CHANGELOG.md|.copier-answers.yml"
4 | default_stages: [commit]
5 |
6 | ci:
7 | autofix_commit_msg: "chore(pre-commit.ci): auto fixes"
8 | autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate"
9 |
10 | repos:
11 | - repo: https://github.com/commitizen-tools/commitizen
12 | rev: v3.28.0
13 | hooks:
14 | - id: commitizen
15 | stages: [commit-msg]
16 | - repo: https://github.com/pre-commit/pre-commit-hooks
17 | rev: v4.6.0
18 | hooks:
19 | - id: debug-statements
20 | - id: check-builtin-literals
21 | - id: check-case-conflict
22 | - id: check-docstring-first
23 | - id: check-json
24 | - id: check-toml
25 | - id: check-xml
26 | - id: check-yaml
27 | - id: detect-private-key
28 | - id: end-of-file-fixer
29 | - id: trailing-whitespace
30 | - repo: https://github.com/python-poetry/poetry
31 | rev: 1.8.3
32 | hooks:
33 | - id: poetry-check
34 | - repo: https://github.com/pre-commit/mirrors-prettier
35 | rev: v3.1.0
36 | hooks:
37 | - id: prettier
38 | args: ["--tab-width", "2"]
39 | - repo: https://github.com/asottile/pyupgrade
40 | rev: v3.17.0
41 | hooks:
42 | - id: pyupgrade
43 | args: [--py38-plus]
44 | - repo: https://github.com/PyCQA/autoflake
45 | rev: v2.3.1
46 | hooks:
47 | - id: autoflake
48 | - repo: https://github.com/PyCQA/isort
49 | rev: 5.13.2
50 | hooks:
51 | - id: isort
52 | - repo: https://github.com/psf/black
53 | rev: 24.1.0
54 | hooks:
55 | - id: black
56 | - repo: https://github.com/codespell-project/codespell
57 | rev: v2.2.6
58 | hooks:
59 | - id: codespell
60 | args: [-w]
61 | - repo: https://github.com/PyCQA/flake8
62 | rev: 7.1.1
63 | hooks:
64 | - id: flake8
65 | #- repo: https://github.com/pre-commit/mirrors-mypy
66 | # rev: v0.931
67 | # hooks:
68 | # - id: mypy
69 | # additional_dependencies: []
70 | # - repo: https://github.com/PyCQA/bandit
71 | # rev: 1.7.4
72 | # hooks:
73 | # - id: bandit
74 | # args: [-x, tests]
75 | - repo: https://github.com/srstevenson/nb-clean
76 | rev: "3.3.0"
77 | hooks:
78 | - id: nb-clean
79 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | # Required
5 | version: 2
6 |
7 | # Set the version of Python and other tools you might need
8 | build:
9 | os: ubuntu-20.04
10 | tools:
11 | python: "3.9"
12 | jobs:
13 | post_create_environment:
14 | # Install poetry
15 | - pip install poetry
16 | # Tell poetry to not use a virtual environment
17 | - poetry config virtualenvs.create false
18 | post_install:
19 | # Install dependencies
20 | - poetry install --with docs
21 |
22 | # Build documentation in the docs directory with Sphinx
23 | sphinx:
24 | configuration: docs/conf.py
25 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Contributions are welcome, and they are greatly appreciated! Every little helps, and credit will always be given.
4 |
5 | You can contribute in many ways:
6 |
7 | ## Types of Contributions
8 |
9 | ### Report Bugs
10 |
11 | Report bugs to [our issue page][gh-issues]. If you are reporting a bug, please include:
12 |
13 | - Your operating system name and version.
14 | - Any details about your local setup that might be helpful in troubleshooting.
15 | - Detailed steps to reproduce the bug.
16 |
17 | ### Fix Bugs
18 |
19 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help wanted" is open to whoever wants to implement it.
20 |
21 | ### Implement Features
22 |
23 | Look through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it.
24 |
25 | ### Write Documentation
26 |
27 | SoftVC VITS Singing Voice Conversion Fork could always use more documentation, whether as part of the official SoftVC VITS Singing Voice Conversion Fork docs, in docstrings, or even on the web in blog posts, articles, and such.
28 |
29 | ### Submit Feedback
30 |
31 | The best way to send feedback [our issue page][gh-issues] on GitHub. If you are proposing a feature:
32 |
33 | - Explain in detail how it would work.
34 | - Keep the scope as narrow as possible, to make it easier to implement.
35 | - Remember that this is a volunteer-driven project, and that contributions are welcome 😊
36 |
37 | ## Get Started!
38 |
39 | Ready to contribute? Here's how to set yourself up for local development.
40 |
41 | 1. Fork the repo on GitHub.
42 |
43 | 2. Clone your fork locally:
44 |
45 | ```shell
46 | $ git clone git@github.com:your_name_here/so-vits-svc-fork.git
47 | ```
48 |
49 | 3. Install the project dependencies with [Poetry](https://python-poetry.org):
50 |
51 | ```shell
52 | $ poetry install
53 | ```
54 |
55 | 4. Create a branch for local development:
56 |
57 | ```shell
58 | $ git checkout -b name-of-your-bugfix-or-feature
59 | ```
60 |
61 | Now you can make your changes locally.
62 |
63 | 5. When you're done making changes, check that your changes pass our tests:
64 |
65 | ```shell
66 | $ poetry run pytest
67 | ```
68 |
69 | 6. Linting is done through [pre-commit](https://pre-commit.com). Provided you have the tool installed globally, you can run them all as one-off:
70 |
71 | ```shell
72 | $ pre-commit run -a
73 | ```
74 |
75 | Or better, install the hooks once and have them run automatically each time you commit:
76 |
77 | ```shell
78 | $ pre-commit install
79 | ```
80 |
81 | 7. Commit your changes and push your branch to GitHub:
82 |
83 | ```shell
84 | $ git add .
85 | $ git commit -m "feat(something): your detailed description of your changes"
86 | $ git push origin name-of-your-bugfix-or-feature
87 | ```
88 |
89 | Note: the commit message should follow [the conventional commits](https://www.conventionalcommits.org). We run [`commitlint` on CI](https://github.com/marketplace/actions/commit-linter) to validate it, and if you've installed pre-commit hooks at the previous step, the message will be checked at commit time.
90 |
91 | 8. Submit a pull request through the GitHub website or using the GitHub CLI (if you have it installed):
92 |
93 | ```shell
94 | $ gh pr create --fill
95 | ```
96 |
97 | ## Pull Request Guidelines
98 |
99 | We like to have the pull request open as soon as possible, that's a great place to discuss any piece of work, even unfinished. You can use draft pull request if it's still a work in progress. Here are a few guidelines to follow:
100 |
101 | 1. Include tests for feature or bug fixes.
102 | 2. Update the documentation for significant features.
103 | 3. Ensure tests are passing on CI.
104 |
105 | ## Tips
106 |
107 | To run a subset of tests:
108 |
109 | ```shell
110 | $ pytest tests
111 | ```
112 |
113 | ## Making a new release
114 |
115 | The deployment should be automated and can be triggered from the Semantic Release workflow in GitHub. The next version will be based on [the commit logs](https://python-semantic-release.readthedocs.io/en/latest/commit-log-parsing.html#commit-log-parsing). This is done by [python-semantic-release](https://python-semantic-release.readthedocs.io/en/latest/index.html) via a GitHub action.
116 |
117 | [gh-issues]: https://github.com/34j/so-vits-svc-fork/issues
118 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
2 | RUN ["apt", "update"]
3 | RUN ["apt", "install", "-y", "build-essential"]
4 | RUN ["pip", "install", "-U", "pip", "setuptools", "wheel"]
5 | RUN ["pip", "install", "-U", "so-vits-svc-fork"]
6 | ENTRYPOINT [ "svcg" ]
7 |
--------------------------------------------------------------------------------
/commitlint.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | extends: ["@commitlint/config-conventional"],
3 | rules: {
4 | "header-max-length": [0, "always", Infinity],
5 | "body-max-line-length": [0, "always", Infinity],
6 | "footer-max-line-length": [0, "always", Infinity],
7 | },
8 | };
9 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/docs/_static/.gitkeep
--------------------------------------------------------------------------------
/docs/_static/gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/docs/_static/gui.png
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CHANGELOG.md
2 |
3 | ```
4 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 | from pathlib import Path
7 | from typing import Any
8 |
9 | from sphinx.application import Sphinx
10 | from sphinx.ext import apidoc
11 |
12 | # -- Project information -----------------------------------------------------
13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
14 |
15 | project = "SoftVC VITS Singing Voice Conversion Fork"
16 | copyright = "2023, 34j"
17 | author = "34j"
18 | release = "0.0.0"
19 |
20 | # -- General configuration ---------------------------------------------------
21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
22 |
23 | # Add any Sphinx extension module names here, as strings. They can be
24 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
25 | # ones.
26 | extensions = [
27 | "myst_parser",
28 | "sphinx.ext.napoleon",
29 | "sphinx.ext.autodoc",
30 | "sphinx.ext.viewcode",
31 | ]
32 | napoleon_google_docstring = False
33 |
34 | # The suffix of source filenames.
35 | source_suffix = [".rst", ".md"]
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ["_templates"]
39 |
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns: list[str] = []
44 |
45 |
46 | # -- Options for HTML output -------------------------------------------------
47 |
48 | # The theme to use for HTML and HTML Help pages. See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = "sphinx_rtd_theme"
52 |
53 | # Add any paths that contain custom static files (such as style sheets) here,
54 | # relative to this directory. They are copied after the builtin static files,
55 | # so a file named "default.css" will overwrite the builtin "default.css".
56 | html_static_path = ["_static"]
57 |
58 |
59 | # -- Automatically run sphinx-apidoc -----------------------------------------
60 |
61 |
62 | def run_apidoc(_: Any) -> None:
63 | docs_path = Path(__file__).parent
64 | module_path = docs_path.parent / "src" / "so_vits_svc_fork"
65 |
66 | apidoc.main(
67 | [
68 | "--force",
69 | "--module-first",
70 | "-o",
71 | docs_path.as_posix(),
72 | module_path.as_posix(),
73 | ]
74 | )
75 |
76 |
77 | def setup(app: Sphinx) -> None:
78 | app.connect("builder-inited", run_apidoc)
79 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CONTRIBUTING.md
2 |
3 | ```
4 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to SoftVC VITS Singing Voice Conversion Fork documentation!
2 |
3 | ```{toctree}
4 | :caption: Installation & Usage
5 | :maxdepth: 2
6 |
7 | installation
8 | usage
9 | ```
10 |
11 | ```{toctree}
12 | :caption: Project Info
13 | :maxdepth: 2
14 |
15 | changelog
16 | contributing
17 | ```
18 |
19 | ```{toctree}
20 | :caption: API Reference
21 | :maxdepth: 2
22 |
23 | so_vits_svc_fork
24 | ```
25 |
26 | ```{include} ../README.md
27 |
28 | ```
29 |
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | The package is published on [PyPI](https://pypi.org/project/so-vits-svc-fork/) and can be installed with `pip` (or any equivalent):
4 |
5 | ```bash
6 | pip install so-vits-svc-fork
7 | ```
8 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | To use this package, import it:
4 |
5 | ```python
6 | import so_vits_svc_fork
7 | ```
8 |
9 | TODO: Document usage
10 |
--------------------------------------------------------------------------------
/easy-installation/install-cn.bat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/easy-installation/install-cn.bat
--------------------------------------------------------------------------------
/easy-installation/install.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | echo You can rerun this script to update the installation.
4 |
5 | echo Moving to AppData\Roaming\so-vits-svc-fork...
6 | mkdir "%APPDATA%\so-vits-svc-fork" >nul 2>&1
7 | cd "%APPDATA%\so-vits-svc-fork"
8 |
9 | echo Checking for Python 3.10...
10 |
11 | py -3.10 --version >nul 2>&1
12 | if %errorlevel%==0 (
13 | echo Python 3.10 is already installed.
14 | ) else (
15 | echo Python 3.10 is not installed. Downloading installer...
16 | curl https://www.python.org/ftp/python/3.10.10/python-3.10.10-amd64.exe -o python-3.10.10-amd64.exe
17 |
18 | echo Installing Python 3.10...
19 | python-3.10.10-amd64.exe /quiet InstallAllUsers=1 PrependPath=1
20 |
21 | echo Cleaning up installer...
22 | del python-3.10.10-amd64.exe
23 | )
24 |
25 | echo Creating virtual environment...
26 | py -3.10 -m venv venv
27 |
28 | echo Updating pip and wheel...
29 | venv\Scripts\python.exe -m pip install --upgrade pip wheel
30 |
31 | nvidia-smi >nul 2>&1
32 | if %errorlevel%==0 (
33 | echo Installing PyTorch with GPU support...
34 | venv\Scripts\pip.exe install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
35 | ) else (
36 | echo Installing PyTorch without GPU support...
37 | venv\Scripts\pip.exe install torch torchaudio
38 | )
39 |
40 | echo Installing so-vits-svc-fork...
41 | venv\Scripts\pip.exe install so-vits-svc-fork
42 |
43 | rem echo Creating shortcut...
44 | rem powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%USDRPROFILE%\Desktop\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()"
45 |
46 | echo Creating shortcut to the start menu...
47 | powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%APPDATA%\Microsoft\Windows\Start Menu\Programs\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()"
48 |
49 | echo Launching so-vits-svc-fork GUI...
50 | venv\Scripts\svcg.exe
51 |
--------------------------------------------------------------------------------
/notebooks/so-vits-svc-fork-4.0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Before training\n",
8 | "\n",
9 | "This program saves the last 3 generations of models to Google Drive. Since 1 generation of models is >1GB, you should have at least 3GB of free space in Google Drive. If you do not have such free space, it is recommended to create another Google Account.\n",
10 | "\n",
11 | "Training requires >10GB VRAM. (T4 should be enough) Inference does not require such a lot of VRAM."
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Installation"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "#@title Check GPU\n",
28 | "!nvidia-smi"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "#@title Mount Google Drive\n",
38 | "from google.colab import drive\n",
39 | "drive.mount('/content/drive')"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "#@title Install dependencies\n",
49 | "#@markdown pip may fail to resolve dependencies and raise ERROR, but it can be ignored.\n",
50 | "!python -m pip install -U pip wheel\n",
51 | "%pip install -U ipython \n",
52 | "\n",
53 | "#@markdown Branch (for development)\n",
54 | "BRANCH = \"none\" #@param {\"type\": \"string\"}\n",
55 | "if BRANCH == \"none\":\n",
56 | " %pip install -U so-vits-svc-fork\n",
57 | "else:\n",
58 | " %pip install -U git+https://github.com/34j/so-vits-svc-fork.git@{BRANCH}"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "## Training"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "#@title Make dataset directory\n",
75 | "!mkdir -p \"dataset_raw\""
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "#!rm -r \"dataset_raw\"\n",
85 | "#!rm -r \"dataset/44k\""
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "#@title Copy your dataset\n",
95 | "#@markdown **We assume that your dataset is in your Google Drive's `so-vits-svc-fork/dataset/(speaker_name)` directory.**\n",
96 | "DATASET_NAME = \"kiritan\" #@param {type: \"string\"}\n",
97 | "!cp -R /content/drive/MyDrive/so-vits-svc-fork/dataset/{DATASET_NAME}/ -t \"dataset_raw/\""
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "#@title Download dataset (Tsukuyomi-chan JVS)\n",
107 | "#@markdown You can download this dataset if you don't have your own dataset.\n",
108 | "#@markdown Make sure you agree to the license when using this dataset.\n",
109 | "#@markdown https://tyc.rei-yumesaki.net/material/corpus/#toc6\n",
110 | "# !wget https://tyc.rei-yumesaki.net/files/sozai-tyc-corpus1.zip\n",
111 | "# !unzip sozai-tyc-corpus1.zip\n",
112 | "# !mv \"/content/つくよみちゃんコーパス Vol.1 声優統計コーパス(JVSコーパス準拠)/おまけ:WAV(+12dB増幅&高音域削減)/WAV(+12dB増幅&高音域削減)\" \"dataset_raw/tsukuyomi\""
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "#@title Automatic preprocessing\n",
122 | "!svc pre-resample"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "!svc pre-config"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "#@title Export configs file\n",
141 | "#@markdown This assumes that you want to save the **config.json** on the default location. There will be also a backup file created in case the action is done accidentally.!cp configs/44k/config.json configs/44k/config.bkp.json!cp drive/MyDrive/so-vits-svc-fork/config.json configs/44k"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "#@title Import configs file (Optional Step, NOT REQUIRED)\n",
151 | "#@markdown This assumes that you are saving the **config.json** on the default location. There will be also a backup file created in case the action is done accidentally.!cp drive/MyDrive/so-vits-svc-fork/config.json drive/MyDrive/so-vits-svc-fork/config.bkp.json!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n",
161 | "!svc pre-hubert -fm {F0_METHOD}"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "#@title Train\n",
171 | "%load_ext tensorboard\n",
172 | "%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k\n",
173 | "!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "## Training Cluster model"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "!svc train-cluster --output-path drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "## Inference"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "#@title Get the author's voice as a source\n",
206 | "import random\n",
207 | "NAME = str(random.randint(1, 49))\n",
208 | "TYPE = \"fsd50k\" #@param [\"\", \"digit\", \"dog\", \"fsd50k\"]\n",
209 | "CUSTOM_FILEPATH = \"\" #@param {type: \"string\"}\n",
210 | "if CUSTOM_FILEPATH != \"\":\n",
211 | " NAME = CUSTOM_FILEPATH\n",
212 | "else:\n",
213 | " # it is extremely difficult to find a voice that can download from the internet directly\n",
214 | " if TYPE == \"dog\":\n",
215 | " !wget -N f\"https://huggingface.co/datasets/437aewuh/dog-dataset/resolve/main/dogs/dogs_{NAME:.0000}.wav\" -O {NAME}.wav\n",
216 | " elif TYPE == \"digit\":\n",
217 | " # george, jackson, lucas, nicolas, ...\n",
218 | " !wget -N f\"https://github.com/Jakobovski/free-spoken-digit-dataset/raw/master/recordings/0_george_{NAME}.wav\" -O {NAME}.wav\n",
219 | " elif TYPE == \"fsd50k\":\n",
220 | " !wget -N f\"https://huggingface.co/datasets/Fhrozen/FSD50k/blob/main/clips/dev/{10000+int(NAME)}.wav\" -O {NAME}.wav\n",
221 | " else:\n",
222 | " !wget -N f\"https://zunko.jp/sozai/utau/voice_{\"kiritan\" if NAME < 25 else \"itako\"}{NAME % 5 + 1}.wav\" -O {NAME}.wav\n",
223 | "from IPython.display import Audio, display\n",
224 | "display(Audio(f\"{NAME}.wav\"))"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "#@title Use trained model\n",
234 | "#@markdown **Put your .wav file in `so-vits-svc-fork/audio` directory**\n",
235 | "from IPython.display import Audio, display\n",
236 | "!svc infer drive/MyDrive/so-vits-svc-fork/audio/{NAME}.wav -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json\n",
237 | "display(Audio(f\"drive/MyDrive/so-vits-svc-fork/audio/{NAME}.out.wav\", autoplay=True))"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "##@title Use trained model (with cluster)\n",
247 | "!svc infer {NAME}.wav -s speaker -r 0.1 -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json -k drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt\n",
248 | "display(Audio(f\"{NAME}.out.wav\", autoplay=True))"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "### Pretrained models"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "#@title https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/tree/main\n",
265 | "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/G_riri_220.pth\"\n",
266 | "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/config.json\""
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "!svc infer {NAME}.wav -c config.json -m G_riri_220.pth\n",
276 | "display(Audio(f\"{NAME}.out.wav\", autoplay=True))"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "#@title https://huggingface.co/therealvul/so-vits-svc-4.0/tree/main\n",
286 | "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/G_166400.pth\"\n",
287 | "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/config.json\""
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "!svc infer {NAME}.wav --speaker \"Pinkie {neutral}\" -c config.json -m G_166400.pth\n",
297 | "display(Audio(f\"{NAME}.out.wav\", autoplay=True))"
298 | ]
299 | }
300 | ],
301 | "metadata": {
302 | "accelerator": "GPU",
303 | "colab": {
304 | "provenance": []
305 | },
306 | "gpuClass": "standard",
307 | "kernelspec": {
308 | "display_name": "Python 3",
309 | "name": "python3"
310 | },
311 | "language_info": {
312 | "codemirror_mode": {
313 | "name": "ipython",
314 | "version": 3
315 | },
316 | "file_extension": ".py",
317 | "mimetype": "text/x-python",
318 | "name": "python",
319 | "nbconvert_exporter": "python",
320 | "pygments_lexer": "ipython3"
321 | }
322 | },
323 | "nbformat": 4,
324 | "nbformat_minor": 0
325 | }
326 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "so-vits-svc-fork"
3 | version = "4.2.26"
4 | description = "A fork of so-vits-svc."
5 | authors = ["34j <34j.95a2p@simplelogin.com>"]
6 | license = "MIT"
7 | readme = "README.md"
8 | repository = "https://github.com/34j/so-vits-svc-fork"
9 | documentation = "https://so-vits-svc-fork.readthedocs.io"
10 | classifiers = [
11 | "Development Status :: 2 - Pre-Alpha",
12 | "Intended Audience :: Developers",
13 | "Natural Language :: English",
14 | "Operating System :: OS Independent",
15 | "Topic :: Software Development :: Libraries",
16 | ]
17 | packages = [
18 | { include = "so_vits_svc_fork", from = "src" },
19 | ]
20 |
21 | [tool.poetry.scripts]
22 | so-vits-svc-fork = "so_vits_svc_fork.__main__:cli"
23 | svc = "so_vits_svc_fork.__main__:cli"
24 | svcf = "so_vits_svc_fork.__main__:cli"
25 | svcg = "so_vits_svc_fork.gui:main"
26 | svc-gui = "so_vits_svc_fork.gui:main"
27 | svcf-gui = "so_vits_svc_fork.gui:main"
28 |
29 | [tool.poetry.urls]
30 | "Bug Tracker" = "https://github.com/34j/so-vits-svc-fork/issues"
31 | "Changelog" = "https://github.com/34j/so-vits-svc-fork/blob/main/CHANGELOG.md"
32 |
33 | [tool.poetry.dependencies]
34 | python = ">=3.9,<3.13"
35 | librosa = "*"
36 | numpy = "^1.26.4"
37 | pyworld = "*"
38 | requests = "*"
39 | scipy = "*"
40 | sounddevice = "*"
41 | SoundFile = "*"
42 | tqdm = "*"
43 | praat-parselmouth = "*"
44 | onnx = "*"
45 | onnxsim = "*"
46 | onnxoptimizer = "*"
47 | torch = "^2"
48 | torchaudio = "*"
49 | tensorboard = "*"
50 | rich = "*"
51 | tqdm-joblib = "^0.0.4"
52 | tensorboardx = "*"
53 | cm-time = ">=0.1.2"
54 | pebble = ">=5.0"
55 | torchcrepe = ">=0.0.17"
56 | lightning = "^2.0.1"
57 | fastapi = "==0.111.1"
58 | transformers = "^4.28.1"
59 | matplotlib = "^3.7.1"
60 | click = "^8.1.7"
61 | setuptools = "^69.5.1"
62 | pysimplegui-4-foss = "^4.60.4.1"
63 |
64 | [tool.poetry.group.dev.dependencies]
65 | pre-commit = ">=3"
66 | pytest = "^8.0.0"
67 | pytest-cov = "^4.0.0"
68 | pipdeptree = "^2.7.0"
69 | pip-licenses = "^5.0.0"
70 |
71 | [tool.poetry.group.docs]
72 | optional = true
73 |
74 | [tool.poetry.group.docs.dependencies]
75 | myst-parser = ">=0.16"
76 | sphinx = ">=4.0"
77 | sphinx-rtd-theme = ">=1.0"
78 |
79 | [tool.semantic_release]
80 | branch = "main"
81 | version_toml = "pyproject.toml:tool.poetry.version"
82 | version_variable = "src/so_vits_svc_fork/__init__.py:__version__"
83 | build_command = "pip install poetry && poetry build"
84 |
85 | [tool.pytest.ini_options]
86 | addopts = "-v -Wdefault --cov=so_vits_svc_fork --cov-report=term-missing:skip-covered"
87 | pythonpath = ["src"]
88 |
89 | [tool.coverage.run]
90 | branch = true
91 |
92 | [tool.coverage.report]
93 | exclude_lines = [
94 | "pragma: no cover",
95 | "@overload",
96 | "if TYPE_CHECKING",
97 | "raise NotImplementedError",
98 | 'if __name__ == "__main__":',
99 | ]
100 |
101 | [tool.isort]
102 | profile = "black"
103 | known_first_party = ["so_vits_svc_fork", "tests"]
104 |
105 | [tool.autoflake]
106 | remove_all_unused_imports = true
107 |
108 | [tool.mypy]
109 | check_untyped_defs = true
110 | disallow_any_generics = true
111 | disallow_incomplete_defs = true
112 | disallow_untyped_defs = true
113 | mypy_path = "src/"
114 | no_implicit_optional = true
115 | show_error_codes = true
116 | warn_unreachable = true
117 | warn_unused_ignores = true
118 | exclude = [
119 | 'docs/.*',
120 | 'setup.py',
121 | ]
122 |
123 | [[tool.mypy.overrides]]
124 | module = "tests.*"
125 | allow_untyped_defs = true
126 |
127 | [[tool.mypy.overrides]]
128 | module = "docs.*"
129 | ignore_errors = true
130 |
131 | [tool.bandit]
132 | exclude_dirs = ["src"]
133 |
134 | [build-system]
135 | requires = ["poetry-core>=1.0.0"]
136 | build-backend = "poetry.core.masonry.api"
137 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["github>browniebroke/renovate-configs:python"]
3 | }
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # This is a shim to allow GitHub to detect the package, build is done with poetry
4 | # Taken from https://github.com/Textualize/rich
5 |
6 | import setuptools
7 |
8 | if __name__ == "__main__":
9 | setuptools.setup(name="so-vits-svc-fork")
10 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "4.2.26"
2 |
3 | from .logger import init_logger
4 |
5 | init_logger()
6 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from typing import Any
5 |
6 | import torch
7 | from sklearn.cluster import KMeans
8 |
9 |
10 | def get_cluster_model(ckpt_path: Path | str):
11 | with Path(ckpt_path).open("rb") as f:
12 | checkpoint = torch.load(
13 | f, map_location="cpu"
14 | ) # Danger of arbitrary code execution
15 | kmeans_dict = {}
16 | for spk, ckpt in checkpoint.items():
17 | km = KMeans(ckpt["n_features_in_"])
18 | km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
19 | km.__dict__["_n_threads"] = ckpt["_n_threads"]
20 | km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
21 | kmeans_dict[spk] = km
22 | return kmeans_dict
23 |
24 |
25 | def check_speaker(model: Any, speaker: Any):
26 | if speaker not in model:
27 | raise ValueError(f"Speaker {speaker} not in {list(model.keys())}")
28 |
29 |
30 | def get_cluster_result(model: Any, x: Any, speaker: Any):
31 | """
32 | x: np.array [t, 256]
33 | return cluster class result
34 | """
35 | check_speaker(model, speaker)
36 | return model[speaker].predict(x)
37 |
38 |
39 | def get_cluster_center_result(model: Any, x: Any, speaker: Any):
40 | """x: np.array [t, 256]"""
41 | check_speaker(model, speaker)
42 | predict = model[speaker].predict(x)
43 | return model[speaker].cluster_centers_[predict]
44 |
45 |
46 | def get_center(model: Any, x: Any, speaker: Any):
47 | check_speaker(model, speaker)
48 | return model[speaker].cluster_centers_[x]
49 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/cluster/train_cluster.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import math
4 | from logging import getLogger
5 | from pathlib import Path
6 | from typing import Any
7 |
8 | import numpy as np
9 | import torch
10 | from cm_time import timer
11 | from joblib import Parallel, delayed
12 | from sklearn.cluster import KMeans, MiniBatchKMeans
13 | from tqdm_joblib import tqdm_joblib
14 |
15 | LOG = getLogger(__name__)
16 |
17 |
18 | def train_cluster(
19 | input_dir: Path | str,
20 | n_clusters: int,
21 | use_minibatch: bool = True,
22 | batch_size: int = 4096,
23 | partial_fit: bool = False,
24 | verbose: bool = False,
25 | ) -> dict:
26 | input_dir = Path(input_dir)
27 | if not partial_fit:
28 | LOG.info(f"Loading features from {input_dir}")
29 | features = []
30 | for path in input_dir.rglob("*.data.pt"):
31 | with path.open("rb") as f:
32 | features.append(
33 | torch.load(f, weights_only=True)["content"].squeeze(0).numpy().T
34 | )
35 | if not features:
36 | raise ValueError(f"No features found in {input_dir}")
37 | features = np.concatenate(features, axis=0).astype(np.float32)
38 | if features.shape[0] < n_clusters:
39 | raise ValueError(
40 | "Too few HuBERT features to cluster. Consider using a smaller number of clusters."
41 | )
42 | LOG.info(
43 | f"shape: {features.shape}, size: {features.nbytes/1024**2:.2f} MB, dtype: {features.dtype}"
44 | )
45 | with timer() as t:
46 | if use_minibatch:
47 | kmeans = MiniBatchKMeans(
48 | n_clusters=n_clusters,
49 | verbose=verbose,
50 | batch_size=batch_size,
51 | max_iter=80,
52 | n_init="auto",
53 | ).fit(features)
54 | else:
55 | kmeans = KMeans(
56 | n_clusters=n_clusters, verbose=verbose, n_init="auto"
57 | ).fit(features)
58 | LOG.info(f"Clustering took {t.elapsed:.2f} seconds")
59 |
60 | x = {
61 | "n_features_in_": kmeans.n_features_in_,
62 | "_n_threads": kmeans._n_threads,
63 | "cluster_centers_": kmeans.cluster_centers_,
64 | }
65 | return x
66 | else:
67 | # minibatch partial fit
68 | paths = list(input_dir.rglob("*.data.pt"))
69 | if len(paths) == 0:
70 | raise ValueError(f"No features found in {input_dir}")
71 | LOG.info(f"Found {len(paths)} features in {input_dir}")
72 | n_batches = math.ceil(len(paths) / batch_size)
73 | LOG.info(f"Splitting into {n_batches} batches")
74 | with timer() as t:
75 | kmeans = MiniBatchKMeans(
76 | n_clusters=n_clusters,
77 | verbose=verbose,
78 | batch_size=batch_size,
79 | max_iter=80,
80 | n_init="auto",
81 | )
82 | for i in range(0, len(paths), batch_size):
83 | LOG.info(
84 | f"Processing batch {i//batch_size+1}/{n_batches} for speaker {input_dir.stem}"
85 | )
86 | features = []
87 | for path in paths[i : i + batch_size]:
88 | with path.open("rb") as f:
89 | features.append(
90 | torch.load(f, weights_only=True)["content"]
91 | .squeeze(0)
92 | .numpy()
93 | .T
94 | )
95 | features = np.concatenate(features, axis=0).astype(np.float32)
96 | kmeans.partial_fit(features)
97 | LOG.info(f"Clustering took {t.elapsed:.2f} seconds")
98 |
99 | x = {
100 | "n_features_in_": kmeans.n_features_in_,
101 | "_n_threads": kmeans._n_threads,
102 | "cluster_centers_": kmeans.cluster_centers_,
103 | }
104 | return x
105 |
106 |
107 | def main(
108 | input_dir: Path | str,
109 | output_path: Path | str,
110 | n_clusters: int = 10000,
111 | use_minibatch: bool = True,
112 | batch_size: int = 4096,
113 | partial_fit: bool = False,
114 | verbose: bool = False,
115 | ) -> None:
116 | input_dir = Path(input_dir)
117 | output_path = Path(output_path)
118 |
119 | if not (use_minibatch or not partial_fit):
120 | raise ValueError("partial_fit requires use_minibatch")
121 |
122 | def train_cluster_(input_path: Path, **kwargs: Any) -> tuple[str, dict]:
123 | return input_path.stem, train_cluster(input_path, **kwargs)
124 |
125 | with tqdm_joblib(desc="Training clusters", total=len(list(input_dir.iterdir()))):
126 | parallel_result = Parallel(n_jobs=-1)(
127 | delayed(train_cluster_)(
128 | speaker_name,
129 | n_clusters=n_clusters,
130 | use_minibatch=use_minibatch,
131 | batch_size=batch_size,
132 | partial_fit=partial_fit,
133 | verbose=verbose,
134 | )
135 | for speaker_name in input_dir.iterdir()
136 | )
137 | assert parallel_result is not None
138 | checkpoint = dict(parallel_result)
139 | output_path.parent.mkdir(exist_ok=True, parents=True)
140 | with output_path.open("wb") as f:
141 | torch.save(checkpoint, f)
142 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/dataset.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from random import Random
5 | from typing import Sequence
6 |
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from torch.utils.data import Dataset
11 |
12 | from .hparams import HParams
13 |
14 |
15 | class TextAudioDataset(Dataset):
16 | def __init__(self, hps: HParams, is_validation: bool = False):
17 | self.datapaths = [
18 | Path(x).parent / (Path(x).name + ".data.pt")
19 | for x in Path(
20 | hps.data.validation_files if is_validation else hps.data.training_files
21 | )
22 | .read_text("utf-8")
23 | .splitlines()
24 | ]
25 | self.hps = hps
26 | self.random = Random(hps.train.seed)
27 | self.random.shuffle(self.datapaths)
28 | self.max_spec_len = 800
29 |
30 | def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
31 | with Path(self.datapaths[index]).open("rb") as f:
32 | data = torch.load(f, weights_only=True, map_location="cpu")
33 |
34 | # cut long data randomly
35 | spec_len = data["mel_spec"].shape[1]
36 | hop_len = self.hps.data.hop_length
37 | if spec_len > self.max_spec_len:
38 | start = self.random.randint(0, spec_len - self.max_spec_len)
39 | end = start + self.max_spec_len - 10
40 | for key in data.keys():
41 | if key == "audio":
42 | data[key] = data[key][:, start * hop_len : end * hop_len]
43 | elif key == "spk":
44 | continue
45 | else:
46 | data[key] = data[key][..., start:end]
47 | torch.cuda.empty_cache()
48 | return data
49 |
50 | def __len__(self) -> int:
51 | return len(self.datapaths)
52 |
53 |
54 | def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor:
55 | max_idx = torch.argmax(torch.tensor([x_.shape[-1] for x_ in array]))
56 | max_x = array[max_idx]
57 | x_padded = [
58 | F.pad(x_, (0, max_x.shape[-1] - x_.shape[-1]), mode="constant", value=0)
59 | for x_ in array
60 | ]
61 | return torch.stack(x_padded)
62 |
63 |
64 | class TextAudioCollate(nn.Module):
65 | def forward(
66 | self, batch: Sequence[dict[str, torch.Tensor]]
67 | ) -> tuple[torch.Tensor, ...]:
68 | batch = [b for b in batch if b is not None]
69 | batch = list(sorted(batch, key=lambda x: x["mel_spec"].shape[1], reverse=True))
70 | lengths = torch.tensor([b["mel_spec"].shape[1] for b in batch]).long()
71 | results = {}
72 | for key in batch[0].keys():
73 | if key not in ["spk"]:
74 | results[key] = _pad_stack([b[key] for b in batch]).cpu()
75 | else:
76 | results[key] = torch.tensor([[b[key]] for b in batch]).cpu()
77 |
78 | return (
79 | results["content"],
80 | results["f0"],
81 | results["spec"],
82 | results["mel_spec"],
83 | results["audio"],
84 | results["spk"],
85 | lengths,
86 | results["uv"],
87 | )
88 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/default_gui_presets.json:
--------------------------------------------------------------------------------
1 | {
2 | "Default VC (GPU, GTX 1060)": {
3 | "silence_threshold": -35.0,
4 | "transpose": 12.0,
5 | "auto_predict_f0": false,
6 | "f0_method": "dio",
7 | "cluster_infer_ratio": 0.0,
8 | "noise_scale": 0.4,
9 | "pad_seconds": 0.1,
10 | "chunk_seconds": 0.5,
11 | "absolute_thresh": true,
12 | "max_chunk_seconds": 40,
13 | "crossfade_seconds": 0.05,
14 | "block_seconds": 0.35,
15 | "additional_infer_before_seconds": 0.15,
16 | "additional_infer_after_seconds": 0.1,
17 | "realtime_algorithm": "1 (Divide constantly)",
18 | "passthrough_original": false,
19 | "use_gpu": true
20 | },
21 | "Default VC (CPU)": {
22 | "silence_threshold": -35.0,
23 | "transpose": 12.0,
24 | "auto_predict_f0": false,
25 | "f0_method": "dio",
26 | "cluster_infer_ratio": 0.0,
27 | "noise_scale": 0.4,
28 | "pad_seconds": 0.1,
29 | "chunk_seconds": 0.5,
30 | "absolute_thresh": true,
31 | "max_chunk_seconds": 40,
32 | "crossfade_seconds": 0.05,
33 | "block_seconds": 1.5,
34 | "additional_infer_before_seconds": 0.01,
35 | "additional_infer_after_seconds": 0.01,
36 | "realtime_algorithm": "1 (Divide constantly)",
37 | "passthrough_original": false,
38 | "use_gpu": false
39 | },
40 | "Default VC (Mobile CPU)": {
41 | "silence_threshold": -35.0,
42 | "transpose": 12.0,
43 | "auto_predict_f0": false,
44 | "f0_method": "dio",
45 | "cluster_infer_ratio": 0.0,
46 | "noise_scale": 0.4,
47 | "pad_seconds": 0.1,
48 | "chunk_seconds": 0.5,
49 | "absolute_thresh": true,
50 | "max_chunk_seconds": 40,
51 | "crossfade_seconds": 0.05,
52 | "block_seconds": 2.5,
53 | "additional_infer_before_seconds": 0.01,
54 | "additional_infer_after_seconds": 0.01,
55 | "realtime_algorithm": "1 (Divide constantly)",
56 | "passthrough_original": false,
57 | "use_gpu": false
58 | },
59 | "Default VC (Crooning)": {
60 | "silence_threshold": -35.0,
61 | "transpose": 12.0,
62 | "auto_predict_f0": false,
63 | "f0_method": "dio",
64 | "cluster_infer_ratio": 0.0,
65 | "noise_scale": 0.4,
66 | "pad_seconds": 0.1,
67 | "chunk_seconds": 0.5,
68 | "absolute_thresh": true,
69 | "max_chunk_seconds": 40,
70 | "crossfade_seconds": 0.04,
71 | "block_seconds": 0.15,
72 | "additional_infer_before_seconds": 0.05,
73 | "additional_infer_after_seconds": 0.05,
74 | "realtime_algorithm": "1 (Divide constantly)",
75 | "passthrough_original": false,
76 | "use_gpu": true
77 | },
78 | "Default File": {
79 | "silence_threshold": -35.0,
80 | "transpose": 0.0,
81 | "auto_predict_f0": true,
82 | "f0_method": "crepe",
83 | "cluster_infer_ratio": 0.0,
84 | "noise_scale": 0.4,
85 | "pad_seconds": 0.1,
86 | "chunk_seconds": 0.5,
87 | "absolute_thresh": true,
88 | "max_chunk_seconds": 40,
89 | "auto_play": true,
90 | "passthrough_original": false
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/f0.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from logging import getLogger
4 | from typing import Any, Literal
5 |
6 | import numpy as np
7 | import torch
8 | import torchcrepe
9 | from cm_time import timer
10 | from numpy import dtype, float32, ndarray
11 | from torch import FloatTensor, Tensor
12 |
13 | from so_vits_svc_fork.utils import get_optimal_device
14 |
15 | LOG = getLogger(__name__)
16 |
17 |
18 | def normalize_f0(
19 | f0: FloatTensor, x_mask: FloatTensor, uv: FloatTensor, random_scale=True
20 | ) -> FloatTensor:
21 | # calculate means based on x_mask
22 | uv_sum = torch.sum(uv, dim=1, keepdim=True)
23 | uv_sum[uv_sum == 0] = 9999
24 | means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum
25 |
26 | if random_scale:
27 | factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device)
28 | else:
29 | factor = torch.ones(f0.shape[0], 1).to(f0.device)
30 | # normalize f0 based on means and factor
31 | f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
32 | if torch.isnan(f0_norm).any():
33 | exit(0)
34 | return f0_norm * x_mask
35 |
36 |
37 | def interpolate_f0(
38 | f0: ndarray[Any, dtype[float32]]
39 | ) -> tuple[ndarray[Any, dtype[float32]], ndarray[Any, dtype[float32]]]:
40 | data = np.reshape(f0, (f0.size, 1))
41 |
42 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
43 | vuv_vector[data > 0.0] = 1.0
44 | vuv_vector[data <= 0.0] = 0.0
45 |
46 | ip_data = data
47 |
48 | frame_number = data.size
49 | last_value = 0.0
50 | for i in range(frame_number):
51 | if data[i] <= 0.0:
52 | j = i + 1
53 | for j in range(i + 1, frame_number):
54 | if data[j] > 0.0:
55 | break
56 | if j < frame_number - 1:
57 | if last_value > 0.0:
58 | step = (data[j] - data[i - 1]) / float(j - i)
59 | for k in range(i, j):
60 | ip_data[k] = data[i - 1] + step * (k - i + 1)
61 | else:
62 | for k in range(i, j):
63 | ip_data[k] = data[j]
64 | else:
65 | for k in range(i, frame_number):
66 | ip_data[k] = last_value
67 | else:
68 | ip_data[i] = data[i]
69 | last_value = data[i]
70 |
71 | return ip_data[:, 0], vuv_vector[:, 0]
72 |
73 |
74 | def compute_f0_parselmouth(
75 | wav_numpy: ndarray[Any, dtype[float32]],
76 | p_len: None | int = None,
77 | sampling_rate: int = 44100,
78 | hop_length: int = 512,
79 | ):
80 | import parselmouth
81 |
82 | x = wav_numpy
83 | if p_len is None:
84 | p_len = x.shape[0] // hop_length
85 | else:
86 | assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error"
87 | time_step = hop_length / sampling_rate * 1000
88 | f0_min = 50
89 | f0_max = 1100
90 | f0 = (
91 | parselmouth.Sound(x, sampling_rate)
92 | .to_pitch_ac(
93 | time_step=time_step / 1000,
94 | voicing_threshold=0.6,
95 | pitch_floor=f0_min,
96 | pitch_ceiling=f0_max,
97 | )
98 | .selected_array["frequency"]
99 | )
100 |
101 | pad_size = (p_len - len(f0) + 1) // 2
102 | if pad_size > 0 or p_len - len(f0) - pad_size > 0:
103 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
104 | return f0
105 |
106 |
107 | def _resize_f0(
108 | x: ndarray[Any, dtype[float32]], target_len: int
109 | ) -> ndarray[Any, dtype[float32]]:
110 | source = np.array(x)
111 | source[source < 0.001] = np.nan
112 | target = np.interp(
113 | np.arange(0, len(source) * target_len, len(source)) / target_len,
114 | np.arange(0, len(source)),
115 | source,
116 | )
117 | res = np.nan_to_num(target)
118 | return res
119 |
120 |
121 | def compute_f0_pyworld(
122 | wav_numpy: ndarray[Any, dtype[float32]],
123 | p_len: None | int = None,
124 | sampling_rate: int = 44100,
125 | hop_length: int = 512,
126 | type_: Literal["dio", "harvest"] = "dio",
127 | ):
128 | import pyworld
129 |
130 | if p_len is None:
131 | p_len = wav_numpy.shape[0] // hop_length
132 | if type_ == "dio":
133 | f0, t = pyworld.dio(
134 | wav_numpy.astype(np.double),
135 | fs=sampling_rate,
136 | f0_ceil=f0_max,
137 | f0_floor=f0_min,
138 | frame_period=1000 * hop_length / sampling_rate,
139 | )
140 | elif type_ == "harvest":
141 | f0, t = pyworld.harvest(
142 | wav_numpy.astype(np.double),
143 | fs=sampling_rate,
144 | f0_ceil=f0_max,
145 | f0_floor=f0_min,
146 | frame_period=1000 * hop_length / sampling_rate,
147 | )
148 | f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
149 | for index, pitch in enumerate(f0):
150 | f0[index] = round(pitch, 1)
151 | return _resize_f0(f0, p_len)
152 |
153 |
154 | def compute_f0_crepe(
155 | wav_numpy: ndarray[Any, dtype[float32]],
156 | p_len: None | int = None,
157 | sampling_rate: int = 44100,
158 | hop_length: int = 512,
159 | device: str | torch.device = get_optimal_device(),
160 | model: Literal["full", "tiny"] = "full",
161 | ):
162 | audio = torch.from_numpy(wav_numpy).to(device, copy=True)
163 | audio = torch.unsqueeze(audio, dim=0)
164 |
165 | if audio.ndim == 2 and audio.shape[0] > 1:
166 | audio = torch.mean(audio, dim=0, keepdim=True).detach()
167 | # (T) -> (1, T)
168 | audio = audio.detach()
169 |
170 | pitch: Tensor = torchcrepe.predict(
171 | audio,
172 | sampling_rate,
173 | hop_length,
174 | f0_min,
175 | f0_max,
176 | model,
177 | batch_size=hop_length * 2,
178 | device=device,
179 | pad=True,
180 | )
181 |
182 | f0 = pitch.squeeze(0).cpu().float().numpy()
183 | p_len = p_len or wav_numpy.shape[0] // hop_length
184 | f0 = _resize_f0(f0, p_len)
185 | return f0
186 |
187 |
188 | def compute_f0(
189 | wav_numpy: ndarray[Any, dtype[float32]],
190 | p_len: None | int = None,
191 | sampling_rate: int = 44100,
192 | hop_length: int = 512,
193 | method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
194 | **kwargs,
195 | ):
196 | with timer() as t:
197 | wav_numpy = wav_numpy.astype(np.float32)
198 | wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999)
199 | if method in ["dio", "harvest"]:
200 | f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method)
201 | elif method == "crepe":
202 | f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs)
203 | elif method == "crepe-tiny":
204 | f0 = compute_f0_crepe(
205 | wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs
206 | )
207 | elif method == "parselmouth":
208 | f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
209 | else:
210 | raise ValueError(
211 | "type must be dio, crepe, crepe-tiny, harvest or parselmouth"
212 | )
213 | rtf = t.elapsed / (len(wav_numpy) / sampling_rate)
214 | LOG.info(f"F0 inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}")
215 | return f0
216 |
217 |
218 | def f0_to_coarse(f0: torch.Tensor | float):
219 | is_torch = isinstance(f0, torch.Tensor)
220 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
221 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
222 | f0_mel_max - f0_mel_min
223 | ) + 1
224 |
225 | f0_mel[f0_mel <= 1] = 1
226 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
227 | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
228 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
229 | f0_coarse.max(),
230 | f0_coarse.min(),
231 | )
232 | return f0_coarse
233 |
234 |
235 | f0_bin = 256
236 | f0_max = 1100.0
237 | f0_min = 50.0
238 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
239 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
240 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/hparams.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Any
4 |
5 |
6 | class HParams:
7 | def __init__(self, **kwargs: Any) -> None:
8 | for k, v in kwargs.items():
9 | if type(v) == dict: # noqa
10 | v = HParams(**v)
11 | self[k] = v
12 |
13 | def keys(self):
14 | return self.__dict__.keys()
15 |
16 | def items(self):
17 | return self.__dict__.items()
18 |
19 | def values(self):
20 | return self.__dict__.values()
21 |
22 | def get(self, key: str, default: Any = None):
23 | return self.__dict__.get(key, default)
24 |
25 | def __len__(self):
26 | return len(self.__dict__)
27 |
28 | def __getitem__(self, key):
29 | return getattr(self, key)
30 |
31 | def __setitem__(self, key, value):
32 | return setattr(self, key, value)
33 |
34 | def __contains__(self, key):
35 | return key in self.__dict__
36 |
37 | def __repr__(self):
38 | return self.__dict__.__repr__()
39 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/inference/__init__.py
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/inference/main.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from logging import getLogger
4 | from pathlib import Path
5 | from typing import Literal, Sequence
6 |
7 | import librosa
8 | import numpy as np
9 | import soundfile
10 | import torch
11 | from cm_time import timer
12 | from tqdm import tqdm
13 |
14 | from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc
15 | from so_vits_svc_fork.utils import get_optimal_device
16 |
17 | LOG = getLogger(__name__)
18 |
19 |
20 | def infer(
21 | *,
22 | # paths
23 | input_path: Path | str | Sequence[Path | str],
24 | output_path: Path | str | Sequence[Path | str],
25 | model_path: Path | str,
26 | config_path: Path | str,
27 | recursive: bool = False,
28 | # svc config
29 | speaker: int | str,
30 | cluster_model_path: Path | str | None = None,
31 | transpose: int = 0,
32 | auto_predict_f0: bool = False,
33 | cluster_infer_ratio: float = 0,
34 | noise_scale: float = 0.4,
35 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
36 | # slice config
37 | db_thresh: int = -40,
38 | pad_seconds: float = 0.5,
39 | chunk_seconds: float = 0.5,
40 | absolute_thresh: bool = False,
41 | max_chunk_seconds: float = 40,
42 | device: str | torch.device = get_optimal_device(),
43 | ):
44 | if isinstance(input_path, (str, Path)):
45 | input_path = [input_path]
46 | if isinstance(output_path, (str, Path)):
47 | output_path = [output_path]
48 | if len(input_path) != len(output_path):
49 | raise ValueError(
50 | f"input_path and output_path must have same length, but got {len(input_path)} and {len(output_path)}"
51 | )
52 |
53 | model_path = Path(model_path)
54 | config_path = Path(config_path)
55 | output_path = [Path(p) for p in output_path]
56 | input_path = [Path(p) for p in input_path]
57 | output_paths = []
58 | input_paths = []
59 |
60 | for input_path, output_path in zip(input_path, output_path):
61 | if input_path.is_dir():
62 | if not recursive:
63 | raise ValueError(
64 | f"input_path is a directory, but recursive is False: {input_path}"
65 | )
66 | input_paths.extend(list(input_path.rglob("*.*")))
67 | output_paths.extend(
68 | [output_path / p.relative_to(input_path) for p in input_paths]
69 | )
70 | continue
71 | input_paths.append(input_path)
72 | output_paths.append(output_path)
73 |
74 | cluster_model_path = Path(cluster_model_path) if cluster_model_path else None
75 | svc_model = Svc(
76 | net_g_path=model_path.as_posix(),
77 | config_path=config_path.as_posix(),
78 | cluster_model_path=(
79 | cluster_model_path.as_posix() if cluster_model_path else None
80 | ),
81 | device=device,
82 | )
83 |
84 | try:
85 | pbar = tqdm(list(zip(input_paths, output_paths)), disable=len(input_paths) == 1)
86 | for input_path, output_path in pbar:
87 | pbar.set_description(f"{input_path}")
88 | try:
89 | audio, _ = librosa.load(str(input_path), sr=svc_model.target_sample)
90 | except Exception as e:
91 | LOG.error(f"Failed to load {input_path}")
92 | LOG.exception(e)
93 | continue
94 | output_path.parent.mkdir(parents=True, exist_ok=True)
95 | audio = svc_model.infer_silence(
96 | audio.astype(np.float32),
97 | speaker=speaker,
98 | transpose=transpose,
99 | auto_predict_f0=auto_predict_f0,
100 | cluster_infer_ratio=cluster_infer_ratio,
101 | noise_scale=noise_scale,
102 | f0_method=f0_method,
103 | db_thresh=db_thresh,
104 | pad_seconds=pad_seconds,
105 | chunk_seconds=chunk_seconds,
106 | absolute_thresh=absolute_thresh,
107 | max_chunk_seconds=max_chunk_seconds,
108 | )
109 | soundfile.write(str(output_path), audio, svc_model.target_sample)
110 | finally:
111 | del svc_model
112 | torch.cuda.empty_cache()
113 |
114 |
115 | def realtime(
116 | *,
117 | # paths
118 | model_path: Path | str,
119 | config_path: Path | str,
120 | # svc config
121 | speaker: str,
122 | cluster_model_path: Path | str | None = None,
123 | transpose: int = 0,
124 | auto_predict_f0: bool = False,
125 | cluster_infer_ratio: float = 0,
126 | noise_scale: float = 0.4,
127 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
128 | # slice config
129 | db_thresh: int = -40,
130 | pad_seconds: float = 0.5,
131 | chunk_seconds: float = 0.5,
132 | # realtime config
133 | crossfade_seconds: float = 0.05,
134 | additional_infer_before_seconds: float = 0.2,
135 | additional_infer_after_seconds: float = 0.1,
136 | block_seconds: float = 0.5,
137 | version: int = 2,
138 | input_device: int | str | None = None,
139 | output_device: int | str | None = None,
140 | device: str | torch.device = get_optimal_device(),
141 | passthrough_original: bool = False,
142 | ):
143 | import sounddevice as sd
144 |
145 | model_path = Path(model_path)
146 | config_path = Path(config_path)
147 | cluster_model_path = Path(cluster_model_path) if cluster_model_path else None
148 | svc_model = Svc(
149 | net_g_path=model_path.as_posix(),
150 | config_path=config_path.as_posix(),
151 | cluster_model_path=(
152 | cluster_model_path.as_posix() if cluster_model_path else None
153 | ),
154 | device=device,
155 | )
156 |
157 | LOG.info("Creating realtime model...")
158 | if version == 1:
159 | model = RealtimeVC(
160 | svc_model=svc_model,
161 | crossfade_len=int(crossfade_seconds * svc_model.target_sample),
162 | additional_infer_before_len=int(
163 | additional_infer_before_seconds * svc_model.target_sample
164 | ),
165 | additional_infer_after_len=int(
166 | additional_infer_after_seconds * svc_model.target_sample
167 | ),
168 | )
169 | else:
170 | model = RealtimeVC2(
171 | svc_model=svc_model,
172 | )
173 |
174 | # LOG all device info
175 | devices = sd.query_devices()
176 | LOG.info(f"Device: {devices}")
177 | if isinstance(input_device, str):
178 | input_device_candidates = [
179 | i for i, d in enumerate(devices) if d["name"] == input_device
180 | ]
181 | if len(input_device_candidates) == 0:
182 | LOG.warning(f"Input device {input_device} not found, using default")
183 | input_device = None
184 | else:
185 | input_device = input_device_candidates[0]
186 | if isinstance(output_device, str):
187 | output_device_candidates = [
188 | i for i, d in enumerate(devices) if d["name"] == output_device
189 | ]
190 | if len(output_device_candidates) == 0:
191 | LOG.warning(f"Output device {output_device} not found, using default")
192 | output_device = None
193 | else:
194 | output_device = output_device_candidates[0]
195 | if input_device is None or input_device >= len(devices):
196 | input_device = sd.default.device[0]
197 | if output_device is None or output_device >= len(devices):
198 | output_device = sd.default.device[1]
199 | LOG.info(
200 | f"Input Device: {devices[input_device]['name']}, Output Device: {devices[output_device]['name']}"
201 | )
202 |
203 | # the model RTL is somewhat significantly high only in the first inference
204 | # there could be no better way to warm up the model than to do a dummy inference
205 | # (there are not differences in the behavior of the model between the first and the later inferences)
206 | # so we do a dummy inference to warm up the model (1 second of audio)
207 | LOG.info("Warming up the model...")
208 | svc_model.infer(
209 | speaker=speaker,
210 | transpose=transpose,
211 | auto_predict_f0=auto_predict_f0,
212 | cluster_infer_ratio=cluster_infer_ratio,
213 | noise_scale=noise_scale,
214 | f0_method=f0_method,
215 | audio=np.zeros(svc_model.target_sample, dtype=np.float32),
216 | )
217 |
218 | def callback(
219 | indata: np.ndarray,
220 | outdata: np.ndarray,
221 | frames: int,
222 | time: int,
223 | status: sd.CallbackFlags,
224 | ) -> None:
225 | LOG.debug(
226 | f"Frames: {frames}, Status: {status}, Shape: {indata.shape}, Time: {time}"
227 | )
228 |
229 | kwargs = dict(
230 | input_audio=indata.mean(axis=1).astype(np.float32),
231 | # svc config
232 | speaker=speaker,
233 | transpose=transpose,
234 | auto_predict_f0=auto_predict_f0,
235 | cluster_infer_ratio=cluster_infer_ratio,
236 | noise_scale=noise_scale,
237 | f0_method=f0_method,
238 | # slice config
239 | db_thresh=db_thresh,
240 | # pad_seconds=pad_seconds,
241 | chunk_seconds=chunk_seconds,
242 | )
243 | if version == 1:
244 | kwargs["pad_seconds"] = pad_seconds
245 | with timer() as t:
246 | inference = model.process(
247 | **kwargs,
248 | ).reshape(-1, 1)
249 | if passthrough_original:
250 | outdata[:] = (indata + inference) / 2
251 | else:
252 | outdata[:] = inference
253 | rtf = t.elapsed / block_seconds
254 | LOG.info(f"Realtime inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}")
255 | if rtf > 1:
256 | LOG.warning("RTF is too high, consider increasing block_seconds")
257 |
258 | try:
259 | with sd.Stream(
260 | device=(input_device, output_device),
261 | channels=1,
262 | callback=callback,
263 | samplerate=svc_model.target_sample,
264 | blocksize=int(block_seconds * svc_model.target_sample),
265 | latency="low",
266 | ) as stream:
267 | LOG.info(f"Latency: {stream.latency}")
268 | while True:
269 | sd.sleep(1000)
270 | finally:
271 | # del model, svc_model
272 | torch.cuda.empty_cache()
273 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from logging import DEBUG, INFO, StreamHandler, basicConfig, captureWarnings, getLogger
4 | from pathlib import Path
5 |
6 | from rich.logging import RichHandler
7 |
8 | LOGGER_INIT = False
9 |
10 |
11 | def init_logger() -> None:
12 | global LOGGER_INIT
13 | if LOGGER_INIT:
14 | return
15 |
16 | IS_TEST = "test" in Path.cwd().stem
17 | package_name = sys.modules[__name__].__package__
18 | basicConfig(
19 | level=INFO,
20 | format="%(asctime)s %(message)s",
21 | datefmt="[%X]",
22 | handlers=[
23 | StreamHandler() if is_notebook() else RichHandler(),
24 | # FileHandler(f"{package_name}.log"),
25 | ],
26 | )
27 | if IS_TEST:
28 | getLogger(package_name).setLevel(DEBUG)
29 | captureWarnings(True)
30 | LOGGER_INIT = True
31 |
32 |
33 | def is_notebook():
34 | try:
35 | from IPython import get_ipython
36 |
37 | if "IPKernelApp" not in get_ipython().config: # pragma: no cover
38 | raise ImportError("console")
39 | return False
40 | if "VSCODE_PID" in os.environ: # pragma: no cover
41 | raise ImportError("vscode")
42 | return False
43 | except Exception:
44 | return False
45 | else: # pragma: no cover
46 | return True
47 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/modules/__init__.py
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/commons.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from torch import Tensor
6 |
7 |
8 | def slice_segments(x: Tensor, starts: Tensor, length: int) -> Tensor:
9 | if length is None:
10 | return x
11 | length = min(length, x.size(-1))
12 | x_slice = torch.zeros((x.size()[:-1] + (length,)), dtype=x.dtype, device=x.device)
13 | ends = starts + length
14 | for i, (start, end) in enumerate(zip(starts, ends)):
15 | # LOG.debug(i, start, end, x.size(), x[i, ..., start:end].size(), x_slice.size())
16 | # x_slice[i, ...] = x[i, ..., start:end] need to pad
17 | # x_slice[i, ..., :end - start] = x[i, ..., start:end] this does not work
18 | x_slice[i, ...] = F.pad(x[i, ..., start:end], (0, max(0, length - x.size(-1))))
19 | return x_slice
20 |
21 |
22 | def rand_slice_segments_with_pitch(
23 | x: Tensor, f0: Tensor, x_lengths: Tensor | int | None, segment_size: int | None
24 | ):
25 | if segment_size is None:
26 | return x, f0, torch.arange(x.size(0), device=x.device)
27 | if x_lengths is None:
28 | x_lengths = x.size(-1) * torch.ones(
29 | x.size(0), dtype=torch.long, device=x.device
30 | )
31 | # slice_starts = (torch.rand(z.size(0), device=z.device) * (z_lengths - segment_size)).long()
32 | slice_starts = (
33 | torch.rand(x.size(0), device=x.device)
34 | * torch.max(
35 | x_lengths - segment_size, torch.zeros_like(x_lengths, device=x.device)
36 | )
37 | ).long()
38 | z_slice = slice_segments(x, slice_starts, segment_size)
39 | f0_slice = slice_segments(f0, slice_starts, segment_size)
40 | return z_slice, f0_slice, slice_starts
41 |
42 |
43 | def slice_2d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor:
44 | batch_size, num_features, seq_len = x.shape
45 | ends = starts + length
46 | idxs = (
47 | torch.arange(seq_len, device=x.device)
48 | .unsqueeze(0)
49 | .unsqueeze(1)
50 | .repeat(batch_size, num_features, 1)
51 | )
52 | mask = (idxs >= starts.unsqueeze(-1).unsqueeze(-1)) & (
53 | idxs < ends.unsqueeze(-1).unsqueeze(-1)
54 | )
55 | return x[mask].reshape(batch_size, num_features, length)
56 |
57 |
58 | def slice_1d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor:
59 | batch_size, seq_len = x.shape
60 | ends = starts + length
61 | idxs = torch.arange(seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1)
62 | mask = (idxs >= starts.unsqueeze(-1)) & (idxs < ends.unsqueeze(-1))
63 | return x[mask].reshape(batch_size, length)
64 |
65 |
66 | def _slice_segments_v3(x: Tensor, starts: Tensor, length: int) -> Tensor:
67 | shape = x.shape[:-1] + (length,)
68 | ends = starts + length
69 | idxs = torch.arange(x.shape[-1], device=x.device).unsqueeze(0).unsqueeze(0)
70 | unsqueeze_dims = len(shape) - len(
71 | x.shape
72 | ) # calculate number of dimensions to unsqueeze
73 | starts = starts.reshape(starts.shape + (1,) * unsqueeze_dims)
74 | ends = ends.reshape(ends.shape + (1,) * unsqueeze_dims)
75 | mask = (idxs >= starts) & (idxs < ends)
76 | return x[mask].reshape(shape)
77 |
78 |
79 | def init_weights(m, mean=0.0, std=0.01):
80 | classname = m.__class__.__name__
81 | if classname.find("Conv") != -1:
82 | m.weight.data.normal_(mean, std)
83 |
84 |
85 | def get_padding(kernel_size, dilation=1):
86 | return int((kernel_size * dilation - dilation) / 2)
87 |
88 |
89 | def convert_pad_shape(pad_shape):
90 | l = pad_shape[::-1]
91 | pad_shape = [item for sublist in l for item in sublist]
92 | return pad_shape
93 |
94 |
95 | def subsequent_mask(length):
96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97 | return mask
98 |
99 |
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 | n_channels_int = n_channels[0]
103 | in_act = input_a + input_b
104 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 | acts = t_act * s_act
107 | return acts
108 |
109 |
110 | def sequence_mask(length, max_length=None):
111 | if max_length is None:
112 | max_length = length.max()
113 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
114 | return x.unsqueeze(0) < length.unsqueeze(1)
115 |
116 |
117 | def clip_grad_value_(parameters, clip_value, norm_type=2):
118 | if isinstance(parameters, torch.Tensor):
119 | parameters = [parameters]
120 | parameters = list(filter(lambda p: p.grad is not None, parameters))
121 | norm_type = float(norm_type)
122 | if clip_value is not None:
123 | clip_value = float(clip_value)
124 |
125 | total_norm = 0
126 | for p in parameters:
127 | param_norm = p.grad.data.norm(norm_type)
128 | total_norm += param_norm.item() ** norm_type
129 | if clip_value is not None:
130 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
131 | total_norm = total_norm ** (1.0 / norm_type)
132 | return total_norm
133 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/modules/decoders/__init__.py
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/f0.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from so_vits_svc_fork.modules import attentions as attentions
5 |
6 |
7 | class F0Decoder(nn.Module):
8 | def __init__(
9 | self,
10 | out_channels,
11 | hidden_channels,
12 | filter_channels,
13 | n_heads,
14 | n_layers,
15 | kernel_size,
16 | p_dropout,
17 | spk_channels=0,
18 | ):
19 | super().__init__()
20 | self.out_channels = out_channels
21 | self.hidden_channels = hidden_channels
22 | self.filter_channels = filter_channels
23 | self.n_heads = n_heads
24 | self.n_layers = n_layers
25 | self.kernel_size = kernel_size
26 | self.p_dropout = p_dropout
27 | self.spk_channels = spk_channels
28 |
29 | self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
30 | self.decoder = attentions.FFT(
31 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
32 | )
33 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
34 | self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
35 | self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
36 |
37 | def forward(self, x, norm_f0, x_mask, spk_emb=None):
38 | x = torch.detach(x)
39 | if spk_emb is not None:
40 | spk_emb = torch.detach(spk_emb)
41 | x = x + self.cond(spk_emb)
42 | x += self.f0_prenet(norm_f0)
43 | x = self.prenet(x) * x_mask
44 | x = self.decoder(x * x_mask, x_mask)
45 | x = self.proj(x) * x_mask
46 | return x
47 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from ._models import NSFHifiGANGenerator
2 |
3 | __all__ = ["NSFHifiGANGenerator"]
4 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py:
--------------------------------------------------------------------------------
1 | from logging import getLogger
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.nn import Conv1d, ConvTranspose1d
8 | from torch.nn.utils import remove_weight_norm, weight_norm
9 |
10 | from ...modules import ResBlock1, ResBlock2
11 | from ._utils import init_weights
12 |
13 | LOG = getLogger(__name__)
14 |
15 | LRELU_SLOPE = 0.1
16 |
17 |
18 | def padDiff(x):
19 | return F.pad(
20 | F.pad(x, (0, 0, -1, 1), "constant", 0) - x, (0, 0, 0, -1), "constant", 0
21 | )
22 |
23 |
24 | class SineGen(torch.nn.Module):
25 | """Definition of sine generator
26 | SineGen(samp_rate, harmonic_num = 0,
27 | sine_amp = 0.1, noise_std = 0.003,
28 | voiced_threshold = 0,
29 | flag_for_pulse=False)
30 | samp_rate: sampling rate in Hz
31 | harmonic_num: number of harmonic overtones (default 0)
32 | sine_amp: amplitude of sine-wavefrom (default 0.1)
33 | noise_std: std of Gaussian noise (default 0.003)
34 | voiced_thoreshold: F0 threshold for U/V classification (default 0)
35 | flag_for_pulse: this SinGen is used inside PulseGen (default False)
36 | Note: when flag_for_pulse is True, the first time step of a voiced
37 | segment is always sin(np.pi) or cos(0)
38 | """
39 |
40 | def __init__(
41 | self,
42 | samp_rate,
43 | harmonic_num=0,
44 | sine_amp=0.1,
45 | noise_std=0.003,
46 | voiced_threshold=0,
47 | flag_for_pulse=False,
48 | ):
49 | super().__init__()
50 | self.sine_amp = sine_amp
51 | self.noise_std = noise_std
52 | self.harmonic_num = harmonic_num
53 | self.dim = self.harmonic_num + 1
54 | self.sampling_rate = samp_rate
55 | self.voiced_threshold = voiced_threshold
56 | self.flag_for_pulse = flag_for_pulse
57 |
58 | def _f02uv(self, f0):
59 | # generate uv signal
60 | uv = (f0 > self.voiced_threshold).type(torch.float32)
61 | return uv
62 |
63 | def _f02sine(self, f0_values):
64 | """f0_values: (batchsize, length, dim)
65 | where dim indicates fundamental tone and overtones
66 | """
67 | # convert to F0 in rad. The integer part n can be ignored
68 | # because 2 * np.pi * n doesn't affect phase
69 | rad_values = (f0_values / self.sampling_rate) % 1
70 |
71 | # initial phase noise (no noise for fundamental component)
72 | rand_ini = torch.rand(
73 | f0_values.shape[0], f0_values.shape[2], device=f0_values.device
74 | )
75 | rand_ini[:, 0] = 0
76 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
77 |
78 | # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
79 | if not self.flag_for_pulse:
80 | # for normal case
81 |
82 | # To prevent torch.cumsum numerical overflow,
83 | # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
84 | # Buffer tmp_over_one_idx indicates the time step to add -1.
85 | # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
86 | tmp_over_one = torch.cumsum(rad_values, 1) % 1
87 | tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
88 | cumsum_shift = torch.zeros_like(rad_values)
89 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
90 |
91 | sines = torch.sin(
92 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
93 | )
94 | else:
95 | # If necessary, make sure that the first time step of every
96 | # voiced segments is sin(pi) or cos(0)
97 | # This is used for pulse-train generation
98 |
99 | # identify the last time step in unvoiced segments
100 | uv = self._f02uv(f0_values)
101 | uv_1 = torch.roll(uv, shifts=-1, dims=1)
102 | uv_1[:, -1, :] = 1
103 | u_loc = (uv < 1) * (uv_1 > 0)
104 |
105 | # get the instantanouse phase
106 | tmp_cumsum = torch.cumsum(rad_values, dim=1)
107 | # different batch needs to be processed differently
108 | for idx in range(f0_values.shape[0]):
109 | temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
110 | temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
111 | # stores the accumulation of i.phase within
112 | # each voiced segments
113 | tmp_cumsum[idx, :, :] = 0
114 | tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
115 |
116 | # rad_values - tmp_cumsum: remove the accumulation of i.phase
117 | # within the previous voiced segment.
118 | i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
119 |
120 | # get the sines
121 | sines = torch.cos(i_phase * 2 * np.pi)
122 | return sines
123 |
124 | def forward(self, f0):
125 | """sine_tensor, uv = forward(f0)
126 | input F0: tensor(batchsize=1, length, dim=1)
127 | f0 for unvoiced steps should be 0
128 | output sine_tensor: tensor(batchsize=1, length, dim)
129 | output uv: tensor(batchsize=1, length, 1)
130 | """
131 | with torch.no_grad():
132 | # f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
133 | # fundamental component
134 | # fn = torch.multiply(
135 | # f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)
136 | # )
137 | fn = torch.multiply(
138 | f0, torch.arange(1, self.harmonic_num + 2).to(f0.device).to(f0.dtype)
139 | )
140 |
141 | # generate sine waveforms
142 | sine_waves = self._f02sine(fn) * self.sine_amp
143 |
144 | # generate uv signal
145 | # uv = torch.ones(f0.shape)
146 | # uv = uv * (f0 > self.voiced_threshold)
147 | uv = self._f02uv(f0)
148 |
149 | # noise: for unvoiced should be similar to sine_amp
150 | # std = self.sine_amp/3 -> max value ~ self.sine_amp
151 | # . for voiced regions is self.noise_std
152 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
153 | noise = noise_amp * torch.randn_like(sine_waves)
154 |
155 | # first: set the unvoiced part to 0 by uv
156 | # then: additive noise
157 | sine_waves = sine_waves * uv + noise
158 | return sine_waves, uv, noise
159 |
160 |
161 | class SourceModuleHnNSF(torch.nn.Module):
162 | """SourceModule for hn-nsf
163 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
164 | add_noise_std=0.003, voiced_threshod=0)
165 | sampling_rate: sampling_rate in Hz
166 | harmonic_num: number of harmonic above F0 (default: 0)
167 | sine_amp: amplitude of sine source signal (default: 0.1)
168 | add_noise_std: std of additive Gaussian noise (default: 0.003)
169 | note that amplitude of noise in unvoiced is decided
170 | by sine_amp
171 | voiced_threshold: threshold to set U/V given F0 (default: 0)
172 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
173 | F0_sampled (batchsize, length, 1)
174 | Sine_source (batchsize, length, 1)
175 | noise_source (batchsize, length 1)
176 | uv (batchsize, length, 1)
177 | """
178 |
179 | def __init__(
180 | self,
181 | sampling_rate,
182 | harmonic_num=0,
183 | sine_amp=0.1,
184 | add_noise_std=0.003,
185 | voiced_threshod=0,
186 | ):
187 | super().__init__()
188 |
189 | self.sine_amp = sine_amp
190 | self.noise_std = add_noise_std
191 |
192 | # to produce sine waveforms
193 | self.l_sin_gen = SineGen(
194 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
195 | )
196 |
197 | # to merge source harmonics into a single excitation
198 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
199 | self.l_tanh = torch.nn.Tanh()
200 |
201 | def forward(self, x):
202 | """
203 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
204 | F0_sampled (batchsize, length, 1)
205 | Sine_source (batchsize, length, 1)
206 | noise_source (batchsize, length 1)
207 | """
208 | # source for harmonic branch
209 | sine_wavs, uv, _ = self.l_sin_gen(x)
210 | sine_merge = self.l_tanh(self.l_linear(sine_wavs))
211 |
212 | # source for noise branch, in the same shape as uv
213 | noise = torch.randn_like(uv) * self.sine_amp / 3
214 | return sine_merge, noise, uv
215 |
216 |
217 | class NSFHifiGANGenerator(torch.nn.Module):
218 | def __init__(self, h):
219 | super().__init__()
220 | self.h = h
221 |
222 | self.num_kernels = len(h["resblock_kernel_sizes"])
223 | self.num_upsamples = len(h["upsample_rates"])
224 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
225 | self.m_source = SourceModuleHnNSF(
226 | sampling_rate=h["sampling_rate"], harmonic_num=8
227 | )
228 | self.noise_convs = nn.ModuleList()
229 | self.conv_pre = weight_norm(
230 | Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)
231 | )
232 | resblock = ResBlock1 if h["resblock"] == "1" else ResBlock2
233 | self.ups = nn.ModuleList()
234 | for i, (u, k) in enumerate(
235 | zip(h["upsample_rates"], h["upsample_kernel_sizes"])
236 | ):
237 | c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
238 | self.ups.append(
239 | weight_norm(
240 | ConvTranspose1d(
241 | h["upsample_initial_channel"] // (2**i),
242 | h["upsample_initial_channel"] // (2 ** (i + 1)),
243 | k,
244 | u,
245 | padding=(k - u) // 2,
246 | )
247 | )
248 | )
249 | if i + 1 < len(h["upsample_rates"]): #
250 | stride_f0 = np.prod(h["upsample_rates"][i + 1 :])
251 | self.noise_convs.append(
252 | Conv1d(
253 | 1,
254 | c_cur,
255 | kernel_size=stride_f0 * 2,
256 | stride=stride_f0,
257 | padding=stride_f0 // 2,
258 | )
259 | )
260 | else:
261 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
262 | self.resblocks = nn.ModuleList()
263 | for i in range(len(self.ups)):
264 | ch = h["upsample_initial_channel"] // (2 ** (i + 1))
265 | for j, (k, d) in enumerate(
266 | zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])
267 | ):
268 | self.resblocks.append(resblock(ch, k, d))
269 |
270 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
271 | self.ups.apply(init_weights)
272 | self.conv_post.apply(init_weights)
273 | self.cond = nn.Conv1d(h["gin_channels"], h["upsample_initial_channel"], 1)
274 |
275 | def forward(self, x, f0, g=None):
276 | # LOG.info(1,x.shape,f0.shape,f0[:, None].shape)
277 | f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
278 | # LOG.info(2,f0.shape)
279 | har_source, noi_source, uv = self.m_source(f0)
280 | har_source = har_source.transpose(1, 2)
281 | x = self.conv_pre(x)
282 | x = x + self.cond(g)
283 | # LOG.info(124,x.shape,har_source.shape)
284 | for i in range(self.num_upsamples):
285 | x = F.leaky_relu(x, LRELU_SLOPE)
286 | # LOG.info(3,x.shape)
287 | x = self.ups[i](x)
288 | x_source = self.noise_convs[i](har_source)
289 | # LOG.info(4,x_source.shape,har_source.shape,x.shape)
290 | x = x + x_source
291 | xs = None
292 | for j in range(self.num_kernels):
293 | if xs is None:
294 | xs = self.resblocks[i * self.num_kernels + j](x)
295 | else:
296 | xs += self.resblocks[i * self.num_kernels + j](x)
297 | x = xs / self.num_kernels
298 | x = F.leaky_relu(x)
299 | x = self.conv_post(x)
300 | x = torch.tanh(x)
301 |
302 | return x
303 |
304 | def remove_weight_norm(self):
305 | LOG.info("Removing weight norm...")
306 | for l in self.ups:
307 | remove_weight_norm(l)
308 | for l in self.resblocks:
309 | l.remove_weight_norm()
310 | remove_weight_norm(self.conv_pre)
311 | remove_weight_norm(self.conv_post)
312 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py:
--------------------------------------------------------------------------------
1 | from logging import getLogger
2 |
3 | # matplotlib.use("Agg")
4 |
5 | LOG = getLogger(__name__)
6 |
7 |
8 | def init_weights(m, mean=0.0, std=0.01):
9 | classname = m.__class__.__name__
10 | if classname.find("Conv") != -1:
11 | m.weight.data.normal_(mean, std)
12 |
13 |
14 | def get_padding(kernel_size, dilation=1):
15 | return int((kernel_size * dilation - dilation) / 2)
16 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py:
--------------------------------------------------------------------------------
1 | from ._generators import (
2 | Multiband_iSTFT_Generator,
3 | Multistream_iSTFT_Generator,
4 | iSTFT_Generator,
5 | )
6 | from ._loss import subband_stft_loss
7 | from ._pqmf import PQMF
8 |
9 | __all__ = [
10 | "subband_stft_loss",
11 | "PQMF",
12 | "iSTFT_Generator",
13 | "Multiband_iSTFT_Generator",
14 | "Multistream_iSTFT_Generator",
15 | ]
16 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py:
--------------------------------------------------------------------------------
1 | from ._stft_loss import MultiResolutionSTFTLoss
2 |
3 |
4 | def subband_stft_loss(h, y_mb, y_hat_mb):
5 | sub_stft_loss = MultiResolutionSTFTLoss(
6 | h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths
7 | )
8 | y_mb = y_mb.view(-1, y_mb.size(2))
9 | y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2))
10 | sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, : y_mb.size(-1)], y_mb)
11 | return sub_sc_loss + sub_mag_loss
12 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Tomoki Hayashi
2 | # MIT License (https://opensource.org/licenses/MIT)
3 |
4 | """Pseudo QMF modules."""
5 |
6 | import numpy as np
7 | import torch
8 | import torch.nn.functional as F
9 | from scipy.signal.windows import kaiser
10 |
11 |
12 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
13 | """Design prototype filter for PQMF.
14 | This method is based on `A Kaiser window approach for the design of prototype
15 | filters of cosine modulated filterbanks`_.
16 | Args:
17 | taps (int): The number of filter taps.
18 | cutoff_ratio (float): Cut-off frequency ratio.
19 | beta (float): Beta coefficient for kaiser window.
20 | Returns:
21 | ndarray: Impluse response of prototype filter (taps + 1,).
22 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
23 | https://ieeexplore.ieee.org/abstract/document/681427
24 | """
25 | # check the arguments are valid
26 | assert taps % 2 == 0, "The number of taps mush be even number."
27 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
28 |
29 | # make initial filter
30 | omega_c = np.pi * cutoff_ratio
31 | with np.errstate(invalid="ignore"):
32 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
33 | np.pi * (np.arange(taps + 1) - 0.5 * taps)
34 | )
35 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form
36 |
37 | # apply kaiser window
38 | w = kaiser(taps + 1, beta)
39 | h = h_i * w
40 |
41 | return h
42 |
43 |
44 | class PQMF(torch.nn.Module):
45 | """PQMF module.
46 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
47 | .. _`Near-perfect-reconstruction pseudo-QMF banks`:
48 | https://ieeexplore.ieee.org/document/258122
49 | """
50 |
51 | def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0):
52 | """Initialize PQMF module.
53 | Args:
54 | subbands (int): The number of subbands.
55 | taps (int): The number of filter taps.
56 | cutoff_ratio (float): Cut-off frequency ratio.
57 | beta (float): Beta coefficient for kaiser window.
58 | """
59 | super().__init__()
60 |
61 | # define filter coefficient
62 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
63 | h_analysis = np.zeros((subbands, len(h_proto)))
64 | h_synthesis = np.zeros((subbands, len(h_proto)))
65 | for k in range(subbands):
66 | h_analysis[k] = (
67 | 2
68 | * h_proto
69 | * np.cos(
70 | (2 * k + 1)
71 | * (np.pi / (2 * subbands))
72 | * (np.arange(taps + 1) - ((taps - 1) / 2))
73 | + (-1) ** k * np.pi / 4
74 | )
75 | )
76 | h_synthesis[k] = (
77 | 2
78 | * h_proto
79 | * np.cos(
80 | (2 * k + 1)
81 | * (np.pi / (2 * subbands))
82 | * (np.arange(taps + 1) - ((taps - 1) / 2))
83 | - (-1) ** k * np.pi / 4
84 | )
85 | )
86 |
87 | # convert to tensor
88 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device)
89 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device)
90 |
91 | # register coefficients as buffer
92 | self.register_buffer("analysis_filter", analysis_filter)
93 | self.register_buffer("synthesis_filter", synthesis_filter)
94 |
95 | # filter for downsampling & upsampling
96 | updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device)
97 | for k in range(subbands):
98 | updown_filter[k, k, 0] = 1.0
99 | self.register_buffer("updown_filter", updown_filter)
100 | self.subbands = subbands
101 |
102 | # keep padding info
103 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
104 |
105 | def analysis(self, x):
106 | """Analysis with PQMF.
107 | Args:
108 | x (Tensor): Input tensor (B, 1, T).
109 | Returns:
110 | Tensor: Output tensor (B, subbands, T // subbands).
111 | """
112 | x = F.conv1d(self.pad_fn(x), self.analysis_filter)
113 | return F.conv1d(x, self.updown_filter, stride=self.subbands)
114 |
115 | def synthesis(self, x):
116 | """Synthesis with PQMF.
117 | Args:
118 | x (Tensor): Input tensor (B, subbands, T // subbands).
119 | Returns:
120 | Tensor: Output tensor (B, 1, T).
121 | """
122 | # NOTE(kan-bayashi): Power will be dreased so here multiply by # subbands.
123 | # Not sure this is the correct way, it is better to check again.
124 | # TODO(kan-bayashi): Understand the reconstruction procedure
125 | x = F.conv_transpose1d(
126 | x, self.updown_filter * self.subbands, stride=self.subbands
127 | )
128 | return F.conv1d(self.pad_fn(x), self.synthesis_filter)
129 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py:
--------------------------------------------------------------------------------
1 | """
2 | BSD 3-Clause License
3 | Copyright (c) 2017, Prem Seetharaman
4 | All rights reserved.
5 | * Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 | * Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 | * Redistributions in binary form must reproduce the above copyright notice, this
10 | list of conditions and the following disclaimer in the
11 | documentation and/or other materials provided with the distribution.
12 | * Neither the name of the copyright holder nor the names of its
13 | contributors may be used to endorse or promote products derived from this
14 | software without specific prior written permission.
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | """
26 |
27 | import librosa.util as librosa_util
28 | import numpy as np
29 | import torch
30 | import torch.nn.functional as F
31 | from librosa.util import pad_center, tiny
32 | from scipy.signal import get_window
33 | from torch.autograd import Variable
34 |
35 |
36 | def window_sumsquare(
37 | window,
38 | n_frames,
39 | hop_length=200,
40 | win_length=800,
41 | n_fft=800,
42 | dtype=np.float32,
43 | norm=None,
44 | ):
45 | """
46 | # from librosa 0.6
47 | Compute the sum-square envelope of a window function at a given hop length.
48 | This is used to estimate modulation effects induced by windowing
49 | observations in short-time fourier transforms.
50 | Parameters
51 | ----------
52 | window : string, tuple, number, callable, or list-like
53 | Window specification, as in `get_window`
54 | n_frames : int > 0
55 | The number of analysis frames
56 | hop_length : int > 0
57 | The number of samples to advance between frames
58 | win_length : [optional]
59 | The length of the window function. By default, this matches `n_fft`.
60 | n_fft : int > 0
61 | The length of each analysis frame.
62 | dtype : np.dtype
63 | The data type of the output
64 | Returns
65 | -------
66 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
67 | The sum-squared envelope of the window function
68 | """
69 | if win_length is None:
70 | win_length = n_fft
71 |
72 | n = n_fft + hop_length * (n_frames - 1)
73 | x = np.zeros(n, dtype=dtype)
74 |
75 | # Compute the squared window at the desired length
76 | win_sq = get_window(window, win_length, fftbins=True)
77 | win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
78 | win_sq = librosa_util.pad_center(win_sq, n_fft)
79 |
80 | # Fill the envelope
81 | for i in range(n_frames):
82 | sample = i * hop_length
83 | x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
84 | return x
85 |
86 |
87 | class STFT(torch.nn.Module):
88 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
89 |
90 | def __init__(
91 | self, filter_length=800, hop_length=200, win_length=800, window="hann"
92 | ):
93 | super().__init__()
94 | self.filter_length = filter_length
95 | self.hop_length = hop_length
96 | self.win_length = win_length
97 | self.window = window
98 | self.forward_transform = None
99 | scale = self.filter_length / self.hop_length
100 | fourier_basis = np.fft.fft(np.eye(self.filter_length))
101 |
102 | cutoff = int(self.filter_length / 2 + 1)
103 | fourier_basis = np.vstack(
104 | [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
105 | )
106 |
107 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
108 | inverse_basis = torch.FloatTensor(
109 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]
110 | )
111 |
112 | if window is not None:
113 | assert filter_length >= win_length
114 | # get window and zero center pad it to filter_length
115 | fft_window = get_window(window, win_length, fftbins=True)
116 | fft_window = pad_center(fft_window, filter_length)
117 | fft_window = torch.from_numpy(fft_window).float()
118 |
119 | # window the bases
120 | forward_basis *= fft_window
121 | inverse_basis *= fft_window
122 |
123 | self.register_buffer("forward_basis", forward_basis.float())
124 | self.register_buffer("inverse_basis", inverse_basis.float())
125 |
126 | def transform(self, input_data):
127 | num_batches = input_data.size(0)
128 | num_samples = input_data.size(1)
129 |
130 | self.num_samples = num_samples
131 |
132 | # similar to librosa, reflect-pad the input
133 | input_data = input_data.view(num_batches, 1, num_samples)
134 | input_data = F.pad(
135 | input_data.unsqueeze(1),
136 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
137 | mode="reflect",
138 | )
139 | input_data = input_data.squeeze(1)
140 |
141 | forward_transform = F.conv1d(
142 | input_data,
143 | Variable(self.forward_basis, requires_grad=False),
144 | stride=self.hop_length,
145 | padding=0,
146 | )
147 |
148 | cutoff = int((self.filter_length / 2) + 1)
149 | real_part = forward_transform[:, :cutoff, :]
150 | imag_part = forward_transform[:, cutoff:, :]
151 |
152 | magnitude = torch.sqrt(real_part**2 + imag_part**2)
153 | phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
154 |
155 | return magnitude, phase
156 |
157 | def inverse(self, magnitude, phase):
158 | recombine_magnitude_phase = torch.cat(
159 | [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
160 | )
161 |
162 | inverse_transform = F.conv_transpose1d(
163 | recombine_magnitude_phase,
164 | Variable(self.inverse_basis, requires_grad=False),
165 | stride=self.hop_length,
166 | padding=0,
167 | )
168 |
169 | if self.window is not None:
170 | window_sum = window_sumsquare(
171 | self.window,
172 | magnitude.size(-1),
173 | hop_length=self.hop_length,
174 | win_length=self.win_length,
175 | n_fft=self.filter_length,
176 | dtype=np.float32,
177 | )
178 | # remove modulation effects
179 | approx_nonzero_indices = torch.from_numpy(
180 | np.where(window_sum > tiny(window_sum))[0]
181 | )
182 | window_sum = torch.autograd.Variable(
183 | torch.from_numpy(window_sum), requires_grad=False
184 | )
185 | window_sum = window_sum.to(inverse_transform.device())
186 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
187 | approx_nonzero_indices
188 | ]
189 |
190 | # scale by hop ratio
191 | inverse_transform *= float(self.filter_length) / self.hop_length
192 |
193 | inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
194 | inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
195 |
196 | return inverse_transform
197 |
198 | def forward(self, input_data):
199 | self.magnitude, self.phase = self.transform(input_data)
200 | reconstruction = self.inverse(self.magnitude, self.phase)
201 | return reconstruction
202 |
203 |
204 | class TorchSTFT(torch.nn.Module):
205 | def __init__(
206 | self, filter_length=800, hop_length=200, win_length=800, window="hann"
207 | ):
208 | super().__init__()
209 | self.filter_length = filter_length
210 | self.hop_length = hop_length
211 | self.win_length = win_length
212 | self.window = torch.from_numpy(
213 | get_window(window, win_length, fftbins=True).astype(np.float32)
214 | )
215 |
216 | def transform(self, input_data):
217 | forward_transform = torch.stft(
218 | input_data,
219 | self.filter_length,
220 | self.hop_length,
221 | self.win_length,
222 | window=self.window,
223 | return_complex=True,
224 | )
225 |
226 | return torch.abs(forward_transform), torch.angle(forward_transform)
227 |
228 | def inverse(self, magnitude, phase):
229 | inverse_transform = torch.istft(
230 | magnitude * torch.exp(phase * 1j),
231 | self.filter_length,
232 | self.hop_length,
233 | self.win_length,
234 | window=self.window.to(magnitude.device),
235 | )
236 |
237 | return inverse_transform.unsqueeze(
238 | -2
239 | ) # unsqueeze to stay consistent with conv_transpose1d implementation
240 |
241 | def forward(self, input_data):
242 | self.magnitude, self.phase = self.transform(input_data)
243 | reconstruction = self.inverse(self.magnitude, self.phase)
244 | return reconstruction
245 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Tomoki Hayashi
2 | # MIT License (https://opensource.org/licenses/MIT)
3 |
4 | """STFT-based Loss modules."""
5 |
6 | import torch
7 | import torch.nn.functional as F
8 |
9 |
10 | def stft(x, fft_size, hop_size, win_length, window):
11 | """Perform STFT and convert to magnitude spectrogram.
12 | Args:
13 | x (Tensor): Input signal tensor (B, T).
14 | fft_size (int): FFT size.
15 | hop_size (int): Hop size.
16 | win_length (int): Window length.
17 | window (str): Window function type.
18 | Returns:
19 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
20 | """
21 | x_stft = torch.stft(
22 | x, fft_size, hop_size, win_length, window.to(x.device), return_complex=False
23 | )
24 | real = x_stft[..., 0]
25 | imag = x_stft[..., 1]
26 |
27 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
28 | return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
29 |
30 |
31 | class SpectralConvergengeLoss(torch.nn.Module):
32 | """Spectral convergence loss module."""
33 |
34 | def __init__(self):
35 | """Initialize spectral convergence loss module."""
36 | super().__init__()
37 |
38 | def forward(self, x_mag, y_mag):
39 | """Calculate forward propagation.
40 | Args:
41 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
42 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
43 | Returns:
44 | Tensor: Spectral convergence loss value.
45 | """
46 | return torch.norm(y_mag - x_mag) / torch.norm(
47 | y_mag
48 | ) # MB-iSTFT-VITS changed here due to codespell
49 |
50 |
51 | class LogSTFTMagnitudeLoss(torch.nn.Module):
52 | """Log STFT magnitude loss module."""
53 |
54 | def __init__(self):
55 | """Initialize los STFT magnitude loss module."""
56 | super().__init__()
57 |
58 | def forward(self, x_mag, y_mag):
59 | """Calculate forward propagation.
60 | Args:
61 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
62 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
63 | Returns:
64 | Tensor: Log STFT magnitude loss value.
65 | """
66 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
67 |
68 |
69 | class STFTLoss(torch.nn.Module):
70 | """STFT loss module."""
71 |
72 | def __init__(
73 | self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"
74 | ):
75 | """Initialize STFT loss module."""
76 | super().__init__()
77 | self.fft_size = fft_size
78 | self.shift_size = shift_size
79 | self.win_length = win_length
80 | self.window = getattr(torch, window)(win_length)
81 | self.spectral_convergenge_loss = SpectralConvergengeLoss()
82 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
83 |
84 | def forward(self, x, y):
85 | """Calculate forward propagation.
86 | Args:
87 | x (Tensor): Predicted signal (B, T).
88 | y (Tensor): Groundtruth signal (B, T).
89 | Returns:
90 | Tensor: Spectral convergence loss value.
91 | Tensor: Log STFT magnitude loss value.
92 | """
93 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
94 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
95 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
96 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
97 |
98 | return sc_loss, mag_loss
99 |
100 |
101 | class MultiResolutionSTFTLoss(torch.nn.Module):
102 | """Multi resolution STFT loss module."""
103 |
104 | def __init__(
105 | self,
106 | fft_sizes=[1024, 2048, 512],
107 | hop_sizes=[120, 240, 50],
108 | win_lengths=[600, 1200, 240],
109 | window="hann_window",
110 | ):
111 | """Initialize Multi resolution STFT loss module.
112 | Args:
113 | fft_sizes (list): List of FFT sizes.
114 | hop_sizes (list): List of hop sizes.
115 | win_lengths (list): List of window lengths.
116 | window (str): Window function type.
117 | """
118 | super().__init__()
119 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
120 | self.stft_losses = torch.nn.ModuleList()
121 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
122 | self.stft_losses += [STFTLoss(fs, ss, wl, window)]
123 |
124 | def forward(self, x, y):
125 | """Calculate forward propagation.
126 | Args:
127 | x (Tensor): Predicted signal (B, T).
128 | y (Tensor): Groundtruth signal (B, T).
129 | Returns:
130 | Tensor: Multi resolution spectral convergence loss value.
131 | Tensor: Multi resolution log STFT magnitude loss value.
132 | """
133 | sc_loss = 0.0
134 | mag_loss = 0.0
135 | for f in self.stft_losses:
136 | sc_l, mag_l = f(x, y)
137 | sc_loss += sc_l
138 | mag_loss += mag_l
139 | sc_loss /= len(self.stft_losses)
140 | mag_loss /= len(self.stft_losses)
141 |
142 | return sc_loss, mag_loss
143 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/descriminators.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn import AvgPool1d, Conv1d, Conv2d
4 | from torch.nn import functional as F
5 | from torch.nn.utils import spectral_norm, weight_norm
6 |
7 | from so_vits_svc_fork.modules import modules as modules
8 | from so_vits_svc_fork.modules.commons import get_padding
9 |
10 |
11 | class DiscriminatorP(torch.nn.Module):
12 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
13 | super().__init__()
14 | self.period = period
15 | self.use_spectral_norm = use_spectral_norm
16 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
17 | self.convs = nn.ModuleList(
18 | [
19 | norm_f(
20 | Conv2d(
21 | 1,
22 | 32,
23 | (kernel_size, 1),
24 | (stride, 1),
25 | padding=(get_padding(kernel_size, 1), 0),
26 | )
27 | ),
28 | norm_f(
29 | Conv2d(
30 | 32,
31 | 128,
32 | (kernel_size, 1),
33 | (stride, 1),
34 | padding=(get_padding(kernel_size, 1), 0),
35 | )
36 | ),
37 | norm_f(
38 | Conv2d(
39 | 128,
40 | 512,
41 | (kernel_size, 1),
42 | (stride, 1),
43 | padding=(get_padding(kernel_size, 1), 0),
44 | )
45 | ),
46 | norm_f(
47 | Conv2d(
48 | 512,
49 | 1024,
50 | (kernel_size, 1),
51 | (stride, 1),
52 | padding=(get_padding(kernel_size, 1), 0),
53 | )
54 | ),
55 | norm_f(
56 | Conv2d(
57 | 1024,
58 | 1024,
59 | (kernel_size, 1),
60 | 1,
61 | padding=(get_padding(kernel_size, 1), 0),
62 | )
63 | ),
64 | ]
65 | )
66 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
67 |
68 | def forward(self, x):
69 | fmap = []
70 |
71 | # 1d to 2d
72 | b, c, t = x.shape
73 | if t % self.period != 0: # pad first
74 | n_pad = self.period - (t % self.period)
75 | x = F.pad(x, (0, n_pad), "reflect")
76 | t = t + n_pad
77 | x = x.view(b, c, t // self.period, self.period)
78 |
79 | for l in self.convs:
80 | x = l(x)
81 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
82 | fmap.append(x)
83 | x = self.conv_post(x)
84 | fmap.append(x)
85 | x = torch.flatten(x, 1, -1)
86 |
87 | return x, fmap
88 |
89 |
90 | class DiscriminatorS(torch.nn.Module):
91 | def __init__(self, use_spectral_norm=False):
92 | super().__init__()
93 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
94 | self.convs = nn.ModuleList(
95 | [
96 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
97 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
98 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
99 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
100 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
101 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
102 | ]
103 | )
104 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
105 |
106 | def forward(self, x):
107 | fmap = []
108 |
109 | for l in self.convs:
110 | x = l(x)
111 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
112 | fmap.append(x)
113 | x = self.conv_post(x)
114 | fmap.append(x)
115 | x = torch.flatten(x, 1, -1)
116 |
117 | return x, fmap
118 |
119 |
120 | class MultiPeriodDiscriminator(torch.nn.Module):
121 | def __init__(self, use_spectral_norm=False):
122 | super().__init__()
123 | periods = [2, 3, 5, 7, 11]
124 |
125 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
126 | discs = discs + [
127 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
128 | ]
129 | self.discriminators = nn.ModuleList(discs)
130 |
131 | def forward(self, y, y_hat):
132 | y_d_rs = []
133 | y_d_gs = []
134 | fmap_rs = []
135 | fmap_gs = []
136 | for i, d in enumerate(self.discriminators):
137 | y_d_r, fmap_r = d(y)
138 | y_d_g, fmap_g = d(y_hat)
139 | y_d_rs.append(y_d_r)
140 | y_d_gs.append(y_d_g)
141 | fmap_rs.append(fmap_r)
142 | fmap_gs.append(fmap_g)
143 |
144 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
145 |
146 |
147 | class MultiScaleDiscriminator(torch.nn.Module):
148 | def __init__(self):
149 | super().__init__()
150 | self.discriminators = nn.ModuleList(
151 | [
152 | DiscriminatorS(use_spectral_norm=True),
153 | DiscriminatorS(),
154 | DiscriminatorS(),
155 | ]
156 | )
157 | self.meanpools = nn.ModuleList(
158 | [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
159 | )
160 |
161 | def forward(self, y, y_hat):
162 | y_d_rs = []
163 | y_d_gs = []
164 | fmap_rs = []
165 | fmap_gs = []
166 | for i, d in enumerate(self.discriminators):
167 | if i != 0:
168 | y = self.meanpools[i - 1](y)
169 | y_hat = self.meanpools[i - 1](y_hat)
170 | y_d_r, fmap_r = d(y)
171 | y_d_g, fmap_g = d(y_hat)
172 | y_d_rs.append(y_d_r)
173 | fmap_rs.append(fmap_r)
174 | y_d_gs.append(y_d_g)
175 | fmap_gs.append(fmap_g)
176 |
177 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
178 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/encoders.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from so_vits_svc_fork.modules import attentions as attentions
5 | from so_vits_svc_fork.modules import commons as commons
6 | from so_vits_svc_fork.modules import modules as modules
7 |
8 |
9 | class SpeakerEncoder(torch.nn.Module):
10 | def __init__(
11 | self,
12 | mel_n_channels=80,
13 | model_num_layers=3,
14 | model_hidden_size=256,
15 | model_embedding_size=256,
16 | ):
17 | super().__init__()
18 | self.lstm = nn.LSTM(
19 | mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
20 | )
21 | self.linear = nn.Linear(model_hidden_size, model_embedding_size)
22 | self.relu = nn.ReLU()
23 |
24 | def forward(self, mels):
25 | self.lstm.flatten_parameters()
26 | _, (hidden, _) = self.lstm(mels)
27 | embeds_raw = self.relu(self.linear(hidden[-1]))
28 | return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
29 |
30 | def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
31 | mel_slices = []
32 | for i in range(0, total_frames - partial_frames, partial_hop):
33 | mel_range = torch.arange(i, i + partial_frames)
34 | mel_slices.append(mel_range)
35 |
36 | return mel_slices
37 |
38 | def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
39 | mel_len = mel.size(1)
40 | last_mel = mel[:, -partial_frames:]
41 |
42 | if mel_len > partial_frames:
43 | mel_slices = self.compute_partial_slices(
44 | mel_len, partial_frames, partial_hop
45 | )
46 | mels = list(mel[:, s] for s in mel_slices)
47 | mels.append(last_mel)
48 | mels = torch.stack(tuple(mels), 0).squeeze(1)
49 |
50 | with torch.no_grad():
51 | partial_embeds = self(mels)
52 | embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
53 | # embed = embed / torch.linalg.norm(embed, 2)
54 | else:
55 | with torch.no_grad():
56 | embed = self(last_mel)
57 |
58 | return embed
59 |
60 |
61 | class Encoder(nn.Module):
62 | def __init__(
63 | self,
64 | in_channels,
65 | out_channels,
66 | hidden_channels,
67 | kernel_size,
68 | dilation_rate,
69 | n_layers,
70 | gin_channels=0,
71 | ):
72 | super().__init__()
73 | self.in_channels = in_channels
74 | self.out_channels = out_channels
75 | self.hidden_channels = hidden_channels
76 | self.kernel_size = kernel_size
77 | self.dilation_rate = dilation_rate
78 | self.n_layers = n_layers
79 | self.gin_channels = gin_channels
80 |
81 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
82 | self.enc = modules.WN(
83 | hidden_channels,
84 | kernel_size,
85 | dilation_rate,
86 | n_layers,
87 | gin_channels=gin_channels,
88 | )
89 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
90 |
91 | def forward(self, x, x_lengths, g=None):
92 | # print(x.shape,x_lengths.shape)
93 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
94 | x.dtype
95 | )
96 | x = self.pre(x) * x_mask
97 | x = self.enc(x, x_mask, g=g)
98 | stats = self.proj(x) * x_mask
99 | m, logs = torch.split(stats, self.out_channels, dim=1)
100 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
101 | return z, m, logs, x_mask
102 |
103 |
104 | class TextEncoder(nn.Module):
105 | def __init__(
106 | self,
107 | out_channels,
108 | hidden_channels,
109 | kernel_size,
110 | n_layers,
111 | gin_channels=0,
112 | filter_channels=None,
113 | n_heads=None,
114 | p_dropout=None,
115 | ):
116 | super().__init__()
117 | self.out_channels = out_channels
118 | self.hidden_channels = hidden_channels
119 | self.kernel_size = kernel_size
120 | self.n_layers = n_layers
121 | self.gin_channels = gin_channels
122 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
123 | self.f0_emb = nn.Embedding(256, hidden_channels)
124 |
125 | self.enc_ = attentions.Encoder(
126 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
127 | )
128 |
129 | def forward(self, x, x_mask, f0=None, noice_scale=1):
130 | x = x + self.f0_emb(f0).transpose(1, 2)
131 | x = self.enc_(x * x_mask, x_mask)
132 | stats = self.proj(x) * x_mask
133 | m, logs = torch.split(stats, self.out_channels, dim=1)
134 | z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
135 |
136 | return z, m, logs, x_mask
137 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/flows.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from so_vits_svc_fork.modules import modules as modules
4 |
5 |
6 | class ResidualCouplingBlock(nn.Module):
7 | def __init__(
8 | self,
9 | channels,
10 | hidden_channels,
11 | kernel_size,
12 | dilation_rate,
13 | n_layers,
14 | n_flows=4,
15 | gin_channels=0,
16 | ):
17 | super().__init__()
18 | self.channels = channels
19 | self.hidden_channels = hidden_channels
20 | self.kernel_size = kernel_size
21 | self.dilation_rate = dilation_rate
22 | self.n_layers = n_layers
23 | self.n_flows = n_flows
24 | self.gin_channels = gin_channels
25 |
26 | self.flows = nn.ModuleList()
27 | for i in range(n_flows):
28 | self.flows.append(
29 | modules.ResidualCouplingLayer(
30 | channels,
31 | hidden_channels,
32 | kernel_size,
33 | dilation_rate,
34 | n_layers,
35 | gin_channels=gin_channels,
36 | mean_only=True,
37 | )
38 | )
39 | self.flows.append(modules.Flip())
40 |
41 | def forward(self, x, x_mask, g=None, reverse=False):
42 | if not reverse:
43 | for flow in self.flows:
44 | x, _ = flow(x, x_mask, g=g, reverse=reverse)
45 | else:
46 | for flow in reversed(self.flows):
47 | x = flow(x, x_mask, g=g, reverse=reverse)
48 | return x
49 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def feature_loss(fmap_r, fmap_g):
5 | loss = 0
6 | for dr, dg in zip(fmap_r, fmap_g):
7 | for rl, gl in zip(dr, dg):
8 | rl = rl.float().detach()
9 | gl = gl.float()
10 | loss += torch.mean(torch.abs(rl - gl))
11 |
12 | return loss * 2
13 |
14 |
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 | loss = 0
17 | r_losses = []
18 | g_losses = []
19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 | dr = dr.float()
21 | dg = dg.float()
22 | r_loss = torch.mean((1 - dr) ** 2)
23 | g_loss = torch.mean(dg**2)
24 | loss += r_loss + g_loss
25 | r_losses.append(r_loss.item())
26 | g_losses.append(g_loss.item())
27 |
28 | return loss, r_losses, g_losses
29 |
30 |
31 | def generator_loss(disc_outputs):
32 | loss = 0
33 | gen_losses = []
34 | for dg in disc_outputs:
35 | dg = dg.float()
36 | l = torch.mean((1 - dg) ** 2)
37 | gen_losses.append(l)
38 | loss += l
39 |
40 | return loss, gen_losses
41 |
42 |
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 | """
45 | z_p, logs_q: [b, h, t_t]
46 | m_p, logs_p: [b, h, t_t]
47 | """
48 | z_p = z_p.float()
49 | logs_q = logs_q.float()
50 | m_p = m_p.float()
51 | logs_p = logs_p.float()
52 | z_mask = z_mask.float()
53 | # print(logs_p)
54 | kl = logs_p - logs_q - 0.5
55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 | kl = torch.sum(kl * z_mask)
57 | l = kl / torch.sum(z_mask)
58 | return l
59 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/mel_processing.py:
--------------------------------------------------------------------------------
1 | """from logging import getLogger
2 |
3 | import torch
4 | import torch.utils.data
5 | import torchaudio
6 |
7 | LOG = getLogger(__name__)
8 |
9 |
10 | from ..hparams import HParams
11 |
12 |
13 | def spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor:
14 | return torchaudio.transforms.Spectrogram(
15 | n_fft=hps.data.filter_length,
16 | win_length=hps.data.win_length,
17 | hop_length=hps.data.hop_length,
18 | power=1.0,
19 | window_fn=torch.hann_window,
20 | normalized=False,
21 | ).to(audio.device)(audio)
22 |
23 |
24 | def spec_to_mel_torch(spec: torch.Tensor, hps: HParams) -> torch.Tensor:
25 | return torchaudio.transforms.MelScale(
26 | n_mels=hps.data.n_mel_channels,
27 | sample_rate=hps.data.sampling_rate,
28 | f_min=hps.data.mel_fmin,
29 | f_max=hps.data.mel_fmax,
30 | ).to(spec.device)(spec)
31 |
32 |
33 | def mel_spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor:
34 | return torchaudio.transforms.MelSpectrogram(
35 | sample_rate=hps.data.sampling_rate,
36 | n_fft=hps.data.filter_length,
37 | n_mels=hps.data.n_mel_channels,
38 | win_length=hps.data.win_length,
39 | hop_length=hps.data.hop_length,
40 | f_min=hps.data.mel_fmin,
41 | f_max=hps.data.mel_fmax,
42 | power=1.0,
43 | window_fn=torch.hann_window,
44 | normalized=False,
45 | ).to(audio.device)(audio)"""
46 |
47 | from logging import getLogger
48 |
49 | import torch
50 | import torch.utils.data
51 | from librosa.filters import mel as librosa_mel_fn
52 |
53 | LOG = getLogger(__name__)
54 |
55 | MAX_WAV_VALUE = 32768.0
56 |
57 |
58 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
59 | """
60 | PARAMS
61 | ------
62 | C: compression factor
63 | """
64 | return torch.log(torch.clamp(x, min=clip_val) * C)
65 |
66 |
67 | def dynamic_range_decompression_torch(x, C=1):
68 | """
69 | PARAMS
70 | ------
71 | C: compression factor used to compress
72 | """
73 | return torch.exp(x) / C
74 |
75 |
76 | def spectral_normalize_torch(magnitudes):
77 | output = dynamic_range_compression_torch(magnitudes)
78 | return output
79 |
80 |
81 | def spectral_de_normalize_torch(magnitudes):
82 | output = dynamic_range_decompression_torch(magnitudes)
83 | return output
84 |
85 |
86 | mel_basis = {}
87 | hann_window = {}
88 |
89 |
90 | def spectrogram_torch(y, hps, center=False):
91 | if torch.min(y) < -1.0:
92 | LOG.info("min value is ", torch.min(y))
93 | if torch.max(y) > 1.0:
94 | LOG.info("max value is ", torch.max(y))
95 | n_fft = hps.data.filter_length
96 | hop_size = hps.data.hop_length
97 | win_size = hps.data.win_length
98 | global hann_window
99 | dtype_device = str(y.dtype) + "_" + str(y.device)
100 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
101 | if wnsize_dtype_device not in hann_window:
102 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
103 | dtype=y.dtype, device=y.device
104 | )
105 |
106 | y = torch.nn.functional.pad(
107 | y.unsqueeze(1),
108 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
109 | mode="reflect",
110 | )
111 | y = y.squeeze(1)
112 |
113 | spec = torch.stft(
114 | y,
115 | n_fft,
116 | hop_length=hop_size,
117 | win_length=win_size,
118 | window=hann_window[wnsize_dtype_device],
119 | center=center,
120 | pad_mode="reflect",
121 | normalized=False,
122 | onesided=True,
123 | return_complex=False,
124 | )
125 |
126 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
127 | return spec
128 |
129 |
130 | def spec_to_mel_torch(spec, hps):
131 | sampling_rate = hps.data.sampling_rate
132 | n_fft = hps.data.filter_length
133 | num_mels = hps.data.n_mel_channels
134 | fmin = hps.data.mel_fmin
135 | fmax = hps.data.mel_fmax
136 | global mel_basis
137 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
138 | fmax_dtype_device = str(fmax) + "_" + dtype_device
139 | if fmax_dtype_device not in mel_basis:
140 | mel = librosa_mel_fn(
141 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
142 | )
143 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
144 | dtype=spec.dtype, device=spec.device
145 | )
146 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
147 | spec = spectral_normalize_torch(spec)
148 | return spec
149 |
150 |
151 | def mel_spectrogram_torch(y, hps, center=False):
152 | sampling_rate = hps.data.sampling_rate
153 | n_fft = hps.data.filter_length
154 | num_mels = hps.data.n_mel_channels
155 | fmin = hps.data.mel_fmin
156 | fmax = hps.data.mel_fmax
157 | hop_size = hps.data.hop_length
158 | win_size = hps.data.win_length
159 | if torch.min(y) < -1.0:
160 | LOG.info(f"min value is {torch.min(y)}")
161 | if torch.max(y) > 1.0:
162 | LOG.info(f"max value is {torch.max(y)}")
163 |
164 | global mel_basis, hann_window
165 | dtype_device = str(y.dtype) + "_" + str(y.device)
166 | fmax_dtype_device = str(fmax) + "_" + dtype_device
167 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
168 | if fmax_dtype_device not in mel_basis:
169 | mel = librosa_mel_fn(
170 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
171 | )
172 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
173 | dtype=y.dtype, device=y.device
174 | )
175 | if wnsize_dtype_device not in hann_window:
176 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
177 | dtype=y.dtype, device=y.device
178 | )
179 |
180 | y = torch.nn.functional.pad(
181 | y.unsqueeze(1),
182 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
183 | mode="reflect",
184 | )
185 | y = y.squeeze(1)
186 |
187 | spec = torch.stft(
188 | y,
189 | n_fft,
190 | hop_length=hop_size,
191 | win_length=win_size,
192 | window=hann_window[wnsize_dtype_device],
193 | center=center,
194 | pad_mode="reflect",
195 | normalized=False,
196 | onesided=True,
197 | return_complex=False,
198 | )
199 |
200 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
201 |
202 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
203 | spec = spectral_normalize_torch(spec)
204 |
205 | return spec
206 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/modules/synthesizers.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from logging import getLogger
3 | from typing import Any, Literal, Sequence
4 |
5 | import torch
6 | from torch import nn
7 |
8 | import so_vits_svc_fork.f0
9 | from so_vits_svc_fork.f0 import f0_to_coarse
10 | from so_vits_svc_fork.modules import commons as commons
11 | from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
12 | from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator
13 | from so_vits_svc_fork.modules.decoders.mb_istft import (
14 | Multiband_iSTFT_Generator,
15 | Multistream_iSTFT_Generator,
16 | iSTFT_Generator,
17 | )
18 | from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder
19 | from so_vits_svc_fork.modules.flows import ResidualCouplingBlock
20 |
21 | LOG = getLogger(__name__)
22 |
23 |
24 | class SynthesizerTrn(nn.Module):
25 | """
26 | Synthesizer for Training
27 | """
28 |
29 | def __init__(
30 | self,
31 | spec_channels: int,
32 | segment_size: int,
33 | inter_channels: int,
34 | hidden_channels: int,
35 | filter_channels: int,
36 | n_heads: int,
37 | n_layers: int,
38 | kernel_size: int,
39 | p_dropout: int,
40 | resblock: str,
41 | resblock_kernel_sizes: Sequence[int],
42 | resblock_dilation_sizes: Sequence[Sequence[int]],
43 | upsample_rates: Sequence[int],
44 | upsample_initial_channel: int,
45 | upsample_kernel_sizes: Sequence[int],
46 | gin_channels: int,
47 | ssl_dim: int,
48 | n_speakers: int,
49 | sampling_rate: int = 44100,
50 | type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
51 | gen_istft_n_fft: int = 16,
52 | gen_istft_hop_size: int = 4,
53 | subbands: int = 4,
54 | **kwargs: Any,
55 | ):
56 | super().__init__()
57 | self.spec_channels = spec_channels
58 | self.inter_channels = inter_channels
59 | self.hidden_channels = hidden_channels
60 | self.filter_channels = filter_channels
61 | self.n_heads = n_heads
62 | self.n_layers = n_layers
63 | self.kernel_size = kernel_size
64 | self.p_dropout = p_dropout
65 | self.resblock = resblock
66 | self.resblock_kernel_sizes = resblock_kernel_sizes
67 | self.resblock_dilation_sizes = resblock_dilation_sizes
68 | self.upsample_rates = upsample_rates
69 | self.upsample_initial_channel = upsample_initial_channel
70 | self.upsample_kernel_sizes = upsample_kernel_sizes
71 | self.segment_size = segment_size
72 | self.gin_channels = gin_channels
73 | self.ssl_dim = ssl_dim
74 | self.n_speakers = n_speakers
75 | self.sampling_rate = sampling_rate
76 | self.type_ = type_
77 | self.gen_istft_n_fft = gen_istft_n_fft
78 | self.gen_istft_hop_size = gen_istft_hop_size
79 | self.subbands = subbands
80 | if kwargs:
81 | warnings.warn(f"Unused arguments: {kwargs}")
82 |
83 | self.emb_g = nn.Embedding(n_speakers, gin_channels)
84 |
85 | if ssl_dim is None:
86 | self.pre = nn.LazyConv1d(hidden_channels, kernel_size=5, padding=2)
87 | else:
88 | self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
89 |
90 | self.enc_p = TextEncoder(
91 | inter_channels,
92 | hidden_channels,
93 | filter_channels=filter_channels,
94 | n_heads=n_heads,
95 | n_layers=n_layers,
96 | kernel_size=kernel_size,
97 | p_dropout=p_dropout,
98 | )
99 |
100 | LOG.info(f"Decoder type: {type_}")
101 | if type_ == "hifi-gan":
102 | hps = {
103 | "sampling_rate": sampling_rate,
104 | "inter_channels": inter_channels,
105 | "resblock": resblock,
106 | "resblock_kernel_sizes": resblock_kernel_sizes,
107 | "resblock_dilation_sizes": resblock_dilation_sizes,
108 | "upsample_rates": upsample_rates,
109 | "upsample_initial_channel": upsample_initial_channel,
110 | "upsample_kernel_sizes": upsample_kernel_sizes,
111 | "gin_channels": gin_channels,
112 | }
113 | self.dec = NSFHifiGANGenerator(h=hps)
114 | self.mb = False
115 | else:
116 | hps = {
117 | "initial_channel": inter_channels,
118 | "resblock": resblock,
119 | "resblock_kernel_sizes": resblock_kernel_sizes,
120 | "resblock_dilation_sizes": resblock_dilation_sizes,
121 | "upsample_rates": upsample_rates,
122 | "upsample_initial_channel": upsample_initial_channel,
123 | "upsample_kernel_sizes": upsample_kernel_sizes,
124 | "gin_channels": gin_channels,
125 | "gen_istft_n_fft": gen_istft_n_fft,
126 | "gen_istft_hop_size": gen_istft_hop_size,
127 | "subbands": subbands,
128 | }
129 |
130 | # gen_istft_n_fft, gen_istft_hop_size, subbands
131 | if type_ == "istft":
132 | del hps["subbands"]
133 | self.dec = iSTFT_Generator(**hps)
134 | elif type_ == "ms-istft":
135 | self.dec = Multistream_iSTFT_Generator(**hps)
136 | elif type_ == "mb-istft":
137 | self.dec = Multiband_iSTFT_Generator(**hps)
138 | else:
139 | raise ValueError(f"Unknown type: {type_}")
140 | self.mb = True
141 |
142 | self.enc_q = Encoder(
143 | spec_channels,
144 | inter_channels,
145 | hidden_channels,
146 | 5,
147 | 1,
148 | 16,
149 | gin_channels=gin_channels,
150 | )
151 | self.flow = ResidualCouplingBlock(
152 | inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
153 | )
154 | self.f0_decoder = F0Decoder(
155 | 1,
156 | hidden_channels,
157 | filter_channels,
158 | n_heads,
159 | n_layers,
160 | kernel_size,
161 | p_dropout,
162 | spk_channels=gin_channels,
163 | )
164 | self.emb_uv = nn.Embedding(2, hidden_channels)
165 |
166 | def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
167 | g = self.emb_g(g).transpose(1, 2)
168 | # ssl prenet
169 | x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
170 | c.dtype
171 | )
172 | x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
173 |
174 | # f0 predict
175 | lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
176 | norm_lf0 = so_vits_svc_fork.f0.normalize_f0(lf0, x_mask, uv)
177 | pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
178 |
179 | # encoder
180 | z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
181 | z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
182 |
183 | # flow
184 | z_p = self.flow(z, spec_mask, g=g)
185 | z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(
186 | z, f0, spec_lengths, self.segment_size
187 | )
188 |
189 | # MB-iSTFT-VITS
190 | if self.mb:
191 | o, o_mb = self.dec(z_slice, g=g)
192 | # HiFi-GAN
193 | else:
194 | o = self.dec(z_slice, g=g, f0=pitch_slice)
195 | o_mb = None
196 | return (
197 | o,
198 | o_mb,
199 | ids_slice,
200 | spec_mask,
201 | (z, z_p, m_p, logs_p, m_q, logs_q),
202 | pred_lf0,
203 | norm_lf0,
204 | lf0,
205 | )
206 |
207 | def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
208 | c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
209 | g = self.emb_g(g).transpose(1, 2)
210 | x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
211 | c.dtype
212 | )
213 | x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
214 |
215 | if predict_f0:
216 | lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
217 | norm_lf0 = so_vits_svc_fork.f0.normalize_f0(
218 | lf0, x_mask, uv, random_scale=False
219 | )
220 | pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
221 | f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
222 |
223 | z_p, m_p, logs_p, c_mask = self.enc_p(
224 | x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale
225 | )
226 | z = self.flow(z_p, c_mask, g=g, reverse=True)
227 |
228 | # MB-iSTFT-VITS
229 | if self.mb:
230 | o, o_mb = self.dec(z * c_mask, g=g)
231 | else:
232 | o = self.dec(z * c_mask, g=g, f0=f0)
233 | return o
234 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/preprocessing/__init__.py
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 100,
4 | "eval_interval": 200,
5 | "seed": 1234,
6 | "epochs": 10000,
7 | "learning_rate": 0.0001,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 16,
11 | "fp16_run": false,
12 | "bf16_run": false,
13 | "lr_decay": 0.999875,
14 | "segment_size": 10240,
15 | "init_lr_ratio": 1,
16 | "warmup_epochs": 0,
17 | "c_mel": 45,
18 | "c_kl": 1.0,
19 | "use_sr": true,
20 | "max_speclen": 512,
21 | "port": "8001",
22 | "keep_ckpts": 3,
23 | "fft_sizes": [768, 1366, 342],
24 | "hop_sizes": [60, 120, 20],
25 | "win_lengths": [300, 600, 120],
26 | "window": "hann_window",
27 | "num_workers": 4,
28 | "log_version": 0,
29 | "ckpt_name_by_step": false,
30 | "accumulate_grad_batches": 1
31 | },
32 | "data": {
33 | "training_files": "filelists/44k/train.txt",
34 | "validation_files": "filelists/44k/val.txt",
35 | "max_wav_value": 32768.0,
36 | "sampling_rate": 44100,
37 | "filter_length": 2048,
38 | "hop_length": 512,
39 | "win_length": 2048,
40 | "n_mel_channels": 80,
41 | "mel_fmin": 0.0,
42 | "mel_fmax": 22050,
43 | "contentvec_final_proj": false
44 | },
45 | "model": {
46 | "inter_channels": 192,
47 | "hidden_channels": 192,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 6,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3, 7, 11],
55 | "resblock_dilation_sizes": [
56 | [1, 3, 5],
57 | [1, 3, 5],
58 | [1, 3, 5]
59 | ],
60 | "upsample_rates": [8, 4],
61 | "upsample_initial_channel": 512,
62 | "upsample_kernel_sizes": [32, 16],
63 | "n_layers_q": 3,
64 | "use_spectral_norm": false,
65 | "gin_channels": 256,
66 | "ssl_dim": 768,
67 | "n_speakers": 200,
68 | "type_": "ms-istft",
69 | "gen_istft_n_fft": 16,
70 | "gen_istft_hop_size": 4,
71 | "subbands": 4,
72 | "pretrained": {
73 | "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth",
74 | "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth"
75 | }
76 | },
77 | "spk": {}
78 | }
79 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 800,
5 | "seed": 1234,
6 | "epochs": 10000,
7 | "learning_rate": 0.0001,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 16,
11 | "fp16_run": false,
12 | "bf16_run": false,
13 | "lr_decay": 0.999875,
14 | "segment_size": 10240,
15 | "init_lr_ratio": 1,
16 | "warmup_epochs": 0,
17 | "c_mel": 45,
18 | "c_kl": 1.0,
19 | "use_sr": true,
20 | "max_speclen": 512,
21 | "port": "8001",
22 | "keep_ckpts": 3,
23 | "num_workers": 4,
24 | "log_version": 0,
25 | "ckpt_name_by_step": false,
26 | "accumulate_grad_batches": 1
27 | },
28 | "data": {
29 | "training_files": "filelists/44k/train.txt",
30 | "validation_files": "filelists/44k/val.txt",
31 | "max_wav_value": 32768.0,
32 | "sampling_rate": 44100,
33 | "filter_length": 2048,
34 | "hop_length": 512,
35 | "win_length": 2048,
36 | "n_mel_channels": 80,
37 | "mel_fmin": 0.0,
38 | "mel_fmax": 22050
39 | },
40 | "model": {
41 | "inter_channels": 192,
42 | "hidden_channels": 192,
43 | "filter_channels": 768,
44 | "n_heads": 2,
45 | "n_layers": 6,
46 | "kernel_size": 3,
47 | "p_dropout": 0.1,
48 | "resblock": "1",
49 | "resblock_kernel_sizes": [3, 7, 11],
50 | "resblock_dilation_sizes": [
51 | [1, 3, 5],
52 | [1, 3, 5],
53 | [1, 3, 5]
54 | ],
55 | "upsample_rates": [8, 8, 2, 2, 2],
56 | "upsample_initial_channel": 512,
57 | "upsample_kernel_sizes": [16, 16, 4, 4, 4],
58 | "n_layers_q": 3,
59 | "use_spectral_norm": false,
60 | "gin_channels": 256,
61 | "ssl_dim": 256,
62 | "n_speakers": 200,
63 | "pretrained": {
64 | "D_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth",
65 | "G_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth"
66 | }
67 | },
68 | "spk": {}
69 | }
70 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 100,
4 | "eval_interval": 200,
5 | "seed": 1234,
6 | "epochs": 10000,
7 | "learning_rate": 0.0001,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 16,
11 | "fp16_run": false,
12 | "bf16_run": false,
13 | "lr_decay": 0.999875,
14 | "segment_size": 10240,
15 | "init_lr_ratio": 1,
16 | "warmup_epochs": 0,
17 | "c_mel": 45,
18 | "c_kl": 1.0,
19 | "use_sr": true,
20 | "max_speclen": 512,
21 | "port": "8001",
22 | "keep_ckpts": 3,
23 | "num_workers": 4,
24 | "log_version": 0,
25 | "ckpt_name_by_step": false,
26 | "accumulate_grad_batches": 1
27 | },
28 | "data": {
29 | "training_files": "filelists/44k/train.txt",
30 | "validation_files": "filelists/44k/val.txt",
31 | "max_wav_value": 32768.0,
32 | "sampling_rate": 44100,
33 | "filter_length": 2048,
34 | "hop_length": 512,
35 | "win_length": 2048,
36 | "n_mel_channels": 80,
37 | "mel_fmin": 0.0,
38 | "mel_fmax": 22050,
39 | "contentvec_final_proj": false
40 | },
41 | "model": {
42 | "inter_channels": 192,
43 | "hidden_channels": 192,
44 | "filter_channels": 768,
45 | "n_heads": 2,
46 | "n_layers": 6,
47 | "kernel_size": 3,
48 | "p_dropout": 0.1,
49 | "resblock": "1",
50 | "resblock_kernel_sizes": [3, 7, 11],
51 | "resblock_dilation_sizes": [
52 | [1, 3, 5],
53 | [1, 3, 5],
54 | [1, 3, 5]
55 | ],
56 | "upsample_rates": [8, 8, 2, 2, 2],
57 | "upsample_initial_channel": 512,
58 | "upsample_kernel_sizes": [16, 16, 4, 4, 4],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "gin_channels": 256,
62 | "ssl_dim": 768,
63 | "n_speakers": 200,
64 | "type_": "hifi-gan",
65 | "pretrained": {
66 | "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth",
67 | "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth"
68 | }
69 | },
70 | "spk": {}
71 | }
72 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_classify.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from logging import getLogger
4 | from pathlib import Path
5 |
6 | import keyboard
7 | import librosa
8 | import sounddevice as sd
9 | import soundfile as sf
10 | from rich.console import Console
11 | from tqdm.rich import tqdm
12 |
13 | LOG = getLogger(__name__)
14 |
15 |
16 | def preprocess_classify(
17 | input_dir: Path | str, output_dir: Path | str, create_new: bool = True
18 | ) -> None:
19 | # paths
20 | input_dir_ = Path(input_dir)
21 | output_dir_ = Path(output_dir)
22 | speed = 1
23 | if not input_dir_.is_dir():
24 | raise ValueError(f"{input_dir} is not a directory.")
25 | output_dir_.mkdir(exist_ok=True)
26 |
27 | console = Console()
28 | # get audio paths and folders
29 | audio_paths = list(input_dir_.glob("*.*"))
30 | last_folders = [x for x in output_dir_.glob("*") if x.is_dir()]
31 | console.print("Press ↑ or ↓ to change speed. Press any other key to classify.")
32 | console.print(f"Folders: {[x.name for x in last_folders]}")
33 |
34 | pbar_description = ""
35 |
36 | pbar = tqdm(audio_paths)
37 | for audio_path in pbar:
38 | # read file
39 | audio, sr = sf.read(audio_path)
40 |
41 | # update description
42 | duration = librosa.get_duration(y=audio, sr=sr)
43 | pbar_description = f"{duration:.1f} {pbar_description}"
44 | pbar.set_description(pbar_description)
45 |
46 | while True:
47 | # start playing
48 | sd.play(librosa.effects.time_stretch(audio, rate=speed), sr, loop=True)
49 |
50 | # wait for key press
51 | key = str(keyboard.read_key())
52 | if key == "down":
53 | speed /= 1.1
54 | console.print(f"Speed: {speed:.2f}")
55 | elif key == "up":
56 | speed *= 1.1
57 | console.print(f"Speed: {speed:.2f}")
58 | else:
59 | break
60 |
61 | # stop playing
62 | sd.stop()
63 |
64 | # print if folder changed
65 | folders = [x for x in output_dir_.glob("*") if x.is_dir()]
66 | if folders != last_folders:
67 | console.print(f"Folders updated: {[x.name for x in folders]}")
68 | last_folders = folders
69 |
70 | # get folder
71 | folder_candidates = [x for x in folders if x.name.startswith(key)]
72 | if len(folder_candidates) == 0:
73 | if create_new:
74 | folder = output_dir_ / key
75 | else:
76 | console.print(f"No folder starts with {key}.")
77 | continue
78 | else:
79 | if len(folder_candidates) > 1:
80 | LOG.warning(
81 | f"Multiple folders ({[x.name for x in folder_candidates]}) start with {key}. "
82 | f"Using first one ({folder_candidates[0].name})."
83 | )
84 | folder = folder_candidates[0]
85 | folder.mkdir(exist_ok=True)
86 |
87 | # move file
88 | new_path = folder / audio_path.name
89 | audio_path.rename(new_path)
90 |
91 | # update description
92 | pbar_description = f"Last: {audio_path.name} -> {folder.name}"
93 |
94 | # yield result
95 | # yield audio_path, key, folder, new_path
96 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import json
4 | import os
5 | from copy import deepcopy
6 | from logging import getLogger
7 | from pathlib import Path
8 |
9 | import numpy as np
10 | from librosa import get_duration
11 | from tqdm import tqdm
12 |
13 | LOG = getLogger(__name__)
14 | CONFIG_TEMPLATE_DIR = Path(__file__).parent / "config_templates"
15 |
16 |
17 | def preprocess_config(
18 | input_dir: Path | str,
19 | train_list_path: Path | str,
20 | val_list_path: Path | str,
21 | test_list_path: Path | str,
22 | config_path: Path | str,
23 | config_name: str,
24 | ):
25 | input_dir = Path(input_dir)
26 | train_list_path = Path(train_list_path)
27 | val_list_path = Path(val_list_path)
28 | test_list_path = Path(test_list_path)
29 | config_path = Path(config_path)
30 | train = []
31 | val = []
32 | test = []
33 | spk_dict = {}
34 | spk_id = 0
35 | random = np.random.RandomState(1234)
36 | for speaker in os.listdir(input_dir):
37 | spk_dict[speaker] = spk_id
38 | spk_id += 1
39 | paths = []
40 | for path in tqdm(list((input_dir / speaker).rglob("*.wav"))):
41 | if get_duration(filename=path) < 0.3:
42 | LOG.warning(f"skip {path} because it is too short.")
43 | continue
44 | paths.append(path)
45 | random.shuffle(paths)
46 | if len(paths) <= 4:
47 | raise ValueError(
48 | f"too few files in {input_dir / speaker} (expected at least 5)."
49 | )
50 | train += paths[2:-2]
51 | val += paths[:2]
52 | test += paths[-2:]
53 |
54 | LOG.info(f"Writing {train_list_path}")
55 | train_list_path.parent.mkdir(parents=True, exist_ok=True)
56 | train_list_path.write_text(
57 | "\n".join([x.as_posix() for x in train]), encoding="utf-8"
58 | )
59 |
60 | LOG.info(f"Writing {val_list_path}")
61 | val_list_path.parent.mkdir(parents=True, exist_ok=True)
62 | val_list_path.write_text("\n".join([x.as_posix() for x in val]), encoding="utf-8")
63 |
64 | LOG.info(f"Writing {test_list_path}")
65 | test_list_path.parent.mkdir(parents=True, exist_ok=True)
66 | test_list_path.write_text("\n".join([x.as_posix() for x in test]), encoding="utf-8")
67 |
68 | config = deepcopy(
69 | json.loads(
70 | (
71 | CONFIG_TEMPLATE_DIR
72 | / (
73 | config_name
74 | if config_name.endswith(".json")
75 | else config_name + ".json"
76 | )
77 | ).read_text(encoding="utf-8")
78 | )
79 | )
80 | config["spk"] = spk_dict
81 | config["data"]["training_files"] = train_list_path.as_posix()
82 | config["data"]["validation_files"] = val_list_path.as_posix()
83 | LOG.info(f"Writing {config_path}")
84 | config_path.parent.mkdir(parents=True, exist_ok=True)
85 | with config_path.open("w", encoding="utf-8") as f:
86 | json.dump(config, f, indent=2)
87 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from logging import getLogger
4 | from pathlib import Path
5 | from random import shuffle
6 | from typing import Iterable, Literal
7 |
8 | import librosa
9 | import numpy as np
10 | import torch
11 | import torchaudio
12 | from joblib import Parallel, cpu_count, delayed
13 | from tqdm import tqdm
14 | from transformers import HubertModel
15 |
16 | import so_vits_svc_fork.f0
17 | from so_vits_svc_fork import utils
18 |
19 | from ..hparams import HParams
20 | from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch
21 | from ..utils import get_optimal_device, get_total_gpu_memory
22 | from .preprocess_utils import check_hubert_min_duration
23 |
24 | LOG = getLogger(__name__)
25 | HUBERT_MEMORY = 2900
26 | HUBERT_MEMORY_CREPE = 3900
27 |
28 |
29 | def _process_one(
30 | *,
31 | filepath: Path,
32 | content_model: HubertModel,
33 | device: torch.device | str = get_optimal_device(),
34 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
35 | force_rebuild: bool = False,
36 | hps: HParams,
37 | ):
38 | audio, sr = librosa.load(filepath, sr=hps.data.sampling_rate, mono=True)
39 |
40 | if not check_hubert_min_duration(audio, sr):
41 | LOG.info(f"Skip {filepath} because it is too short.")
42 | return
43 |
44 | data_path = filepath.parent / (filepath.name + ".data.pt")
45 | if data_path.exists() and not force_rebuild:
46 | return
47 |
48 | # Compute f0
49 | f0 = so_vits_svc_fork.f0.compute_f0(
50 | audio, sampling_rate=sr, hop_length=hps.data.hop_length, method=f0_method
51 | )
52 | f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0)
53 | f0 = torch.from_numpy(f0).float()
54 | uv = torch.from_numpy(uv).float()
55 |
56 | # Compute HuBERT content
57 | audio = torch.from_numpy(audio).float().to(device)
58 | c = utils.get_content(
59 | content_model,
60 | audio,
61 | device,
62 | sr=sr,
63 | legacy_final_proj=hps.data.get("contentvec_final_proj", True),
64 | )
65 | c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
66 | torch.cuda.empty_cache()
67 |
68 | # Compute spectrogram
69 | audio, sr = torchaudio.load(filepath)
70 | spec = spectrogram_torch(audio, hps).squeeze(0)
71 | mel_spec = spec_to_mel_torch(spec, hps)
72 | torch.cuda.empty_cache()
73 |
74 | # fix lengths
75 | lmin = min(spec.shape[1], mel_spec.shape[1], f0.shape[0], uv.shape[0], c.shape[1])
76 | spec, mel_spec, f0, uv, c = (
77 | spec[:, :lmin],
78 | mel_spec[:, :lmin],
79 | f0[:lmin],
80 | uv[:lmin],
81 | c[:, :lmin],
82 | )
83 |
84 | # get speaker id
85 | spk_name = filepath.parent.name
86 | spk = hps.spk.__dict__[spk_name]
87 | spk = torch.tensor(spk).long()
88 | assert (
89 | spec.shape[1] == mel_spec.shape[1] == f0.shape[0] == uv.shape[0] == c.shape[1]
90 | ), (spec.shape, mel_spec.shape, f0.shape, uv.shape, c.shape)
91 | data = {
92 | "spec": spec,
93 | "mel_spec": mel_spec,
94 | "f0": f0,
95 | "uv": uv,
96 | "content": c,
97 | "audio": audio,
98 | "spk": spk,
99 | }
100 | data = {k: v.cpu() for k, v in data.items()}
101 | with data_path.open("wb") as f:
102 | torch.save(data, f)
103 |
104 |
105 | def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs):
106 | hps = kwargs["hps"]
107 | content_model = utils.get_hubert_model(
108 | get_optimal_device(), hps.data.get("contentvec_final_proj", True)
109 | )
110 |
111 | for filepath in tqdm(filepaths, position=pbar_position):
112 | _process_one(
113 | content_model=content_model,
114 | filepath=filepath,
115 | **kwargs,
116 | )
117 |
118 |
119 | def preprocess_hubert_f0(
120 | input_dir: Path | str,
121 | config_path: Path | str,
122 | n_jobs: int | None = None,
123 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
124 | force_rebuild: bool = False,
125 | ):
126 | input_dir = Path(input_dir)
127 | config_path = Path(config_path)
128 | hps = utils.get_hparams(config_path)
129 | if n_jobs is None:
130 | # add cpu_count() to avoid SIGKILL
131 | memory = get_total_gpu_memory("total")
132 | n_jobs = min(
133 | max(
134 | (
135 | memory
136 | // (HUBERT_MEMORY_CREPE if f0_method == "crepe" else HUBERT_MEMORY)
137 | if memory is not None
138 | else 1
139 | ),
140 | 1,
141 | ),
142 | cpu_count(),
143 | )
144 | LOG.info(f"n_jobs automatically set to {n_jobs}, memory: {memory} MiB")
145 |
146 | filepaths = list(input_dir.rglob("*.wav"))
147 | n_jobs = min(len(filepaths) // 16 + 1, n_jobs)
148 | shuffle(filepaths)
149 | filepath_chunks = np.array_split(filepaths, n_jobs)
150 | Parallel(n_jobs=n_jobs)(
151 | delayed(_process_batch)(
152 | filepaths=chunk,
153 | pbar_position=pbar_position,
154 | f0_method=f0_method,
155 | force_rebuild=force_rebuild,
156 | hps=hps,
157 | )
158 | for (pbar_position, chunk) in enumerate(filepath_chunks)
159 | )
160 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_resample.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 | from logging import getLogger
5 | from pathlib import Path
6 | from typing import Iterable
7 |
8 | import librosa
9 | import soundfile
10 | from joblib import Parallel, delayed
11 | from tqdm_joblib import tqdm_joblib
12 |
13 | from .preprocess_utils import check_hubert_min_duration
14 |
15 | LOG = getLogger(__name__)
16 |
17 | # input_dir and output_dir exists.
18 | # write code to convert input dir audio files to output dir audio files,
19 | # without changing folder structure. Use joblib to parallelize.
20 | # Converting audio files includes:
21 | # - resampling to specified sampling rate
22 | # - trim silence
23 | # - adjust volume in a smart way
24 | # - save as 16-bit wav file
25 |
26 |
27 | def _get_unique_filename(path: Path, existing_paths: Iterable[Path]) -> Path:
28 | """Return a unique path by appending a number to the original path."""
29 | if path not in existing_paths:
30 | return path
31 | i = 1
32 | while True:
33 | new_path = path.parent / f"{path.stem}_{i}{path.suffix}"
34 | if new_path not in existing_paths:
35 | return new_path
36 | i += 1
37 |
38 |
39 | def is_relative_to(path: Path, *other):
40 | """Return True if the path is relative to another path or False.
41 | Python 3.9+ has Path.is_relative_to() method, but we need to support Python 3.8.
42 | """
43 | try:
44 | path.relative_to(*other)
45 | return True
46 | except ValueError:
47 | return False
48 |
49 |
50 | def _preprocess_one(
51 | input_path: Path,
52 | output_path: Path,
53 | sr: int,
54 | *,
55 | top_db: int,
56 | frame_seconds: float,
57 | hop_seconds: float,
58 | ) -> None:
59 | """Preprocess one audio file."""
60 |
61 | try:
62 | audio, sr = librosa.load(input_path, sr=sr, mono=True)
63 |
64 | # Audioread is the last backend it will attempt, so this is the exception thrown on failure
65 | except Exception as e:
66 | # Failure due to attempting to load a file that is not audio, so return early
67 | LOG.warning(f"Failed to load {input_path} due to {e}")
68 | return
69 |
70 | if not check_hubert_min_duration(audio, sr):
71 | LOG.info(f"Skip {input_path} because it is too short.")
72 | return
73 |
74 | # Adjust volume
75 | audio /= max(audio.max(), -audio.min())
76 |
77 | # Trim silence
78 | audio, _ = librosa.effects.trim(
79 | audio,
80 | top_db=top_db,
81 | frame_length=int(frame_seconds * sr),
82 | hop_length=int(hop_seconds * sr),
83 | )
84 |
85 | if not check_hubert_min_duration(audio, sr):
86 | LOG.info(f"Skip {input_path} because it is too short.")
87 | return
88 |
89 | soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")
90 |
91 |
92 | def preprocess_resample(
93 | input_dir: Path | str,
94 | output_dir: Path | str,
95 | sampling_rate: int,
96 | n_jobs: int = -1,
97 | *,
98 | top_db: int = 30,
99 | frame_seconds: float = 0.1,
100 | hop_seconds: float = 0.05,
101 | ) -> None:
102 | input_dir = Path(input_dir)
103 | output_dir = Path(output_dir)
104 | """Preprocess audio files in input_dir and save them to output_dir."""
105 |
106 | out_paths = []
107 | in_paths = list(input_dir.rglob("*.*"))
108 | if not in_paths:
109 | raise ValueError(f"No audio files found in {input_dir}")
110 | for in_path in in_paths:
111 | in_path_relative = in_path.relative_to(input_dir)
112 | if not in_path.is_absolute() and is_relative_to(
113 | in_path, Path("dataset_raw") / "44k"
114 | ):
115 | new_in_path_relative = in_path_relative.relative_to("44k")
116 | warnings.warn(
117 | f"Recommended folder structure has changed since v1.0.0. "
118 | "Please move your dataset directly under dataset_raw folder. "
119 | f"Recoginzed {in_path_relative} as {new_in_path_relative}"
120 | )
121 | in_path_relative = new_in_path_relative
122 |
123 | if len(in_path_relative.parts) < 2:
124 | continue
125 | speaker_name = in_path_relative.parts[0]
126 | file_name = in_path_relative.with_suffix(".wav").name
127 | out_path = output_dir / speaker_name / file_name
128 | out_path = _get_unique_filename(out_path, out_paths)
129 | out_path.parent.mkdir(parents=True, exist_ok=True)
130 | out_paths.append(out_path)
131 |
132 | in_and_out_paths = list(zip(in_paths, out_paths))
133 |
134 | with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
135 | Parallel(n_jobs=n_jobs)(
136 | delayed(_preprocess_one)(
137 | *args,
138 | sr=sampling_rate,
139 | top_db=top_db,
140 | frame_seconds=frame_seconds,
141 | hop_seconds=hop_seconds,
142 | )
143 | for args in in_and_out_paths
144 | )
145 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from collections import defaultdict
4 | from logging import getLogger
5 | from pathlib import Path
6 |
7 | import librosa
8 | import soundfile as sf
9 | import torch
10 | from joblib import Parallel, delayed
11 | from pyannote.audio import Pipeline
12 | from tqdm import tqdm
13 | from tqdm_joblib import tqdm_joblib
14 |
15 | LOG = getLogger(__name__)
16 |
17 |
18 | def _process_one(
19 | input_path: Path,
20 | output_dir: Path,
21 | sr: int,
22 | *,
23 | min_speakers: int = 1,
24 | max_speakers: int = 1,
25 | huggingface_token: str | None = None,
26 | ) -> None:
27 | try:
28 | audio, sr = librosa.load(input_path, sr=sr, mono=True)
29 | except Exception as e:
30 | LOG.warning(f"Failed to read {input_path}: {e}")
31 | return
32 | pipeline = Pipeline.from_pretrained(
33 | "pyannote/speaker-diarization-3.1", use_auth_token=huggingface_token
34 | )
35 | if pipeline is None:
36 | raise ValueError("Failed to load pipeline")
37 | pipeline = pipeline.to(torch.device("cuda"))
38 | LOG.info(f"Processing {input_path}. This may take a while...")
39 | diarization = pipeline(
40 | input_path, min_speakers=min_speakers, max_speakers=max_speakers
41 | )
42 |
43 | LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}")
44 | speaker_count = defaultdict(int)
45 |
46 | output_dir.mkdir(parents=True, exist_ok=True)
47 | for segment, track, speaker in tqdm(
48 | list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}"
49 | ):
50 | if segment.end - segment.start < 1:
51 | continue
52 | speaker_count[speaker] += 1
53 | audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)]
54 | sf.write(
55 | (output_dir / f"{speaker}_{speaker_count[speaker]:04d}.wav"),
56 | audio_cut,
57 | sr,
58 | )
59 |
60 | LOG.info(f"Speaker count: {speaker_count}")
61 |
62 |
63 | def preprocess_speaker_diarization(
64 | input_dir: Path | str,
65 | output_dir: Path | str,
66 | sr: int,
67 | *,
68 | min_speakers: int = 1,
69 | max_speakers: int = 1,
70 | huggingface_token: str | None = None,
71 | n_jobs: int = -1,
72 | ) -> None:
73 | if huggingface_token is not None and not huggingface_token.startswith("hf_"):
74 | LOG.warning("Huggingface token probably should start with hf_")
75 | if not torch.cuda.is_available():
76 | LOG.warning("CUDA is not available. This will be extremely slow.")
77 | input_dir = Path(input_dir)
78 | output_dir = Path(output_dir)
79 | input_dir.mkdir(parents=True, exist_ok=True)
80 | output_dir.mkdir(parents=True, exist_ok=True)
81 | input_paths = list(input_dir.rglob("*.*"))
82 | with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)):
83 | Parallel(n_jobs=n_jobs)(
84 | delayed(_process_one)(
85 | input_path,
86 | output_dir / input_path.relative_to(input_dir).parent / input_path.stem,
87 | sr,
88 | max_speakers=max_speakers,
89 | min_speakers=min_speakers,
90 | huggingface_token=huggingface_token,
91 | )
92 | for input_path in input_paths
93 | )
94 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_split.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from logging import getLogger
4 | from pathlib import Path
5 |
6 | import librosa
7 | import soundfile as sf
8 | from joblib import Parallel, delayed
9 | from tqdm import tqdm
10 | from tqdm_joblib import tqdm_joblib
11 |
12 | LOG = getLogger(__name__)
13 |
14 |
15 | def _process_one(
16 | input_path: Path,
17 | output_dir: Path,
18 | sr: int,
19 | *,
20 | max_length: float = 10.0,
21 | top_db: int = 30,
22 | frame_seconds: float = 0.5,
23 | hop_seconds: float = 0.1,
24 | ):
25 | try:
26 | audio, sr = librosa.load(input_path, sr=sr, mono=True)
27 | except Exception as e:
28 | LOG.warning(f"Failed to read {input_path}: {e}")
29 | return
30 | intervals = librosa.effects.split(
31 | audio,
32 | top_db=top_db,
33 | frame_length=int(sr * frame_seconds),
34 | hop_length=int(sr * hop_seconds),
35 | )
36 | output_dir.mkdir(parents=True, exist_ok=True)
37 | for start, end in tqdm(intervals, desc=f"Writing {input_path}"):
38 | for sub_start in range(start, end, int(sr * max_length)):
39 | sub_end = min(sub_start + int(sr * max_length), end)
40 | audio_cut = audio[sub_start:sub_end]
41 | sf.write(
42 | (
43 | output_dir
44 | / f"{input_path.stem}_{sub_start / sr:.3f}_{sub_end / sr:.3f}.wav"
45 | ),
46 | audio_cut,
47 | sr,
48 | )
49 |
50 |
51 | def preprocess_split(
52 | input_dir: Path | str,
53 | output_dir: Path | str,
54 | sr: int,
55 | *,
56 | max_length: float = 10.0,
57 | top_db: int = 30,
58 | frame_seconds: float = 0.5,
59 | hop_seconds: float = 0.1,
60 | n_jobs: int = -1,
61 | ):
62 | input_dir = Path(input_dir)
63 | output_dir = Path(output_dir)
64 | output_dir.mkdir(parents=True, exist_ok=True)
65 | input_paths = list(input_dir.rglob("*.*"))
66 | with tqdm_joblib(desc="Splitting", total=len(input_paths)):
67 | Parallel(n_jobs=n_jobs)(
68 | delayed(_process_one)(
69 | input_path,
70 | output_dir / input_path.relative_to(input_dir).parent,
71 | sr,
72 | max_length=max_length,
73 | top_db=top_db,
74 | frame_seconds=frame_seconds,
75 | hop_seconds=hop_seconds,
76 | )
77 | for input_path in input_paths
78 | )
79 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/preprocessing/preprocess_utils.py:
--------------------------------------------------------------------------------
1 | from numpy import ndarray
2 |
3 |
4 | def check_hubert_min_duration(audio: ndarray, sr: int) -> bool:
5 | return len(audio) / sr >= 0.3
6 |
--------------------------------------------------------------------------------
/src/so_vits_svc_fork/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/py.typed
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/__init__.py
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0001.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0002.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0003.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0004.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0005.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0006.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0007.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0008.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0009.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/LJ001-0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0010.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/nested/LJ001-0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/nested/LJ001-0001.wav
--------------------------------------------------------------------------------
/tests/dataset_raw/test/nested/に.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/nested/に.wav
--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from pathlib import Path
4 | from unittest import SkipTest, TestCase
5 |
6 | IS_CI = os.environ.get("GITHUB_ACTIONS", False)
7 | IS_COLAB = os.getenv("COLAB_RELEASE_TAG", False)
8 |
9 |
10 | class TestMain(TestCase):
11 | def test_import(self):
12 | import so_vits_svc_fork.cluster.train_cluster # noqa
13 | import so_vits_svc_fork.inference.main # noqa
14 |
15 | # import so_vits_svc_fork.modules.onnx._export # noqa
16 | import so_vits_svc_fork.preprocessing.preprocess_flist_config # noqa
17 | import so_vits_svc_fork.preprocessing.preprocess_hubert_f0 # noqa
18 | import so_vits_svc_fork.preprocessing.preprocess_resample # noqa
19 | import so_vits_svc_fork.preprocessing.preprocess_split # noqa
20 | import so_vits_svc_fork.train # noqa
21 |
22 | def test_infer(self):
23 | if IS_CI:
24 | raise SkipTest("Skip inference test on CI")
25 | from so_vits_svc_fork.inference.main import infer # noqa
26 |
27 | # infer("tests/dataset_raw/34j/1.wav", "tests/configs/config.json", "tests/logs/44k")
28 |
29 | def test_preprocess(self):
30 | from so_vits_svc_fork.preprocessing.preprocess_resample import (
31 | preprocess_resample,
32 | )
33 |
34 | preprocess_resample(
35 | "tests/dataset_raw", "tests/dataset/44k", 44100, n_jobs=1 if IS_CI else -1
36 | )
37 |
38 | from so_vits_svc_fork.preprocessing.preprocess_flist_config import (
39 | preprocess_config,
40 | )
41 |
42 | preprocess_config(
43 | "tests/dataset/44k",
44 | "tests/filelists/train.txt",
45 | "tests/filelists/val.txt",
46 | "tests/filelists/test.txt",
47 | "tests/configs/44k/config.json",
48 | "so-vits-svc-4.0v1",
49 | )
50 |
51 | if IS_CI:
52 | raise SkipTest("Skip hubert and f0 test on CI")
53 | from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import (
54 | preprocess_hubert_f0,
55 | )
56 |
57 | preprocess_hubert_f0("tests/dataset/44k", "tests/configs/44k/config.json")
58 |
59 | def test_train(self):
60 | if not IS_COLAB:
61 | raise SkipTest("Skip training test on non-colab")
62 | # requires >10GB of GPU memory, can be only tested on colab
63 | from so_vits_svc_fork.train import train
64 |
65 | config_path = Path("tests/logs/44k/config.json")
66 | config_json = json.loads(config_path.read_text("utf-8"))
67 | config_json["train"]["epochs"] = 1
68 | config_path.write_text(json.dumps(config_json), "utf-8")
69 | train(config_path, "tests/logs/44k")
70 |
--------------------------------------------------------------------------------