├── .all-contributorsrc ├── .copier-answers.yml ├── .dockerignore ├── .editorconfig ├── .flake8 ├── .github ├── CODE_OF_CONDUCT.md ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 1-bug_report.yml │ ├── 2-feature-request.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── labels.toml └── workflows │ ├── ci.yml │ ├── hacktoberfest.yml │ ├── issue-manager.yml │ ├── labels.yml │ └── poetry-upgrade.yml ├── .gitignore ├── .gitpod.yml ├── .idea ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── so-vits-svc-fork.iml ├── vcs.xml ├── watcherTasks.xml └── workspace.xml ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── README_zh_CN.md ├── commitlint.config.js ├── docs ├── Makefile ├── _static │ ├── .gitkeep │ └── gui.png ├── changelog.md ├── conf.py ├── contributing.md ├── index.md ├── installation.md ├── make.bat └── usage.md ├── easy-installation ├── install-cn.bat └── install.bat ├── notebooks └── so-vits-svc-fork-4.0.ipynb ├── poetry.lock ├── pyproject.toml ├── renovate.json ├── setup.py ├── src └── so_vits_svc_fork │ ├── __init__.py │ ├── __main__.py │ ├── cluster │ ├── __init__.py │ └── train_cluster.py │ ├── dataset.py │ ├── default_gui_presets.json │ ├── f0.py │ ├── gui.py │ ├── hparams.py │ ├── inference │ ├── __init__.py │ ├── core.py │ └── main.py │ ├── logger.py │ ├── modules │ ├── __init__.py │ ├── attentions.py │ ├── commons.py │ ├── decoders │ │ ├── __init__.py │ │ ├── f0.py │ │ ├── hifigan │ │ │ ├── __init__.py │ │ │ ├── _models.py │ │ │ └── _utils.py │ │ └── mb_istft │ │ │ ├── __init__.py │ │ │ ├── _generators.py │ │ │ ├── _loss.py │ │ │ ├── _pqmf.py │ │ │ ├── _stft.py │ │ │ └── _stft_loss.py │ ├── descriminators.py │ ├── encoders.py │ ├── flows.py │ ├── losses.py │ ├── mel_processing.py │ ├── modules.py │ └── synthesizers.py │ ├── preprocessing │ ├── __init__.py │ ├── config_templates │ │ ├── quickvc.json │ │ ├── so-vits-svc-4.0v1-legacy.json │ │ └── so-vits-svc-4.0v1.json │ ├── preprocess_classify.py │ ├── preprocess_flist_config.py │ ├── preprocess_hubert_f0.py │ ├── preprocess_resample.py │ ├── preprocess_speaker_diarization.py │ ├── preprocess_split.py │ └── preprocess_utils.py │ ├── py.typed │ ├── train.py │ └── utils.py └── tests ├── __init__.py ├── dataset_raw └── test │ ├── LJ001-0001.wav │ ├── LJ001-0002.wav │ ├── LJ001-0003.wav │ ├── LJ001-0004.wav │ ├── LJ001-0005.wav │ ├── LJ001-0006.wav │ ├── LJ001-0007.wav │ ├── LJ001-0008.wav │ ├── LJ001-0009.wav │ ├── LJ001-0010.wav │ └── nested │ ├── LJ001-0001.wav │ └── に.wav └── test_main.py /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "projectName": "so-vits-svc-fork", 3 | "projectOwner": "voicepaw", 4 | "repoType": "github", 5 | "repoHost": "https://github.com", 6 | "files": ["README.md"], 7 | "imageSize": 80, 8 | "commit": true, 9 | "commitConvention": "angular", 10 | "contributors": [ 11 | { 12 | "login": "34j", 13 | "name": "34j", 14 | "avatar_url": "https://avatars.githubusercontent.com/u/55338215?v=4", 15 | "profile": "https://github.com/34j", 16 | "contributions": [ 17 | "code", 18 | "ideas", 19 | "doc", 20 | "example", 21 | "infra", 22 | "maintenance", 23 | "review", 24 | "test", 25 | "tutorial", 26 | "promotion", 27 | "bug" 28 | ] 29 | }, 30 | { 31 | "login": "GarrettConway", 32 | "name": "GarrettConway", 33 | "avatar_url": "https://avatars.githubusercontent.com/u/22782004?v=4", 34 | "profile": "https://github.com/GarrettConway", 35 | "contributions": ["code", "bug", "doc", "review"] 36 | }, 37 | { 38 | "login": "BlueAmulet", 39 | "name": "BlueAmulet", 40 | "avatar_url": "https://avatars.githubusercontent.com/u/43395286?v=4", 41 | "profile": "https://github.com/BlueAmulet", 42 | "contributions": ["ideas", "question", "code", "maintenance"] 43 | }, 44 | { 45 | "login": "ThrowawayAccount01", 46 | "name": "ThrowawayAccount01", 47 | "avatar_url": "https://avatars.githubusercontent.com/u/125531852?v=4", 48 | "profile": "https://github.com/ThrowawayAccount01", 49 | "contributions": ["bug"] 50 | }, 51 | { 52 | "login": "MashiroSA", 53 | "name": "緋", 54 | "avatar_url": "https://avatars.githubusercontent.com/u/40637516?v=4", 55 | "profile": "https://github.com/MashiroSA", 56 | "contributions": ["doc", "bug"] 57 | }, 58 | { 59 | "login": "Lordmau5", 60 | "name": "Lordmau5", 61 | "avatar_url": "https://avatars.githubusercontent.com/u/1345036?v=4", 62 | "profile": "https://github.com/Lordmau5", 63 | "contributions": [ 64 | "bug", 65 | "code", 66 | "ideas", 67 | "maintenance", 68 | "question", 69 | "userTesting" 70 | ] 71 | }, 72 | { 73 | "login": "DL909", 74 | "name": "DL909", 75 | "avatar_url": "https://avatars.githubusercontent.com/u/71912115?v=4", 76 | "profile": "https://github.com/DL909", 77 | "contributions": ["bug"] 78 | }, 79 | { 80 | "login": "Satisfy256", 81 | "name": "Satisfy256", 82 | "avatar_url": "https://avatars.githubusercontent.com/u/101394399?v=4", 83 | "profile": "https://github.com/Satisfy256", 84 | "contributions": ["bug"] 85 | }, 86 | { 87 | "login": "pierluigizagaria", 88 | "name": "Pierluigi Zagaria", 89 | "avatar_url": "https://avatars.githubusercontent.com/u/57801386?v=4", 90 | "profile": "https://github.com/pierluigizagaria", 91 | "contributions": ["userTesting"] 92 | }, 93 | { 94 | "login": "ruckusmattster", 95 | "name": "ruckusmattster", 96 | "avatar_url": "https://avatars.githubusercontent.com/u/77196088?v=4", 97 | "profile": "https://github.com/ruckusmattster", 98 | "contributions": ["bug"] 99 | }, 100 | { 101 | "login": "Desuka-art", 102 | "name": "Desuka-art", 103 | "avatar_url": "https://avatars.githubusercontent.com/u/111822082?v=4", 104 | "profile": "https://github.com/Desuka-art", 105 | "contributions": ["bug"] 106 | }, 107 | { 108 | "login": "heyfixit", 109 | "name": "heyfixit", 110 | "avatar_url": "https://avatars.githubusercontent.com/u/41658450?v=4", 111 | "profile": "https://github.com/heyfixit", 112 | "contributions": ["doc"] 113 | }, 114 | { 115 | "login": "nerdyrodent", 116 | "name": "Nerdy Rodent", 117 | "avatar_url": "https://avatars.githubusercontent.com/u/74688049?v=4", 118 | "profile": "https://www.youtube.com/c/NerdyRodent", 119 | "contributions": ["video"] 120 | }, 121 | { 122 | "login": "xieyumc", 123 | "name": "谢宇", 124 | "avatar_url": "https://avatars.githubusercontent.com/u/47858007?v=4", 125 | "profile": "https://github.com/xieyumc", 126 | "contributions": ["doc"] 127 | }, 128 | { 129 | "login": "ColdCawfee", 130 | "name": "ColdCawfee", 131 | "avatar_url": "https://avatars.githubusercontent.com/u/79474598?v=4", 132 | "profile": "https://github.com/ColdCawfee", 133 | "contributions": ["bug"] 134 | }, 135 | { 136 | "login": "sbersier", 137 | "name": "sbersier", 138 | "avatar_url": "https://avatars.githubusercontent.com/u/34165937?v=4", 139 | "profile": "https://github.com/sbersier", 140 | "contributions": ["ideas", "userTesting", "bug"] 141 | }, 142 | { 143 | "login": "Meldoner", 144 | "name": "Meldoner", 145 | "avatar_url": "https://avatars.githubusercontent.com/u/43951115?v=4", 146 | "profile": "https://github.com/Meldoner", 147 | "contributions": ["bug", "ideas", "code"] 148 | }, 149 | { 150 | "login": "mmodeusher", 151 | "name": "mmodeusher", 152 | "avatar_url": "https://avatars.githubusercontent.com/u/46575920?v=4", 153 | "profile": "https://github.com/mmodeusher", 154 | "contributions": ["bug"] 155 | }, 156 | { 157 | "login": "AlonDan", 158 | "name": "AlonDan", 159 | "avatar_url": "https://avatars.githubusercontent.com/u/21152334?v=4", 160 | "profile": "https://github.com/AlonDan", 161 | "contributions": ["bug"] 162 | }, 163 | { 164 | "login": "Likkkez", 165 | "name": "Likkkez", 166 | "avatar_url": "https://avatars.githubusercontent.com/u/44336181?v=4", 167 | "profile": "https://github.com/Likkkez", 168 | "contributions": ["bug"] 169 | }, 170 | { 171 | "login": "DuctTapeGames", 172 | "name": "Duct Tape Games", 173 | "avatar_url": "https://avatars.githubusercontent.com/u/84365142?v=4", 174 | "profile": "https://github.com/DuctTapeGames", 175 | "contributions": ["bug"] 176 | }, 177 | { 178 | "login": "hxl9654", 179 | "name": "Xianglong He", 180 | "avatar_url": "https://avatars.githubusercontent.com/u/6624983?v=4", 181 | "profile": "https://tec.hxlxz.com/", 182 | "contributions": ["bug"] 183 | }, 184 | { 185 | "login": "75aosu", 186 | "name": "75aosu", 187 | "avatar_url": "https://avatars.githubusercontent.com/u/79185331?v=4", 188 | "profile": "https://github.com/75aosu", 189 | "contributions": ["bug"] 190 | }, 191 | { 192 | "login": "tonyco82", 193 | "name": "tonyco82", 194 | "avatar_url": "https://avatars.githubusercontent.com/u/56610534?v=4", 195 | "profile": "https://github.com/tonyco82", 196 | "contributions": ["bug"] 197 | }, 198 | { 199 | "login": "yxlllc", 200 | "name": "yxlllc", 201 | "avatar_url": "https://avatars.githubusercontent.com/u/33565655?v=4", 202 | "profile": "https://github.com/yxlllc", 203 | "contributions": ["ideas", "code"] 204 | }, 205 | { 206 | "login": "outhipped", 207 | "name": "outhipped", 208 | "avatar_url": "https://avatars.githubusercontent.com/u/116147475?v=4", 209 | "profile": "https://github.com/outhipped", 210 | "contributions": ["bug"] 211 | }, 212 | { 213 | "login": "escoolioinglesias", 214 | "name": "escoolioinglesias", 215 | "avatar_url": "https://avatars.githubusercontent.com/u/73505402?v=4", 216 | "profile": "https://github.com/escoolioinglesias", 217 | "contributions": ["bug", "userTesting", "video"] 218 | }, 219 | { 220 | "login": "Blacksingh", 221 | "name": "Blacksingh", 222 | "avatar_url": "https://avatars.githubusercontent.com/u/130872856?v=4", 223 | "profile": "https://github.com/Blacksingh", 224 | "contributions": ["bug"] 225 | }, 226 | { 227 | "login": "tybantarnusa", 228 | "name": "Mgs. M. Thoyib Antarnusa", 229 | "avatar_url": "https://avatars.githubusercontent.com/u/9532857?v=4", 230 | "profile": "http://tybantarnusa.com", 231 | "contributions": ["bug"] 232 | }, 233 | { 234 | "login": "ZeroHackz", 235 | "name": "Exosfeer", 236 | "avatar_url": "https://avatars.githubusercontent.com/u/15729496?v=4", 237 | "profile": "https://github.com/ZeroHackz", 238 | "contributions": ["bug", "code"] 239 | }, 240 | { 241 | "login": "guranon", 242 | "name": "guranon", 243 | "avatar_url": "https://avatars.githubusercontent.com/u/130421189?v=4", 244 | "profile": "https://github.com/guranon", 245 | "contributions": ["bug", "ideas", "code"] 246 | }, 247 | { 248 | "login": "alexanderkoumis", 249 | "name": "Alexander Koumis", 250 | "avatar_url": "https://avatars.githubusercontent.com/u/5108856?v=4", 251 | "profile": "https://github.com/alexanderkoumis", 252 | "contributions": ["code"] 253 | }, 254 | { 255 | "login": "acekagami", 256 | "name": "acekagami", 257 | "avatar_url": "https://avatars.githubusercontent.com/u/127201056?v=4", 258 | "profile": "https://github.com/acekagami", 259 | "contributions": ["translation"] 260 | }, 261 | { 262 | "login": "Highupech", 263 | "name": "Highupech", 264 | "avatar_url": "https://avatars.githubusercontent.com/u/114140670?v=4", 265 | "profile": "https://github.com/Highupech", 266 | "contributions": ["bug"] 267 | }, 268 | { 269 | "login": "Scorpi", 270 | "name": "Scorpi", 271 | "avatar_url": "https://avatars.githubusercontent.com/u/969654?v=4", 272 | "profile": "https://github.com/Scorpi", 273 | "contributions": ["code"] 274 | }, 275 | { 276 | "login": "maximxlss", 277 | "name": "Maximxls", 278 | "avatar_url": "https://avatars.githubusercontent.com/u/29152154?v=4", 279 | "profile": "http://maximxlss.github.io", 280 | "contributions": ["code"] 281 | }, 282 | { 283 | "login": "Star3Lord", 284 | "name": "Star3Lord", 285 | "avatar_url": "https://avatars.githubusercontent.com/u/57606931?v=4", 286 | "profile": "https://github.com/Star3Lord", 287 | "contributions": ["bug", "code"] 288 | }, 289 | { 290 | "login": "Ph0rk0z", 291 | "name": "Forkoz", 292 | "avatar_url": "https://avatars.githubusercontent.com/u/59298527?v=4", 293 | "profile": "https://github.com/Ph0rk0z", 294 | "contributions": ["bug", "code"] 295 | }, 296 | { 297 | "login": "Zerui18", 298 | "name": "Zerui Chen", 299 | "avatar_url": "https://avatars.githubusercontent.com/u/34794550?v=4", 300 | "profile": "https://github.com/Zerui18", 301 | "contributions": ["code", "ideas"] 302 | }, 303 | { 304 | "login": "shenberg", 305 | "name": "Roee Shenberg", 306 | "avatar_url": "https://avatars.githubusercontent.com/u/653972?v=4", 307 | "profile": "https://www.meimadix.com", 308 | "contributions": ["userTesting", "ideas", "code"] 309 | }, 310 | { 311 | "login": "ShinyJustyZ", 312 | "name": "Justas", 313 | "avatar_url": "https://avatars.githubusercontent.com/u/65282440?v=4", 314 | "profile": "https://github.com/ShinyJustyZ", 315 | "contributions": ["bug", "code"] 316 | }, 317 | { 318 | "login": "Onako2", 319 | "name": "Onako2", 320 | "avatar_url": "https://avatars.githubusercontent.com/u/79749977?v=4", 321 | "profile": "https://onako2.github.io/", 322 | "contributions": ["doc"] 323 | }, 324 | { 325 | "login": "4ll0w3v1l", 326 | "name": "4ll0w3v1l", 327 | "avatar_url": "https://avatars.githubusercontent.com/u/53517147?v=4", 328 | "profile": "https://github.com/4ll0w3v1l", 329 | "contributions": ["code"] 330 | }, 331 | { 332 | "login": "SamuelSwartzberg", 333 | "name": "j5y0V6b", 334 | "avatar_url": "https://avatars.githubusercontent.com/u/16353439?v=4", 335 | "profile": "https://github.com/SamuelSwartzberg", 336 | "contributions": ["security"] 337 | }, 338 | { 339 | "login": "marcellocirelli", 340 | "name": "marcellocirelli", 341 | "avatar_url": "https://avatars.githubusercontent.com/u/51972090?v=4", 342 | "profile": "https://github.com/marcellocirelli", 343 | "contributions": ["bug"] 344 | }, 345 | { 346 | "login": "Priyanshu-hawk", 347 | "name": "Priyanshu Patel", 348 | "avatar_url": "https://avatars.githubusercontent.com/u/76026651?v=4", 349 | "profile": "https://github.com/Priyanshu-hawk", 350 | "contributions": ["code"] 351 | }, 352 | { 353 | "login": "annagorshunova", 354 | "name": "Anna Gorshunova", 355 | "avatar_url": "https://avatars.githubusercontent.com/u/5199204?v=4", 356 | "profile": "https://github.com/annagorshunova", 357 | "contributions": ["bug", "code"] 358 | } 359 | ], 360 | "contributorsPerLine": 7, 361 | "skipCi": true, 362 | "commitType": "docs" 363 | } 364 | -------------------------------------------------------------------------------- /.copier-answers.yml: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier 2 | _commit: d5acceb 3 | _src_path: gh:34j/pypackage-template-fork 4 | add_me_as_contributor: false 5 | copyright_year: '2023' 6 | documentation: true 7 | email: 34j.95a2p@simplelogin.com 8 | full_name: 34j 9 | github_username: 34j 10 | initial_commit: false 11 | open_source_license: MIT 12 | open_with_vscode: false 13 | package_name: so_vits_svc_fork 14 | project_name: SoftVC VITS Singing Voice Conversion Fork 15 | project_short_description: A fork of so-vits-svc. 16 | project_slug: so-vits-svc-fork 17 | run_poetry_install: true 18 | setup_github: false 19 | setup_pre_commit: false 20 | setup_venv: true 21 | venv_version: '3.10' 22 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Ignore everything 2 | * 3 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = docs 3 | max-line-length = 88 4 | ignore = E203, E501, E741, E402, E712, W503, E731, E711, E226 5 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/.github/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: ["34j"] 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug report 2 | description: Create a report to help us improve 3 | labels: [bug] 4 | body: 5 | - type: textarea 6 | id: description 7 | attributes: 8 | label: Describe the bug 9 | description: A clear and concise description of what the bug is. 10 | placeholder: Describe the bug 11 | validations: 12 | required: true 13 | - type: textarea 14 | id: reproduce 15 | attributes: 16 | label: To Reproduce 17 | description: Steps to reproduce the behavior. 18 | placeholder: To Reproduce 19 | validations: 20 | required: true 21 | - type: textarea 22 | id: context 23 | attributes: 24 | label: Additional context 25 | description: Add any other context about the problem here. 26 | placeholder: Additional context 27 | - type: input 28 | id: version 29 | attributes: 30 | label: Version 31 | description: Version of the project. 32 | placeholder: Version 33 | validations: 34 | required: true 35 | - type: input 36 | id: platform 37 | attributes: 38 | label: Platform 39 | description: Platform where the bug was found. 40 | placeholder: "Example: Windows 11 / macOS 12.0.1 / Ubuntu 20.04" 41 | validations: 42 | required: true 43 | - type: checkboxes 44 | id: terms 45 | attributes: 46 | label: Code of Conduct 47 | description: By submitting this issue, you agree to follow our 48 | [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md). 49 | options: 50 | - label: I agree to follow this project's Code of Conduct. 51 | required: true 52 | - type: checkboxes 53 | id: no-duplicate 54 | attributes: 55 | label: No Duplicate 56 | description: Please check [existing issues](https://github.com/34j/so-vits-svc-fork/issues) to avoid duplicates. 57 | options: 58 | - label: I have checked existing issues to avoid duplicates. 59 | required: true 60 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | labels: [enhancement] 4 | body: 5 | - type: textarea 6 | id: description 7 | attributes: 8 | label: Is your feature request related to a problem? Please describe. 9 | description: A clear and concise description of what the problem is. 10 | value: I'm always frustrated when 11 | validations: 12 | required: true 13 | - type: textarea 14 | id: solution 15 | attributes: 16 | label: Describe alternatives you've considered 17 | description: A clear and concise description of any alternative solutions or features you've considered. 18 | placeholder: Describe alternatives you've considered 19 | validations: 20 | required: true 21 | - type: textarea 22 | id: context 23 | attributes: 24 | label: Additional context 25 | description: Add any other context or screenshots about the feature request here. 26 | placeholder: Additional context 27 | - type: checkboxes 28 | id: terms 29 | attributes: 30 | label: Code of Conduct 31 | description: By submitting this issue, you agree to follow our 32 | [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md). 33 | options: 34 | - label: I agree to follow this project's Code of Conduct 35 | required: true 36 | - type: checkboxes 37 | id: willing 38 | attributes: 39 | label: Are you willing to resolve this issue by submitting a Pull Request? 40 | description: Remember that first-time contributors are welcome! 🙌 41 | options: 42 | - label: Yes, I have the time, and I know how to start. 43 | - label: Yes, I have the time, but I don't know how to start. I would need guidance. 44 | - label: No, I don't have the time, although I believe I could do it if I had the time... 45 | - label: No, I don't have the time and I wouldn't even know how to start. 46 | validations: 47 | required: true 48 | - type: markdown 49 | attributes: 50 | value: 👋 Have a great day and thank you for the feature request! 51 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Discussions 4 | url: https://github.com/34j/so-vits-svc-fork/discussions 5 | about: Please ask and answer questions here. 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | ### Description of change 9 | 10 | 13 | 14 | copilot:all 15 | 16 | 29 | 30 | ### Pull-Request Checklist 31 | 32 | 37 | 38 | - [ ] Code is up-to-date with the `main` branch 39 | - [ ] This pull request follows [Contributing.md](https://github.com/34j/so-vits-svc-fork/blob/main/CONTRIBUTING.md) 40 | - [ ] This pull request links relevant issues as `Fixes #0000` 41 | - [ ] `pre-commit run -a` passes with this change or ci passes 42 | - [ ] `poetry run pytest` passes with this change or ci passes 43 | - [ ] (There are new or updated unit tests validating the change) 44 | - [ ] Documentation has been updated to reflect this change 45 | - [ ] The new commits follow conventions outlined in the [conventional commit spec](https://www.conventionalcommits.org/en/v1.0.0/) 46 | 47 | 50 | -------------------------------------------------------------------------------- /.github/labels.toml: -------------------------------------------------------------------------------- 1 | [breaking] 2 | color = "ffcc00" 3 | name = "breaking" 4 | description = "Breaking change." 5 | 6 | [bug] 7 | color = "d73a4a" 8 | name = "bug" 9 | description = "Something isn't working" 10 | 11 | [dependencies] 12 | color = "0366d6" 13 | name = "dependencies" 14 | description = "Pull requests that update a dependency file" 15 | 16 | [github_actions] 17 | color = "000000" 18 | name = "github_actions" 19 | description = "Update of github actions" 20 | 21 | [documentation] 22 | color = "1bc4a5" 23 | name = "documentation" 24 | description = "Improvements or additions to documentation" 25 | 26 | [duplicate] 27 | color = "cfd3d7" 28 | name = "duplicate" 29 | description = "This issue or pull request already exists" 30 | 31 | [enhancement] 32 | color = "a2eeef" 33 | name = "enhancement" 34 | description = "New feature or request" 35 | 36 | ["good first issue"] 37 | color = "7057ff" 38 | name = "good first issue" 39 | description = "Good for newcomers" 40 | 41 | ["help wanted"] 42 | color = "008672" 43 | name = "help wanted" 44 | description = "Extra attention is needed" 45 | 46 | [invalid] 47 | color = "e4e669" 48 | name = "invalid" 49 | description = "This doesn't seem right" 50 | 51 | [nochangelog] 52 | color = "555555" 53 | name = "nochangelog" 54 | description = "Exclude pull requests from changelog" 55 | 56 | [question] 57 | color = "d876e3" 58 | name = "question" 59 | description = "Further information is requested" 60 | 61 | [removed] 62 | color = "e99695" 63 | name = "removed" 64 | description = "Removed piece of functionalities." 65 | 66 | [tests] 67 | color = "bfd4f2" 68 | name = "tests" 69 | description = "CI, CD and testing related changes" 70 | 71 | [wontfix] 72 | color = "ffffff" 73 | name = "wontfix" 74 | description = "This will not be worked on" 75 | 76 | [discussion] 77 | color = "c2e0c6" 78 | name = "discussion" 79 | description = "Some discussion around the project" 80 | 81 | [hacktoberfest] 82 | color = "ffa663" 83 | name = "hacktoberfest" 84 | description = "Good issues for Hacktoberfest" 85 | 86 | [answered] 87 | color = "0ee2b6" 88 | name = "answered" 89 | description = "Automatically closes as answered after a delay" 90 | 91 | [waiting] 92 | color = "5f7972" 93 | name = "waiting" 94 | description = "Automatically closes if no answer after a delay" 95 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | concurrency: 10 | group: ${{ github.head_ref || github.run_id }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | lint: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v5 19 | with: 20 | python-version: "3.9" 21 | - uses: pre-commit/action@v3.0.1 22 | 23 | # Make sure commit messages follow the conventional commits convention: 24 | # https://www.conventionalcommits.org 25 | commitlint: 26 | name: Lint Commit Messages 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v3 30 | with: 31 | fetch-depth: 0 32 | - uses: wagoid/commitlint-github-action@v5.5.1 33 | 34 | test: 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | python-version: 39 | - "3.8" 40 | - "3.9" 41 | - "3.10" 42 | - "3.11" 43 | # - "3.12" 44 | os: 45 | - ubuntu-latest 46 | # - windows-latest 47 | # - macOS-latest 48 | runs-on: ${{ matrix.os }} 49 | steps: 50 | - uses: actions/checkout@v3 51 | - name: Set up Python 52 | uses: actions/setup-python@v5 53 | with: 54 | python-version: ${{ matrix.python-version }} 55 | - uses: snok/install-poetry@v1.3.4 56 | - name: Install Dependencies 57 | run: poetry install 58 | shell: bash 59 | - name: Test with Pytest 60 | run: poetry run pytest --cov-report=xml 61 | shell: bash 62 | - name: Upload coverage to Codecov 63 | uses: codecov/codecov-action@v4 64 | with: 65 | token: ${{ secrets.CODECOV_TOKEN }} 66 | 67 | release: 68 | runs-on: ubuntu-latest 69 | environment: release 70 | if: github.ref == 'refs/heads/main' 71 | needs: 72 | - test 73 | - lint 74 | - commitlint 75 | 76 | steps: 77 | - uses: actions/checkout@v3 78 | with: 79 | fetch-depth: 0 80 | 81 | # Run semantic release: 82 | # - Update CHANGELOG.md 83 | # - Update version in code 84 | # - Create git tag 85 | # - Create GitHub release 86 | # - Publish to PyPI 87 | - name: Python Semantic Release 88 | uses: relekang/python-semantic-release@v7.34.6 89 | with: 90 | github_token: ${{ secrets.GITHUB_TOKEN }} 91 | pypi_token: ${{ secrets.PYPI_TOKEN }} 92 | -------------------------------------------------------------------------------- /.github/workflows/hacktoberfest.yml: -------------------------------------------------------------------------------- 1 | name: Hacktoberfest 2 | 3 | on: 4 | schedule: 5 | # Run every day in October 6 | - cron: "0 0 * 10 *" 7 | # Run on the 1st of November to revert 8 | - cron: "0 13 1 11 *" 9 | 10 | jobs: 11 | hacktoberfest: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: browniebroke/hacktoberfest-labeler-action@v2.3.0 16 | with: 17 | github_token: ${{ secrets.GH_PAT }} 18 | -------------------------------------------------------------------------------- /.github/workflows/issue-manager.yml: -------------------------------------------------------------------------------- 1 | name: Issue Manager 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" 6 | issue_comment: 7 | types: 8 | - created 9 | issues: 10 | types: 11 | - labeled 12 | pull_request_target: 13 | types: 14 | - labeled 15 | workflow_dispatch: 16 | 17 | jobs: 18 | issue-manager: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: tiangolo/issue-manager@0.5.0 22 | with: 23 | token: ${{ secrets.GITHUB_TOKEN }} 24 | config: > 25 | { 26 | "answered": { 27 | "message": "Assuming the original issue was solved, it will be automatically closed now." 28 | }, 29 | "waiting": { 30 | "message": "Automatically closing. To re-open, please provide the additional information requested." 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.github/workflows/labels.yml: -------------------------------------------------------------------------------- 1 | name: Sync Github labels 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - ".github/**" 9 | 10 | jobs: 11 | labels: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.8 19 | - name: Install labels 20 | run: pip install labels 21 | - name: Sync config with Github 22 | run: labels -u ${{ github.repository_owner }} -t ${{ secrets.GITHUB_TOKEN }} sync -f .github/labels.toml 23 | -------------------------------------------------------------------------------- /.github/workflows/poetry-upgrade.yml: -------------------------------------------------------------------------------- 1 | name: Upgrader 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "29 23 16 * *" 7 | 8 | jobs: 9 | upgrade: 10 | uses: browniebroke/github-actions/.github/workflows/poetry-upgrade.yml@v1 11 | secrets: 12 | gh_pat: ${{ secrets.GH_PAT }} 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder {{package_name}} settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope {{package_name}} settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # pytype static type analyzer 137 | .pytype/ 138 | 139 | # Cython debug symbols 140 | cython_debug/ 141 | 142 | # additional files 143 | tests/**/*.wav 144 | !tests/dataset_raw/test/**/*.wav 145 | tests/**/*.npy 146 | tests/**/*.pt 147 | tests/**/*.txt 148 | tests/**/*.json 149 | tests/**/*.pth 150 | tests/**/*.download 151 | tests/**/*.lab 152 | tests/**/*.pdf 153 | tests/**/*.csv 154 | tests/**/*.ckpt 155 | tests/**/*.yaml 156 | *.tfevents.* 157 | *.pt 158 | user_gui_presets.json 159 | -------------------------------------------------------------------------------- /.gitpod.yml: -------------------------------------------------------------------------------- 1 | tasks: 2 | - command: | 3 | pip install poetry 4 | PIP_USER=false poetry install 5 | - command: | 6 | pip install pre-commit 7 | pre-commit install 8 | PIP_USER=false pre-commit install-hooks 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/so-vits-svc-fork.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 18 | 19 | 21 | 22 | 24 | 25 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/watcherTasks.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 24 | 25 | 36 | 44 | 45 | 56 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 28 | 29 | 34 | 35 | 36 | 38 | 39 | 41 | 42 | 43 | 44 | 45 | 46 | 49 | { 50 | "keyToString": { 51 | "RunOnceActivity.OpenProjectViewOnStart": "true", 52 | "RunOnceActivity.ShowReadmeOnStart": "true", 53 | "WebServerToolWindowFactoryState": "false", 54 | "node.js.detected.package.eslint": "true", 55 | "node.js.selected.package.eslint": "(autodetect)", 56 | "nodejs_package_manager_path": "npm" 57 | } 58 | } 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 91 | 92 | 93 | 118 | 119 | 120 | 121 | 122 | 123 | 1678892092249 124 | 133 | 134 | 135 | 136 | 138 | 139 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: "CHANGELOG.md|.copier-answers.yml" 4 | default_stages: [commit] 5 | 6 | ci: 7 | autofix_commit_msg: "chore(pre-commit.ci): auto fixes" 8 | autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate" 9 | 10 | repos: 11 | - repo: https://github.com/commitizen-tools/commitizen 12 | rev: v3.28.0 13 | hooks: 14 | - id: commitizen 15 | stages: [commit-msg] 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v4.6.0 18 | hooks: 19 | - id: debug-statements 20 | - id: check-builtin-literals 21 | - id: check-case-conflict 22 | - id: check-docstring-first 23 | - id: check-json 24 | - id: check-toml 25 | - id: check-xml 26 | - id: check-yaml 27 | - id: detect-private-key 28 | - id: end-of-file-fixer 29 | - id: trailing-whitespace 30 | - repo: https://github.com/python-poetry/poetry 31 | rev: 1.8.3 32 | hooks: 33 | - id: poetry-check 34 | - repo: https://github.com/pre-commit/mirrors-prettier 35 | rev: v3.1.0 36 | hooks: 37 | - id: prettier 38 | args: ["--tab-width", "2"] 39 | - repo: https://github.com/asottile/pyupgrade 40 | rev: v3.17.0 41 | hooks: 42 | - id: pyupgrade 43 | args: [--py38-plus] 44 | - repo: https://github.com/PyCQA/autoflake 45 | rev: v2.3.1 46 | hooks: 47 | - id: autoflake 48 | - repo: https://github.com/PyCQA/isort 49 | rev: 5.13.2 50 | hooks: 51 | - id: isort 52 | - repo: https://github.com/psf/black 53 | rev: 24.1.0 54 | hooks: 55 | - id: black 56 | - repo: https://github.com/codespell-project/codespell 57 | rev: v2.2.6 58 | hooks: 59 | - id: codespell 60 | args: [-w] 61 | - repo: https://github.com/PyCQA/flake8 62 | rev: 7.1.1 63 | hooks: 64 | - id: flake8 65 | #- repo: https://github.com/pre-commit/mirrors-mypy 66 | # rev: v0.931 67 | # hooks: 68 | # - id: mypy 69 | # additional_dependencies: [] 70 | # - repo: https://github.com/PyCQA/bandit 71 | # rev: 1.7.4 72 | # hooks: 73 | # - id: bandit 74 | # args: [-x, tests] 75 | - repo: https://github.com/srstevenson/nb-clean 76 | rev: "3.3.0" 77 | hooks: 78 | - id: nb-clean 79 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the version of Python and other tools you might need 8 | build: 9 | os: ubuntu-20.04 10 | tools: 11 | python: "3.9" 12 | jobs: 13 | post_create_environment: 14 | # Install poetry 15 | - pip install poetry 16 | # Tell poetry to not use a virtual environment 17 | - poetry config virtualenvs.create false 18 | post_install: 19 | # Install dependencies 20 | - poetry install --with docs 21 | 22 | # Build documentation in the docs directory with Sphinx 23 | sphinx: 24 | configuration: docs/conf.py 25 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every little helps, and credit will always be given. 4 | 5 | You can contribute in many ways: 6 | 7 | ## Types of Contributions 8 | 9 | ### Report Bugs 10 | 11 | Report bugs to [our issue page][gh-issues]. If you are reporting a bug, please include: 12 | 13 | - Your operating system name and version. 14 | - Any details about your local setup that might be helpful in troubleshooting. 15 | - Detailed steps to reproduce the bug. 16 | 17 | ### Fix Bugs 18 | 19 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help wanted" is open to whoever wants to implement it. 20 | 21 | ### Implement Features 22 | 23 | Look through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it. 24 | 25 | ### Write Documentation 26 | 27 | SoftVC VITS Singing Voice Conversion Fork could always use more documentation, whether as part of the official SoftVC VITS Singing Voice Conversion Fork docs, in docstrings, or even on the web in blog posts, articles, and such. 28 | 29 | ### Submit Feedback 30 | 31 | The best way to send feedback [our issue page][gh-issues] on GitHub. If you are proposing a feature: 32 | 33 | - Explain in detail how it would work. 34 | - Keep the scope as narrow as possible, to make it easier to implement. 35 | - Remember that this is a volunteer-driven project, and that contributions are welcome 😊 36 | 37 | ## Get Started! 38 | 39 | Ready to contribute? Here's how to set yourself up for local development. 40 | 41 | 1. Fork the repo on GitHub. 42 | 43 | 2. Clone your fork locally: 44 | 45 | ```shell 46 | $ git clone git@github.com:your_name_here/so-vits-svc-fork.git 47 | ``` 48 | 49 | 3. Install the project dependencies with [Poetry](https://python-poetry.org): 50 | 51 | ```shell 52 | $ poetry install 53 | ``` 54 | 55 | 4. Create a branch for local development: 56 | 57 | ```shell 58 | $ git checkout -b name-of-your-bugfix-or-feature 59 | ``` 60 | 61 | Now you can make your changes locally. 62 | 63 | 5. When you're done making changes, check that your changes pass our tests: 64 | 65 | ```shell 66 | $ poetry run pytest 67 | ``` 68 | 69 | 6. Linting is done through [pre-commit](https://pre-commit.com). Provided you have the tool installed globally, you can run them all as one-off: 70 | 71 | ```shell 72 | $ pre-commit run -a 73 | ``` 74 | 75 | Or better, install the hooks once and have them run automatically each time you commit: 76 | 77 | ```shell 78 | $ pre-commit install 79 | ``` 80 | 81 | 7. Commit your changes and push your branch to GitHub: 82 | 83 | ```shell 84 | $ git add . 85 | $ git commit -m "feat(something): your detailed description of your changes" 86 | $ git push origin name-of-your-bugfix-or-feature 87 | ``` 88 | 89 | Note: the commit message should follow [the conventional commits](https://www.conventionalcommits.org). We run [`commitlint` on CI](https://github.com/marketplace/actions/commit-linter) to validate it, and if you've installed pre-commit hooks at the previous step, the message will be checked at commit time. 90 | 91 | 8. Submit a pull request through the GitHub website or using the GitHub CLI (if you have it installed): 92 | 93 | ```shell 94 | $ gh pr create --fill 95 | ``` 96 | 97 | ## Pull Request Guidelines 98 | 99 | We like to have the pull request open as soon as possible, that's a great place to discuss any piece of work, even unfinished. You can use draft pull request if it's still a work in progress. Here are a few guidelines to follow: 100 | 101 | 1. Include tests for feature or bug fixes. 102 | 2. Update the documentation for significant features. 103 | 3. Ensure tests are passing on CI. 104 | 105 | ## Tips 106 | 107 | To run a subset of tests: 108 | 109 | ```shell 110 | $ pytest tests 111 | ``` 112 | 113 | ## Making a new release 114 | 115 | The deployment should be automated and can be triggered from the Semantic Release workflow in GitHub. The next version will be based on [the commit logs](https://python-semantic-release.readthedocs.io/en/latest/commit-log-parsing.html#commit-log-parsing). This is done by [python-semantic-release](https://python-semantic-release.readthedocs.io/en/latest/index.html) via a GitHub action. 116 | 117 | [gh-issues]: https://github.com/34j/so-vits-svc-fork/issues 118 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 2 | RUN ["apt", "update"] 3 | RUN ["apt", "install", "-y", "build-essential"] 4 | RUN ["pip", "install", "-U", "pip", "setuptools", "wheel"] 5 | RUN ["pip", "install", "-U", "so-vits-svc-fork"] 6 | ENTRYPOINT [ "svcg" ] 7 | -------------------------------------------------------------------------------- /commitlint.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extends: ["@commitlint/config-conventional"], 3 | rules: { 4 | "header-max-length": [0, "always", Infinity], 5 | "body-max-line-length": [0, "always", Infinity], 6 | "footer-max-line-length": [0, "always", Infinity], 7 | }, 8 | }; 9 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/docs/_static/.gitkeep -------------------------------------------------------------------------------- /docs/_static/gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/docs/_static/gui.png -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CHANGELOG.md 2 | 3 | ``` 4 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | from pathlib import Path 7 | from typing import Any 8 | 9 | from sphinx.application import Sphinx 10 | from sphinx.ext import apidoc 11 | 12 | # -- Project information ----------------------------------------------------- 13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 14 | 15 | project = "SoftVC VITS Singing Voice Conversion Fork" 16 | copyright = "2023, 34j" 17 | author = "34j" 18 | release = "0.0.0" 19 | 20 | # -- General configuration --------------------------------------------------- 21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 22 | 23 | # Add any Sphinx extension module names here, as strings. They can be 24 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 25 | # ones. 26 | extensions = [ 27 | "myst_parser", 28 | "sphinx.ext.napoleon", 29 | "sphinx.ext.autodoc", 30 | "sphinx.ext.viewcode", 31 | ] 32 | napoleon_google_docstring = False 33 | 34 | # The suffix of source filenames. 35 | source_suffix = [".rst", ".md"] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ["_templates"] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns: list[str] = [] 44 | 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | 48 | # The theme to use for HTML and HTML Help pages. See the documentation for 49 | # a list of builtin themes. 50 | # 51 | html_theme = "sphinx_rtd_theme" 52 | 53 | # Add any paths that contain custom static files (such as style sheets) here, 54 | # relative to this directory. They are copied after the builtin static files, 55 | # so a file named "default.css" will overwrite the builtin "default.css". 56 | html_static_path = ["_static"] 57 | 58 | 59 | # -- Automatically run sphinx-apidoc ----------------------------------------- 60 | 61 | 62 | def run_apidoc(_: Any) -> None: 63 | docs_path = Path(__file__).parent 64 | module_path = docs_path.parent / "src" / "so_vits_svc_fork" 65 | 66 | apidoc.main( 67 | [ 68 | "--force", 69 | "--module-first", 70 | "-o", 71 | docs_path.as_posix(), 72 | module_path.as_posix(), 73 | ] 74 | ) 75 | 76 | 77 | def setup(app: Sphinx) -> None: 78 | app.connect("builder-inited", run_apidoc) 79 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CONTRIBUTING.md 2 | 3 | ``` 4 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to SoftVC VITS Singing Voice Conversion Fork documentation! 2 | 3 | ```{toctree} 4 | :caption: Installation & Usage 5 | :maxdepth: 2 6 | 7 | installation 8 | usage 9 | ``` 10 | 11 | ```{toctree} 12 | :caption: Project Info 13 | :maxdepth: 2 14 | 15 | changelog 16 | contributing 17 | ``` 18 | 19 | ```{toctree} 20 | :caption: API Reference 21 | :maxdepth: 2 22 | 23 | so_vits_svc_fork 24 | ``` 25 | 26 | ```{include} ../README.md 27 | 28 | ``` 29 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | The package is published on [PyPI](https://pypi.org/project/so-vits-svc-fork/) and can be installed with `pip` (or any equivalent): 4 | 5 | ```bash 6 | pip install so-vits-svc-fork 7 | ``` 8 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | To use this package, import it: 4 | 5 | ```python 6 | import so_vits_svc_fork 7 | ``` 8 | 9 | TODO: Document usage 10 | -------------------------------------------------------------------------------- /easy-installation/install-cn.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/easy-installation/install-cn.bat -------------------------------------------------------------------------------- /easy-installation/install.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | echo You can rerun this script to update the installation. 4 | 5 | echo Moving to AppData\Roaming\so-vits-svc-fork... 6 | mkdir "%APPDATA%\so-vits-svc-fork" >nul 2>&1 7 | cd "%APPDATA%\so-vits-svc-fork" 8 | 9 | echo Checking for Python 3.10... 10 | 11 | py -3.10 --version >nul 2>&1 12 | if %errorlevel%==0 ( 13 | echo Python 3.10 is already installed. 14 | ) else ( 15 | echo Python 3.10 is not installed. Downloading installer... 16 | curl https://www.python.org/ftp/python/3.10.10/python-3.10.10-amd64.exe -o python-3.10.10-amd64.exe 17 | 18 | echo Installing Python 3.10... 19 | python-3.10.10-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 20 | 21 | echo Cleaning up installer... 22 | del python-3.10.10-amd64.exe 23 | ) 24 | 25 | echo Creating virtual environment... 26 | py -3.10 -m venv venv 27 | 28 | echo Updating pip and wheel... 29 | venv\Scripts\python.exe -m pip install --upgrade pip wheel 30 | 31 | nvidia-smi >nul 2>&1 32 | if %errorlevel%==0 ( 33 | echo Installing PyTorch with GPU support... 34 | venv\Scripts\pip.exe install torch torchaudio --index-url https://download.pytorch.org/whl/cu118 35 | ) else ( 36 | echo Installing PyTorch without GPU support... 37 | venv\Scripts\pip.exe install torch torchaudio 38 | ) 39 | 40 | echo Installing so-vits-svc-fork... 41 | venv\Scripts\pip.exe install so-vits-svc-fork 42 | 43 | rem echo Creating shortcut... 44 | rem powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%USDRPROFILE%\Desktop\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()" 45 | 46 | echo Creating shortcut to the start menu... 47 | powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%APPDATA%\Microsoft\Windows\Start Menu\Programs\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()" 48 | 49 | echo Launching so-vits-svc-fork GUI... 50 | venv\Scripts\svcg.exe 51 | -------------------------------------------------------------------------------- /notebooks/so-vits-svc-fork-4.0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Before training\n", 8 | "\n", 9 | "This program saves the last 3 generations of models to Google Drive. Since 1 generation of models is >1GB, you should have at least 3GB of free space in Google Drive. If you do not have such free space, it is recommended to create another Google Account.\n", 10 | "\n", 11 | "Training requires >10GB VRAM. (T4 should be enough) Inference does not require such a lot of VRAM." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Installation" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "#@title Check GPU\n", 28 | "!nvidia-smi" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "#@title Mount Google Drive\n", 38 | "from google.colab import drive\n", 39 | "drive.mount('/content/drive')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "#@title Install dependencies\n", 49 | "#@markdown pip may fail to resolve dependencies and raise ERROR, but it can be ignored.\n", 50 | "!python -m pip install -U pip wheel\n", 51 | "%pip install -U ipython \n", 52 | "\n", 53 | "#@markdown Branch (for development)\n", 54 | "BRANCH = \"none\" #@param {\"type\": \"string\"}\n", 55 | "if BRANCH == \"none\":\n", 56 | " %pip install -U so-vits-svc-fork\n", 57 | "else:\n", 58 | " %pip install -U git+https://github.com/34j/so-vits-svc-fork.git@{BRANCH}" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Training" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "#@title Make dataset directory\n", 75 | "!mkdir -p \"dataset_raw\"" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "#!rm -r \"dataset_raw\"\n", 85 | "#!rm -r \"dataset/44k\"" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "#@title Copy your dataset\n", 95 | "#@markdown **We assume that your dataset is in your Google Drive's `so-vits-svc-fork/dataset/(speaker_name)` directory.**\n", 96 | "DATASET_NAME = \"kiritan\" #@param {type: \"string\"}\n", 97 | "!cp -R /content/drive/MyDrive/so-vits-svc-fork/dataset/{DATASET_NAME}/ -t \"dataset_raw/\"" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "#@title Download dataset (Tsukuyomi-chan JVS)\n", 107 | "#@markdown You can download this dataset if you don't have your own dataset.\n", 108 | "#@markdown Make sure you agree to the license when using this dataset.\n", 109 | "#@markdown https://tyc.rei-yumesaki.net/material/corpus/#toc6\n", 110 | "# !wget https://tyc.rei-yumesaki.net/files/sozai-tyc-corpus1.zip\n", 111 | "# !unzip sozai-tyc-corpus1.zip\n", 112 | "# !mv \"/content/つくよみちゃんコーパス Vol.1 声優統計コーパス(JVSコーパス準拠)/おまけ:WAV(+12dB増幅&高音域削減)/WAV(+12dB増幅&高音域削減)\" \"dataset_raw/tsukuyomi\"" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "#@title Automatic preprocessing\n", 122 | "!svc pre-resample" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "!svc pre-config" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "#@title Export configs file\n", 141 | "#@markdown This assumes that you want to save the **config.json** on the default location. There will be also a backup file created in case the action is done accidentally.!cp configs/44k/config.json configs/44k/config.bkp.json!cp drive/MyDrive/so-vits-svc-fork/config.json configs/44k" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "#@title Import configs file (Optional Step, NOT REQUIRED)\n", 151 | "#@markdown This assumes that you are saving the **config.json** on the default location. There will be also a backup file created in case the action is done accidentally.!cp drive/MyDrive/so-vits-svc-fork/config.json drive/MyDrive/so-vits-svc-fork/config.bkp.json!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n", 161 | "!svc pre-hubert -fm {F0_METHOD}" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "#@title Train\n", 171 | "%load_ext tensorboard\n", 172 | "%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k\n", 173 | "!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Training Cluster model" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "!svc train-cluster --output-path drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Inference" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "#@title Get the author's voice as a source\n", 206 | "import random\n", 207 | "NAME = str(random.randint(1, 49))\n", 208 | "TYPE = \"fsd50k\" #@param [\"\", \"digit\", \"dog\", \"fsd50k\"]\n", 209 | "CUSTOM_FILEPATH = \"\" #@param {type: \"string\"}\n", 210 | "if CUSTOM_FILEPATH != \"\":\n", 211 | " NAME = CUSTOM_FILEPATH\n", 212 | "else:\n", 213 | " # it is extremely difficult to find a voice that can download from the internet directly\n", 214 | " if TYPE == \"dog\":\n", 215 | " !wget -N f\"https://huggingface.co/datasets/437aewuh/dog-dataset/resolve/main/dogs/dogs_{NAME:.0000}.wav\" -O {NAME}.wav\n", 216 | " elif TYPE == \"digit\":\n", 217 | " # george, jackson, lucas, nicolas, ...\n", 218 | " !wget -N f\"https://github.com/Jakobovski/free-spoken-digit-dataset/raw/master/recordings/0_george_{NAME}.wav\" -O {NAME}.wav\n", 219 | " elif TYPE == \"fsd50k\":\n", 220 | " !wget -N f\"https://huggingface.co/datasets/Fhrozen/FSD50k/blob/main/clips/dev/{10000+int(NAME)}.wav\" -O {NAME}.wav\n", 221 | " else:\n", 222 | " !wget -N f\"https://zunko.jp/sozai/utau/voice_{\"kiritan\" if NAME < 25 else \"itako\"}{NAME % 5 + 1}.wav\" -O {NAME}.wav\n", 223 | "from IPython.display import Audio, display\n", 224 | "display(Audio(f\"{NAME}.wav\"))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "#@title Use trained model\n", 234 | "#@markdown **Put your .wav file in `so-vits-svc-fork/audio` directory**\n", 235 | "from IPython.display import Audio, display\n", 236 | "!svc infer drive/MyDrive/so-vits-svc-fork/audio/{NAME}.wav -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json\n", 237 | "display(Audio(f\"drive/MyDrive/so-vits-svc-fork/audio/{NAME}.out.wav\", autoplay=True))" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "##@title Use trained model (with cluster)\n", 247 | "!svc infer {NAME}.wav -s speaker -r 0.1 -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json -k drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt\n", 248 | "display(Audio(f\"{NAME}.out.wav\", autoplay=True))" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "### Pretrained models" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "#@title https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/tree/main\n", 265 | "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/G_riri_220.pth\"\n", 266 | "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/config.json\"" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "!svc infer {NAME}.wav -c config.json -m G_riri_220.pth\n", 276 | "display(Audio(f\"{NAME}.out.wav\", autoplay=True))" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "#@title https://huggingface.co/therealvul/so-vits-svc-4.0/tree/main\n", 286 | "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/G_166400.pth\"\n", 287 | "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/config.json\"" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "!svc infer {NAME}.wav --speaker \"Pinkie {neutral}\" -c config.json -m G_166400.pth\n", 297 | "display(Audio(f\"{NAME}.out.wav\", autoplay=True))" 298 | ] 299 | } 300 | ], 301 | "metadata": { 302 | "accelerator": "GPU", 303 | "colab": { 304 | "provenance": [] 305 | }, 306 | "gpuClass": "standard", 307 | "kernelspec": { 308 | "display_name": "Python 3", 309 | "name": "python3" 310 | }, 311 | "language_info": { 312 | "codemirror_mode": { 313 | "name": "ipython", 314 | "version": 3 315 | }, 316 | "file_extension": ".py", 317 | "mimetype": "text/x-python", 318 | "name": "python", 319 | "nbconvert_exporter": "python", 320 | "pygments_lexer": "ipython3" 321 | } 322 | }, 323 | "nbformat": 4, 324 | "nbformat_minor": 0 325 | } 326 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "so-vits-svc-fork" 3 | version = "4.2.26" 4 | description = "A fork of so-vits-svc." 5 | authors = ["34j <34j.95a2p@simplelogin.com>"] 6 | license = "MIT" 7 | readme = "README.md" 8 | repository = "https://github.com/34j/so-vits-svc-fork" 9 | documentation = "https://so-vits-svc-fork.readthedocs.io" 10 | classifiers = [ 11 | "Development Status :: 2 - Pre-Alpha", 12 | "Intended Audience :: Developers", 13 | "Natural Language :: English", 14 | "Operating System :: OS Independent", 15 | "Topic :: Software Development :: Libraries", 16 | ] 17 | packages = [ 18 | { include = "so_vits_svc_fork", from = "src" }, 19 | ] 20 | 21 | [tool.poetry.scripts] 22 | so-vits-svc-fork = "so_vits_svc_fork.__main__:cli" 23 | svc = "so_vits_svc_fork.__main__:cli" 24 | svcf = "so_vits_svc_fork.__main__:cli" 25 | svcg = "so_vits_svc_fork.gui:main" 26 | svc-gui = "so_vits_svc_fork.gui:main" 27 | svcf-gui = "so_vits_svc_fork.gui:main" 28 | 29 | [tool.poetry.urls] 30 | "Bug Tracker" = "https://github.com/34j/so-vits-svc-fork/issues" 31 | "Changelog" = "https://github.com/34j/so-vits-svc-fork/blob/main/CHANGELOG.md" 32 | 33 | [tool.poetry.dependencies] 34 | python = ">=3.9,<3.13" 35 | librosa = "*" 36 | numpy = "^1.26.4" 37 | pyworld = "*" 38 | requests = "*" 39 | scipy = "*" 40 | sounddevice = "*" 41 | SoundFile = "*" 42 | tqdm = "*" 43 | praat-parselmouth = "*" 44 | onnx = "*" 45 | onnxsim = "*" 46 | onnxoptimizer = "*" 47 | torch = "^2" 48 | torchaudio = "*" 49 | tensorboard = "*" 50 | rich = "*" 51 | tqdm-joblib = "^0.0.4" 52 | tensorboardx = "*" 53 | cm-time = ">=0.1.2" 54 | pebble = ">=5.0" 55 | torchcrepe = ">=0.0.17" 56 | lightning = "^2.0.1" 57 | fastapi = "==0.111.1" 58 | transformers = "^4.28.1" 59 | matplotlib = "^3.7.1" 60 | click = "^8.1.7" 61 | setuptools = "^69.5.1" 62 | pysimplegui-4-foss = "^4.60.4.1" 63 | 64 | [tool.poetry.group.dev.dependencies] 65 | pre-commit = ">=3" 66 | pytest = "^8.0.0" 67 | pytest-cov = "^4.0.0" 68 | pipdeptree = "^2.7.0" 69 | pip-licenses = "^5.0.0" 70 | 71 | [tool.poetry.group.docs] 72 | optional = true 73 | 74 | [tool.poetry.group.docs.dependencies] 75 | myst-parser = ">=0.16" 76 | sphinx = ">=4.0" 77 | sphinx-rtd-theme = ">=1.0" 78 | 79 | [tool.semantic_release] 80 | branch = "main" 81 | version_toml = "pyproject.toml:tool.poetry.version" 82 | version_variable = "src/so_vits_svc_fork/__init__.py:__version__" 83 | build_command = "pip install poetry && poetry build" 84 | 85 | [tool.pytest.ini_options] 86 | addopts = "-v -Wdefault --cov=so_vits_svc_fork --cov-report=term-missing:skip-covered" 87 | pythonpath = ["src"] 88 | 89 | [tool.coverage.run] 90 | branch = true 91 | 92 | [tool.coverage.report] 93 | exclude_lines = [ 94 | "pragma: no cover", 95 | "@overload", 96 | "if TYPE_CHECKING", 97 | "raise NotImplementedError", 98 | 'if __name__ == "__main__":', 99 | ] 100 | 101 | [tool.isort] 102 | profile = "black" 103 | known_first_party = ["so_vits_svc_fork", "tests"] 104 | 105 | [tool.autoflake] 106 | remove_all_unused_imports = true 107 | 108 | [tool.mypy] 109 | check_untyped_defs = true 110 | disallow_any_generics = true 111 | disallow_incomplete_defs = true 112 | disallow_untyped_defs = true 113 | mypy_path = "src/" 114 | no_implicit_optional = true 115 | show_error_codes = true 116 | warn_unreachable = true 117 | warn_unused_ignores = true 118 | exclude = [ 119 | 'docs/.*', 120 | 'setup.py', 121 | ] 122 | 123 | [[tool.mypy.overrides]] 124 | module = "tests.*" 125 | allow_untyped_defs = true 126 | 127 | [[tool.mypy.overrides]] 128 | module = "docs.*" 129 | ignore_errors = true 130 | 131 | [tool.bandit] 132 | exclude_dirs = ["src"] 133 | 134 | [build-system] 135 | requires = ["poetry-core>=1.0.0"] 136 | build-backend = "poetry.core.masonry.api" 137 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["github>browniebroke/renovate-configs:python"] 3 | } 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This is a shim to allow GitHub to detect the package, build is done with poetry 4 | # Taken from https://github.com/Textualize/rich 5 | 6 | import setuptools 7 | 8 | if __name__ == "__main__": 9 | setuptools.setup(name="so-vits-svc-fork") 10 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "4.2.26" 2 | 3 | from .logger import init_logger 4 | 5 | init_logger() 6 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | from typing import Any 5 | 6 | import torch 7 | from sklearn.cluster import KMeans 8 | 9 | 10 | def get_cluster_model(ckpt_path: Path | str): 11 | with Path(ckpt_path).open("rb") as f: 12 | checkpoint = torch.load( 13 | f, map_location="cpu" 14 | ) # Danger of arbitrary code execution 15 | kmeans_dict = {} 16 | for spk, ckpt in checkpoint.items(): 17 | km = KMeans(ckpt["n_features_in_"]) 18 | km.__dict__["n_features_in_"] = ckpt["n_features_in_"] 19 | km.__dict__["_n_threads"] = ckpt["_n_threads"] 20 | km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"] 21 | kmeans_dict[spk] = km 22 | return kmeans_dict 23 | 24 | 25 | def check_speaker(model: Any, speaker: Any): 26 | if speaker not in model: 27 | raise ValueError(f"Speaker {speaker} not in {list(model.keys())}") 28 | 29 | 30 | def get_cluster_result(model: Any, x: Any, speaker: Any): 31 | """ 32 | x: np.array [t, 256] 33 | return cluster class result 34 | """ 35 | check_speaker(model, speaker) 36 | return model[speaker].predict(x) 37 | 38 | 39 | def get_cluster_center_result(model: Any, x: Any, speaker: Any): 40 | """x: np.array [t, 256]""" 41 | check_speaker(model, speaker) 42 | predict = model[speaker].predict(x) 43 | return model[speaker].cluster_centers_[predict] 44 | 45 | 46 | def get_center(model: Any, x: Any, speaker: Any): 47 | check_speaker(model, speaker) 48 | return model[speaker].cluster_centers_[x] 49 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/cluster/train_cluster.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | from logging import getLogger 5 | from pathlib import Path 6 | from typing import Any 7 | 8 | import numpy as np 9 | import torch 10 | from cm_time import timer 11 | from joblib import Parallel, delayed 12 | from sklearn.cluster import KMeans, MiniBatchKMeans 13 | from tqdm_joblib import tqdm_joblib 14 | 15 | LOG = getLogger(__name__) 16 | 17 | 18 | def train_cluster( 19 | input_dir: Path | str, 20 | n_clusters: int, 21 | use_minibatch: bool = True, 22 | batch_size: int = 4096, 23 | partial_fit: bool = False, 24 | verbose: bool = False, 25 | ) -> dict: 26 | input_dir = Path(input_dir) 27 | if not partial_fit: 28 | LOG.info(f"Loading features from {input_dir}") 29 | features = [] 30 | for path in input_dir.rglob("*.data.pt"): 31 | with path.open("rb") as f: 32 | features.append( 33 | torch.load(f, weights_only=True)["content"].squeeze(0).numpy().T 34 | ) 35 | if not features: 36 | raise ValueError(f"No features found in {input_dir}") 37 | features = np.concatenate(features, axis=0).astype(np.float32) 38 | if features.shape[0] < n_clusters: 39 | raise ValueError( 40 | "Too few HuBERT features to cluster. Consider using a smaller number of clusters." 41 | ) 42 | LOG.info( 43 | f"shape: {features.shape}, size: {features.nbytes/1024**2:.2f} MB, dtype: {features.dtype}" 44 | ) 45 | with timer() as t: 46 | if use_minibatch: 47 | kmeans = MiniBatchKMeans( 48 | n_clusters=n_clusters, 49 | verbose=verbose, 50 | batch_size=batch_size, 51 | max_iter=80, 52 | n_init="auto", 53 | ).fit(features) 54 | else: 55 | kmeans = KMeans( 56 | n_clusters=n_clusters, verbose=verbose, n_init="auto" 57 | ).fit(features) 58 | LOG.info(f"Clustering took {t.elapsed:.2f} seconds") 59 | 60 | x = { 61 | "n_features_in_": kmeans.n_features_in_, 62 | "_n_threads": kmeans._n_threads, 63 | "cluster_centers_": kmeans.cluster_centers_, 64 | } 65 | return x 66 | else: 67 | # minibatch partial fit 68 | paths = list(input_dir.rglob("*.data.pt")) 69 | if len(paths) == 0: 70 | raise ValueError(f"No features found in {input_dir}") 71 | LOG.info(f"Found {len(paths)} features in {input_dir}") 72 | n_batches = math.ceil(len(paths) / batch_size) 73 | LOG.info(f"Splitting into {n_batches} batches") 74 | with timer() as t: 75 | kmeans = MiniBatchKMeans( 76 | n_clusters=n_clusters, 77 | verbose=verbose, 78 | batch_size=batch_size, 79 | max_iter=80, 80 | n_init="auto", 81 | ) 82 | for i in range(0, len(paths), batch_size): 83 | LOG.info( 84 | f"Processing batch {i//batch_size+1}/{n_batches} for speaker {input_dir.stem}" 85 | ) 86 | features = [] 87 | for path in paths[i : i + batch_size]: 88 | with path.open("rb") as f: 89 | features.append( 90 | torch.load(f, weights_only=True)["content"] 91 | .squeeze(0) 92 | .numpy() 93 | .T 94 | ) 95 | features = np.concatenate(features, axis=0).astype(np.float32) 96 | kmeans.partial_fit(features) 97 | LOG.info(f"Clustering took {t.elapsed:.2f} seconds") 98 | 99 | x = { 100 | "n_features_in_": kmeans.n_features_in_, 101 | "_n_threads": kmeans._n_threads, 102 | "cluster_centers_": kmeans.cluster_centers_, 103 | } 104 | return x 105 | 106 | 107 | def main( 108 | input_dir: Path | str, 109 | output_path: Path | str, 110 | n_clusters: int = 10000, 111 | use_minibatch: bool = True, 112 | batch_size: int = 4096, 113 | partial_fit: bool = False, 114 | verbose: bool = False, 115 | ) -> None: 116 | input_dir = Path(input_dir) 117 | output_path = Path(output_path) 118 | 119 | if not (use_minibatch or not partial_fit): 120 | raise ValueError("partial_fit requires use_minibatch") 121 | 122 | def train_cluster_(input_path: Path, **kwargs: Any) -> tuple[str, dict]: 123 | return input_path.stem, train_cluster(input_path, **kwargs) 124 | 125 | with tqdm_joblib(desc="Training clusters", total=len(list(input_dir.iterdir()))): 126 | parallel_result = Parallel(n_jobs=-1)( 127 | delayed(train_cluster_)( 128 | speaker_name, 129 | n_clusters=n_clusters, 130 | use_minibatch=use_minibatch, 131 | batch_size=batch_size, 132 | partial_fit=partial_fit, 133 | verbose=verbose, 134 | ) 135 | for speaker_name in input_dir.iterdir() 136 | ) 137 | assert parallel_result is not None 138 | checkpoint = dict(parallel_result) 139 | output_path.parent.mkdir(exist_ok=True, parents=True) 140 | with output_path.open("wb") as f: 141 | torch.save(checkpoint, f) 142 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | from random import Random 5 | from typing import Sequence 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.utils.data import Dataset 11 | 12 | from .hparams import HParams 13 | 14 | 15 | class TextAudioDataset(Dataset): 16 | def __init__(self, hps: HParams, is_validation: bool = False): 17 | self.datapaths = [ 18 | Path(x).parent / (Path(x).name + ".data.pt") 19 | for x in Path( 20 | hps.data.validation_files if is_validation else hps.data.training_files 21 | ) 22 | .read_text("utf-8") 23 | .splitlines() 24 | ] 25 | self.hps = hps 26 | self.random = Random(hps.train.seed) 27 | self.random.shuffle(self.datapaths) 28 | self.max_spec_len = 800 29 | 30 | def __getitem__(self, index: int) -> dict[str, torch.Tensor]: 31 | with Path(self.datapaths[index]).open("rb") as f: 32 | data = torch.load(f, weights_only=True, map_location="cpu") 33 | 34 | # cut long data randomly 35 | spec_len = data["mel_spec"].shape[1] 36 | hop_len = self.hps.data.hop_length 37 | if spec_len > self.max_spec_len: 38 | start = self.random.randint(0, spec_len - self.max_spec_len) 39 | end = start + self.max_spec_len - 10 40 | for key in data.keys(): 41 | if key == "audio": 42 | data[key] = data[key][:, start * hop_len : end * hop_len] 43 | elif key == "spk": 44 | continue 45 | else: 46 | data[key] = data[key][..., start:end] 47 | torch.cuda.empty_cache() 48 | return data 49 | 50 | def __len__(self) -> int: 51 | return len(self.datapaths) 52 | 53 | 54 | def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor: 55 | max_idx = torch.argmax(torch.tensor([x_.shape[-1] for x_ in array])) 56 | max_x = array[max_idx] 57 | x_padded = [ 58 | F.pad(x_, (0, max_x.shape[-1] - x_.shape[-1]), mode="constant", value=0) 59 | for x_ in array 60 | ] 61 | return torch.stack(x_padded) 62 | 63 | 64 | class TextAudioCollate(nn.Module): 65 | def forward( 66 | self, batch: Sequence[dict[str, torch.Tensor]] 67 | ) -> tuple[torch.Tensor, ...]: 68 | batch = [b for b in batch if b is not None] 69 | batch = list(sorted(batch, key=lambda x: x["mel_spec"].shape[1], reverse=True)) 70 | lengths = torch.tensor([b["mel_spec"].shape[1] for b in batch]).long() 71 | results = {} 72 | for key in batch[0].keys(): 73 | if key not in ["spk"]: 74 | results[key] = _pad_stack([b[key] for b in batch]).cpu() 75 | else: 76 | results[key] = torch.tensor([[b[key]] for b in batch]).cpu() 77 | 78 | return ( 79 | results["content"], 80 | results["f0"], 81 | results["spec"], 82 | results["mel_spec"], 83 | results["audio"], 84 | results["spk"], 85 | lengths, 86 | results["uv"], 87 | ) 88 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/default_gui_presets.json: -------------------------------------------------------------------------------- 1 | { 2 | "Default VC (GPU, GTX 1060)": { 3 | "silence_threshold": -35.0, 4 | "transpose": 12.0, 5 | "auto_predict_f0": false, 6 | "f0_method": "dio", 7 | "cluster_infer_ratio": 0.0, 8 | "noise_scale": 0.4, 9 | "pad_seconds": 0.1, 10 | "chunk_seconds": 0.5, 11 | "absolute_thresh": true, 12 | "max_chunk_seconds": 40, 13 | "crossfade_seconds": 0.05, 14 | "block_seconds": 0.35, 15 | "additional_infer_before_seconds": 0.15, 16 | "additional_infer_after_seconds": 0.1, 17 | "realtime_algorithm": "1 (Divide constantly)", 18 | "passthrough_original": false, 19 | "use_gpu": true 20 | }, 21 | "Default VC (CPU)": { 22 | "silence_threshold": -35.0, 23 | "transpose": 12.0, 24 | "auto_predict_f0": false, 25 | "f0_method": "dio", 26 | "cluster_infer_ratio": 0.0, 27 | "noise_scale": 0.4, 28 | "pad_seconds": 0.1, 29 | "chunk_seconds": 0.5, 30 | "absolute_thresh": true, 31 | "max_chunk_seconds": 40, 32 | "crossfade_seconds": 0.05, 33 | "block_seconds": 1.5, 34 | "additional_infer_before_seconds": 0.01, 35 | "additional_infer_after_seconds": 0.01, 36 | "realtime_algorithm": "1 (Divide constantly)", 37 | "passthrough_original": false, 38 | "use_gpu": false 39 | }, 40 | "Default VC (Mobile CPU)": { 41 | "silence_threshold": -35.0, 42 | "transpose": 12.0, 43 | "auto_predict_f0": false, 44 | "f0_method": "dio", 45 | "cluster_infer_ratio": 0.0, 46 | "noise_scale": 0.4, 47 | "pad_seconds": 0.1, 48 | "chunk_seconds": 0.5, 49 | "absolute_thresh": true, 50 | "max_chunk_seconds": 40, 51 | "crossfade_seconds": 0.05, 52 | "block_seconds": 2.5, 53 | "additional_infer_before_seconds": 0.01, 54 | "additional_infer_after_seconds": 0.01, 55 | "realtime_algorithm": "1 (Divide constantly)", 56 | "passthrough_original": false, 57 | "use_gpu": false 58 | }, 59 | "Default VC (Crooning)": { 60 | "silence_threshold": -35.0, 61 | "transpose": 12.0, 62 | "auto_predict_f0": false, 63 | "f0_method": "dio", 64 | "cluster_infer_ratio": 0.0, 65 | "noise_scale": 0.4, 66 | "pad_seconds": 0.1, 67 | "chunk_seconds": 0.5, 68 | "absolute_thresh": true, 69 | "max_chunk_seconds": 40, 70 | "crossfade_seconds": 0.04, 71 | "block_seconds": 0.15, 72 | "additional_infer_before_seconds": 0.05, 73 | "additional_infer_after_seconds": 0.05, 74 | "realtime_algorithm": "1 (Divide constantly)", 75 | "passthrough_original": false, 76 | "use_gpu": true 77 | }, 78 | "Default File": { 79 | "silence_threshold": -35.0, 80 | "transpose": 0.0, 81 | "auto_predict_f0": true, 82 | "f0_method": "crepe", 83 | "cluster_infer_ratio": 0.0, 84 | "noise_scale": 0.4, 85 | "pad_seconds": 0.1, 86 | "chunk_seconds": 0.5, 87 | "absolute_thresh": true, 88 | "max_chunk_seconds": 40, 89 | "auto_play": true, 90 | "passthrough_original": false 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/f0.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from logging import getLogger 4 | from typing import Any, Literal 5 | 6 | import numpy as np 7 | import torch 8 | import torchcrepe 9 | from cm_time import timer 10 | from numpy import dtype, float32, ndarray 11 | from torch import FloatTensor, Tensor 12 | 13 | from so_vits_svc_fork.utils import get_optimal_device 14 | 15 | LOG = getLogger(__name__) 16 | 17 | 18 | def normalize_f0( 19 | f0: FloatTensor, x_mask: FloatTensor, uv: FloatTensor, random_scale=True 20 | ) -> FloatTensor: 21 | # calculate means based on x_mask 22 | uv_sum = torch.sum(uv, dim=1, keepdim=True) 23 | uv_sum[uv_sum == 0] = 9999 24 | means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum 25 | 26 | if random_scale: 27 | factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device) 28 | else: 29 | factor = torch.ones(f0.shape[0], 1).to(f0.device) 30 | # normalize f0 based on means and factor 31 | f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1) 32 | if torch.isnan(f0_norm).any(): 33 | exit(0) 34 | return f0_norm * x_mask 35 | 36 | 37 | def interpolate_f0( 38 | f0: ndarray[Any, dtype[float32]] 39 | ) -> tuple[ndarray[Any, dtype[float32]], ndarray[Any, dtype[float32]]]: 40 | data = np.reshape(f0, (f0.size, 1)) 41 | 42 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 43 | vuv_vector[data > 0.0] = 1.0 44 | vuv_vector[data <= 0.0] = 0.0 45 | 46 | ip_data = data 47 | 48 | frame_number = data.size 49 | last_value = 0.0 50 | for i in range(frame_number): 51 | if data[i] <= 0.0: 52 | j = i + 1 53 | for j in range(i + 1, frame_number): 54 | if data[j] > 0.0: 55 | break 56 | if j < frame_number - 1: 57 | if last_value > 0.0: 58 | step = (data[j] - data[i - 1]) / float(j - i) 59 | for k in range(i, j): 60 | ip_data[k] = data[i - 1] + step * (k - i + 1) 61 | else: 62 | for k in range(i, j): 63 | ip_data[k] = data[j] 64 | else: 65 | for k in range(i, frame_number): 66 | ip_data[k] = last_value 67 | else: 68 | ip_data[i] = data[i] 69 | last_value = data[i] 70 | 71 | return ip_data[:, 0], vuv_vector[:, 0] 72 | 73 | 74 | def compute_f0_parselmouth( 75 | wav_numpy: ndarray[Any, dtype[float32]], 76 | p_len: None | int = None, 77 | sampling_rate: int = 44100, 78 | hop_length: int = 512, 79 | ): 80 | import parselmouth 81 | 82 | x = wav_numpy 83 | if p_len is None: 84 | p_len = x.shape[0] // hop_length 85 | else: 86 | assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error" 87 | time_step = hop_length / sampling_rate * 1000 88 | f0_min = 50 89 | f0_max = 1100 90 | f0 = ( 91 | parselmouth.Sound(x, sampling_rate) 92 | .to_pitch_ac( 93 | time_step=time_step / 1000, 94 | voicing_threshold=0.6, 95 | pitch_floor=f0_min, 96 | pitch_ceiling=f0_max, 97 | ) 98 | .selected_array["frequency"] 99 | ) 100 | 101 | pad_size = (p_len - len(f0) + 1) // 2 102 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 103 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 104 | return f0 105 | 106 | 107 | def _resize_f0( 108 | x: ndarray[Any, dtype[float32]], target_len: int 109 | ) -> ndarray[Any, dtype[float32]]: 110 | source = np.array(x) 111 | source[source < 0.001] = np.nan 112 | target = np.interp( 113 | np.arange(0, len(source) * target_len, len(source)) / target_len, 114 | np.arange(0, len(source)), 115 | source, 116 | ) 117 | res = np.nan_to_num(target) 118 | return res 119 | 120 | 121 | def compute_f0_pyworld( 122 | wav_numpy: ndarray[Any, dtype[float32]], 123 | p_len: None | int = None, 124 | sampling_rate: int = 44100, 125 | hop_length: int = 512, 126 | type_: Literal["dio", "harvest"] = "dio", 127 | ): 128 | import pyworld 129 | 130 | if p_len is None: 131 | p_len = wav_numpy.shape[0] // hop_length 132 | if type_ == "dio": 133 | f0, t = pyworld.dio( 134 | wav_numpy.astype(np.double), 135 | fs=sampling_rate, 136 | f0_ceil=f0_max, 137 | f0_floor=f0_min, 138 | frame_period=1000 * hop_length / sampling_rate, 139 | ) 140 | elif type_ == "harvest": 141 | f0, t = pyworld.harvest( 142 | wav_numpy.astype(np.double), 143 | fs=sampling_rate, 144 | f0_ceil=f0_max, 145 | f0_floor=f0_min, 146 | frame_period=1000 * hop_length / sampling_rate, 147 | ) 148 | f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate) 149 | for index, pitch in enumerate(f0): 150 | f0[index] = round(pitch, 1) 151 | return _resize_f0(f0, p_len) 152 | 153 | 154 | def compute_f0_crepe( 155 | wav_numpy: ndarray[Any, dtype[float32]], 156 | p_len: None | int = None, 157 | sampling_rate: int = 44100, 158 | hop_length: int = 512, 159 | device: str | torch.device = get_optimal_device(), 160 | model: Literal["full", "tiny"] = "full", 161 | ): 162 | audio = torch.from_numpy(wav_numpy).to(device, copy=True) 163 | audio = torch.unsqueeze(audio, dim=0) 164 | 165 | if audio.ndim == 2 and audio.shape[0] > 1: 166 | audio = torch.mean(audio, dim=0, keepdim=True).detach() 167 | # (T) -> (1, T) 168 | audio = audio.detach() 169 | 170 | pitch: Tensor = torchcrepe.predict( 171 | audio, 172 | sampling_rate, 173 | hop_length, 174 | f0_min, 175 | f0_max, 176 | model, 177 | batch_size=hop_length * 2, 178 | device=device, 179 | pad=True, 180 | ) 181 | 182 | f0 = pitch.squeeze(0).cpu().float().numpy() 183 | p_len = p_len or wav_numpy.shape[0] // hop_length 184 | f0 = _resize_f0(f0, p_len) 185 | return f0 186 | 187 | 188 | def compute_f0( 189 | wav_numpy: ndarray[Any, dtype[float32]], 190 | p_len: None | int = None, 191 | sampling_rate: int = 44100, 192 | hop_length: int = 512, 193 | method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", 194 | **kwargs, 195 | ): 196 | with timer() as t: 197 | wav_numpy = wav_numpy.astype(np.float32) 198 | wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999) 199 | if method in ["dio", "harvest"]: 200 | f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method) 201 | elif method == "crepe": 202 | f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs) 203 | elif method == "crepe-tiny": 204 | f0 = compute_f0_crepe( 205 | wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs 206 | ) 207 | elif method == "parselmouth": 208 | f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length) 209 | else: 210 | raise ValueError( 211 | "type must be dio, crepe, crepe-tiny, harvest or parselmouth" 212 | ) 213 | rtf = t.elapsed / (len(wav_numpy) / sampling_rate) 214 | LOG.info(f"F0 inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}") 215 | return f0 216 | 217 | 218 | def f0_to_coarse(f0: torch.Tensor | float): 219 | is_torch = isinstance(f0, torch.Tensor) 220 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) 221 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / ( 222 | f0_mel_max - f0_mel_min 223 | ) + 1 224 | 225 | f0_mel[f0_mel <= 1] = 1 226 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 227 | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) 228 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 229 | f0_coarse.max(), 230 | f0_coarse.min(), 231 | ) 232 | return f0_coarse 233 | 234 | 235 | f0_bin = 256 236 | f0_max = 1100.0 237 | f0_min = 50.0 238 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 239 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 240 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/hparams.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | 6 | class HParams: 7 | def __init__(self, **kwargs: Any) -> None: 8 | for k, v in kwargs.items(): 9 | if type(v) == dict: # noqa 10 | v = HParams(**v) 11 | self[k] = v 12 | 13 | def keys(self): 14 | return self.__dict__.keys() 15 | 16 | def items(self): 17 | return self.__dict__.items() 18 | 19 | def values(self): 20 | return self.__dict__.values() 21 | 22 | def get(self, key: str, default: Any = None): 23 | return self.__dict__.get(key, default) 24 | 25 | def __len__(self): 26 | return len(self.__dict__) 27 | 28 | def __getitem__(self, key): 29 | return getattr(self, key) 30 | 31 | def __setitem__(self, key, value): 32 | return setattr(self, key, value) 33 | 34 | def __contains__(self, key): 35 | return key in self.__dict__ 36 | 37 | def __repr__(self): 38 | return self.__dict__.__repr__() 39 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/inference/__init__.py -------------------------------------------------------------------------------- /src/so_vits_svc_fork/inference/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from logging import getLogger 4 | from pathlib import Path 5 | from typing import Literal, Sequence 6 | 7 | import librosa 8 | import numpy as np 9 | import soundfile 10 | import torch 11 | from cm_time import timer 12 | from tqdm import tqdm 13 | 14 | from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc 15 | from so_vits_svc_fork.utils import get_optimal_device 16 | 17 | LOG = getLogger(__name__) 18 | 19 | 20 | def infer( 21 | *, 22 | # paths 23 | input_path: Path | str | Sequence[Path | str], 24 | output_path: Path | str | Sequence[Path | str], 25 | model_path: Path | str, 26 | config_path: Path | str, 27 | recursive: bool = False, 28 | # svc config 29 | speaker: int | str, 30 | cluster_model_path: Path | str | None = None, 31 | transpose: int = 0, 32 | auto_predict_f0: bool = False, 33 | cluster_infer_ratio: float = 0, 34 | noise_scale: float = 0.4, 35 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", 36 | # slice config 37 | db_thresh: int = -40, 38 | pad_seconds: float = 0.5, 39 | chunk_seconds: float = 0.5, 40 | absolute_thresh: bool = False, 41 | max_chunk_seconds: float = 40, 42 | device: str | torch.device = get_optimal_device(), 43 | ): 44 | if isinstance(input_path, (str, Path)): 45 | input_path = [input_path] 46 | if isinstance(output_path, (str, Path)): 47 | output_path = [output_path] 48 | if len(input_path) != len(output_path): 49 | raise ValueError( 50 | f"input_path and output_path must have same length, but got {len(input_path)} and {len(output_path)}" 51 | ) 52 | 53 | model_path = Path(model_path) 54 | config_path = Path(config_path) 55 | output_path = [Path(p) for p in output_path] 56 | input_path = [Path(p) for p in input_path] 57 | output_paths = [] 58 | input_paths = [] 59 | 60 | for input_path, output_path in zip(input_path, output_path): 61 | if input_path.is_dir(): 62 | if not recursive: 63 | raise ValueError( 64 | f"input_path is a directory, but recursive is False: {input_path}" 65 | ) 66 | input_paths.extend(list(input_path.rglob("*.*"))) 67 | output_paths.extend( 68 | [output_path / p.relative_to(input_path) for p in input_paths] 69 | ) 70 | continue 71 | input_paths.append(input_path) 72 | output_paths.append(output_path) 73 | 74 | cluster_model_path = Path(cluster_model_path) if cluster_model_path else None 75 | svc_model = Svc( 76 | net_g_path=model_path.as_posix(), 77 | config_path=config_path.as_posix(), 78 | cluster_model_path=( 79 | cluster_model_path.as_posix() if cluster_model_path else None 80 | ), 81 | device=device, 82 | ) 83 | 84 | try: 85 | pbar = tqdm(list(zip(input_paths, output_paths)), disable=len(input_paths) == 1) 86 | for input_path, output_path in pbar: 87 | pbar.set_description(f"{input_path}") 88 | try: 89 | audio, _ = librosa.load(str(input_path), sr=svc_model.target_sample) 90 | except Exception as e: 91 | LOG.error(f"Failed to load {input_path}") 92 | LOG.exception(e) 93 | continue 94 | output_path.parent.mkdir(parents=True, exist_ok=True) 95 | audio = svc_model.infer_silence( 96 | audio.astype(np.float32), 97 | speaker=speaker, 98 | transpose=transpose, 99 | auto_predict_f0=auto_predict_f0, 100 | cluster_infer_ratio=cluster_infer_ratio, 101 | noise_scale=noise_scale, 102 | f0_method=f0_method, 103 | db_thresh=db_thresh, 104 | pad_seconds=pad_seconds, 105 | chunk_seconds=chunk_seconds, 106 | absolute_thresh=absolute_thresh, 107 | max_chunk_seconds=max_chunk_seconds, 108 | ) 109 | soundfile.write(str(output_path), audio, svc_model.target_sample) 110 | finally: 111 | del svc_model 112 | torch.cuda.empty_cache() 113 | 114 | 115 | def realtime( 116 | *, 117 | # paths 118 | model_path: Path | str, 119 | config_path: Path | str, 120 | # svc config 121 | speaker: str, 122 | cluster_model_path: Path | str | None = None, 123 | transpose: int = 0, 124 | auto_predict_f0: bool = False, 125 | cluster_infer_ratio: float = 0, 126 | noise_scale: float = 0.4, 127 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", 128 | # slice config 129 | db_thresh: int = -40, 130 | pad_seconds: float = 0.5, 131 | chunk_seconds: float = 0.5, 132 | # realtime config 133 | crossfade_seconds: float = 0.05, 134 | additional_infer_before_seconds: float = 0.2, 135 | additional_infer_after_seconds: float = 0.1, 136 | block_seconds: float = 0.5, 137 | version: int = 2, 138 | input_device: int | str | None = None, 139 | output_device: int | str | None = None, 140 | device: str | torch.device = get_optimal_device(), 141 | passthrough_original: bool = False, 142 | ): 143 | import sounddevice as sd 144 | 145 | model_path = Path(model_path) 146 | config_path = Path(config_path) 147 | cluster_model_path = Path(cluster_model_path) if cluster_model_path else None 148 | svc_model = Svc( 149 | net_g_path=model_path.as_posix(), 150 | config_path=config_path.as_posix(), 151 | cluster_model_path=( 152 | cluster_model_path.as_posix() if cluster_model_path else None 153 | ), 154 | device=device, 155 | ) 156 | 157 | LOG.info("Creating realtime model...") 158 | if version == 1: 159 | model = RealtimeVC( 160 | svc_model=svc_model, 161 | crossfade_len=int(crossfade_seconds * svc_model.target_sample), 162 | additional_infer_before_len=int( 163 | additional_infer_before_seconds * svc_model.target_sample 164 | ), 165 | additional_infer_after_len=int( 166 | additional_infer_after_seconds * svc_model.target_sample 167 | ), 168 | ) 169 | else: 170 | model = RealtimeVC2( 171 | svc_model=svc_model, 172 | ) 173 | 174 | # LOG all device info 175 | devices = sd.query_devices() 176 | LOG.info(f"Device: {devices}") 177 | if isinstance(input_device, str): 178 | input_device_candidates = [ 179 | i for i, d in enumerate(devices) if d["name"] == input_device 180 | ] 181 | if len(input_device_candidates) == 0: 182 | LOG.warning(f"Input device {input_device} not found, using default") 183 | input_device = None 184 | else: 185 | input_device = input_device_candidates[0] 186 | if isinstance(output_device, str): 187 | output_device_candidates = [ 188 | i for i, d in enumerate(devices) if d["name"] == output_device 189 | ] 190 | if len(output_device_candidates) == 0: 191 | LOG.warning(f"Output device {output_device} not found, using default") 192 | output_device = None 193 | else: 194 | output_device = output_device_candidates[0] 195 | if input_device is None or input_device >= len(devices): 196 | input_device = sd.default.device[0] 197 | if output_device is None or output_device >= len(devices): 198 | output_device = sd.default.device[1] 199 | LOG.info( 200 | f"Input Device: {devices[input_device]['name']}, Output Device: {devices[output_device]['name']}" 201 | ) 202 | 203 | # the model RTL is somewhat significantly high only in the first inference 204 | # there could be no better way to warm up the model than to do a dummy inference 205 | # (there are not differences in the behavior of the model between the first and the later inferences) 206 | # so we do a dummy inference to warm up the model (1 second of audio) 207 | LOG.info("Warming up the model...") 208 | svc_model.infer( 209 | speaker=speaker, 210 | transpose=transpose, 211 | auto_predict_f0=auto_predict_f0, 212 | cluster_infer_ratio=cluster_infer_ratio, 213 | noise_scale=noise_scale, 214 | f0_method=f0_method, 215 | audio=np.zeros(svc_model.target_sample, dtype=np.float32), 216 | ) 217 | 218 | def callback( 219 | indata: np.ndarray, 220 | outdata: np.ndarray, 221 | frames: int, 222 | time: int, 223 | status: sd.CallbackFlags, 224 | ) -> None: 225 | LOG.debug( 226 | f"Frames: {frames}, Status: {status}, Shape: {indata.shape}, Time: {time}" 227 | ) 228 | 229 | kwargs = dict( 230 | input_audio=indata.mean(axis=1).astype(np.float32), 231 | # svc config 232 | speaker=speaker, 233 | transpose=transpose, 234 | auto_predict_f0=auto_predict_f0, 235 | cluster_infer_ratio=cluster_infer_ratio, 236 | noise_scale=noise_scale, 237 | f0_method=f0_method, 238 | # slice config 239 | db_thresh=db_thresh, 240 | # pad_seconds=pad_seconds, 241 | chunk_seconds=chunk_seconds, 242 | ) 243 | if version == 1: 244 | kwargs["pad_seconds"] = pad_seconds 245 | with timer() as t: 246 | inference = model.process( 247 | **kwargs, 248 | ).reshape(-1, 1) 249 | if passthrough_original: 250 | outdata[:] = (indata + inference) / 2 251 | else: 252 | outdata[:] = inference 253 | rtf = t.elapsed / block_seconds 254 | LOG.info(f"Realtime inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}") 255 | if rtf > 1: 256 | LOG.warning("RTF is too high, consider increasing block_seconds") 257 | 258 | try: 259 | with sd.Stream( 260 | device=(input_device, output_device), 261 | channels=1, 262 | callback=callback, 263 | samplerate=svc_model.target_sample, 264 | blocksize=int(block_seconds * svc_model.target_sample), 265 | latency="low", 266 | ) as stream: 267 | LOG.info(f"Latency: {stream.latency}") 268 | while True: 269 | sd.sleep(1000) 270 | finally: 271 | # del model, svc_model 272 | torch.cuda.empty_cache() 273 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from logging import DEBUG, INFO, StreamHandler, basicConfig, captureWarnings, getLogger 4 | from pathlib import Path 5 | 6 | from rich.logging import RichHandler 7 | 8 | LOGGER_INIT = False 9 | 10 | 11 | def init_logger() -> None: 12 | global LOGGER_INIT 13 | if LOGGER_INIT: 14 | return 15 | 16 | IS_TEST = "test" in Path.cwd().stem 17 | package_name = sys.modules[__name__].__package__ 18 | basicConfig( 19 | level=INFO, 20 | format="%(asctime)s %(message)s", 21 | datefmt="[%X]", 22 | handlers=[ 23 | StreamHandler() if is_notebook() else RichHandler(), 24 | # FileHandler(f"{package_name}.log"), 25 | ], 26 | ) 27 | if IS_TEST: 28 | getLogger(package_name).setLevel(DEBUG) 29 | captureWarnings(True) 30 | LOGGER_INIT = True 31 | 32 | 33 | def is_notebook(): 34 | try: 35 | from IPython import get_ipython 36 | 37 | if "IPKernelApp" not in get_ipython().config: # pragma: no cover 38 | raise ImportError("console") 39 | return False 40 | if "VSCODE_PID" in os.environ: # pragma: no cover 41 | raise ImportError("vscode") 42 | return False 43 | except Exception: 44 | return False 45 | else: # pragma: no cover 46 | return True 47 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/modules/__init__.py -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/commons.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | 7 | 8 | def slice_segments(x: Tensor, starts: Tensor, length: int) -> Tensor: 9 | if length is None: 10 | return x 11 | length = min(length, x.size(-1)) 12 | x_slice = torch.zeros((x.size()[:-1] + (length,)), dtype=x.dtype, device=x.device) 13 | ends = starts + length 14 | for i, (start, end) in enumerate(zip(starts, ends)): 15 | # LOG.debug(i, start, end, x.size(), x[i, ..., start:end].size(), x_slice.size()) 16 | # x_slice[i, ...] = x[i, ..., start:end] need to pad 17 | # x_slice[i, ..., :end - start] = x[i, ..., start:end] this does not work 18 | x_slice[i, ...] = F.pad(x[i, ..., start:end], (0, max(0, length - x.size(-1)))) 19 | return x_slice 20 | 21 | 22 | def rand_slice_segments_with_pitch( 23 | x: Tensor, f0: Tensor, x_lengths: Tensor | int | None, segment_size: int | None 24 | ): 25 | if segment_size is None: 26 | return x, f0, torch.arange(x.size(0), device=x.device) 27 | if x_lengths is None: 28 | x_lengths = x.size(-1) * torch.ones( 29 | x.size(0), dtype=torch.long, device=x.device 30 | ) 31 | # slice_starts = (torch.rand(z.size(0), device=z.device) * (z_lengths - segment_size)).long() 32 | slice_starts = ( 33 | torch.rand(x.size(0), device=x.device) 34 | * torch.max( 35 | x_lengths - segment_size, torch.zeros_like(x_lengths, device=x.device) 36 | ) 37 | ).long() 38 | z_slice = slice_segments(x, slice_starts, segment_size) 39 | f0_slice = slice_segments(f0, slice_starts, segment_size) 40 | return z_slice, f0_slice, slice_starts 41 | 42 | 43 | def slice_2d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor: 44 | batch_size, num_features, seq_len = x.shape 45 | ends = starts + length 46 | idxs = ( 47 | torch.arange(seq_len, device=x.device) 48 | .unsqueeze(0) 49 | .unsqueeze(1) 50 | .repeat(batch_size, num_features, 1) 51 | ) 52 | mask = (idxs >= starts.unsqueeze(-1).unsqueeze(-1)) & ( 53 | idxs < ends.unsqueeze(-1).unsqueeze(-1) 54 | ) 55 | return x[mask].reshape(batch_size, num_features, length) 56 | 57 | 58 | def slice_1d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor: 59 | batch_size, seq_len = x.shape 60 | ends = starts + length 61 | idxs = torch.arange(seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1) 62 | mask = (idxs >= starts.unsqueeze(-1)) & (idxs < ends.unsqueeze(-1)) 63 | return x[mask].reshape(batch_size, length) 64 | 65 | 66 | def _slice_segments_v3(x: Tensor, starts: Tensor, length: int) -> Tensor: 67 | shape = x.shape[:-1] + (length,) 68 | ends = starts + length 69 | idxs = torch.arange(x.shape[-1], device=x.device).unsqueeze(0).unsqueeze(0) 70 | unsqueeze_dims = len(shape) - len( 71 | x.shape 72 | ) # calculate number of dimensions to unsqueeze 73 | starts = starts.reshape(starts.shape + (1,) * unsqueeze_dims) 74 | ends = ends.reshape(ends.shape + (1,) * unsqueeze_dims) 75 | mask = (idxs >= starts) & (idxs < ends) 76 | return x[mask].reshape(shape) 77 | 78 | 79 | def init_weights(m, mean=0.0, std=0.01): 80 | classname = m.__class__.__name__ 81 | if classname.find("Conv") != -1: 82 | m.weight.data.normal_(mean, std) 83 | 84 | 85 | def get_padding(kernel_size, dilation=1): 86 | return int((kernel_size * dilation - dilation) / 2) 87 | 88 | 89 | def convert_pad_shape(pad_shape): 90 | l = pad_shape[::-1] 91 | pad_shape = [item for sublist in l for item in sublist] 92 | return pad_shape 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def sequence_mask(length, max_length=None): 111 | if max_length is None: 112 | max_length = length.max() 113 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 114 | return x.unsqueeze(0) < length.unsqueeze(1) 115 | 116 | 117 | def clip_grad_value_(parameters, clip_value, norm_type=2): 118 | if isinstance(parameters, torch.Tensor): 119 | parameters = [parameters] 120 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 121 | norm_type = float(norm_type) 122 | if clip_value is not None: 123 | clip_value = float(clip_value) 124 | 125 | total_norm = 0 126 | for p in parameters: 127 | param_norm = p.grad.data.norm(norm_type) 128 | total_norm += param_norm.item() ** norm_type 129 | if clip_value is not None: 130 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 131 | total_norm = total_norm ** (1.0 / norm_type) 132 | return total_norm 133 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/modules/decoders/__init__.py -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/f0.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from so_vits_svc_fork.modules import attentions as attentions 5 | 6 | 7 | class F0Decoder(nn.Module): 8 | def __init__( 9 | self, 10 | out_channels, 11 | hidden_channels, 12 | filter_channels, 13 | n_heads, 14 | n_layers, 15 | kernel_size, 16 | p_dropout, 17 | spk_channels=0, 18 | ): 19 | super().__init__() 20 | self.out_channels = out_channels 21 | self.hidden_channels = hidden_channels 22 | self.filter_channels = filter_channels 23 | self.n_heads = n_heads 24 | self.n_layers = n_layers 25 | self.kernel_size = kernel_size 26 | self.p_dropout = p_dropout 27 | self.spk_channels = spk_channels 28 | 29 | self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1) 30 | self.decoder = attentions.FFT( 31 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 32 | ) 33 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 34 | self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1) 35 | self.cond = nn.Conv1d(spk_channels, hidden_channels, 1) 36 | 37 | def forward(self, x, norm_f0, x_mask, spk_emb=None): 38 | x = torch.detach(x) 39 | if spk_emb is not None: 40 | spk_emb = torch.detach(spk_emb) 41 | x = x + self.cond(spk_emb) 42 | x += self.f0_prenet(norm_f0) 43 | x = self.prenet(x) * x_mask 44 | x = self.decoder(x * x_mask, x_mask) 45 | x = self.proj(x) * x_mask 46 | return x 47 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from ._models import NSFHifiGANGenerator 2 | 3 | __all__ = ["NSFHifiGANGenerator"] 4 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/hifigan/_models.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.nn import Conv1d, ConvTranspose1d 8 | from torch.nn.utils import remove_weight_norm, weight_norm 9 | 10 | from ...modules import ResBlock1, ResBlock2 11 | from ._utils import init_weights 12 | 13 | LOG = getLogger(__name__) 14 | 15 | LRELU_SLOPE = 0.1 16 | 17 | 18 | def padDiff(x): 19 | return F.pad( 20 | F.pad(x, (0, 0, -1, 1), "constant", 0) - x, (0, 0, 0, -1), "constant", 0 21 | ) 22 | 23 | 24 | class SineGen(torch.nn.Module): 25 | """Definition of sine generator 26 | SineGen(samp_rate, harmonic_num = 0, 27 | sine_amp = 0.1, noise_std = 0.003, 28 | voiced_threshold = 0, 29 | flag_for_pulse=False) 30 | samp_rate: sampling rate in Hz 31 | harmonic_num: number of harmonic overtones (default 0) 32 | sine_amp: amplitude of sine-wavefrom (default 0.1) 33 | noise_std: std of Gaussian noise (default 0.003) 34 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 35 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 36 | Note: when flag_for_pulse is True, the first time step of a voiced 37 | segment is always sin(np.pi) or cos(0) 38 | """ 39 | 40 | def __init__( 41 | self, 42 | samp_rate, 43 | harmonic_num=0, 44 | sine_amp=0.1, 45 | noise_std=0.003, 46 | voiced_threshold=0, 47 | flag_for_pulse=False, 48 | ): 49 | super().__init__() 50 | self.sine_amp = sine_amp 51 | self.noise_std = noise_std 52 | self.harmonic_num = harmonic_num 53 | self.dim = self.harmonic_num + 1 54 | self.sampling_rate = samp_rate 55 | self.voiced_threshold = voiced_threshold 56 | self.flag_for_pulse = flag_for_pulse 57 | 58 | def _f02uv(self, f0): 59 | # generate uv signal 60 | uv = (f0 > self.voiced_threshold).type(torch.float32) 61 | return uv 62 | 63 | def _f02sine(self, f0_values): 64 | """f0_values: (batchsize, length, dim) 65 | where dim indicates fundamental tone and overtones 66 | """ 67 | # convert to F0 in rad. The integer part n can be ignored 68 | # because 2 * np.pi * n doesn't affect phase 69 | rad_values = (f0_values / self.sampling_rate) % 1 70 | 71 | # initial phase noise (no noise for fundamental component) 72 | rand_ini = torch.rand( 73 | f0_values.shape[0], f0_values.shape[2], device=f0_values.device 74 | ) 75 | rand_ini[:, 0] = 0 76 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 77 | 78 | # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) 79 | if not self.flag_for_pulse: 80 | # for normal case 81 | 82 | # To prevent torch.cumsum numerical overflow, 83 | # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. 84 | # Buffer tmp_over_one_idx indicates the time step to add -1. 85 | # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi 86 | tmp_over_one = torch.cumsum(rad_values, 1) % 1 87 | tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 88 | cumsum_shift = torch.zeros_like(rad_values) 89 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 90 | 91 | sines = torch.sin( 92 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi 93 | ) 94 | else: 95 | # If necessary, make sure that the first time step of every 96 | # voiced segments is sin(pi) or cos(0) 97 | # This is used for pulse-train generation 98 | 99 | # identify the last time step in unvoiced segments 100 | uv = self._f02uv(f0_values) 101 | uv_1 = torch.roll(uv, shifts=-1, dims=1) 102 | uv_1[:, -1, :] = 1 103 | u_loc = (uv < 1) * (uv_1 > 0) 104 | 105 | # get the instantanouse phase 106 | tmp_cumsum = torch.cumsum(rad_values, dim=1) 107 | # different batch needs to be processed differently 108 | for idx in range(f0_values.shape[0]): 109 | temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] 110 | temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] 111 | # stores the accumulation of i.phase within 112 | # each voiced segments 113 | tmp_cumsum[idx, :, :] = 0 114 | tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum 115 | 116 | # rad_values - tmp_cumsum: remove the accumulation of i.phase 117 | # within the previous voiced segment. 118 | i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) 119 | 120 | # get the sines 121 | sines = torch.cos(i_phase * 2 * np.pi) 122 | return sines 123 | 124 | def forward(self, f0): 125 | """sine_tensor, uv = forward(f0) 126 | input F0: tensor(batchsize=1, length, dim=1) 127 | f0 for unvoiced steps should be 0 128 | output sine_tensor: tensor(batchsize=1, length, dim) 129 | output uv: tensor(batchsize=1, length, 1) 130 | """ 131 | with torch.no_grad(): 132 | # f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) 133 | # fundamental component 134 | # fn = torch.multiply( 135 | # f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device) 136 | # ) 137 | fn = torch.multiply( 138 | f0, torch.arange(1, self.harmonic_num + 2).to(f0.device).to(f0.dtype) 139 | ) 140 | 141 | # generate sine waveforms 142 | sine_waves = self._f02sine(fn) * self.sine_amp 143 | 144 | # generate uv signal 145 | # uv = torch.ones(f0.shape) 146 | # uv = uv * (f0 > self.voiced_threshold) 147 | uv = self._f02uv(f0) 148 | 149 | # noise: for unvoiced should be similar to sine_amp 150 | # std = self.sine_amp/3 -> max value ~ self.sine_amp 151 | # . for voiced regions is self.noise_std 152 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 153 | noise = noise_amp * torch.randn_like(sine_waves) 154 | 155 | # first: set the unvoiced part to 0 by uv 156 | # then: additive noise 157 | sine_waves = sine_waves * uv + noise 158 | return sine_waves, uv, noise 159 | 160 | 161 | class SourceModuleHnNSF(torch.nn.Module): 162 | """SourceModule for hn-nsf 163 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 164 | add_noise_std=0.003, voiced_threshod=0) 165 | sampling_rate: sampling_rate in Hz 166 | harmonic_num: number of harmonic above F0 (default: 0) 167 | sine_amp: amplitude of sine source signal (default: 0.1) 168 | add_noise_std: std of additive Gaussian noise (default: 0.003) 169 | note that amplitude of noise in unvoiced is decided 170 | by sine_amp 171 | voiced_threshold: threshold to set U/V given F0 (default: 0) 172 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 173 | F0_sampled (batchsize, length, 1) 174 | Sine_source (batchsize, length, 1) 175 | noise_source (batchsize, length 1) 176 | uv (batchsize, length, 1) 177 | """ 178 | 179 | def __init__( 180 | self, 181 | sampling_rate, 182 | harmonic_num=0, 183 | sine_amp=0.1, 184 | add_noise_std=0.003, 185 | voiced_threshod=0, 186 | ): 187 | super().__init__() 188 | 189 | self.sine_amp = sine_amp 190 | self.noise_std = add_noise_std 191 | 192 | # to produce sine waveforms 193 | self.l_sin_gen = SineGen( 194 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod 195 | ) 196 | 197 | # to merge source harmonics into a single excitation 198 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) 199 | self.l_tanh = torch.nn.Tanh() 200 | 201 | def forward(self, x): 202 | """ 203 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 204 | F0_sampled (batchsize, length, 1) 205 | Sine_source (batchsize, length, 1) 206 | noise_source (batchsize, length 1) 207 | """ 208 | # source for harmonic branch 209 | sine_wavs, uv, _ = self.l_sin_gen(x) 210 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 211 | 212 | # source for noise branch, in the same shape as uv 213 | noise = torch.randn_like(uv) * self.sine_amp / 3 214 | return sine_merge, noise, uv 215 | 216 | 217 | class NSFHifiGANGenerator(torch.nn.Module): 218 | def __init__(self, h): 219 | super().__init__() 220 | self.h = h 221 | 222 | self.num_kernels = len(h["resblock_kernel_sizes"]) 223 | self.num_upsamples = len(h["upsample_rates"]) 224 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) 225 | self.m_source = SourceModuleHnNSF( 226 | sampling_rate=h["sampling_rate"], harmonic_num=8 227 | ) 228 | self.noise_convs = nn.ModuleList() 229 | self.conv_pre = weight_norm( 230 | Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3) 231 | ) 232 | resblock = ResBlock1 if h["resblock"] == "1" else ResBlock2 233 | self.ups = nn.ModuleList() 234 | for i, (u, k) in enumerate( 235 | zip(h["upsample_rates"], h["upsample_kernel_sizes"]) 236 | ): 237 | c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) 238 | self.ups.append( 239 | weight_norm( 240 | ConvTranspose1d( 241 | h["upsample_initial_channel"] // (2**i), 242 | h["upsample_initial_channel"] // (2 ** (i + 1)), 243 | k, 244 | u, 245 | padding=(k - u) // 2, 246 | ) 247 | ) 248 | ) 249 | if i + 1 < len(h["upsample_rates"]): # 250 | stride_f0 = np.prod(h["upsample_rates"][i + 1 :]) 251 | self.noise_convs.append( 252 | Conv1d( 253 | 1, 254 | c_cur, 255 | kernel_size=stride_f0 * 2, 256 | stride=stride_f0, 257 | padding=stride_f0 // 2, 258 | ) 259 | ) 260 | else: 261 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 262 | self.resblocks = nn.ModuleList() 263 | for i in range(len(self.ups)): 264 | ch = h["upsample_initial_channel"] // (2 ** (i + 1)) 265 | for j, (k, d) in enumerate( 266 | zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"]) 267 | ): 268 | self.resblocks.append(resblock(ch, k, d)) 269 | 270 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 271 | self.ups.apply(init_weights) 272 | self.conv_post.apply(init_weights) 273 | self.cond = nn.Conv1d(h["gin_channels"], h["upsample_initial_channel"], 1) 274 | 275 | def forward(self, x, f0, g=None): 276 | # LOG.info(1,x.shape,f0.shape,f0[:, None].shape) 277 | f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t 278 | # LOG.info(2,f0.shape) 279 | har_source, noi_source, uv = self.m_source(f0) 280 | har_source = har_source.transpose(1, 2) 281 | x = self.conv_pre(x) 282 | x = x + self.cond(g) 283 | # LOG.info(124,x.shape,har_source.shape) 284 | for i in range(self.num_upsamples): 285 | x = F.leaky_relu(x, LRELU_SLOPE) 286 | # LOG.info(3,x.shape) 287 | x = self.ups[i](x) 288 | x_source = self.noise_convs[i](har_source) 289 | # LOG.info(4,x_source.shape,har_source.shape,x.shape) 290 | x = x + x_source 291 | xs = None 292 | for j in range(self.num_kernels): 293 | if xs is None: 294 | xs = self.resblocks[i * self.num_kernels + j](x) 295 | else: 296 | xs += self.resblocks[i * self.num_kernels + j](x) 297 | x = xs / self.num_kernels 298 | x = F.leaky_relu(x) 299 | x = self.conv_post(x) 300 | x = torch.tanh(x) 301 | 302 | return x 303 | 304 | def remove_weight_norm(self): 305 | LOG.info("Removing weight norm...") 306 | for l in self.ups: 307 | remove_weight_norm(l) 308 | for l in self.resblocks: 309 | l.remove_weight_norm() 310 | remove_weight_norm(self.conv_pre) 311 | remove_weight_norm(self.conv_post) 312 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | # matplotlib.use("Agg") 4 | 5 | LOG = getLogger(__name__) 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size * dilation - dilation) / 2) 16 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py: -------------------------------------------------------------------------------- 1 | from ._generators import ( 2 | Multiband_iSTFT_Generator, 3 | Multistream_iSTFT_Generator, 4 | iSTFT_Generator, 5 | ) 6 | from ._loss import subband_stft_loss 7 | from ._pqmf import PQMF 8 | 9 | __all__ = [ 10 | "subband_stft_loss", 11 | "PQMF", 12 | "iSTFT_Generator", 13 | "Multiband_iSTFT_Generator", 14 | "Multistream_iSTFT_Generator", 15 | ] 16 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py: -------------------------------------------------------------------------------- 1 | from ._stft_loss import MultiResolutionSTFTLoss 2 | 3 | 4 | def subband_stft_loss(h, y_mb, y_hat_mb): 5 | sub_stft_loss = MultiResolutionSTFTLoss( 6 | h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths 7 | ) 8 | y_mb = y_mb.view(-1, y_mb.size(2)) 9 | y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2)) 10 | sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, : y_mb.size(-1)], y_mb) 11 | return sub_sc_loss + sub_mag_loss 12 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Pseudo QMF modules.""" 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | from scipy.signal.windows import kaiser 10 | 11 | 12 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0): 13 | """Design prototype filter for PQMF. 14 | This method is based on `A Kaiser window approach for the design of prototype 15 | filters of cosine modulated filterbanks`_. 16 | Args: 17 | taps (int): The number of filter taps. 18 | cutoff_ratio (float): Cut-off frequency ratio. 19 | beta (float): Beta coefficient for kaiser window. 20 | Returns: 21 | ndarray: Impluse response of prototype filter (taps + 1,). 22 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: 23 | https://ieeexplore.ieee.org/abstract/document/681427 24 | """ 25 | # check the arguments are valid 26 | assert taps % 2 == 0, "The number of taps mush be even number." 27 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." 28 | 29 | # make initial filter 30 | omega_c = np.pi * cutoff_ratio 31 | with np.errstate(invalid="ignore"): 32 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / ( 33 | np.pi * (np.arange(taps + 1) - 0.5 * taps) 34 | ) 35 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form 36 | 37 | # apply kaiser window 38 | w = kaiser(taps + 1, beta) 39 | h = h_i * w 40 | 41 | return h 42 | 43 | 44 | class PQMF(torch.nn.Module): 45 | """PQMF module. 46 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. 47 | .. _`Near-perfect-reconstruction pseudo-QMF banks`: 48 | https://ieeexplore.ieee.org/document/258122 49 | """ 50 | 51 | def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0): 52 | """Initialize PQMF module. 53 | Args: 54 | subbands (int): The number of subbands. 55 | taps (int): The number of filter taps. 56 | cutoff_ratio (float): Cut-off frequency ratio. 57 | beta (float): Beta coefficient for kaiser window. 58 | """ 59 | super().__init__() 60 | 61 | # define filter coefficient 62 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta) 63 | h_analysis = np.zeros((subbands, len(h_proto))) 64 | h_synthesis = np.zeros((subbands, len(h_proto))) 65 | for k in range(subbands): 66 | h_analysis[k] = ( 67 | 2 68 | * h_proto 69 | * np.cos( 70 | (2 * k + 1) 71 | * (np.pi / (2 * subbands)) 72 | * (np.arange(taps + 1) - ((taps - 1) / 2)) 73 | + (-1) ** k * np.pi / 4 74 | ) 75 | ) 76 | h_synthesis[k] = ( 77 | 2 78 | * h_proto 79 | * np.cos( 80 | (2 * k + 1) 81 | * (np.pi / (2 * subbands)) 82 | * (np.arange(taps + 1) - ((taps - 1) / 2)) 83 | - (-1) ** k * np.pi / 4 84 | ) 85 | ) 86 | 87 | # convert to tensor 88 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device) 89 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device) 90 | 91 | # register coefficients as buffer 92 | self.register_buffer("analysis_filter", analysis_filter) 93 | self.register_buffer("synthesis_filter", synthesis_filter) 94 | 95 | # filter for downsampling & upsampling 96 | updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device) 97 | for k in range(subbands): 98 | updown_filter[k, k, 0] = 1.0 99 | self.register_buffer("updown_filter", updown_filter) 100 | self.subbands = subbands 101 | 102 | # keep padding info 103 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 104 | 105 | def analysis(self, x): 106 | """Analysis with PQMF. 107 | Args: 108 | x (Tensor): Input tensor (B, 1, T). 109 | Returns: 110 | Tensor: Output tensor (B, subbands, T // subbands). 111 | """ 112 | x = F.conv1d(self.pad_fn(x), self.analysis_filter) 113 | return F.conv1d(x, self.updown_filter, stride=self.subbands) 114 | 115 | def synthesis(self, x): 116 | """Synthesis with PQMF. 117 | Args: 118 | x (Tensor): Input tensor (B, subbands, T // subbands). 119 | Returns: 120 | Tensor: Output tensor (B, 1, T). 121 | """ 122 | # NOTE(kan-bayashi): Power will be dreased so here multiply by # subbands. 123 | # Not sure this is the correct way, it is better to check again. 124 | # TODO(kan-bayashi): Understand the reconstruction procedure 125 | x = F.conv_transpose1d( 126 | x, self.updown_filter * self.subbands, stride=self.subbands 127 | ) 128 | return F.conv1d(self.pad_fn(x), self.synthesis_filter) 129 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | Copyright (c) 2017, Prem Seetharaman 4 | All rights reserved. 5 | * Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, this 10 | list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the copyright holder nor the names of its 13 | contributors may be used to endorse or promote products derived from this 14 | software without specific prior written permission. 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import librosa.util as librosa_util 28 | import numpy as np 29 | import torch 30 | import torch.nn.functional as F 31 | from librosa.util import pad_center, tiny 32 | from scipy.signal import get_window 33 | from torch.autograd import Variable 34 | 35 | 36 | def window_sumsquare( 37 | window, 38 | n_frames, 39 | hop_length=200, 40 | win_length=800, 41 | n_fft=800, 42 | dtype=np.float32, 43 | norm=None, 44 | ): 45 | """ 46 | # from librosa 0.6 47 | Compute the sum-square envelope of a window function at a given hop length. 48 | This is used to estimate modulation effects induced by windowing 49 | observations in short-time fourier transforms. 50 | Parameters 51 | ---------- 52 | window : string, tuple, number, callable, or list-like 53 | Window specification, as in `get_window` 54 | n_frames : int > 0 55 | The number of analysis frames 56 | hop_length : int > 0 57 | The number of samples to advance between frames 58 | win_length : [optional] 59 | The length of the window function. By default, this matches `n_fft`. 60 | n_fft : int > 0 61 | The length of each analysis frame. 62 | dtype : np.dtype 63 | The data type of the output 64 | Returns 65 | ------- 66 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 67 | The sum-squared envelope of the window function 68 | """ 69 | if win_length is None: 70 | win_length = n_fft 71 | 72 | n = n_fft + hop_length * (n_frames - 1) 73 | x = np.zeros(n, dtype=dtype) 74 | 75 | # Compute the squared window at the desired length 76 | win_sq = get_window(window, win_length, fftbins=True) 77 | win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 78 | win_sq = librosa_util.pad_center(win_sq, n_fft) 79 | 80 | # Fill the envelope 81 | for i in range(n_frames): 82 | sample = i * hop_length 83 | x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] 84 | return x 85 | 86 | 87 | class STFT(torch.nn.Module): 88 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 89 | 90 | def __init__( 91 | self, filter_length=800, hop_length=200, win_length=800, window="hann" 92 | ): 93 | super().__init__() 94 | self.filter_length = filter_length 95 | self.hop_length = hop_length 96 | self.win_length = win_length 97 | self.window = window 98 | self.forward_transform = None 99 | scale = self.filter_length / self.hop_length 100 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 101 | 102 | cutoff = int(self.filter_length / 2 + 1) 103 | fourier_basis = np.vstack( 104 | [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] 105 | ) 106 | 107 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 108 | inverse_basis = torch.FloatTensor( 109 | np.linalg.pinv(scale * fourier_basis).T[:, None, :] 110 | ) 111 | 112 | if window is not None: 113 | assert filter_length >= win_length 114 | # get window and zero center pad it to filter_length 115 | fft_window = get_window(window, win_length, fftbins=True) 116 | fft_window = pad_center(fft_window, filter_length) 117 | fft_window = torch.from_numpy(fft_window).float() 118 | 119 | # window the bases 120 | forward_basis *= fft_window 121 | inverse_basis *= fft_window 122 | 123 | self.register_buffer("forward_basis", forward_basis.float()) 124 | self.register_buffer("inverse_basis", inverse_basis.float()) 125 | 126 | def transform(self, input_data): 127 | num_batches = input_data.size(0) 128 | num_samples = input_data.size(1) 129 | 130 | self.num_samples = num_samples 131 | 132 | # similar to librosa, reflect-pad the input 133 | input_data = input_data.view(num_batches, 1, num_samples) 134 | input_data = F.pad( 135 | input_data.unsqueeze(1), 136 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 137 | mode="reflect", 138 | ) 139 | input_data = input_data.squeeze(1) 140 | 141 | forward_transform = F.conv1d( 142 | input_data, 143 | Variable(self.forward_basis, requires_grad=False), 144 | stride=self.hop_length, 145 | padding=0, 146 | ) 147 | 148 | cutoff = int((self.filter_length / 2) + 1) 149 | real_part = forward_transform[:, :cutoff, :] 150 | imag_part = forward_transform[:, cutoff:, :] 151 | 152 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 153 | phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data)) 154 | 155 | return magnitude, phase 156 | 157 | def inverse(self, magnitude, phase): 158 | recombine_magnitude_phase = torch.cat( 159 | [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 160 | ) 161 | 162 | inverse_transform = F.conv_transpose1d( 163 | recombine_magnitude_phase, 164 | Variable(self.inverse_basis, requires_grad=False), 165 | stride=self.hop_length, 166 | padding=0, 167 | ) 168 | 169 | if self.window is not None: 170 | window_sum = window_sumsquare( 171 | self.window, 172 | magnitude.size(-1), 173 | hop_length=self.hop_length, 174 | win_length=self.win_length, 175 | n_fft=self.filter_length, 176 | dtype=np.float32, 177 | ) 178 | # remove modulation effects 179 | approx_nonzero_indices = torch.from_numpy( 180 | np.where(window_sum > tiny(window_sum))[0] 181 | ) 182 | window_sum = torch.autograd.Variable( 183 | torch.from_numpy(window_sum), requires_grad=False 184 | ) 185 | window_sum = window_sum.to(inverse_transform.device()) 186 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ 187 | approx_nonzero_indices 188 | ] 189 | 190 | # scale by hop ratio 191 | inverse_transform *= float(self.filter_length) / self.hop_length 192 | 193 | inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :] 194 | inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :] 195 | 196 | return inverse_transform 197 | 198 | def forward(self, input_data): 199 | self.magnitude, self.phase = self.transform(input_data) 200 | reconstruction = self.inverse(self.magnitude, self.phase) 201 | return reconstruction 202 | 203 | 204 | class TorchSTFT(torch.nn.Module): 205 | def __init__( 206 | self, filter_length=800, hop_length=200, win_length=800, window="hann" 207 | ): 208 | super().__init__() 209 | self.filter_length = filter_length 210 | self.hop_length = hop_length 211 | self.win_length = win_length 212 | self.window = torch.from_numpy( 213 | get_window(window, win_length, fftbins=True).astype(np.float32) 214 | ) 215 | 216 | def transform(self, input_data): 217 | forward_transform = torch.stft( 218 | input_data, 219 | self.filter_length, 220 | self.hop_length, 221 | self.win_length, 222 | window=self.window, 223 | return_complex=True, 224 | ) 225 | 226 | return torch.abs(forward_transform), torch.angle(forward_transform) 227 | 228 | def inverse(self, magnitude, phase): 229 | inverse_transform = torch.istft( 230 | magnitude * torch.exp(phase * 1j), 231 | self.filter_length, 232 | self.hop_length, 233 | self.win_length, 234 | window=self.window.to(magnitude.device), 235 | ) 236 | 237 | return inverse_transform.unsqueeze( 238 | -2 239 | ) # unsqueeze to stay consistent with conv_transpose1d implementation 240 | 241 | def forward(self, input_data): 242 | self.magnitude, self.phase = self.transform(input_data) 243 | reconstruction = self.inverse(self.magnitude, self.phase) 244 | return reconstruction 245 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """STFT-based Loss modules.""" 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | 10 | def stft(x, fft_size, hop_size, win_length, window): 11 | """Perform STFT and convert to magnitude spectrogram. 12 | Args: 13 | x (Tensor): Input signal tensor (B, T). 14 | fft_size (int): FFT size. 15 | hop_size (int): Hop size. 16 | win_length (int): Window length. 17 | window (str): Window function type. 18 | Returns: 19 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 20 | """ 21 | x_stft = torch.stft( 22 | x, fft_size, hop_size, win_length, window.to(x.device), return_complex=False 23 | ) 24 | real = x_stft[..., 0] 25 | imag = x_stft[..., 1] 26 | 27 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 28 | return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) 29 | 30 | 31 | class SpectralConvergengeLoss(torch.nn.Module): 32 | """Spectral convergence loss module.""" 33 | 34 | def __init__(self): 35 | """Initialize spectral convergence loss module.""" 36 | super().__init__() 37 | 38 | def forward(self, x_mag, y_mag): 39 | """Calculate forward propagation. 40 | Args: 41 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 42 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 43 | Returns: 44 | Tensor: Spectral convergence loss value. 45 | """ 46 | return torch.norm(y_mag - x_mag) / torch.norm( 47 | y_mag 48 | ) # MB-iSTFT-VITS changed here due to codespell 49 | 50 | 51 | class LogSTFTMagnitudeLoss(torch.nn.Module): 52 | """Log STFT magnitude loss module.""" 53 | 54 | def __init__(self): 55 | """Initialize los STFT magnitude loss module.""" 56 | super().__init__() 57 | 58 | def forward(self, x_mag, y_mag): 59 | """Calculate forward propagation. 60 | Args: 61 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 62 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 63 | Returns: 64 | Tensor: Log STFT magnitude loss value. 65 | """ 66 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 67 | 68 | 69 | class STFTLoss(torch.nn.Module): 70 | """STFT loss module.""" 71 | 72 | def __init__( 73 | self, fft_size=1024, shift_size=120, win_length=600, window="hann_window" 74 | ): 75 | """Initialize STFT loss module.""" 76 | super().__init__() 77 | self.fft_size = fft_size 78 | self.shift_size = shift_size 79 | self.win_length = win_length 80 | self.window = getattr(torch, window)(win_length) 81 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 82 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 83 | 84 | def forward(self, x, y): 85 | """Calculate forward propagation. 86 | Args: 87 | x (Tensor): Predicted signal (B, T). 88 | y (Tensor): Groundtruth signal (B, T). 89 | Returns: 90 | Tensor: Spectral convergence loss value. 91 | Tensor: Log STFT magnitude loss value. 92 | """ 93 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 94 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 95 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 96 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 97 | 98 | return sc_loss, mag_loss 99 | 100 | 101 | class MultiResolutionSTFTLoss(torch.nn.Module): 102 | """Multi resolution STFT loss module.""" 103 | 104 | def __init__( 105 | self, 106 | fft_sizes=[1024, 2048, 512], 107 | hop_sizes=[120, 240, 50], 108 | win_lengths=[600, 1200, 240], 109 | window="hann_window", 110 | ): 111 | """Initialize Multi resolution STFT loss module. 112 | Args: 113 | fft_sizes (list): List of FFT sizes. 114 | hop_sizes (list): List of hop sizes. 115 | win_lengths (list): List of window lengths. 116 | window (str): Window function type. 117 | """ 118 | super().__init__() 119 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 120 | self.stft_losses = torch.nn.ModuleList() 121 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 122 | self.stft_losses += [STFTLoss(fs, ss, wl, window)] 123 | 124 | def forward(self, x, y): 125 | """Calculate forward propagation. 126 | Args: 127 | x (Tensor): Predicted signal (B, T). 128 | y (Tensor): Groundtruth signal (B, T). 129 | Returns: 130 | Tensor: Multi resolution spectral convergence loss value. 131 | Tensor: Multi resolution log STFT magnitude loss value. 132 | """ 133 | sc_loss = 0.0 134 | mag_loss = 0.0 135 | for f in self.stft_losses: 136 | sc_l, mag_l = f(x, y) 137 | sc_loss += sc_l 138 | mag_loss += mag_l 139 | sc_loss /= len(self.stft_losses) 140 | mag_loss /= len(self.stft_losses) 141 | 142 | return sc_loss, mag_loss 143 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/descriminators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import AvgPool1d, Conv1d, Conv2d 4 | from torch.nn import functional as F 5 | from torch.nn.utils import spectral_norm, weight_norm 6 | 7 | from so_vits_svc_fork.modules import modules as modules 8 | from so_vits_svc_fork.modules.commons import get_padding 9 | 10 | 11 | class DiscriminatorP(torch.nn.Module): 12 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 13 | super().__init__() 14 | self.period = period 15 | self.use_spectral_norm = use_spectral_norm 16 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 17 | self.convs = nn.ModuleList( 18 | [ 19 | norm_f( 20 | Conv2d( 21 | 1, 22 | 32, 23 | (kernel_size, 1), 24 | (stride, 1), 25 | padding=(get_padding(kernel_size, 1), 0), 26 | ) 27 | ), 28 | norm_f( 29 | Conv2d( 30 | 32, 31 | 128, 32 | (kernel_size, 1), 33 | (stride, 1), 34 | padding=(get_padding(kernel_size, 1), 0), 35 | ) 36 | ), 37 | norm_f( 38 | Conv2d( 39 | 128, 40 | 512, 41 | (kernel_size, 1), 42 | (stride, 1), 43 | padding=(get_padding(kernel_size, 1), 0), 44 | ) 45 | ), 46 | norm_f( 47 | Conv2d( 48 | 512, 49 | 1024, 50 | (kernel_size, 1), 51 | (stride, 1), 52 | padding=(get_padding(kernel_size, 1), 0), 53 | ) 54 | ), 55 | norm_f( 56 | Conv2d( 57 | 1024, 58 | 1024, 59 | (kernel_size, 1), 60 | 1, 61 | padding=(get_padding(kernel_size, 1), 0), 62 | ) 63 | ), 64 | ] 65 | ) 66 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 67 | 68 | def forward(self, x): 69 | fmap = [] 70 | 71 | # 1d to 2d 72 | b, c, t = x.shape 73 | if t % self.period != 0: # pad first 74 | n_pad = self.period - (t % self.period) 75 | x = F.pad(x, (0, n_pad), "reflect") 76 | t = t + n_pad 77 | x = x.view(b, c, t // self.period, self.period) 78 | 79 | for l in self.convs: 80 | x = l(x) 81 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 82 | fmap.append(x) 83 | x = self.conv_post(x) 84 | fmap.append(x) 85 | x = torch.flatten(x, 1, -1) 86 | 87 | return x, fmap 88 | 89 | 90 | class DiscriminatorS(torch.nn.Module): 91 | def __init__(self, use_spectral_norm=False): 92 | super().__init__() 93 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 94 | self.convs = nn.ModuleList( 95 | [ 96 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 97 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 98 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 99 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 100 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 101 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 102 | ] 103 | ) 104 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 105 | 106 | def forward(self, x): 107 | fmap = [] 108 | 109 | for l in self.convs: 110 | x = l(x) 111 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 112 | fmap.append(x) 113 | x = self.conv_post(x) 114 | fmap.append(x) 115 | x = torch.flatten(x, 1, -1) 116 | 117 | return x, fmap 118 | 119 | 120 | class MultiPeriodDiscriminator(torch.nn.Module): 121 | def __init__(self, use_spectral_norm=False): 122 | super().__init__() 123 | periods = [2, 3, 5, 7, 11] 124 | 125 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 126 | discs = discs + [ 127 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 128 | ] 129 | self.discriminators = nn.ModuleList(discs) 130 | 131 | def forward(self, y, y_hat): 132 | y_d_rs = [] 133 | y_d_gs = [] 134 | fmap_rs = [] 135 | fmap_gs = [] 136 | for i, d in enumerate(self.discriminators): 137 | y_d_r, fmap_r = d(y) 138 | y_d_g, fmap_g = d(y_hat) 139 | y_d_rs.append(y_d_r) 140 | y_d_gs.append(y_d_g) 141 | fmap_rs.append(fmap_r) 142 | fmap_gs.append(fmap_g) 143 | 144 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 145 | 146 | 147 | class MultiScaleDiscriminator(torch.nn.Module): 148 | def __init__(self): 149 | super().__init__() 150 | self.discriminators = nn.ModuleList( 151 | [ 152 | DiscriminatorS(use_spectral_norm=True), 153 | DiscriminatorS(), 154 | DiscriminatorS(), 155 | ] 156 | ) 157 | self.meanpools = nn.ModuleList( 158 | [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)] 159 | ) 160 | 161 | def forward(self, y, y_hat): 162 | y_d_rs = [] 163 | y_d_gs = [] 164 | fmap_rs = [] 165 | fmap_gs = [] 166 | for i, d in enumerate(self.discriminators): 167 | if i != 0: 168 | y = self.meanpools[i - 1](y) 169 | y_hat = self.meanpools[i - 1](y_hat) 170 | y_d_r, fmap_r = d(y) 171 | y_d_g, fmap_g = d(y_hat) 172 | y_d_rs.append(y_d_r) 173 | fmap_rs.append(fmap_r) 174 | y_d_gs.append(y_d_g) 175 | fmap_gs.append(fmap_g) 176 | 177 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 178 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/encoders.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from so_vits_svc_fork.modules import attentions as attentions 5 | from so_vits_svc_fork.modules import commons as commons 6 | from so_vits_svc_fork.modules import modules as modules 7 | 8 | 9 | class SpeakerEncoder(torch.nn.Module): 10 | def __init__( 11 | self, 12 | mel_n_channels=80, 13 | model_num_layers=3, 14 | model_hidden_size=256, 15 | model_embedding_size=256, 16 | ): 17 | super().__init__() 18 | self.lstm = nn.LSTM( 19 | mel_n_channels, model_hidden_size, model_num_layers, batch_first=True 20 | ) 21 | self.linear = nn.Linear(model_hidden_size, model_embedding_size) 22 | self.relu = nn.ReLU() 23 | 24 | def forward(self, mels): 25 | self.lstm.flatten_parameters() 26 | _, (hidden, _) = self.lstm(mels) 27 | embeds_raw = self.relu(self.linear(hidden[-1])) 28 | return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 29 | 30 | def compute_partial_slices(self, total_frames, partial_frames, partial_hop): 31 | mel_slices = [] 32 | for i in range(0, total_frames - partial_frames, partial_hop): 33 | mel_range = torch.arange(i, i + partial_frames) 34 | mel_slices.append(mel_range) 35 | 36 | return mel_slices 37 | 38 | def embed_utterance(self, mel, partial_frames=128, partial_hop=64): 39 | mel_len = mel.size(1) 40 | last_mel = mel[:, -partial_frames:] 41 | 42 | if mel_len > partial_frames: 43 | mel_slices = self.compute_partial_slices( 44 | mel_len, partial_frames, partial_hop 45 | ) 46 | mels = list(mel[:, s] for s in mel_slices) 47 | mels.append(last_mel) 48 | mels = torch.stack(tuple(mels), 0).squeeze(1) 49 | 50 | with torch.no_grad(): 51 | partial_embeds = self(mels) 52 | embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) 53 | # embed = embed / torch.linalg.norm(embed, 2) 54 | else: 55 | with torch.no_grad(): 56 | embed = self(last_mel) 57 | 58 | return embed 59 | 60 | 61 | class Encoder(nn.Module): 62 | def __init__( 63 | self, 64 | in_channels, 65 | out_channels, 66 | hidden_channels, 67 | kernel_size, 68 | dilation_rate, 69 | n_layers, 70 | gin_channels=0, 71 | ): 72 | super().__init__() 73 | self.in_channels = in_channels 74 | self.out_channels = out_channels 75 | self.hidden_channels = hidden_channels 76 | self.kernel_size = kernel_size 77 | self.dilation_rate = dilation_rate 78 | self.n_layers = n_layers 79 | self.gin_channels = gin_channels 80 | 81 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 82 | self.enc = modules.WN( 83 | hidden_channels, 84 | kernel_size, 85 | dilation_rate, 86 | n_layers, 87 | gin_channels=gin_channels, 88 | ) 89 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 90 | 91 | def forward(self, x, x_lengths, g=None): 92 | # print(x.shape,x_lengths.shape) 93 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 94 | x.dtype 95 | ) 96 | x = self.pre(x) * x_mask 97 | x = self.enc(x, x_mask, g=g) 98 | stats = self.proj(x) * x_mask 99 | m, logs = torch.split(stats, self.out_channels, dim=1) 100 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 101 | return z, m, logs, x_mask 102 | 103 | 104 | class TextEncoder(nn.Module): 105 | def __init__( 106 | self, 107 | out_channels, 108 | hidden_channels, 109 | kernel_size, 110 | n_layers, 111 | gin_channels=0, 112 | filter_channels=None, 113 | n_heads=None, 114 | p_dropout=None, 115 | ): 116 | super().__init__() 117 | self.out_channels = out_channels 118 | self.hidden_channels = hidden_channels 119 | self.kernel_size = kernel_size 120 | self.n_layers = n_layers 121 | self.gin_channels = gin_channels 122 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 123 | self.f0_emb = nn.Embedding(256, hidden_channels) 124 | 125 | self.enc_ = attentions.Encoder( 126 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 127 | ) 128 | 129 | def forward(self, x, x_mask, f0=None, noice_scale=1): 130 | x = x + self.f0_emb(f0).transpose(1, 2) 131 | x = self.enc_(x * x_mask, x_mask) 132 | stats = self.proj(x) * x_mask 133 | m, logs = torch.split(stats, self.out_channels, dim=1) 134 | z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask 135 | 136 | return z, m, logs, x_mask 137 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/flows.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from so_vits_svc_fork.modules import modules as modules 4 | 5 | 6 | class ResidualCouplingBlock(nn.Module): 7 | def __init__( 8 | self, 9 | channels, 10 | hidden_channels, 11 | kernel_size, 12 | dilation_rate, 13 | n_layers, 14 | n_flows=4, 15 | gin_channels=0, 16 | ): 17 | super().__init__() 18 | self.channels = channels 19 | self.hidden_channels = hidden_channels 20 | self.kernel_size = kernel_size 21 | self.dilation_rate = dilation_rate 22 | self.n_layers = n_layers 23 | self.n_flows = n_flows 24 | self.gin_channels = gin_channels 25 | 26 | self.flows = nn.ModuleList() 27 | for i in range(n_flows): 28 | self.flows.append( 29 | modules.ResidualCouplingLayer( 30 | channels, 31 | hidden_channels, 32 | kernel_size, 33 | dilation_rate, 34 | n_layers, 35 | gin_channels=gin_channels, 36 | mean_only=True, 37 | ) 38 | ) 39 | self.flows.append(modules.Flip()) 40 | 41 | def forward(self, x, x_mask, g=None, reverse=False): 42 | if not reverse: 43 | for flow in self.flows: 44 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 45 | else: 46 | for flow in reversed(self.flows): 47 | x = flow(x, x_mask, g=g, reverse=reverse) 48 | return x 49 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | """ 48 | z_p = z_p.float() 49 | logs_q = logs_q.float() 50 | m_p = m_p.float() 51 | logs_p = logs_p.float() 52 | z_mask = z_mask.float() 53 | # print(logs_p) 54 | kl = logs_p - logs_q - 0.5 55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 56 | kl = torch.sum(kl * z_mask) 57 | l = kl / torch.sum(z_mask) 58 | return l 59 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/mel_processing.py: -------------------------------------------------------------------------------- 1 | """from logging import getLogger 2 | 3 | import torch 4 | import torch.utils.data 5 | import torchaudio 6 | 7 | LOG = getLogger(__name__) 8 | 9 | 10 | from ..hparams import HParams 11 | 12 | 13 | def spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor: 14 | return torchaudio.transforms.Spectrogram( 15 | n_fft=hps.data.filter_length, 16 | win_length=hps.data.win_length, 17 | hop_length=hps.data.hop_length, 18 | power=1.0, 19 | window_fn=torch.hann_window, 20 | normalized=False, 21 | ).to(audio.device)(audio) 22 | 23 | 24 | def spec_to_mel_torch(spec: torch.Tensor, hps: HParams) -> torch.Tensor: 25 | return torchaudio.transforms.MelScale( 26 | n_mels=hps.data.n_mel_channels, 27 | sample_rate=hps.data.sampling_rate, 28 | f_min=hps.data.mel_fmin, 29 | f_max=hps.data.mel_fmax, 30 | ).to(spec.device)(spec) 31 | 32 | 33 | def mel_spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor: 34 | return torchaudio.transforms.MelSpectrogram( 35 | sample_rate=hps.data.sampling_rate, 36 | n_fft=hps.data.filter_length, 37 | n_mels=hps.data.n_mel_channels, 38 | win_length=hps.data.win_length, 39 | hop_length=hps.data.hop_length, 40 | f_min=hps.data.mel_fmin, 41 | f_max=hps.data.mel_fmax, 42 | power=1.0, 43 | window_fn=torch.hann_window, 44 | normalized=False, 45 | ).to(audio.device)(audio)""" 46 | 47 | from logging import getLogger 48 | 49 | import torch 50 | import torch.utils.data 51 | from librosa.filters import mel as librosa_mel_fn 52 | 53 | LOG = getLogger(__name__) 54 | 55 | MAX_WAV_VALUE = 32768.0 56 | 57 | 58 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 59 | """ 60 | PARAMS 61 | ------ 62 | C: compression factor 63 | """ 64 | return torch.log(torch.clamp(x, min=clip_val) * C) 65 | 66 | 67 | def dynamic_range_decompression_torch(x, C=1): 68 | """ 69 | PARAMS 70 | ------ 71 | C: compression factor used to compress 72 | """ 73 | return torch.exp(x) / C 74 | 75 | 76 | def spectral_normalize_torch(magnitudes): 77 | output = dynamic_range_compression_torch(magnitudes) 78 | return output 79 | 80 | 81 | def spectral_de_normalize_torch(magnitudes): 82 | output = dynamic_range_decompression_torch(magnitudes) 83 | return output 84 | 85 | 86 | mel_basis = {} 87 | hann_window = {} 88 | 89 | 90 | def spectrogram_torch(y, hps, center=False): 91 | if torch.min(y) < -1.0: 92 | LOG.info("min value is ", torch.min(y)) 93 | if torch.max(y) > 1.0: 94 | LOG.info("max value is ", torch.max(y)) 95 | n_fft = hps.data.filter_length 96 | hop_size = hps.data.hop_length 97 | win_size = hps.data.win_length 98 | global hann_window 99 | dtype_device = str(y.dtype) + "_" + str(y.device) 100 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 101 | if wnsize_dtype_device not in hann_window: 102 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 103 | dtype=y.dtype, device=y.device 104 | ) 105 | 106 | y = torch.nn.functional.pad( 107 | y.unsqueeze(1), 108 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 109 | mode="reflect", 110 | ) 111 | y = y.squeeze(1) 112 | 113 | spec = torch.stft( 114 | y, 115 | n_fft, 116 | hop_length=hop_size, 117 | win_length=win_size, 118 | window=hann_window[wnsize_dtype_device], 119 | center=center, 120 | pad_mode="reflect", 121 | normalized=False, 122 | onesided=True, 123 | return_complex=False, 124 | ) 125 | 126 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 127 | return spec 128 | 129 | 130 | def spec_to_mel_torch(spec, hps): 131 | sampling_rate = hps.data.sampling_rate 132 | n_fft = hps.data.filter_length 133 | num_mels = hps.data.n_mel_channels 134 | fmin = hps.data.mel_fmin 135 | fmax = hps.data.mel_fmax 136 | global mel_basis 137 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 138 | fmax_dtype_device = str(fmax) + "_" + dtype_device 139 | if fmax_dtype_device not in mel_basis: 140 | mel = librosa_mel_fn( 141 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 142 | ) 143 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 144 | dtype=spec.dtype, device=spec.device 145 | ) 146 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 147 | spec = spectral_normalize_torch(spec) 148 | return spec 149 | 150 | 151 | def mel_spectrogram_torch(y, hps, center=False): 152 | sampling_rate = hps.data.sampling_rate 153 | n_fft = hps.data.filter_length 154 | num_mels = hps.data.n_mel_channels 155 | fmin = hps.data.mel_fmin 156 | fmax = hps.data.mel_fmax 157 | hop_size = hps.data.hop_length 158 | win_size = hps.data.win_length 159 | if torch.min(y) < -1.0: 160 | LOG.info(f"min value is {torch.min(y)}") 161 | if torch.max(y) > 1.0: 162 | LOG.info(f"max value is {torch.max(y)}") 163 | 164 | global mel_basis, hann_window 165 | dtype_device = str(y.dtype) + "_" + str(y.device) 166 | fmax_dtype_device = str(fmax) + "_" + dtype_device 167 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 168 | if fmax_dtype_device not in mel_basis: 169 | mel = librosa_mel_fn( 170 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 171 | ) 172 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 173 | dtype=y.dtype, device=y.device 174 | ) 175 | if wnsize_dtype_device not in hann_window: 176 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 177 | dtype=y.dtype, device=y.device 178 | ) 179 | 180 | y = torch.nn.functional.pad( 181 | y.unsqueeze(1), 182 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 183 | mode="reflect", 184 | ) 185 | y = y.squeeze(1) 186 | 187 | spec = torch.stft( 188 | y, 189 | n_fft, 190 | hop_length=hop_size, 191 | win_length=win_size, 192 | window=hann_window[wnsize_dtype_device], 193 | center=center, 194 | pad_mode="reflect", 195 | normalized=False, 196 | onesided=True, 197 | return_complex=False, 198 | ) 199 | 200 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 201 | 202 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 203 | spec = spectral_normalize_torch(spec) 204 | 205 | return spec 206 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/modules/synthesizers.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from logging import getLogger 3 | from typing import Any, Literal, Sequence 4 | 5 | import torch 6 | from torch import nn 7 | 8 | import so_vits_svc_fork.f0 9 | from so_vits_svc_fork.f0 import f0_to_coarse 10 | from so_vits_svc_fork.modules import commons as commons 11 | from so_vits_svc_fork.modules.decoders.f0 import F0Decoder 12 | from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator 13 | from so_vits_svc_fork.modules.decoders.mb_istft import ( 14 | Multiband_iSTFT_Generator, 15 | Multistream_iSTFT_Generator, 16 | iSTFT_Generator, 17 | ) 18 | from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder 19 | from so_vits_svc_fork.modules.flows import ResidualCouplingBlock 20 | 21 | LOG = getLogger(__name__) 22 | 23 | 24 | class SynthesizerTrn(nn.Module): 25 | """ 26 | Synthesizer for Training 27 | """ 28 | 29 | def __init__( 30 | self, 31 | spec_channels: int, 32 | segment_size: int, 33 | inter_channels: int, 34 | hidden_channels: int, 35 | filter_channels: int, 36 | n_heads: int, 37 | n_layers: int, 38 | kernel_size: int, 39 | p_dropout: int, 40 | resblock: str, 41 | resblock_kernel_sizes: Sequence[int], 42 | resblock_dilation_sizes: Sequence[Sequence[int]], 43 | upsample_rates: Sequence[int], 44 | upsample_initial_channel: int, 45 | upsample_kernel_sizes: Sequence[int], 46 | gin_channels: int, 47 | ssl_dim: int, 48 | n_speakers: int, 49 | sampling_rate: int = 44100, 50 | type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan", 51 | gen_istft_n_fft: int = 16, 52 | gen_istft_hop_size: int = 4, 53 | subbands: int = 4, 54 | **kwargs: Any, 55 | ): 56 | super().__init__() 57 | self.spec_channels = spec_channels 58 | self.inter_channels = inter_channels 59 | self.hidden_channels = hidden_channels 60 | self.filter_channels = filter_channels 61 | self.n_heads = n_heads 62 | self.n_layers = n_layers 63 | self.kernel_size = kernel_size 64 | self.p_dropout = p_dropout 65 | self.resblock = resblock 66 | self.resblock_kernel_sizes = resblock_kernel_sizes 67 | self.resblock_dilation_sizes = resblock_dilation_sizes 68 | self.upsample_rates = upsample_rates 69 | self.upsample_initial_channel = upsample_initial_channel 70 | self.upsample_kernel_sizes = upsample_kernel_sizes 71 | self.segment_size = segment_size 72 | self.gin_channels = gin_channels 73 | self.ssl_dim = ssl_dim 74 | self.n_speakers = n_speakers 75 | self.sampling_rate = sampling_rate 76 | self.type_ = type_ 77 | self.gen_istft_n_fft = gen_istft_n_fft 78 | self.gen_istft_hop_size = gen_istft_hop_size 79 | self.subbands = subbands 80 | if kwargs: 81 | warnings.warn(f"Unused arguments: {kwargs}") 82 | 83 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 84 | 85 | if ssl_dim is None: 86 | self.pre = nn.LazyConv1d(hidden_channels, kernel_size=5, padding=2) 87 | else: 88 | self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2) 89 | 90 | self.enc_p = TextEncoder( 91 | inter_channels, 92 | hidden_channels, 93 | filter_channels=filter_channels, 94 | n_heads=n_heads, 95 | n_layers=n_layers, 96 | kernel_size=kernel_size, 97 | p_dropout=p_dropout, 98 | ) 99 | 100 | LOG.info(f"Decoder type: {type_}") 101 | if type_ == "hifi-gan": 102 | hps = { 103 | "sampling_rate": sampling_rate, 104 | "inter_channels": inter_channels, 105 | "resblock": resblock, 106 | "resblock_kernel_sizes": resblock_kernel_sizes, 107 | "resblock_dilation_sizes": resblock_dilation_sizes, 108 | "upsample_rates": upsample_rates, 109 | "upsample_initial_channel": upsample_initial_channel, 110 | "upsample_kernel_sizes": upsample_kernel_sizes, 111 | "gin_channels": gin_channels, 112 | } 113 | self.dec = NSFHifiGANGenerator(h=hps) 114 | self.mb = False 115 | else: 116 | hps = { 117 | "initial_channel": inter_channels, 118 | "resblock": resblock, 119 | "resblock_kernel_sizes": resblock_kernel_sizes, 120 | "resblock_dilation_sizes": resblock_dilation_sizes, 121 | "upsample_rates": upsample_rates, 122 | "upsample_initial_channel": upsample_initial_channel, 123 | "upsample_kernel_sizes": upsample_kernel_sizes, 124 | "gin_channels": gin_channels, 125 | "gen_istft_n_fft": gen_istft_n_fft, 126 | "gen_istft_hop_size": gen_istft_hop_size, 127 | "subbands": subbands, 128 | } 129 | 130 | # gen_istft_n_fft, gen_istft_hop_size, subbands 131 | if type_ == "istft": 132 | del hps["subbands"] 133 | self.dec = iSTFT_Generator(**hps) 134 | elif type_ == "ms-istft": 135 | self.dec = Multistream_iSTFT_Generator(**hps) 136 | elif type_ == "mb-istft": 137 | self.dec = Multiband_iSTFT_Generator(**hps) 138 | else: 139 | raise ValueError(f"Unknown type: {type_}") 140 | self.mb = True 141 | 142 | self.enc_q = Encoder( 143 | spec_channels, 144 | inter_channels, 145 | hidden_channels, 146 | 5, 147 | 1, 148 | 16, 149 | gin_channels=gin_channels, 150 | ) 151 | self.flow = ResidualCouplingBlock( 152 | inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels 153 | ) 154 | self.f0_decoder = F0Decoder( 155 | 1, 156 | hidden_channels, 157 | filter_channels, 158 | n_heads, 159 | n_layers, 160 | kernel_size, 161 | p_dropout, 162 | spk_channels=gin_channels, 163 | ) 164 | self.emb_uv = nn.Embedding(2, hidden_channels) 165 | 166 | def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None): 167 | g = self.emb_g(g).transpose(1, 2) 168 | # ssl prenet 169 | x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to( 170 | c.dtype 171 | ) 172 | x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) 173 | 174 | # f0 predict 175 | lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500 176 | norm_lf0 = so_vits_svc_fork.f0.normalize_f0(lf0, x_mask, uv) 177 | pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) 178 | 179 | # encoder 180 | z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) 181 | z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) 182 | 183 | # flow 184 | z_p = self.flow(z, spec_mask, g=g) 185 | z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch( 186 | z, f0, spec_lengths, self.segment_size 187 | ) 188 | 189 | # MB-iSTFT-VITS 190 | if self.mb: 191 | o, o_mb = self.dec(z_slice, g=g) 192 | # HiFi-GAN 193 | else: 194 | o = self.dec(z_slice, g=g, f0=pitch_slice) 195 | o_mb = None 196 | return ( 197 | o, 198 | o_mb, 199 | ids_slice, 200 | spec_mask, 201 | (z, z_p, m_p, logs_p, m_q, logs_q), 202 | pred_lf0, 203 | norm_lf0, 204 | lf0, 205 | ) 206 | 207 | def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False): 208 | c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) 209 | g = self.emb_g(g).transpose(1, 2) 210 | x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to( 211 | c.dtype 212 | ) 213 | x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) 214 | 215 | if predict_f0: 216 | lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500 217 | norm_lf0 = so_vits_svc_fork.f0.normalize_f0( 218 | lf0, x_mask, uv, random_scale=False 219 | ) 220 | pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) 221 | f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1) 222 | 223 | z_p, m_p, logs_p, c_mask = self.enc_p( 224 | x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale 225 | ) 226 | z = self.flow(z_p, c_mask, g=g, reverse=True) 227 | 228 | # MB-iSTFT-VITS 229 | if self.mb: 230 | o, o_mb = self.dec(z * c_mask, g=g) 231 | else: 232 | o = self.dec(z * c_mask, g=g, f0=f0) 233 | return o 234 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 200, 5 | "seed": 1234, 6 | "epochs": 10000, 7 | "learning_rate": 0.0001, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 16, 11 | "fp16_run": false, 12 | "bf16_run": false, 13 | "lr_decay": 0.999875, 14 | "segment_size": 10240, 15 | "init_lr_ratio": 1, 16 | "warmup_epochs": 0, 17 | "c_mel": 45, 18 | "c_kl": 1.0, 19 | "use_sr": true, 20 | "max_speclen": 512, 21 | "port": "8001", 22 | "keep_ckpts": 3, 23 | "fft_sizes": [768, 1366, 342], 24 | "hop_sizes": [60, 120, 20], 25 | "win_lengths": [300, 600, 120], 26 | "window": "hann_window", 27 | "num_workers": 4, 28 | "log_version": 0, 29 | "ckpt_name_by_step": false, 30 | "accumulate_grad_batches": 1 31 | }, 32 | "data": { 33 | "training_files": "filelists/44k/train.txt", 34 | "validation_files": "filelists/44k/val.txt", 35 | "max_wav_value": 32768.0, 36 | "sampling_rate": 44100, 37 | "filter_length": 2048, 38 | "hop_length": 512, 39 | "win_length": 2048, 40 | "n_mel_channels": 80, 41 | "mel_fmin": 0.0, 42 | "mel_fmax": 22050, 43 | "contentvec_final_proj": false 44 | }, 45 | "model": { 46 | "inter_channels": 192, 47 | "hidden_channels": 192, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 6, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3, 7, 11], 55 | "resblock_dilation_sizes": [ 56 | [1, 3, 5], 57 | [1, 3, 5], 58 | [1, 3, 5] 59 | ], 60 | "upsample_rates": [8, 4], 61 | "upsample_initial_channel": 512, 62 | "upsample_kernel_sizes": [32, 16], 63 | "n_layers_q": 3, 64 | "use_spectral_norm": false, 65 | "gin_channels": 256, 66 | "ssl_dim": 768, 67 | "n_speakers": 200, 68 | "type_": "ms-istft", 69 | "gen_istft_n_fft": 16, 70 | "gen_istft_hop_size": 4, 71 | "subbands": 4, 72 | "pretrained": { 73 | "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth", 74 | "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth" 75 | } 76 | }, 77 | "spk": {} 78 | } 79 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 800, 5 | "seed": 1234, 6 | "epochs": 10000, 7 | "learning_rate": 0.0001, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 16, 11 | "fp16_run": false, 12 | "bf16_run": false, 13 | "lr_decay": 0.999875, 14 | "segment_size": 10240, 15 | "init_lr_ratio": 1, 16 | "warmup_epochs": 0, 17 | "c_mel": 45, 18 | "c_kl": 1.0, 19 | "use_sr": true, 20 | "max_speclen": 512, 21 | "port": "8001", 22 | "keep_ckpts": 3, 23 | "num_workers": 4, 24 | "log_version": 0, 25 | "ckpt_name_by_step": false, 26 | "accumulate_grad_batches": 1 27 | }, 28 | "data": { 29 | "training_files": "filelists/44k/train.txt", 30 | "validation_files": "filelists/44k/val.txt", 31 | "max_wav_value": 32768.0, 32 | "sampling_rate": 44100, 33 | "filter_length": 2048, 34 | "hop_length": 512, 35 | "win_length": 2048, 36 | "n_mel_channels": 80, 37 | "mel_fmin": 0.0, 38 | "mel_fmax": 22050 39 | }, 40 | "model": { 41 | "inter_channels": 192, 42 | "hidden_channels": 192, 43 | "filter_channels": 768, 44 | "n_heads": 2, 45 | "n_layers": 6, 46 | "kernel_size": 3, 47 | "p_dropout": 0.1, 48 | "resblock": "1", 49 | "resblock_kernel_sizes": [3, 7, 11], 50 | "resblock_dilation_sizes": [ 51 | [1, 3, 5], 52 | [1, 3, 5], 53 | [1, 3, 5] 54 | ], 55 | "upsample_rates": [8, 8, 2, 2, 2], 56 | "upsample_initial_channel": 512, 57 | "upsample_kernel_sizes": [16, 16, 4, 4, 4], 58 | "n_layers_q": 3, 59 | "use_spectral_norm": false, 60 | "gin_channels": 256, 61 | "ssl_dim": 256, 62 | "n_speakers": 200, 63 | "pretrained": { 64 | "D_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth", 65 | "G_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth" 66 | } 67 | }, 68 | "spk": {} 69 | } 70 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 200, 5 | "seed": 1234, 6 | "epochs": 10000, 7 | "learning_rate": 0.0001, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 16, 11 | "fp16_run": false, 12 | "bf16_run": false, 13 | "lr_decay": 0.999875, 14 | "segment_size": 10240, 15 | "init_lr_ratio": 1, 16 | "warmup_epochs": 0, 17 | "c_mel": 45, 18 | "c_kl": 1.0, 19 | "use_sr": true, 20 | "max_speclen": 512, 21 | "port": "8001", 22 | "keep_ckpts": 3, 23 | "num_workers": 4, 24 | "log_version": 0, 25 | "ckpt_name_by_step": false, 26 | "accumulate_grad_batches": 1 27 | }, 28 | "data": { 29 | "training_files": "filelists/44k/train.txt", 30 | "validation_files": "filelists/44k/val.txt", 31 | "max_wav_value": 32768.0, 32 | "sampling_rate": 44100, 33 | "filter_length": 2048, 34 | "hop_length": 512, 35 | "win_length": 2048, 36 | "n_mel_channels": 80, 37 | "mel_fmin": 0.0, 38 | "mel_fmax": 22050, 39 | "contentvec_final_proj": false 40 | }, 41 | "model": { 42 | "inter_channels": 192, 43 | "hidden_channels": 192, 44 | "filter_channels": 768, 45 | "n_heads": 2, 46 | "n_layers": 6, 47 | "kernel_size": 3, 48 | "p_dropout": 0.1, 49 | "resblock": "1", 50 | "resblock_kernel_sizes": [3, 7, 11], 51 | "resblock_dilation_sizes": [ 52 | [1, 3, 5], 53 | [1, 3, 5], 54 | [1, 3, 5] 55 | ], 56 | "upsample_rates": [8, 8, 2, 2, 2], 57 | "upsample_initial_channel": 512, 58 | "upsample_kernel_sizes": [16, 16, 4, 4, 4], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "gin_channels": 256, 62 | "ssl_dim": 768, 63 | "n_speakers": 200, 64 | "type_": "hifi-gan", 65 | "pretrained": { 66 | "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth", 67 | "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth" 68 | } 69 | }, 70 | "spk": {} 71 | } 72 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_classify.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from logging import getLogger 4 | from pathlib import Path 5 | 6 | import keyboard 7 | import librosa 8 | import sounddevice as sd 9 | import soundfile as sf 10 | from rich.console import Console 11 | from tqdm.rich import tqdm 12 | 13 | LOG = getLogger(__name__) 14 | 15 | 16 | def preprocess_classify( 17 | input_dir: Path | str, output_dir: Path | str, create_new: bool = True 18 | ) -> None: 19 | # paths 20 | input_dir_ = Path(input_dir) 21 | output_dir_ = Path(output_dir) 22 | speed = 1 23 | if not input_dir_.is_dir(): 24 | raise ValueError(f"{input_dir} is not a directory.") 25 | output_dir_.mkdir(exist_ok=True) 26 | 27 | console = Console() 28 | # get audio paths and folders 29 | audio_paths = list(input_dir_.glob("*.*")) 30 | last_folders = [x for x in output_dir_.glob("*") if x.is_dir()] 31 | console.print("Press ↑ or ↓ to change speed. Press any other key to classify.") 32 | console.print(f"Folders: {[x.name for x in last_folders]}") 33 | 34 | pbar_description = "" 35 | 36 | pbar = tqdm(audio_paths) 37 | for audio_path in pbar: 38 | # read file 39 | audio, sr = sf.read(audio_path) 40 | 41 | # update description 42 | duration = librosa.get_duration(y=audio, sr=sr) 43 | pbar_description = f"{duration:.1f} {pbar_description}" 44 | pbar.set_description(pbar_description) 45 | 46 | while True: 47 | # start playing 48 | sd.play(librosa.effects.time_stretch(audio, rate=speed), sr, loop=True) 49 | 50 | # wait for key press 51 | key = str(keyboard.read_key()) 52 | if key == "down": 53 | speed /= 1.1 54 | console.print(f"Speed: {speed:.2f}") 55 | elif key == "up": 56 | speed *= 1.1 57 | console.print(f"Speed: {speed:.2f}") 58 | else: 59 | break 60 | 61 | # stop playing 62 | sd.stop() 63 | 64 | # print if folder changed 65 | folders = [x for x in output_dir_.glob("*") if x.is_dir()] 66 | if folders != last_folders: 67 | console.print(f"Folders updated: {[x.name for x in folders]}") 68 | last_folders = folders 69 | 70 | # get folder 71 | folder_candidates = [x for x in folders if x.name.startswith(key)] 72 | if len(folder_candidates) == 0: 73 | if create_new: 74 | folder = output_dir_ / key 75 | else: 76 | console.print(f"No folder starts with {key}.") 77 | continue 78 | else: 79 | if len(folder_candidates) > 1: 80 | LOG.warning( 81 | f"Multiple folders ({[x.name for x in folder_candidates]}) start with {key}. " 82 | f"Using first one ({folder_candidates[0].name})." 83 | ) 84 | folder = folder_candidates[0] 85 | folder.mkdir(exist_ok=True) 86 | 87 | # move file 88 | new_path = folder / audio_path.name 89 | audio_path.rename(new_path) 90 | 91 | # update description 92 | pbar_description = f"Last: {audio_path.name} -> {folder.name}" 93 | 94 | # yield result 95 | # yield audio_path, key, folder, new_path 96 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import os 5 | from copy import deepcopy 6 | from logging import getLogger 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | from librosa import get_duration 11 | from tqdm import tqdm 12 | 13 | LOG = getLogger(__name__) 14 | CONFIG_TEMPLATE_DIR = Path(__file__).parent / "config_templates" 15 | 16 | 17 | def preprocess_config( 18 | input_dir: Path | str, 19 | train_list_path: Path | str, 20 | val_list_path: Path | str, 21 | test_list_path: Path | str, 22 | config_path: Path | str, 23 | config_name: str, 24 | ): 25 | input_dir = Path(input_dir) 26 | train_list_path = Path(train_list_path) 27 | val_list_path = Path(val_list_path) 28 | test_list_path = Path(test_list_path) 29 | config_path = Path(config_path) 30 | train = [] 31 | val = [] 32 | test = [] 33 | spk_dict = {} 34 | spk_id = 0 35 | random = np.random.RandomState(1234) 36 | for speaker in os.listdir(input_dir): 37 | spk_dict[speaker] = spk_id 38 | spk_id += 1 39 | paths = [] 40 | for path in tqdm(list((input_dir / speaker).rglob("*.wav"))): 41 | if get_duration(filename=path) < 0.3: 42 | LOG.warning(f"skip {path} because it is too short.") 43 | continue 44 | paths.append(path) 45 | random.shuffle(paths) 46 | if len(paths) <= 4: 47 | raise ValueError( 48 | f"too few files in {input_dir / speaker} (expected at least 5)." 49 | ) 50 | train += paths[2:-2] 51 | val += paths[:2] 52 | test += paths[-2:] 53 | 54 | LOG.info(f"Writing {train_list_path}") 55 | train_list_path.parent.mkdir(parents=True, exist_ok=True) 56 | train_list_path.write_text( 57 | "\n".join([x.as_posix() for x in train]), encoding="utf-8" 58 | ) 59 | 60 | LOG.info(f"Writing {val_list_path}") 61 | val_list_path.parent.mkdir(parents=True, exist_ok=True) 62 | val_list_path.write_text("\n".join([x.as_posix() for x in val]), encoding="utf-8") 63 | 64 | LOG.info(f"Writing {test_list_path}") 65 | test_list_path.parent.mkdir(parents=True, exist_ok=True) 66 | test_list_path.write_text("\n".join([x.as_posix() for x in test]), encoding="utf-8") 67 | 68 | config = deepcopy( 69 | json.loads( 70 | ( 71 | CONFIG_TEMPLATE_DIR 72 | / ( 73 | config_name 74 | if config_name.endswith(".json") 75 | else config_name + ".json" 76 | ) 77 | ).read_text(encoding="utf-8") 78 | ) 79 | ) 80 | config["spk"] = spk_dict 81 | config["data"]["training_files"] = train_list_path.as_posix() 82 | config["data"]["validation_files"] = val_list_path.as_posix() 83 | LOG.info(f"Writing {config_path}") 84 | config_path.parent.mkdir(parents=True, exist_ok=True) 85 | with config_path.open("w", encoding="utf-8") as f: 86 | json.dump(config, f, indent=2) 87 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from logging import getLogger 4 | from pathlib import Path 5 | from random import shuffle 6 | from typing import Iterable, Literal 7 | 8 | import librosa 9 | import numpy as np 10 | import torch 11 | import torchaudio 12 | from joblib import Parallel, cpu_count, delayed 13 | from tqdm import tqdm 14 | from transformers import HubertModel 15 | 16 | import so_vits_svc_fork.f0 17 | from so_vits_svc_fork import utils 18 | 19 | from ..hparams import HParams 20 | from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch 21 | from ..utils import get_optimal_device, get_total_gpu_memory 22 | from .preprocess_utils import check_hubert_min_duration 23 | 24 | LOG = getLogger(__name__) 25 | HUBERT_MEMORY = 2900 26 | HUBERT_MEMORY_CREPE = 3900 27 | 28 | 29 | def _process_one( 30 | *, 31 | filepath: Path, 32 | content_model: HubertModel, 33 | device: torch.device | str = get_optimal_device(), 34 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", 35 | force_rebuild: bool = False, 36 | hps: HParams, 37 | ): 38 | audio, sr = librosa.load(filepath, sr=hps.data.sampling_rate, mono=True) 39 | 40 | if not check_hubert_min_duration(audio, sr): 41 | LOG.info(f"Skip {filepath} because it is too short.") 42 | return 43 | 44 | data_path = filepath.parent / (filepath.name + ".data.pt") 45 | if data_path.exists() and not force_rebuild: 46 | return 47 | 48 | # Compute f0 49 | f0 = so_vits_svc_fork.f0.compute_f0( 50 | audio, sampling_rate=sr, hop_length=hps.data.hop_length, method=f0_method 51 | ) 52 | f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0) 53 | f0 = torch.from_numpy(f0).float() 54 | uv = torch.from_numpy(uv).float() 55 | 56 | # Compute HuBERT content 57 | audio = torch.from_numpy(audio).float().to(device) 58 | c = utils.get_content( 59 | content_model, 60 | audio, 61 | device, 62 | sr=sr, 63 | legacy_final_proj=hps.data.get("contentvec_final_proj", True), 64 | ) 65 | c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0]) 66 | torch.cuda.empty_cache() 67 | 68 | # Compute spectrogram 69 | audio, sr = torchaudio.load(filepath) 70 | spec = spectrogram_torch(audio, hps).squeeze(0) 71 | mel_spec = spec_to_mel_torch(spec, hps) 72 | torch.cuda.empty_cache() 73 | 74 | # fix lengths 75 | lmin = min(spec.shape[1], mel_spec.shape[1], f0.shape[0], uv.shape[0], c.shape[1]) 76 | spec, mel_spec, f0, uv, c = ( 77 | spec[:, :lmin], 78 | mel_spec[:, :lmin], 79 | f0[:lmin], 80 | uv[:lmin], 81 | c[:, :lmin], 82 | ) 83 | 84 | # get speaker id 85 | spk_name = filepath.parent.name 86 | spk = hps.spk.__dict__[spk_name] 87 | spk = torch.tensor(spk).long() 88 | assert ( 89 | spec.shape[1] == mel_spec.shape[1] == f0.shape[0] == uv.shape[0] == c.shape[1] 90 | ), (spec.shape, mel_spec.shape, f0.shape, uv.shape, c.shape) 91 | data = { 92 | "spec": spec, 93 | "mel_spec": mel_spec, 94 | "f0": f0, 95 | "uv": uv, 96 | "content": c, 97 | "audio": audio, 98 | "spk": spk, 99 | } 100 | data = {k: v.cpu() for k, v in data.items()} 101 | with data_path.open("wb") as f: 102 | torch.save(data, f) 103 | 104 | 105 | def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs): 106 | hps = kwargs["hps"] 107 | content_model = utils.get_hubert_model( 108 | get_optimal_device(), hps.data.get("contentvec_final_proj", True) 109 | ) 110 | 111 | for filepath in tqdm(filepaths, position=pbar_position): 112 | _process_one( 113 | content_model=content_model, 114 | filepath=filepath, 115 | **kwargs, 116 | ) 117 | 118 | 119 | def preprocess_hubert_f0( 120 | input_dir: Path | str, 121 | config_path: Path | str, 122 | n_jobs: int | None = None, 123 | f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", 124 | force_rebuild: bool = False, 125 | ): 126 | input_dir = Path(input_dir) 127 | config_path = Path(config_path) 128 | hps = utils.get_hparams(config_path) 129 | if n_jobs is None: 130 | # add cpu_count() to avoid SIGKILL 131 | memory = get_total_gpu_memory("total") 132 | n_jobs = min( 133 | max( 134 | ( 135 | memory 136 | // (HUBERT_MEMORY_CREPE if f0_method == "crepe" else HUBERT_MEMORY) 137 | if memory is not None 138 | else 1 139 | ), 140 | 1, 141 | ), 142 | cpu_count(), 143 | ) 144 | LOG.info(f"n_jobs automatically set to {n_jobs}, memory: {memory} MiB") 145 | 146 | filepaths = list(input_dir.rglob("*.wav")) 147 | n_jobs = min(len(filepaths) // 16 + 1, n_jobs) 148 | shuffle(filepaths) 149 | filepath_chunks = np.array_split(filepaths, n_jobs) 150 | Parallel(n_jobs=n_jobs)( 151 | delayed(_process_batch)( 152 | filepaths=chunk, 153 | pbar_position=pbar_position, 154 | f0_method=f0_method, 155 | force_rebuild=force_rebuild, 156 | hps=hps, 157 | ) 158 | for (pbar_position, chunk) in enumerate(filepath_chunks) 159 | ) 160 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_resample.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from logging import getLogger 5 | from pathlib import Path 6 | from typing import Iterable 7 | 8 | import librosa 9 | import soundfile 10 | from joblib import Parallel, delayed 11 | from tqdm_joblib import tqdm_joblib 12 | 13 | from .preprocess_utils import check_hubert_min_duration 14 | 15 | LOG = getLogger(__name__) 16 | 17 | # input_dir and output_dir exists. 18 | # write code to convert input dir audio files to output dir audio files, 19 | # without changing folder structure. Use joblib to parallelize. 20 | # Converting audio files includes: 21 | # - resampling to specified sampling rate 22 | # - trim silence 23 | # - adjust volume in a smart way 24 | # - save as 16-bit wav file 25 | 26 | 27 | def _get_unique_filename(path: Path, existing_paths: Iterable[Path]) -> Path: 28 | """Return a unique path by appending a number to the original path.""" 29 | if path not in existing_paths: 30 | return path 31 | i = 1 32 | while True: 33 | new_path = path.parent / f"{path.stem}_{i}{path.suffix}" 34 | if new_path not in existing_paths: 35 | return new_path 36 | i += 1 37 | 38 | 39 | def is_relative_to(path: Path, *other): 40 | """Return True if the path is relative to another path or False. 41 | Python 3.9+ has Path.is_relative_to() method, but we need to support Python 3.8. 42 | """ 43 | try: 44 | path.relative_to(*other) 45 | return True 46 | except ValueError: 47 | return False 48 | 49 | 50 | def _preprocess_one( 51 | input_path: Path, 52 | output_path: Path, 53 | sr: int, 54 | *, 55 | top_db: int, 56 | frame_seconds: float, 57 | hop_seconds: float, 58 | ) -> None: 59 | """Preprocess one audio file.""" 60 | 61 | try: 62 | audio, sr = librosa.load(input_path, sr=sr, mono=True) 63 | 64 | # Audioread is the last backend it will attempt, so this is the exception thrown on failure 65 | except Exception as e: 66 | # Failure due to attempting to load a file that is not audio, so return early 67 | LOG.warning(f"Failed to load {input_path} due to {e}") 68 | return 69 | 70 | if not check_hubert_min_duration(audio, sr): 71 | LOG.info(f"Skip {input_path} because it is too short.") 72 | return 73 | 74 | # Adjust volume 75 | audio /= max(audio.max(), -audio.min()) 76 | 77 | # Trim silence 78 | audio, _ = librosa.effects.trim( 79 | audio, 80 | top_db=top_db, 81 | frame_length=int(frame_seconds * sr), 82 | hop_length=int(hop_seconds * sr), 83 | ) 84 | 85 | if not check_hubert_min_duration(audio, sr): 86 | LOG.info(f"Skip {input_path} because it is too short.") 87 | return 88 | 89 | soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16") 90 | 91 | 92 | def preprocess_resample( 93 | input_dir: Path | str, 94 | output_dir: Path | str, 95 | sampling_rate: int, 96 | n_jobs: int = -1, 97 | *, 98 | top_db: int = 30, 99 | frame_seconds: float = 0.1, 100 | hop_seconds: float = 0.05, 101 | ) -> None: 102 | input_dir = Path(input_dir) 103 | output_dir = Path(output_dir) 104 | """Preprocess audio files in input_dir and save them to output_dir.""" 105 | 106 | out_paths = [] 107 | in_paths = list(input_dir.rglob("*.*")) 108 | if not in_paths: 109 | raise ValueError(f"No audio files found in {input_dir}") 110 | for in_path in in_paths: 111 | in_path_relative = in_path.relative_to(input_dir) 112 | if not in_path.is_absolute() and is_relative_to( 113 | in_path, Path("dataset_raw") / "44k" 114 | ): 115 | new_in_path_relative = in_path_relative.relative_to("44k") 116 | warnings.warn( 117 | f"Recommended folder structure has changed since v1.0.0. " 118 | "Please move your dataset directly under dataset_raw folder. " 119 | f"Recoginzed {in_path_relative} as {new_in_path_relative}" 120 | ) 121 | in_path_relative = new_in_path_relative 122 | 123 | if len(in_path_relative.parts) < 2: 124 | continue 125 | speaker_name = in_path_relative.parts[0] 126 | file_name = in_path_relative.with_suffix(".wav").name 127 | out_path = output_dir / speaker_name / file_name 128 | out_path = _get_unique_filename(out_path, out_paths) 129 | out_path.parent.mkdir(parents=True, exist_ok=True) 130 | out_paths.append(out_path) 131 | 132 | in_and_out_paths = list(zip(in_paths, out_paths)) 133 | 134 | with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)): 135 | Parallel(n_jobs=n_jobs)( 136 | delayed(_preprocess_one)( 137 | *args, 138 | sr=sampling_rate, 139 | top_db=top_db, 140 | frame_seconds=frame_seconds, 141 | hop_seconds=hop_seconds, 142 | ) 143 | for args in in_and_out_paths 144 | ) 145 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import defaultdict 4 | from logging import getLogger 5 | from pathlib import Path 6 | 7 | import librosa 8 | import soundfile as sf 9 | import torch 10 | from joblib import Parallel, delayed 11 | from pyannote.audio import Pipeline 12 | from tqdm import tqdm 13 | from tqdm_joblib import tqdm_joblib 14 | 15 | LOG = getLogger(__name__) 16 | 17 | 18 | def _process_one( 19 | input_path: Path, 20 | output_dir: Path, 21 | sr: int, 22 | *, 23 | min_speakers: int = 1, 24 | max_speakers: int = 1, 25 | huggingface_token: str | None = None, 26 | ) -> None: 27 | try: 28 | audio, sr = librosa.load(input_path, sr=sr, mono=True) 29 | except Exception as e: 30 | LOG.warning(f"Failed to read {input_path}: {e}") 31 | return 32 | pipeline = Pipeline.from_pretrained( 33 | "pyannote/speaker-diarization-3.1", use_auth_token=huggingface_token 34 | ) 35 | if pipeline is None: 36 | raise ValueError("Failed to load pipeline") 37 | pipeline = pipeline.to(torch.device("cuda")) 38 | LOG.info(f"Processing {input_path}. This may take a while...") 39 | diarization = pipeline( 40 | input_path, min_speakers=min_speakers, max_speakers=max_speakers 41 | ) 42 | 43 | LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}") 44 | speaker_count = defaultdict(int) 45 | 46 | output_dir.mkdir(parents=True, exist_ok=True) 47 | for segment, track, speaker in tqdm( 48 | list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}" 49 | ): 50 | if segment.end - segment.start < 1: 51 | continue 52 | speaker_count[speaker] += 1 53 | audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)] 54 | sf.write( 55 | (output_dir / f"{speaker}_{speaker_count[speaker]:04d}.wav"), 56 | audio_cut, 57 | sr, 58 | ) 59 | 60 | LOG.info(f"Speaker count: {speaker_count}") 61 | 62 | 63 | def preprocess_speaker_diarization( 64 | input_dir: Path | str, 65 | output_dir: Path | str, 66 | sr: int, 67 | *, 68 | min_speakers: int = 1, 69 | max_speakers: int = 1, 70 | huggingface_token: str | None = None, 71 | n_jobs: int = -1, 72 | ) -> None: 73 | if huggingface_token is not None and not huggingface_token.startswith("hf_"): 74 | LOG.warning("Huggingface token probably should start with hf_") 75 | if not torch.cuda.is_available(): 76 | LOG.warning("CUDA is not available. This will be extremely slow.") 77 | input_dir = Path(input_dir) 78 | output_dir = Path(output_dir) 79 | input_dir.mkdir(parents=True, exist_ok=True) 80 | output_dir.mkdir(parents=True, exist_ok=True) 81 | input_paths = list(input_dir.rglob("*.*")) 82 | with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)): 83 | Parallel(n_jobs=n_jobs)( 84 | delayed(_process_one)( 85 | input_path, 86 | output_dir / input_path.relative_to(input_dir).parent / input_path.stem, 87 | sr, 88 | max_speakers=max_speakers, 89 | min_speakers=min_speakers, 90 | huggingface_token=huggingface_token, 91 | ) 92 | for input_path in input_paths 93 | ) 94 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_split.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from logging import getLogger 4 | from pathlib import Path 5 | 6 | import librosa 7 | import soundfile as sf 8 | from joblib import Parallel, delayed 9 | from tqdm import tqdm 10 | from tqdm_joblib import tqdm_joblib 11 | 12 | LOG = getLogger(__name__) 13 | 14 | 15 | def _process_one( 16 | input_path: Path, 17 | output_dir: Path, 18 | sr: int, 19 | *, 20 | max_length: float = 10.0, 21 | top_db: int = 30, 22 | frame_seconds: float = 0.5, 23 | hop_seconds: float = 0.1, 24 | ): 25 | try: 26 | audio, sr = librosa.load(input_path, sr=sr, mono=True) 27 | except Exception as e: 28 | LOG.warning(f"Failed to read {input_path}: {e}") 29 | return 30 | intervals = librosa.effects.split( 31 | audio, 32 | top_db=top_db, 33 | frame_length=int(sr * frame_seconds), 34 | hop_length=int(sr * hop_seconds), 35 | ) 36 | output_dir.mkdir(parents=True, exist_ok=True) 37 | for start, end in tqdm(intervals, desc=f"Writing {input_path}"): 38 | for sub_start in range(start, end, int(sr * max_length)): 39 | sub_end = min(sub_start + int(sr * max_length), end) 40 | audio_cut = audio[sub_start:sub_end] 41 | sf.write( 42 | ( 43 | output_dir 44 | / f"{input_path.stem}_{sub_start / sr:.3f}_{sub_end / sr:.3f}.wav" 45 | ), 46 | audio_cut, 47 | sr, 48 | ) 49 | 50 | 51 | def preprocess_split( 52 | input_dir: Path | str, 53 | output_dir: Path | str, 54 | sr: int, 55 | *, 56 | max_length: float = 10.0, 57 | top_db: int = 30, 58 | frame_seconds: float = 0.5, 59 | hop_seconds: float = 0.1, 60 | n_jobs: int = -1, 61 | ): 62 | input_dir = Path(input_dir) 63 | output_dir = Path(output_dir) 64 | output_dir.mkdir(parents=True, exist_ok=True) 65 | input_paths = list(input_dir.rglob("*.*")) 66 | with tqdm_joblib(desc="Splitting", total=len(input_paths)): 67 | Parallel(n_jobs=n_jobs)( 68 | delayed(_process_one)( 69 | input_path, 70 | output_dir / input_path.relative_to(input_dir).parent, 71 | sr, 72 | max_length=max_length, 73 | top_db=top_db, 74 | frame_seconds=frame_seconds, 75 | hop_seconds=hop_seconds, 76 | ) 77 | for input_path in input_paths 78 | ) 79 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/preprocessing/preprocess_utils.py: -------------------------------------------------------------------------------- 1 | from numpy import ndarray 2 | 3 | 4 | def check_hubert_min_duration(audio: ndarray, sr: int) -> bool: 5 | return len(audio) / sr >= 0.3 6 | -------------------------------------------------------------------------------- /src/so_vits_svc_fork/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/src/so_vits_svc_fork/py.typed -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/__init__.py -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0001.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0002.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0003.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0004.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0005.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0006.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0007.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0008.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0009.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/LJ001-0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/LJ001-0010.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/nested/LJ001-0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/nested/LJ001-0001.wav -------------------------------------------------------------------------------- /tests/dataset_raw/test/nested/に.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/0f015e32aada5cf7481f91bbe6758e574c9c5f39/tests/dataset_raw/test/nested/に.wav -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from unittest import SkipTest, TestCase 5 | 6 | IS_CI = os.environ.get("GITHUB_ACTIONS", False) 7 | IS_COLAB = os.getenv("COLAB_RELEASE_TAG", False) 8 | 9 | 10 | class TestMain(TestCase): 11 | def test_import(self): 12 | import so_vits_svc_fork.cluster.train_cluster # noqa 13 | import so_vits_svc_fork.inference.main # noqa 14 | 15 | # import so_vits_svc_fork.modules.onnx._export # noqa 16 | import so_vits_svc_fork.preprocessing.preprocess_flist_config # noqa 17 | import so_vits_svc_fork.preprocessing.preprocess_hubert_f0 # noqa 18 | import so_vits_svc_fork.preprocessing.preprocess_resample # noqa 19 | import so_vits_svc_fork.preprocessing.preprocess_split # noqa 20 | import so_vits_svc_fork.train # noqa 21 | 22 | def test_infer(self): 23 | if IS_CI: 24 | raise SkipTest("Skip inference test on CI") 25 | from so_vits_svc_fork.inference.main import infer # noqa 26 | 27 | # infer("tests/dataset_raw/34j/1.wav", "tests/configs/config.json", "tests/logs/44k") 28 | 29 | def test_preprocess(self): 30 | from so_vits_svc_fork.preprocessing.preprocess_resample import ( 31 | preprocess_resample, 32 | ) 33 | 34 | preprocess_resample( 35 | "tests/dataset_raw", "tests/dataset/44k", 44100, n_jobs=1 if IS_CI else -1 36 | ) 37 | 38 | from so_vits_svc_fork.preprocessing.preprocess_flist_config import ( 39 | preprocess_config, 40 | ) 41 | 42 | preprocess_config( 43 | "tests/dataset/44k", 44 | "tests/filelists/train.txt", 45 | "tests/filelists/val.txt", 46 | "tests/filelists/test.txt", 47 | "tests/configs/44k/config.json", 48 | "so-vits-svc-4.0v1", 49 | ) 50 | 51 | if IS_CI: 52 | raise SkipTest("Skip hubert and f0 test on CI") 53 | from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import ( 54 | preprocess_hubert_f0, 55 | ) 56 | 57 | preprocess_hubert_f0("tests/dataset/44k", "tests/configs/44k/config.json") 58 | 59 | def test_train(self): 60 | if not IS_COLAB: 61 | raise SkipTest("Skip training test on non-colab") 62 | # requires >10GB of GPU memory, can be only tested on colab 63 | from so_vits_svc_fork.train import train 64 | 65 | config_path = Path("tests/logs/44k/config.json") 66 | config_json = json.loads(config_path.read_text("utf-8")) 67 | config_json["train"]["epochs"] = 1 68 | config_path.write_text(json.dumps(config_json), "utf-8") 69 | train(config_path, "tests/logs/44k") 70 | --------------------------------------------------------------------------------