├── .gitignore ├── README.md ├── code ├── conf_generalists.yaml ├── conf_specialists.yaml ├── datasets │ ├── demand.csv │ ├── fsd50k.csv │ ├── librispeech.csv │ └── musan.csv ├── exp_data.py ├── exp_models.py ├── exp_utils.py ├── finetune.py ├── notebooks │ └── noise_sparsity.ipynb ├── requirements.txt ├── run.py ├── snr_predictor ├── speakers │ ├── test.csv │ ├── train.csv │ └── validation.csv ├── test.py ├── train_generalists.py └── train_specialists.py └── docs └── images ├── pse_ssl_overview.png ├── waveforms_cm.png └── waveforms_pseudose.png /.gitignore: -------------------------------------------------------------------------------- 1 | ## subdirectories 2 | notebooks/* 3 | notebooks/figures/ 4 | 5 | 6 | # Created by https://www.toptal.com/developers/gitignore/api/audio,python,pycharm+all,jupyternotebooks,windows,macos,vim,visualstudio 7 | # Edit at https://www.toptal.com/developers/gitignore?templates=audio,python,pycharm+all,jupyternotebooks,windows,macos,vim,visualstudio 8 | 9 | ### Audio ### 10 | *.aif 11 | *.iff 12 | *.m3u 13 | *.m4a 14 | *.mid 15 | *.mp3 16 | *.mpa 17 | *.ra 18 | *.wav 19 | *.wma 20 | *.ogg 21 | *.flac 22 | 23 | ### JupyterNotebooks ### 24 | # gitignore template for Jupyter Notebooks 25 | # website: http://jupyter.org/ 26 | 27 | .ipynb_checkpoints 28 | */.ipynb_checkpoints/* 29 | 30 | # IPython 31 | profile_default/ 32 | ipython_config.py 33 | 34 | # Remove previous ipynb_checkpoints 35 | # git rm -r .ipynb_checkpoints/ 36 | 37 | ### macOS ### 38 | # General 39 | .DS_Store 40 | .AppleDouble 41 | .LSOverride 42 | 43 | # Icon must end with two \r 44 | Icon 45 | 46 | 47 | # Thumbnails 48 | ._* 49 | 50 | # Files that might appear in the root of a volume 51 | .DocumentRevisions-V100 52 | .fseventsd 53 | .Spotlight-V100 54 | .TemporaryItems 55 | .Trashes 56 | .VolumeIcon.icns 57 | .com.apple.timemachine.donotpresent 58 | 59 | # Directories potentially created on remote AFP share 60 | .AppleDB 61 | .AppleDesktop 62 | Network Trash Folder 63 | Temporary Items 64 | .apdisk 65 | 66 | ### PyCharm+all ### 67 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 68 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 69 | 70 | # User-specific stuff 71 | .idea/**/workspace.xml 72 | .idea/**/tasks.xml 73 | .idea/**/usage.statistics.xml 74 | .idea/**/dictionaries 75 | .idea/**/shelf 76 | 77 | # AWS User-specific 78 | .idea/**/aws.xml 79 | 80 | # Generated files 81 | .idea/**/contentModel.xml 82 | 83 | # Sensitive or high-churn files 84 | .idea/**/dataSources/ 85 | .idea/**/dataSources.ids 86 | .idea/**/dataSources.local.xml 87 | .idea/**/sqlDataSources.xml 88 | .idea/**/dynamic.xml 89 | .idea/**/uiDesigner.xml 90 | .idea/**/dbnavigator.xml 91 | 92 | # Gradle 93 | .idea/**/gradle.xml 94 | .idea/**/libraries 95 | 96 | # Gradle and Maven with auto-import 97 | # When using Gradle or Maven with auto-import, you should exclude module files, 98 | # since they will be recreated, and may cause churn. Uncomment if using 99 | # auto-import. 100 | # .idea/artifacts 101 | # .idea/compiler.xml 102 | # .idea/jarRepositories.xml 103 | # .idea/modules.xml 104 | # .idea/*.iml 105 | # .idea/modules 106 | # *.iml 107 | # *.ipr 108 | 109 | # CMake 110 | cmake-build-*/ 111 | 112 | # Mongo Explorer plugin 113 | .idea/**/mongoSettings.xml 114 | 115 | # File-based project format 116 | *.iws 117 | 118 | # IntelliJ 119 | out/ 120 | 121 | # mpeltonen/sbt-idea plugin 122 | .idea_modules/ 123 | 124 | # JIRA plugin 125 | atlassian-ide-plugin.xml 126 | 127 | # Cursive Clojure plugin 128 | .idea/replstate.xml 129 | 130 | # Crashlytics plugin (for Android Studio and IntelliJ) 131 | com_crashlytics_export_strings.xml 132 | crashlytics.properties 133 | crashlytics-build.properties 134 | fabric.properties 135 | 136 | # Editor-based Rest Client 137 | .idea/httpRequests 138 | 139 | # Android studio 3.1+ serialized cache file 140 | .idea/caches/build_file_checksums.ser 141 | 142 | ### PyCharm+all Patch ### 143 | # Ignores the whole .idea folder and all .iml files 144 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 145 | 146 | .idea/ 147 | 148 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 149 | 150 | *.iml 151 | modules.xml 152 | .idea/misc.xml 153 | *.ipr 154 | 155 | # Sonarlint plugin 156 | .idea/sonarlint 157 | 158 | ### Python ### 159 | # Byte-compiled / optimized / DLL files 160 | __pycache__/ 161 | *.py[cod] 162 | *$py.class 163 | 164 | # C extensions 165 | *.so 166 | 167 | # Distribution / packaging 168 | .Python 169 | build/ 170 | develop-eggs/ 171 | dist/ 172 | downloads/ 173 | eggs/ 174 | .eggs/ 175 | lib/ 176 | lib64/ 177 | parts/ 178 | sdist/ 179 | var/ 180 | wheels/ 181 | share/python-wheels/ 182 | *.egg-info/ 183 | .installed.cfg 184 | *.egg 185 | MANIFEST 186 | 187 | # PyInstaller 188 | # Usually these files are written by a python script from a template 189 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 190 | *.manifest 191 | *.spec 192 | 193 | # Installer logs 194 | pip-log.txt 195 | pip-delete-this-directory.txt 196 | 197 | # Unit test / coverage reports 198 | htmlcov/ 199 | .tox/ 200 | .nox/ 201 | .coverage 202 | .coverage.* 203 | .cache 204 | nosetests.xml 205 | coverage.xml 206 | *.cover 207 | *.py,cover 208 | .hypothesis/ 209 | .pytest_cache/ 210 | cover/ 211 | 212 | # Translations 213 | *.mo 214 | *.pot 215 | 216 | # Django stuff: 217 | *.log 218 | local_settings.py 219 | db.sqlite3 220 | db.sqlite3-journal 221 | 222 | # Flask stuff: 223 | instance/ 224 | .webassets-cache 225 | 226 | # Scrapy stuff: 227 | .scrapy 228 | 229 | # Sphinx documentation 230 | docs/_build/ 231 | 232 | # PyBuilder 233 | .pybuilder/ 234 | target/ 235 | 236 | # Jupyter Notebook 237 | 238 | # IPython 239 | 240 | # pyenv 241 | # For a library or package, you might want to ignore these files since the code is 242 | # intended to run in multiple environments; otherwise, check them in: 243 | # .python-version 244 | 245 | # pipenv 246 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 247 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 248 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 249 | # install all needed dependencies. 250 | #Pipfile.lock 251 | 252 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 253 | __pypackages__/ 254 | 255 | # Celery stuff 256 | celerybeat-schedule 257 | celerybeat.pid 258 | 259 | # SageMath parsed files 260 | *.sage.py 261 | 262 | # Environments 263 | .env 264 | .venv 265 | env/ 266 | venv/ 267 | ENV/ 268 | env.bak/ 269 | venv.bak/ 270 | 271 | # Spyder project settings 272 | .spyderproject 273 | .spyproject 274 | 275 | # Rope project settings 276 | .ropeproject 277 | 278 | # mkdocs documentation 279 | /site 280 | 281 | # mypy 282 | .mypy_cache/ 283 | .dmypy.json 284 | dmypy.json 285 | 286 | # Pyre type checker 287 | .pyre/ 288 | 289 | # pytype static type analyzer 290 | .pytype/ 291 | 292 | # Cython debug symbols 293 | cython_debug/ 294 | 295 | ### Vim ### 296 | # Swap 297 | [._]*.s[a-v][a-z] 298 | !*.svg # comment out if you don't need vector files 299 | [._]*.sw[a-p] 300 | [._]s[a-rt-v][a-z] 301 | [._]ss[a-gi-z] 302 | [._]sw[a-p] 303 | 304 | # Session 305 | Session.vim 306 | Sessionx.vim 307 | 308 | # Temporary 309 | .netrwhist 310 | *~ 311 | # Auto-generated tag files 312 | tags 313 | # Persistent undo 314 | [._]*.un~ 315 | 316 | ### Windows ### 317 | # Windows thumbnail cache files 318 | Thumbs.db 319 | Thumbs.db:encryptable 320 | ehthumbs.db 321 | ehthumbs_vista.db 322 | 323 | # Dump file 324 | *.stackdump 325 | 326 | # Folder config file 327 | [Dd]esktop.ini 328 | 329 | # Recycle Bin used on file shares 330 | $RECYCLE.BIN/ 331 | 332 | # Windows Installer files 333 | *.cab 334 | *.msi 335 | *.msix 336 | *.msm 337 | *.msp 338 | 339 | # Windows shortcuts 340 | *.lnk 341 | 342 | ### VisualStudio ### 343 | ## Ignore Visual Studio temporary files, build results, and 344 | ## files generated by popular Visual Studio add-ons. 345 | ## 346 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 347 | 348 | # User-specific files 349 | *.rsuser 350 | *.suo 351 | *.user 352 | *.userosscache 353 | *.sln.docstates 354 | 355 | # User-specific files (MonoDevelop/Xamarin Studio) 356 | *.userprefs 357 | 358 | # Mono auto generated files 359 | mono_crash.* 360 | 361 | # Build results 362 | [Dd]ebug/ 363 | [Dd]ebugPublic/ 364 | [Rr]elease/ 365 | [Rr]eleases/ 366 | x64/ 367 | x86/ 368 | [Ww][Ii][Nn]32/ 369 | [Aa][Rr][Mm]/ 370 | [Aa][Rr][Mm]64/ 371 | bld/ 372 | [Bb]in/ 373 | [Oo]bj/ 374 | [Ll]og/ 375 | [Ll]ogs/ 376 | 377 | # Visual Studio 2015/2017 cache/options directory 378 | .vs/ 379 | # Uncomment if you have tasks that create the project's static files in wwwroot 380 | #wwwroot/ 381 | 382 | # Visual Studio 2017 auto generated files 383 | Generated\ Files/ 384 | 385 | # MSTest test Results 386 | [Tt]est[Rr]esult*/ 387 | [Bb]uild[Ll]og.* 388 | 389 | # NUnit 390 | *.VisualState.xml 391 | TestResult.xml 392 | nunit-*.xml 393 | 394 | # Build Results of an ATL Project 395 | [Dd]ebugPS/ 396 | [Rr]eleasePS/ 397 | dlldata.c 398 | 399 | # Benchmark Results 400 | BenchmarkDotNet.Artifacts/ 401 | 402 | # .NET Core 403 | project.lock.json 404 | project.fragment.lock.json 405 | artifacts/ 406 | 407 | # ASP.NET Scaffolding 408 | ScaffoldingReadMe.txt 409 | 410 | # StyleCop 411 | StyleCopReport.xml 412 | 413 | # Files built by Visual Studio 414 | *_i.c 415 | *_p.c 416 | *_h.h 417 | *.ilk 418 | *.meta 419 | *.obj 420 | *.iobj 421 | *.pch 422 | *.pdb 423 | *.ipdb 424 | *.pgc 425 | *.pgd 426 | *.rsp 427 | *.sbr 428 | *.tlb 429 | *.tli 430 | *.tlh 431 | *.tmp 432 | *.tmp_proj 433 | *_wpftmp.csproj 434 | *.tlog 435 | *.vspscc 436 | *.vssscc 437 | .builds 438 | *.pidb 439 | *.svclog 440 | *.scc 441 | 442 | # Chutzpah Test files 443 | _Chutzpah* 444 | 445 | # Visual C++ cache files 446 | ipch/ 447 | *.aps 448 | *.ncb 449 | *.opendb 450 | *.opensdf 451 | *.sdf 452 | *.cachefile 453 | *.VC.db 454 | *.VC.VC.opendb 455 | 456 | # Visual Studio profiler 457 | *.psess 458 | *.vsp 459 | *.vspx 460 | *.sap 461 | 462 | # Visual Studio Trace Files 463 | *.e2e 464 | 465 | # TFS 2012 Local Workspace 466 | $tf/ 467 | 468 | # Guidance Automation Toolkit 469 | *.gpState 470 | 471 | # ReSharper is a .NET coding add-in 472 | _ReSharper*/ 473 | *.[Rr]e[Ss]harper 474 | *.DotSettings.user 475 | 476 | # TeamCity is a build add-in 477 | _TeamCity* 478 | 479 | # DotCover is a Code Coverage Tool 480 | *.dotCover 481 | 482 | # AxoCover is a Code Coverage Tool 483 | .axoCover/* 484 | !.axoCover/settings.json 485 | 486 | # Coverlet is a free, cross platform Code Coverage Tool 487 | coverage*.json 488 | coverage*.xml 489 | coverage*.info 490 | 491 | # Visual Studio code coverage results 492 | *.coverage 493 | *.coveragexml 494 | 495 | # NCrunch 496 | _NCrunch_* 497 | .*crunch*.local.xml 498 | nCrunchTemp_* 499 | 500 | # MightyMoose 501 | *.mm.* 502 | AutoTest.Net/ 503 | 504 | # Web workbench (sass) 505 | .sass-cache/ 506 | 507 | # Installshield output folder 508 | [Ee]xpress/ 509 | 510 | # DocProject is a documentation generator add-in 511 | DocProject/buildhelp/ 512 | DocProject/Help/*.HxT 513 | DocProject/Help/*.HxC 514 | DocProject/Help/*.hhc 515 | DocProject/Help/*.hhk 516 | DocProject/Help/*.hhp 517 | DocProject/Help/Html2 518 | DocProject/Help/html 519 | 520 | # Click-Once directory 521 | publish/ 522 | 523 | # Publish Web Output 524 | *.[Pp]ublish.xml 525 | *.azurePubxml 526 | # Note: Comment the next line if you want to checkin your web deploy settings, 527 | # but database connection strings (with potential passwords) will be unencrypted 528 | *.pubxml 529 | *.publishproj 530 | 531 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 532 | # checkin your Azure Web App publish settings, but sensitive information contained 533 | # in these scripts will be unencrypted 534 | PublishScripts/ 535 | 536 | # NuGet Packages 537 | *.nupkg 538 | # NuGet Symbol Packages 539 | *.snupkg 540 | # The packages folder can be ignored because of Package Restore 541 | **/[Pp]ackages/* 542 | # except build/, which is used as an MSBuild target. 543 | !**/[Pp]ackages/build/ 544 | # Uncomment if necessary however generally it will be regenerated when needed 545 | #!**/[Pp]ackages/repositories.config 546 | # NuGet v3's project.json files produces more ignorable files 547 | *.nuget.props 548 | *.nuget.targets 549 | 550 | # Nuget personal access tokens and Credentials 551 | nuget.config 552 | 553 | # Microsoft Azure Build Output 554 | csx/ 555 | *.build.csdef 556 | 557 | # Microsoft Azure Emulator 558 | ecf/ 559 | rcf/ 560 | 561 | # Windows Store app package directories and files 562 | AppPackages/ 563 | BundleArtifacts/ 564 | Package.StoreAssociation.xml 565 | _pkginfo.txt 566 | *.appx 567 | *.appxbundle 568 | *.appxupload 569 | 570 | # Visual Studio cache files 571 | # files ending in .cache can be ignored 572 | *.[Cc]ache 573 | # but keep track of directories ending in .cache 574 | !?*.[Cc]ache/ 575 | 576 | # Others 577 | ClientBin/ 578 | ~$* 579 | *.dbmdl 580 | *.dbproj.schemaview 581 | *.jfm 582 | *.pfx 583 | *.publishsettings 584 | orleans.codegen.cs 585 | 586 | # Including strong name files can present a security risk 587 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 588 | #*.snk 589 | 590 | # Since there are multiple workflows, uncomment next line to ignore bower_components 591 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 592 | #bower_components/ 593 | 594 | # RIA/Silverlight projects 595 | Generated_Code/ 596 | 597 | # Backup & report files from converting an old project file 598 | # to a newer Visual Studio version. Backup files are not needed, 599 | # because we have git ;-) 600 | _UpgradeReport_Files/ 601 | Backup*/ 602 | UpgradeLog*.XML 603 | UpgradeLog*.htm 604 | ServiceFabricBackup/ 605 | *.rptproj.bak 606 | 607 | # SQL Server files 608 | *.mdf 609 | *.ldf 610 | *.ndf 611 | 612 | # Business Intelligence projects 613 | *.rdl.data 614 | *.bim.layout 615 | *.bim_*.settings 616 | *.rptproj.rsuser 617 | *- [Bb]ackup.rdl 618 | *- [Bb]ackup ([0-9]).rdl 619 | *- [Bb]ackup ([0-9][0-9]).rdl 620 | 621 | # Microsoft Fakes 622 | FakesAssemblies/ 623 | 624 | # GhostDoc plugin setting file 625 | *.GhostDoc.xml 626 | 627 | # Node.js Tools for Visual Studio 628 | .ntvs_analysis.dat 629 | node_modules/ 630 | 631 | # Visual Studio 6 build log 632 | *.plg 633 | 634 | # Visual Studio 6 workspace options file 635 | *.opt 636 | 637 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 638 | *.vbw 639 | 640 | # Visual Studio LightSwitch build output 641 | **/*.HTMLClient/GeneratedArtifacts 642 | **/*.DesktopClient/GeneratedArtifacts 643 | **/*.DesktopClient/ModelManifest.xml 644 | **/*.Server/GeneratedArtifacts 645 | **/*.Server/ModelManifest.xml 646 | _Pvt_Extensions 647 | 648 | # Paket dependency manager 649 | .paket/paket.exe 650 | paket-files/ 651 | 652 | # FAKE - F# Make 653 | .fake/ 654 | 655 | # CodeRush personal settings 656 | .cr/personal 657 | 658 | # Python Tools for Visual Studio (PTVS) 659 | *.pyc 660 | 661 | # Cake - Uncomment if you are using it 662 | # tools/** 663 | # !tools/packages.config 664 | 665 | # Tabs Studio 666 | *.tss 667 | 668 | # Telerik's JustMock configuration file 669 | *.jmconfig 670 | 671 | # BizTalk build output 672 | *.btp.cs 673 | *.btm.cs 674 | *.odx.cs 675 | *.xsd.cs 676 | 677 | # OpenCover UI analysis results 678 | OpenCover/ 679 | 680 | # Azure Stream Analytics local run output 681 | ASALocalRun/ 682 | 683 | # MSBuild Binary and Structured Log 684 | *.binlog 685 | 686 | # NVidia Nsight GPU debugger configuration file 687 | *.nvuser 688 | 689 | # MFractors (Xamarin productivity tool) working folder 690 | .mfractor/ 691 | 692 | # Local History for Visual Studio 693 | .localhistory/ 694 | 695 | # BeatPulse healthcheck temp database 696 | healthchecksdb 697 | 698 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 699 | MigrationBackup/ 700 | 701 | # Ionide (cross platform F# VS Code tools) working folder 702 | .ionide/ 703 | 704 | # Fody - auto-generated XML schema 705 | FodyWeavers.xsd 706 | 707 | # VS Code files for those working on multiple tools 708 | .vscode/* 709 | !.vscode/settings.json 710 | !.vscode/tasks.json 711 | !.vscode/launch.json 712 | !.vscode/extensions.json 713 | *.code-workspace 714 | 715 | # Local History for Visual Studio Code 716 | .history/ 717 | 718 | # Windows Installer files from build outputs 719 | 720 | # JetBrains Rider 721 | *.sln.iml 722 | 723 | ### VisualStudio Patch ### 724 | # Additional files built by Visual Studio 725 | 726 | # End of https://www.toptal.com/developers/gitignore/api/audio,python,pycharm+all,jupyternotebooks,windows,macos,vim,visualstudio 727 | 728 | # ignore everything inside of checkpoint directories 729 | *.json 730 | *.pt 731 | events.out.* 732 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Efficient Personalized Speech Enhancement through Self-Supervised Learning 2 | 3 | [Aswin Sivaraman](https://actuallyaswin.github.io/) and [Minje Kim](https://saige.sice.indiana.edu) (Indiana University) 4 | 5 | 6 | [![Paper](https://img.shields.io/badge/Web-Paper-blue)](https://ieeexplore.ieee.org/abstract/document/9794565/) 7 | 8 | ## Abstract 9 | This work presents self-supervised learning (SSL) methods for developing monaural speaker-specific (i.e., personalized) speech enhancement (SE) models. While generalist models must broadly address many speakers, specialist models can adapt their enhancement function towards a particular speaker's voice, expecting to solve a narrower problem. Hence, specialists are capable of achieving more optimal performance in addition to reducing computational complexity. However, naive personalization methods can require clean speech from the target user, which is inconvenient to acquire, e.g., due to subpar recording conditions. To this end, we pose personalization as either a zero-shot learning (ZSL) task, in which no additional clean speech of the target speaker is used for training, or a few-shot learning (FSL) task, in which the goal is to minimize the duration of the clean speech used for transfer learning. With this paper, we propose self-supervised learning methods as a solution to both zero- and few-shot personalization tasks. The proposed methods are designed to learn the personalized speech features from unlabeled data (i.e., in-the-wild noisy recordings from the target user) without knowing the corresponding clean sources. Our experiments investigate three different self-supervised learning mechanisms. The results show that self-supervised models achieve zero-shot and few-shot personalization using fewer model parameters and less clean data from the target user, achieving the data efficiency and model compression goals. 10 | 11 |
12 | Overview of Self-Supervised PSE Methods 14 |
An overview of the baseline and proposed personalization methods. With the baseline, the SE model is first pretrained using a large speaker-agnostic dataset as a generalist and then finetuned using clean speech signals of the test user. This method relies entirely on the finetuning process for personalization. On the other hand, the proposed methods provide various SSL options to pretrain the model using noisy, but speaker-specific speech, which serve a better initialization point for the subsequent finetuning process, leading to better SE performance. The pretrained models can also conduct a certain level of SE as a ZSL model, while the FSL-based finetuning tends to improve the pretrained model.
15 |
16 | 17 | 18 | ## Proposed Methods 19 | 20 | ### Pseudo Speech Enhancement (PseudoSE) 21 | Pseudo Speech Enhancement 22 | 23 | ### Contrastive Mixtures (CM) 24 | Contrastive Mixtures 25 | 26 | ### Data Purification (DP) 27 | Data Purification 28 | 29 | Note that DP may be applied onto the loss functions of either PseudoSE or CM. 30 | 31 | ## Installation & Usage 32 | 33 | Use pip to install the necessary Python packages (e.g., [pytorch-lightning](https://pytorch-lightning.readthedocs.io/en/stable/), [ray[tune]](https://docs.ray.io/en/latest/tune/), and [asteroid](https://asteroid-team.github.io/)). 34 | 35 | ``` 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | Additionally, the following datasets must be downloaded and un-zipped: 40 | + [Librispeech](http://www.openslr.org/12/) 41 | + [FSD50K](https://zenodo.org/record/4060432) 42 | + [MUSAN](http://www.openslr.org/17/) 43 | 44 | We define a _generalist_ model as one which is speaker-agnostic; it is trained to enhance the voices of many different speakers. A _specialist_ model is one which is trained to enhance the voice of a single speaker. In this experiment, we train specialist models entirely using degraded recordings (sampling a single speaker from **Librispeech** and unseen noises from **FSD50K**). 45 | 46 | To train generalist models, first modify `code/conf_generalists.yaml` with the correct folder paths, then run: 47 | ``` 48 | python code/train_generalists.py 49 | ``` 50 | Similarly, to train specialist models, first modify `code/conf_specialists.yaml` with the correct folder paths, then run: 51 | ``` 52 | python code/train_specialists.py 53 | ``` 54 | 55 | Each YAML configuration file defines the experiment search space, and all values provided in a list expand the search space. For example, the provided `conf_generalists.yaml` will run four different experiments: 56 | 57 | 1. *{batch_size=64, model_name=convtasnet, model_size=tiny, distance_func=snr}* 58 | 2. *{batch_size=64, model_name=convtasnet, model_size=small, distance_func=snr}* 59 | 3. *{batch_size=64, model_name=convtasnet, model_size=medium, distance_func=snr}* 60 | 4. *{batch_size=64, model_name=convtasnet, model_size=large, distance_func=snr}* 61 | 62 | The experiments may be run across multiple GPUs and CPUs, which can be specified in the above YAML files. 63 | 64 | ## Citation 65 | 66 | ``` 67 | @article{SivaramanA2022jstsp, 68 | title={{Efficient Personalized Speech Enhancement Through Self-Supervised Learning}}, 69 | author={Sivaraman, Aswin and Kim, Minje}, 70 | journal={{IEEE Journal of Selected Topics in Signal Processing}}, 71 | year={2022}, 72 | volume={16}, 73 | number={6}, 74 | pages={1342-1356} 75 | } 76 | ``` 77 | -------------------------------------------------------------------------------- /code/conf_generalists.yaml: -------------------------------------------------------------------------------- 1 | # This YAML configuration file defines the generalists experiment search space. 2 | # All values provided in a list expand the search space. 3 | # 4 | # For example, if model_name is ["convtasnet", "grunet"] and batch_size is 5 | # [32, 64, 128], then there are six experiments total. As another example, if 6 | # model_name and batch_size are defined as before, and distance_func is 7 | # [0.001, 0.01, 0.1] and distance_func is ['snr', 'mse'], then there would 8 | # be thirty-six experiments total to run. 9 | 10 | # ----------------------------------------------------------------------------- 11 | # ray configuration 12 | # ----------------------------------------------------------------------------- 13 | available_devices: '4,5,6,7' # (these should be GPU IDs, check `nvidia-smi`) 14 | num_gpus_per_experiment: 0.5 15 | num_cpus_per_experiment: 0.5 16 | output_folder: "~/ray_results/" 17 | 18 | # ----------------------------------------------------------------------------- 19 | # data configuration 20 | # ----------------------------------------------------------------------------- 21 | folder_librispeech: "/data/asivara/librispeech/" 22 | folder_musan: "/data/asivara/musan/" 23 | 24 | sample_rate: 16000 25 | example_duration: 4 # (in seconds) 26 | 27 | batch_size: # (should be specified based on available GPU memory) 28 | - 64 29 | 30 | # ----------------------------------------------------------------------------- 31 | # model configuration 32 | # ----------------------------------------------------------------------------- 33 | model_name: # (choices: "convtasnet", "grunet") 34 | - 'convtasnet' 35 | # - 'grunet' 36 | 37 | model_size: # (choices: "tiny", "small", "medium", "large") 38 | - 'tiny' 39 | - 'small' 40 | - 'medium' 41 | - 'large' 42 | 43 | learning_rate: 44 | - 0.001 45 | 46 | distance_func: # (distance function used to compare estimate & target signals) 47 | - 'snr' 48 | # - 'sisdr' 49 | # - 'mse' -------------------------------------------------------------------------------- /code/conf_specialists.yaml: -------------------------------------------------------------------------------- 1 | # This YAML configuration file defines the specialists experiment search space. 2 | # All values provided in a list expand the search space. 3 | # 4 | # For example, if model_name is ["convtasnet", "grunet"] and speaker_id is 5 | # [19, 26, 39], then there are six experiments total. As another example, if 6 | # model_name and speaker_id are defined as before, and use_loss_contrastive is 7 | # [False, True] and use_loss_purification is [False, True], then there would 8 | # be twenty-four experiments total to run. 9 | # 10 | # Most importantly, the speaker_id value designates the personalization target; 11 | # the self-supervised models are trained using already-noisy single-speaker 12 | # observations (e.g., 19 + FSD50K, 26 + FSD50K). 13 | 14 | # ----------------------------------------------------------------------------- 15 | # ray configuration 16 | # ----------------------------------------------------------------------------- 17 | available_devices: '4,5,6,7' # (these should be GPU IDs, check `nvidia-smi`) 18 | num_gpus_per_experiment: 0.5 19 | num_cpus_per_experiment: 0.5 20 | output_folder: "~/ray_results/" 21 | 22 | # ----------------------------------------------------------------------------- 23 | # data configuration 24 | # ----------------------------------------------------------------------------- 25 | folder_librispeech: "/data/asivara/librispeech/" 26 | folder_fsd50k: "/data/asivara/fsd50k_16khz/" # (manually resampled to 16kHz) 27 | folder_musan: "/data/asivara/musan/" 28 | 29 | sample_rate: 16000 30 | example_duration: 4 # (in seconds) 31 | 32 | batch_size: # (should be specified based on available GPU memory) 33 | - 64 34 | 35 | speaker_id: # (Librispeech IDs / must be from "speakers/test.csv") 36 | - 19 37 | - 26 38 | - 39 39 | - 40 40 | - 78 41 | - 83 42 | - 87 43 | - 89 44 | - 118 45 | - 125 46 | - 163 47 | - 196 48 | - 198 49 | - 200 50 | - 201 51 | - 250 52 | - 254 53 | - 307 54 | - 405 55 | - 446 56 | 57 | # ----------------------------------------------------------------------------- 58 | # model configuration 59 | # ----------------------------------------------------------------------------- 60 | model_name: # (choices: "convtasnet", "grunet") 61 | - 'convtasnet' 62 | # - 'grunet' 63 | 64 | model_size: # (choices: "tiny", "small", "medium", "large") 65 | - 'tiny' 66 | - 'small' 67 | - 'medium' 68 | - 'large' 69 | 70 | learning_rate: 71 | - 0.001 72 | 73 | distance_func: # (distance function used to compare estimate & target signals) 74 | - 'snr' 75 | # - 'sisdr' 76 | # - 'mse' 77 | 78 | # ----------------------------------------------------------------------------- 79 | # self-supervised loss function configuration 80 | # ----------------------------------------------------------------------------- 81 | use_loss_contrastive: # enables CM training (choices: False, True) 82 | - False 83 | - True 84 | 85 | use_loss_purification: # enables DP training (choices: False, True) 86 | - False 87 | - True -------------------------------------------------------------------------------- /code/datasets/demand.csv: -------------------------------------------------------------------------------- 1 | category_id,location_id,filepath,duration,sparsity 2 | domestic,kitchen,DKITCHEN/ch01.wav,300.004,0.0029915303457528353 3 | domestic,living,DLIVING/ch01.wav,300.004,0.00037294832873158157 4 | domestic,washing,DWASHING/ch01.wav,300.004,0.005440954118967056 5 | nature,field,NFIELD/ch01.wav,300.004,0.002194696804508567 6 | nature,park,NPARK/ch01.wav,300.004,0.0009642631048336625 7 | nature,river,NRIVER/ch01.wav,300.004,0.006473191548138857 8 | office,hallway,OHALLWAY/ch01.wav,300.004,0.0022714261431246996 9 | office,meeting,OMEETING/ch01.wav,300.004,0.001002078759483993 10 | office,office,OOFFICE/ch01.wav,300.004,0.0047870515845716 11 | public,cafeter,PCAFETER/ch01.wav,300.004,0.0021682872902601957 12 | public,resto,PRESTO/ch01.wav,300.004,0.0015622287755832076 13 | public,station,PSTATION/ch01.wav,300.004,0.0021246219985187054 14 | street,psquare,SPSQUARE/ch01.wav,300.004,0.005154266022145748 15 | street,traffic,STRAFFIC/ch01.wav,300.004,0.007706298492848873 16 | transportation,bus,TBUS/ch01.wav,300.004,0.05816786736249924 17 | transportation,car,TCAR/ch01.wav,300.004,0.03642342612147331 18 | transportation,metro,TMETRO/ch01.wav,300.004,0.019693519920110703 19 | -------------------------------------------------------------------------------- /code/datasets/musan.csv: -------------------------------------------------------------------------------- 1 | split,filepath,duration,sparsity 2 | train,noise/free-sound/noise-free-sound-0000.wav,17.65875,0.1557888239622116 3 | train,noise/free-sound/noise-free-sound-0001.wav,40.5365,0.027538767084479332 4 | train,noise/free-sound/noise-free-sound-0002.wav,71.2175,0.07518414407968521 5 | train,noise/free-sound/noise-free-sound-0003.wav,14.2358125,0.02093704789876938 6 | train,noise/free-sound/noise-free-sound-0004.wav,2.9865,0.08528786152601242 7 | train,noise/free-sound/noise-free-sound-0005.wav,21.1853125,0.17761875689029694 8 | train,noise/free-sound/noise-free-sound-0006.wav,7.7701875,0.192704975605011 9 | train,noise/free-sound/noise-free-sound-0007.wav,16.613875,0.05709356814622879 10 | train,noise/free-sound/noise-free-sound-0008.wav,20.8050625,0.01302085630595684 11 | train,noise/free-sound/noise-free-sound-0009.wav,8.3866875,0.017926516011357307 12 | train,noise/free-sound/noise-free-sound-0010.wav,8.2415625,0.10532404482364655 13 | train,noise/free-sound/noise-free-sound-0011.wav,34.8501875,0.019042056053876877 14 | train,noise/free-sound/noise-free-sound-0012.wav,32.15625,0.1082201898097992 15 | train,noise/free-sound/noise-free-sound-0013.wav,1.5445,0.14064553380012512 16 | train,noise/free-sound/noise-free-sound-0014.wav,3.4159375,0.11936260014772415 17 | train,noise/free-sound/noise-free-sound-0015.wav,91.244875,0.031160425394773483 18 | train,noise/free-sound/noise-free-sound-0016.wav,4.0,0.21168480813503265 19 | train,noise/free-sound/noise-free-sound-0017.wav,7.8773125,0.03246782720088959 20 | train,noise/free-sound/noise-free-sound-0018.wav,14.5763125,0.15054874122142792 21 | train,noise/free-sound/noise-free-sound-0019.wav,11.55775,0.03360776603221893 22 | train,noise/free-sound/noise-free-sound-0020.wav,60.18575,0.005166677292436361 23 | train,noise/free-sound/noise-free-sound-0021.wav,18.0453125,0.04779457673430443 24 | train,noise/free-sound/noise-free-sound-0022.wav,24.832,0.12185768783092499 25 | train,noise/free-sound/noise-free-sound-0023.wav,11.24,0.11853642016649246 26 | train,noise/free-sound/noise-free-sound-0024.wav,19.7790625,0.028153477236628532 27 | train,noise/free-sound/noise-free-sound-0025.wav,2.3466875,0.09568151831626892 28 | train,noise/free-sound/noise-free-sound-0026.wav,3.7973125,0.19289067387580872 29 | train,noise/free-sound/noise-free-sound-0027.wav,23.04,0.02183876745402813 30 | train,noise/free-sound/noise-free-sound-0028.wav,19.6179375,0.1307886242866516 31 | train,noise/free-sound/noise-free-sound-0029.wav,8.90775,0.1712813675403595 32 | train,noise/free-sound/noise-free-sound-0030.wav,73.01225,0.14316979050636292 33 | train,noise/free-sound/noise-free-sound-0031.wav,35.4743125,0.06783615797758102 34 | train,noise/free-sound/noise-free-sound-0032.wav,61.42,0.007194836623966694 35 | train,noise/free-sound/noise-free-sound-0033.wav,5.5263125,0.015329566784203053 36 | train,noise/free-sound/noise-free-sound-0034.wav,20.689,0.022331789135932922 37 | train,noise/free-sound/noise-free-sound-0035.wav,31.9333125,0.04504496231675148 38 | train,noise/free-sound/noise-free-sound-0036.wav,4.57,0.21145282685756683 39 | train,noise/free-sound/noise-free-sound-0037.wav,0.94,0.2134968787431717 40 | train,noise/free-sound/noise-free-sound-0038.wav,0.9133125,0.15371179580688477 41 | train,noise/free-sound/noise-free-sound-0039.wav,3.4466875,0.21756970882415771 42 | train,noise/free-sound/noise-free-sound-0040.wav,16.73,0.21240685880184174 43 | train,noise/free-sound/noise-free-sound-0041.wav,57.782875,0.034967090934515 44 | train,noise/free-sound/noise-free-sound-0042.wav,319.5559375,0.006530998274683952 45 | train,noise/free-sound/noise-free-sound-0043.wav,1.5325,0.22383399307727814 46 | train,noise/free-sound/noise-free-sound-0044.wav,4.5975625,0.2571035325527191 47 | train,noise/free-sound/noise-free-sound-0045.wav,4.644,0.21450121700763702 48 | train,noise/free-sound/noise-free-sound-0046.wav,4.62075,0.21786445379257202 49 | train,noise/free-sound/noise-free-sound-0047.wav,5.2966875,0.1274796426296234 50 | train,noise/free-sound/noise-free-sound-0048.wav,494.6599375,0.010682842694222927 51 | train,noise/free-sound/noise-free-sound-0049.wav,3.4666875,0.24981054663658142 52 | train,noise/free-sound/noise-free-sound-0050.wav,4.133125,0.020578665658831596 53 | train,noise/free-sound/noise-free-sound-0051.wav,6.52,0.207366943359375 54 | train,noise/free-sound/noise-free-sound-0052.wav,5.6641875,0.009431295096874237 55 | train,noise/free-sound/noise-free-sound-0053.wav,5.2719375,0.02078789472579956 56 | train,noise/free-sound/noise-free-sound-0054.wav,10.053375,0.02097940258681774 57 | train,noise/free-sound/noise-free-sound-0055.wav,40.09775,0.05011861026287079 58 | train,noise/free-sound/noise-free-sound-0056.wav,9.74275,0.05332687869668007 59 | train,noise/free-sound/noise-free-sound-0057.wav,8.1304375,0.14586451649665833 60 | train,noise/free-sound/noise-free-sound-0058.wav,2.0,0.03634824976325035 61 | train,noise/free-sound/noise-free-sound-0059.wav,30.4428125,0.01942417398095131 62 | train,noise/free-sound/noise-free-sound-0060.wav,1.044875,0.056959543377161026 63 | train,noise/free-sound/noise-free-sound-0061.wav,11.0498125,0.13177379965782166 64 | train,noise/free-sound/noise-free-sound-0062.wav,6.0666875,0.10277925431728363 65 | train,noise/free-sound/noise-free-sound-0063.wav,6.2666875,0.09506578743457794 66 | train,noise/free-sound/noise-free-sound-0064.wav,100.1333125,0.039749082177877426 67 | train,noise/free-sound/noise-free-sound-0065.wav,6.9753125,0.11899636685848236 68 | train,noise/free-sound/noise-free-sound-0066.wav,60.9436875,0.042037464678287506 69 | train,noise/free-sound/noise-free-sound-0067.wav,39.889,0.014235257171094418 70 | train,noise/free-sound/noise-free-sound-0068.wav,20.075,0.041847988963127136 71 | train,noise/free-sound/noise-free-sound-0069.wav,2.5714375,0.1221093162894249 72 | train,noise/free-sound/noise-free-sound-0070.wav,17.98025,0.13005472719669342 73 | train,noise/free-sound/noise-free-sound-0071.wav,9.3919375,0.1472863107919693 74 | train,noise/free-sound/noise-free-sound-0072.wav,26.6225625,0.08104076981544495 75 | train,noise/free-sound/noise-free-sound-0073.wav,124.5685625,0.032078467309474945 76 | train,noise/free-sound/noise-free-sound-0074.wav,17.24,0.01517713163048029 77 | train,noise/free-sound/noise-free-sound-0075.wav,20.31125,0.007535384967923164 78 | train,noise/free-sound/noise-free-sound-0076.wav,80.0,0.04280521720647812 79 | train,noise/free-sound/noise-free-sound-0077.wav,11.7233125,0.017991654574871063 80 | train,noise/free-sound/noise-free-sound-0078.wav,10.370625,0.022274833172559738 81 | train,noise/free-sound/noise-free-sound-0079.wav,6.0005625,0.1618564873933792 82 | train,noise/free-sound/noise-free-sound-0080.wav,30.374625,0.06642832607030869 83 | train,noise/free-sound/noise-free-sound-0081.wav,21.5824375,0.08998014777898788 84 | train,noise/free-sound/noise-free-sound-0082.wav,20.20375,0.0831134244799614 85 | train,noise/free-sound/noise-free-sound-0083.wav,0.59125,0.13372854888439178 86 | train,noise/free-sound/noise-free-sound-0084.wav,19.4194375,0.07963409274816513 87 | train,noise/free-sound/noise-free-sound-0085.wav,4.671375,0.01780180260539055 88 | train,noise/free-sound/noise-free-sound-0086.wav,51.685,0.009196894243359566 89 | train,noise/free-sound/noise-free-sound-0087.wav,2.8,0.051390185952186584 90 | train,noise/free-sound/noise-free-sound-0088.wav,44.8666875,0.006017704494297504 91 | train,noise/free-sound/noise-free-sound-0089.wav,13.871,0.012411841191351414 92 | train,noise/free-sound/noise-free-sound-0090.wav,5.8441875,0.09183882921934128 93 | train,noise/free-sound/noise-free-sound-0091.wav,5.062125,0.04581494256854057 94 | train,noise/free-sound/noise-free-sound-0092.wav,15.079625,0.019245458766818047 95 | train,noise/free-sound/noise-free-sound-0093.wav,5.317375,0.03296150639653206 96 | train,noise/free-sound/noise-free-sound-0094.wav,5.5898125,0.03426990285515785 97 | train,noise/free-sound/noise-free-sound-0095.wav,4.068375,0.010393367148935795 98 | train,noise/free-sound/noise-free-sound-0096.wav,7.7841875,0.030091799795627594 99 | train,noise/free-sound/noise-free-sound-0097.wav,2.7369375,0.026996226981282234 100 | train,noise/free-sound/noise-free-sound-0098.wav,2.766,0.04624796658754349 101 | train,noise/free-sound/noise-free-sound-0099.wav,7.4946875,0.05058348551392555 102 | train,noise/free-sound/noise-free-sound-0100.wav,7.323125,0.044734448194503784 103 | train,noise/free-sound/noise-free-sound-0101.wav,112.90125,0.018048448488116264 104 | train,noise/free-sound/noise-free-sound-0102.wav,143.62125,0.010415861383080482 105 | train,noise/free-sound/noise-free-sound-0103.wav,5.2381875,0.05413604527711868 106 | train,noise/free-sound/noise-free-sound-0104.wav,15.888,0.030909907072782516 107 | train,noise/free-sound/noise-free-sound-0105.wav,306.4163125,0.012084255926311016 108 | train,noise/free-sound/noise-free-sound-0106.wav,153.952,0.027553599327802658 109 | train,noise/free-sound/noise-free-sound-0107.wav,11.766,0.16753751039505005 110 | train,noise/free-sound/noise-free-sound-0108.wav,19.3963125,0.026959596201777458 111 | train,noise/free-sound/noise-free-sound-0109.wav,37.101875,0.008607765659689903 112 | train,noise/free-sound/noise-free-sound-0110.wav,40.8931875,0.006380096077919006 113 | train,noise/free-sound/noise-free-sound-0111.wav,16.4451875,0.015219585970044136 114 | train,noise/free-sound/noise-free-sound-0112.wav,1.0144375,0.015558876097202301 115 | train,noise/free-sound/noise-free-sound-0113.wav,5.8461875,0.07099143415689468 116 | train,noise/free-sound/noise-free-sound-0114.wav,99.9298125,0.006937652360647917 117 | train,noise/free-sound/noise-free-sound-0115.wav,2.577125,0.05000652000308037 118 | train,noise/free-sound/noise-free-sound-0116.wav,8.82075,0.09594030678272247 119 | train,noise/free-sound/noise-free-sound-0117.wav,33.6573125,0.08519165962934494 120 | train,noise/free-sound/noise-free-sound-0118.wav,17.41,0.044342827051877975 121 | train,noise/free-sound/noise-free-sound-0119.wav,70.728,0.052567992359399796 122 | train,noise/free-sound/noise-free-sound-0120.wav,18.4344375,0.2627156674861908 123 | train,noise/free-sound/noise-free-sound-0121.wav,44.0,0.03234189376235008 124 | train,noise/free-sound/noise-free-sound-0122.wav,8.0,0.031088661402463913 125 | train,noise/free-sound/noise-free-sound-0123.wav,7.5,0.05381748080253601 126 | train,noise/free-sound/noise-free-sound-0124.wav,5.0,0.06208133324980736 127 | train,noise/free-sound/noise-free-sound-0125.wav,1.125,0.05549873784184456 128 | train,noise/free-sound/noise-free-sound-0126.wav,19.7823125,0.016480645164847374 129 | train,noise/free-sound/noise-free-sound-0127.wav,2.448,0.041250959038734436 130 | train,noise/free-sound/noise-free-sound-0128.wav,4.056,0.033089909702539444 131 | train,noise/free-sound/noise-free-sound-0129.wav,8.2939375,0.02219713293015957 132 | train,noise/free-sound/noise-free-sound-0130.wav,38.8701875,0.05674542486667633 133 | train,noise/free-sound/noise-free-sound-0131.wav,55.597375,0.06609814614057541 134 | train,noise/free-sound/noise-free-sound-0132.wav,0.7158125,0.0999051108956337 135 | train,noise/free-sound/noise-free-sound-0133.wav,0.5209375,0.11084580421447754 136 | train,noise/free-sound/noise-free-sound-0134.wav,2.513375,0.026915693655610085 137 | train,noise/free-sound/noise-free-sound-0135.wav,3.0045625,0.0799274742603302 138 | train,noise/free-sound/noise-free-sound-0136.wav,0.625,0.08511532843112946 139 | train,noise/free-sound/noise-free-sound-0137.wav,113.25,0.011948485858738422 140 | train,noise/free-sound/noise-free-sound-0138.wav,3.25,0.04840674623847008 141 | train,noise/free-sound/noise-free-sound-0139.wav,15.5146875,0.025845933705568314 142 | train,noise/free-sound/noise-free-sound-0140.wav,24.77475,0.012172553688287735 143 | train,noise/free-sound/noise-free-sound-0141.wav,135.0720625,0.04096035286784172 144 | train,noise/free-sound/noise-free-sound-0142.wav,38.99975,0.033781711012125015 145 | train,noise/free-sound/noise-free-sound-0143.wav,13.0430625,0.013805116526782513 146 | train,noise/free-sound/noise-free-sound-0144.wav,25.54775,0.136098712682724 147 | train,noise/free-sound/noise-free-sound-0145.wav,2.9121875,0.07779847830533981 148 | train,noise/free-sound/noise-free-sound-0146.wav,2.0775,0.026216337457299232 149 | train,noise/free-sound/noise-free-sound-0147.wav,8.6333125,0.20758233964443207 150 | train,noise/free-sound/noise-free-sound-0148.wav,2.214875,0.020685670897364616 151 | train,noise/free-sound/noise-free-sound-0149.wav,1.7618125,0.0842508003115654 152 | train,noise/free-sound/noise-free-sound-0150.wav,0.4504375,0.03692903742194176 153 | train,noise/free-sound/noise-free-sound-0151.wav,53.76,0.05299970135092735 154 | train,noise/free-sound/noise-free-sound-0152.wav,97.156,0.018531352281570435 155 | train,noise/free-sound/noise-free-sound-0153.wav,72.1493125,0.008794168010354042 156 | train,noise/free-sound/noise-free-sound-0154.wav,62.464,0.06217845529317856 157 | train,noise/free-sound/noise-free-sound-0155.wav,72.6613125,0.0456816628575325 158 | train,noise/free-sound/noise-free-sound-0156.wav,90.197375,0.018847698345780373 159 | train,noise/free-sound/noise-free-sound-0157.wav,3.0933125,0.15810731053352356 160 | train,noise/free-sound/noise-free-sound-0158.wav,64.884,0.007497943937778473 161 | train,noise/free-sound/noise-free-sound-0159.wav,5.216,0.07187561690807343 162 | train,noise/free-sound/noise-free-sound-0160.wav,34.325375,0.0507843978703022 163 | train,noise/free-sound/noise-free-sound-0161.wav,76.476,0.058606624603271484 164 | train,noise/free-sound/noise-free-sound-0162.wav,14.9333125,0.014961466193199158 165 | train,noise/free-sound/noise-free-sound-0163.wav,0.3626875,0.1304340958595276 166 | train,noise/free-sound/noise-free-sound-0164.wav,24.96,0.02270853891968727 167 | train,noise/free-sound/noise-free-sound-0165.wav,33.837375,0.020268714055418968 168 | train,noise/free-sound/noise-free-sound-0166.wav,0.768,0.12463682144880295 169 | train,noise/free-sound/noise-free-sound-0167.wav,2.005375,0.09784073382616043 170 | train,noise/free-sound/noise-free-sound-0168.wav,1.344,0.09780868887901306 171 | train,noise/free-sound/noise-free-sound-0169.wav,2.304,0.07819738984107971 172 | train,noise/free-sound/noise-free-sound-0170.wav,5.2906875,0.0410243459045887 173 | train,noise/free-sound/noise-free-sound-0171.wav,4.6293125,0.09066470712423325 174 | train,noise/free-sound/noise-free-sound-0172.wav,49.5543125,0.15253888070583344 175 | train,noise/free-sound/noise-free-sound-0173.wav,106.802,0.03660242632031441 176 | train,noise/free-sound/noise-free-sound-0174.wav,9.109125,0.03291758522391319 177 | train,noise/free-sound/noise-free-sound-0175.wav,4.7188125,0.030510850250720978 178 | train,noise/free-sound/noise-free-sound-0176.wav,2.9843125,0.07392294704914093 179 | train,noise/free-sound/noise-free-sound-0177.wav,106.737875,0.014649895019829273 180 | train,noise/free-sound/noise-free-sound-0178.wav,4.884625,0.07338564842939377 181 | train,noise/free-sound/noise-free-sound-0179.wav,2.813,0.026941245421767235 182 | train,noise/free-sound/noise-free-sound-0180.wav,15.5155,0.052562348544597626 183 | train,noise/free-sound/noise-free-sound-0181.wav,4.7964375,0.017637288197875023 184 | train,noise/free-sound/noise-free-sound-0182.wav,2.25225,0.05241338163614273 185 | train,noise/free-sound/noise-free-sound-0183.wav,8.0914375,0.04602668061852455 186 | train,noise/free-sound/noise-free-sound-0184.wav,32.407375,0.05794139578938484 187 | train,noise/free-sound/noise-free-sound-0185.wav,2.6693125,0.08157354593276978 188 | train,noise/free-sound/noise-free-sound-0186.wav,2.8361875,0.01683889329433441 189 | train,noise/free-sound/noise-free-sound-0187.wav,6.4230625,0.016764214262366295 190 | train,noise/free-sound/noise-free-sound-0188.wav,9.5511875,0.042392484843730927 191 | train,noise/free-sound/noise-free-sound-0189.wav,6.8401875,0.03030153177678585 192 | train,noise/free-sound/noise-free-sound-0190.wav,6.131125,0.033692434430122375 193 | train,noise/free-sound/noise-free-sound-0191.wav,3.9623125,0.04725954681634903 194 | train,noise/free-sound/noise-free-sound-0192.wav,6.6733125,0.03271656110882759 195 | train,noise/free-sound/noise-free-sound-0193.wav,4.004,0.02426316775381565 196 | train,noise/free-sound/noise-free-sound-0194.wav,7.882875,0.06913600116968155 197 | train,noise/free-sound/noise-free-sound-0195.wav,4.2959375,0.03338419273495674 198 | train,noise/free-sound/noise-free-sound-0196.wav,3.628625,0.03880972042679787 199 | train,noise/free-sound/noise-free-sound-0197.wav,10.5939375,0.06445953249931335 200 | train,noise/free-sound/noise-free-sound-0198.wav,9.134125,0.010551821440458298 201 | train,noise/free-sound/noise-free-sound-0199.wav,6.25625,0.04261793941259384 202 | train,noise/free-sound/noise-free-sound-0200.wav,8.75875,0.05165497586131096 203 | train,noise/free-sound/noise-free-sound-0201.wav,7.0486875,0.03031056560575962 204 | train,noise/free-sound/noise-free-sound-0202.wav,10.5521875,0.01994105614721775 205 | train,noise/free-sound/noise-free-sound-0203.wav,11.0944375,0.05353863909840584 206 | train,noise/free-sound/noise-free-sound-0204.wav,6.381375,0.034790318459272385 207 | train,noise/free-sound/noise-free-sound-0205.wav,14.389375,0.03819683939218521 208 | train,noise/free-sound/noise-free-sound-0206.wav,2.3356875,0.08835189044475555 209 | train,noise/free-sound/noise-free-sound-0207.wav,7.0904375,0.015486874617636204 210 | train,noise/free-sound/noise-free-sound-0208.wav,6.4648125,0.043124981224536896 211 | train,noise/free-sound/noise-free-sound-0209.wav,9.6763125,0.03408803790807724 212 | train,noise/free-sound/noise-free-sound-0210.wav,5.6723125,0.022591933608055115 213 | train,noise/free-sound/noise-free-sound-0211.wav,6.2145625,0.03797854855656624 214 | train,noise/free-sound/noise-free-sound-0212.wav,9.5511875,0.07562563568353653 215 | train,noise/free-sound/noise-free-sound-0213.wav,5.7140625,0.06729837507009506 216 | train,noise/free-sound/noise-free-sound-0214.wav,8.25825,0.04794452711939812 217 | train,noise/free-sound/noise-free-sound-0215.wav,6.4375,0.02876589260995388 218 | train,noise/free-sound/noise-free-sound-0216.wav,38.4588125,0.08392564207315445 219 | train,noise/free-sound/noise-free-sound-0217.wav,4.8381875,0.023469315841794014 220 | train,noise/free-sound/noise-free-sound-0218.wav,6.131125,0.06398402899503708 221 | train,noise/free-sound/noise-free-sound-0219.wav,6.5899375,0.042622439563274384 222 | train,noise/free-sound/noise-free-sound-0220.wav,8.4668125,0.03878623619675636 223 | train,noise/free-sound/noise-free-sound-0221.wav,10.9275625,0.02385111153125763 224 | train,noise/free-sound/noise-free-sound-0222.wav,1.2095625,0.08227390795946121 225 | train,noise/free-sound/noise-free-sound-0223.wav,3.5869375,0.0693453848361969 226 | train,noise/free-sound/noise-free-sound-0224.wav,12.8044375,0.054351672530174255 227 | train,noise/free-sound/noise-free-sound-0225.wav,12.452,0.015415950678288937 228 | train,noise/free-sound/noise-free-sound-0226.wav,26.1973125,0.10926525294780731 229 | train,noise/free-sound/noise-free-sound-0227.wav,31.1654375,0.011057702824473381 230 | train,noise/free-sound/noise-free-sound-0228.wav,16.6486875,0.042658109217882156 231 | train,noise/free-sound/noise-free-sound-0229.wav,5.3339375,0.11092111468315125 232 | train,noise/free-sound/noise-free-sound-0230.wav,41.592,0.05706547573208809 233 | train,noise/free-sound/noise-free-sound-0231.wav,22.33075,0.022546730935573578 234 | train,noise/free-sound/noise-free-sound-0232.wav,44.2890625,0.042499084025621414 235 | train,noise/free-sound/noise-free-sound-0233.wav,44.3495,0.05787660926580429 236 | train,noise/free-sound/noise-free-sound-0234.wav,75.0171875,0.03603597730398178 237 | train,noise/free-sound/noise-free-sound-0235.wav,200.7829375,0.038586996495723724 238 | train,noise/free-sound/noise-free-sound-0236.wav,5.5640625,0.1896110475063324 239 | train,noise/free-sound/noise-free-sound-0237.wav,5.302875,0.03839379549026489 240 | train,noise/free-sound/noise-free-sound-0238.wav,13.944,0.039737407118082047 241 | train,noise/free-sound/noise-free-sound-0239.wav,37.50525,0.08794654905796051 242 | train,noise/free-sound/noise-free-sound-0240.wav,26.2261875,0.03328998014330864 243 | train,noise/free-sound/noise-free-sound-0241.wav,52.6185,0.026958363130688667 244 | train,noise/free-sound/noise-free-sound-0242.wav,23.0038125,0.04528762027621269 245 | train,noise/free-sound/noise-free-sound-0243.wav,90.0005625,0.0357666015625 246 | train,noise/free-sound/noise-free-sound-0244.wav,118.8401875,0.026809563860297203 247 | train,noise/free-sound/noise-free-sound-0245.wav,29.562875,0.010990160517394543 248 | train,noise/free-sound/noise-free-sound-0246.wav,24.7246875,0.029057923704385757 249 | train,noise/free-sound/noise-free-sound-0247.wav,77.7345625,0.13703718781471252 250 | train,noise/free-sound/noise-free-sound-0248.wav,60.30275,0.06144831329584122 251 | train,noise/free-sound/noise-free-sound-0249.wav,40.684625,0.01766958460211754 252 | train,noise/free-sound/noise-free-sound-0250.wav,8.6204375,0.09253757447004318 253 | train,noise/free-sound/noise-free-sound-0251.wav,31.973875,0.10958096385002136 254 | train,noise/free-sound/noise-free-sound-0252.wav,20.9501875,0.037849124521017075 255 | train,noise/free-sound/noise-free-sound-0253.wav,0.9750625,0.03105640597641468 256 | train,noise/free-sound/noise-free-sound-0254.wav,64.142125,0.03622563183307648 257 | train,noise/free-sound/noise-free-sound-0255.wav,5.5375,0.09842009842395782 258 | train,noise/free-sound/noise-free-sound-0256.wav,37.5069375,0.012584677897393703 259 | train,noise/free-sound/noise-free-sound-0257.wav,5.0,0.004914766643196344 260 | train,noise/free-sound/noise-free-sound-0258.wav,25.7580625,0.2934410572052002 261 | train,noise/free-sound/noise-free-sound-0259.wav,14.7069375,0.023536955937743187 262 | train,noise/free-sound/noise-free-sound-0260.wav,14.4,0.08175045251846313 263 | train,noise/free-sound/noise-free-sound-0261.wav,88.0,0.044494520872831345 264 | train,noise/free-sound/noise-free-sound-0262.wav,35.0301875,0.014555398374795914 265 | train,noise/free-sound/noise-free-sound-0263.wav,0.9043125,0.006316532380878925 266 | train,noise/free-sound/noise-free-sound-0264.wav,29.631375,0.008275926113128662 267 | train,noise/free-sound/noise-free-sound-0265.wav,1.3325625,0.031494367867708206 268 | train,noise/free-sound/noise-free-sound-0266.wav,34.674625,0.052908796817064285 269 | train,noise/free-sound/noise-free-sound-0267.wav,36.1389375,0.009712645784020424 270 | train,noise/free-sound/noise-free-sound-0268.wav,22.710875,0.07346329838037491 271 | train,noise/free-sound/noise-free-sound-0269.wav,49.125125,0.05663708969950676 272 | train,noise/free-sound/noise-free-sound-0270.wav,398.0610625,0.04078911244869232 273 | train,noise/free-sound/noise-free-sound-0271.wav,29.6983125,0.026326850056648254 274 | train,noise/free-sound/noise-free-sound-0272.wav,41.058125,0.3127070665359497 275 | train,noise/free-sound/noise-free-sound-0273.wav,10.6433125,0.031103668734431267 276 | train,noise/free-sound/noise-free-sound-0274.wav,3.96775,0.037425946444272995 277 | train,noise/free-sound/noise-free-sound-0275.wav,12.096,0.034055352210998535 278 | train,noise/free-sound/noise-free-sound-0276.wav,2.8715,0.3022034466266632 279 | train,noise/free-sound/noise-free-sound-0277.wav,5.705125,0.0423748753964901 280 | train,noise/free-sound/noise-free-sound-0278.wav,37.59825,0.026329314336180687 281 | train,noise/free-sound/noise-free-sound-0279.wav,17.3333125,0.1291254162788391 282 | train,noise/free-sound/noise-free-sound-0280.wav,62.10325,0.08421215415000916 283 | train,noise/free-sound/noise-free-sound-0281.wav,0.1,0.0008713952847756445 284 | train,noise/free-sound/noise-free-sound-0282.wav,37.72675,0.03869716450572014 285 | train,noise/free-sound/noise-free-sound-0283.wav,22.9636875,0.005775699391961098 286 | train,noise/free-sound/noise-free-sound-0284.wav,2.786,0.14792244136333466 287 | train,noise/free-sound/noise-free-sound-0285.wav,4.0925,0.017943454906344414 288 | train,noise/free-sound/noise-free-sound-0286.wav,9.5603125,0.054936982691287994 289 | train,noise/free-sound/noise-free-sound-0287.wav,60.6563125,0.02655193768441677 290 | train,noise/free-sound/noise-free-sound-0288.wav,59.688,0.009608861990272999 291 | train,noise/free-sound/noise-free-sound-0289.wav,22.2821875,0.09163817763328552 292 | train,noise/free-sound/noise-free-sound-0290.wav,44.304,0.030345182865858078 293 | train,noise/free-sound/noise-free-sound-0291.wav,9.090625,0.2115832418203354 294 | train,noise/free-sound/noise-free-sound-0292.wav,17.97225,0.2524002492427826 295 | train,noise/free-sound/noise-free-sound-0293.wav,43.0381875,0.015832778066396713 296 | train,noise/free-sound/noise-free-sound-0294.wav,6.5875,0.05032056197524071 297 | train,noise/free-sound/noise-free-sound-0295.wav,6.8963125,0.07769472151994705 298 | train,noise/free-sound/noise-free-sound-0296.wav,1.8616875,0.08366407454013824 299 | train,noise/free-sound/noise-free-sound-0297.wav,1.799,0.1571785807609558 300 | train,noise/free-sound/noise-free-sound-0298.wav,32.0,0.1518697738647461 301 | train,noise/free-sound/noise-free-sound-0299.wav,0.212625,0.07357385754585266 302 | train,noise/free-sound/noise-free-sound-0300.wav,6.112625,0.12782976031303406 303 | train,noise/free-sound/noise-free-sound-0301.wav,180.9495625,0.03669818118214607 304 | train,noise/free-sound/noise-free-sound-0302.wav,17.914125,0.07714607566595078 305 | train,noise/free-sound/noise-free-sound-0303.wav,8.1955,0.02229280211031437 306 | train,noise/free-sound/noise-free-sound-0304.wav,62.0408125,0.07652237266302109 307 | train,noise/free-sound/noise-free-sound-0305.wav,29.5,0.024681456387043 308 | train,noise/free-sound/noise-free-sound-0306.wav,24.5,0.023795874789357185 309 | train,noise/free-sound/noise-free-sound-0307.wav,10.704,0.025687742978334427 310 | train,noise/free-sound/noise-free-sound-0308.wav,58.28175,0.06723242998123169 311 | train,noise/free-sound/noise-free-sound-0309.wav,3.3729375,0.07240822911262512 312 | train,noise/free-sound/noise-free-sound-0310.wav,22.03575,0.022913042455911636 313 | train,noise/free-sound/noise-free-sound-0311.wav,13.746375,0.03471088036894798 314 | train,noise/free-sound/noise-free-sound-0312.wav,1.513,0.026560822501778603 315 | train,noise/free-sound/noise-free-sound-0313.wav,2.7875,0.053717926144599915 316 | train,noise/free-sound/noise-free-sound-0314.wav,0.6109375,0.03753811493515968 317 | train,noise/free-sound/noise-free-sound-0315.wav,2.7838125,0.03004627861082554 318 | train,noise/free-sound/noise-free-sound-0316.wav,11.1455625,0.010907059535384178 319 | train,noise/free-sound/noise-free-sound-0317.wav,7.256875,0.050118256360292435 320 | train,noise/free-sound/noise-free-sound-0318.wav,0.40925,0.05898527055978775 321 | train,noise/free-sound/noise-free-sound-0319.wav,1.515125,0.0426362082362175 322 | train,noise/free-sound/noise-free-sound-0320.wav,6.7395625,0.07185203582048416 323 | train,noise/free-sound/noise-free-sound-0321.wav,113.0,0.028845947235822678 324 | train,noise/free-sound/noise-free-sound-0322.wav,31.0335,0.018543396145105362 325 | train,noise/free-sound/noise-free-sound-0323.wav,10.73175,0.1975315362215042 326 | train,noise/free-sound/noise-free-sound-0324.wav,36.2056875,0.12887051701545715 327 | train,noise/free-sound/noise-free-sound-0325.wav,1.6456875,0.08280790597200394 328 | train,noise/free-sound/noise-free-sound-0326.wav,16.0775,0.1114916056394577 329 | train,noise/free-sound/noise-free-sound-0327.wav,5.1596875,0.026619240641593933 330 | train,noise/free-sound/noise-free-sound-0328.wav,0.7181875,0.07777177542448044 331 | train,noise/free-sound/noise-free-sound-0329.wav,9.69225,0.025797756388783455 332 | train,noise/free-sound/noise-free-sound-0330.wav,8.4288125,0.022084051743149757 333 | train,noise/free-sound/noise-free-sound-0331.wav,6.375,0.05982063338160515 334 | train,noise/free-sound/noise-free-sound-0332.wav,3.625,0.07769813388586044 335 | train,noise/free-sound/noise-free-sound-0333.wav,15.6735,0.04349810257554054 336 | train,noise/free-sound/noise-free-sound-0334.wav,178.1493125,0.032667893916368484 337 | train,noise/free-sound/noise-free-sound-0335.wav,34.0375625,0.04825016111135483 338 | train,noise/free-sound/noise-free-sound-0336.wav,48.08275,0.13560190796852112 339 | train,noise/free-sound/noise-free-sound-0337.wav,1.4255,0.2234208583831787 340 | train,noise/free-sound/noise-free-sound-0338.wav,9.24,0.018683331087231636 341 | train,noise/free-sound/noise-free-sound-0339.wav,20.328,0.02289791963994503 342 | train,noise/free-sound/noise-free-sound-0340.wav,33.672,0.03894010931253433 343 | train,noise/free-sound/noise-free-sound-0341.wav,23.071875,0.023344693705439568 344 | train,noise/free-sound/noise-free-sound-0342.wav,13.806,0.09261415898799896 345 | train,noise/free-sound/noise-free-sound-0343.wav,2.809625,0.0704667717218399 346 | train,noise/free-sound/noise-free-sound-0344.wav,20.544,0.026964064687490463 347 | train,noise/free-sound/noise-free-sound-0345.wav,5.9024375,0.11662321537733078 348 | train,noise/free-sound/noise-free-sound-0346.wav,18.5469375,0.021898966282606125 349 | train,noise/free-sound/noise-free-sound-0347.wav,13.7801875,0.2138994336128235 350 | train,noise/free-sound/noise-free-sound-0348.wav,50.1140625,0.03524171933531761 351 | train,noise/free-sound/noise-free-sound-0349.wav,1.0,0.1690613180398941 352 | train,noise/free-sound/noise-free-sound-0350.wav,9.0,0.040681108832359314 353 | train,noise/free-sound/noise-free-sound-0351.wav,4.968,0.027849895879626274 354 | train,noise/free-sound/noise-free-sound-0352.wav,10.0,0.07022271305322647 355 | train,noise/free-sound/noise-free-sound-0353.wav,26.0963125,0.2605692744255066 356 | train,noise/free-sound/noise-free-sound-0354.wav,0.9999375,0.05305643379688263 357 | train,noise/free-sound/noise-free-sound-0355.wav,14.664,0.08156871050596237 358 | train,noise/free-sound/noise-free-sound-0356.wav,24.96,0.1574023813009262 359 | train,noise/free-sound/noise-free-sound-0357.wav,8.1763125,0.2569054663181305 360 | train,noise/free-sound/noise-free-sound-0358.wav,21.24625,0.035855066031217575 361 | train,noise/free-sound/noise-free-sound-0359.wav,18.3078125,0.014943092130124569 362 | train,noise/free-sound/noise-free-sound-0360.wav,13.89375,0.022115277126431465 363 | train,noise/free-sound/noise-free-sound-0361.wav,7.7390625,0.0052618165500462055 364 | train,noise/free-sound/noise-free-sound-0362.wav,12.875,0.025662187486886978 365 | train,noise/free-sound/noise-free-sound-0363.wav,4.721875,0.011223689652979374 366 | train,noise/free-sound/noise-free-sound-0364.wav,11.8400625,0.012878494337201118 367 | train,noise/free-sound/noise-free-sound-0365.wav,10.057125,0.21046742796897888 368 | train,noise/free-sound/noise-free-sound-0366.wav,4.049,0.33490365743637085 369 | train,noise/free-sound/noise-free-sound-0367.wav,1.044875,0.22696106135845184 370 | train,noise/free-sound/noise-free-sound-0368.wav,1.6456875,0.2907167971134186 371 | train,noise/free-sound/noise-free-sound-0369.wav,4.049,0.2372066080570221 372 | train,noise/free-sound/noise-free-sound-0370.wav,72.2546875,0.0368385836482048 373 | train,noise/free-sound/noise-free-sound-0371.wav,21.28,0.09028219431638718 374 | train,noise/free-sound/noise-free-sound-0372.wav,22.88875,0.04608670249581337 375 | train,noise/free-sound/noise-free-sound-0373.wav,3.234625,0.06457925587892532 376 | train,noise/free-sound/noise-free-sound-0374.wav,0.7430625,2.589870007341233e-07 377 | train,noise/free-sound/noise-free-sound-0375.wav,0.75,0.008004882372915745 378 | train,noise/free-sound/noise-free-sound-0376.wav,5.0866875,0.04294144734740257 379 | train,noise/free-sound/noise-free-sound-0377.wav,8.986125,0.0908549502491951 380 | train,noise/free-sound/noise-free-sound-0378.wav,22.2824375,0.046287212520837784 381 | train,noise/free-sound/noise-free-sound-0379.wav,6.9893125,0.09434410184621811 382 | train,noise/free-sound/noise-free-sound-0380.wav,0.144,0.03649836778640747 383 | train,noise/free-sound/noise-free-sound-0381.wav,44.2775625,0.10409745573997498 384 | train,noise/free-sound/noise-free-sound-0382.wav,60.552,0.034003183245658875 385 | train,noise/free-sound/noise-free-sound-0383.wav,32.904,0.07692724466323853 386 | train,noise/free-sound/noise-free-sound-0384.wav,178.211375,0.025643497705459595 387 | train,noise/free-sound/noise-free-sound-0385.wav,8.408375,0.05071987211704254 388 | train,noise/free-sound/noise-free-sound-0386.wav,26.7350625,0.031059708446264267 389 | train,noise/free-sound/noise-free-sound-0387.wav,5.96025,0.06439970433712006 390 | train,noise/free-sound/noise-free-sound-0388.wav,4.382375,0.04423951730132103 391 | train,noise/free-sound/noise-free-sound-0389.wav,13.7295,0.10568549484014511 392 | train,noise/free-sound/noise-free-sound-0390.wav,15.7635,0.04382310435175896 393 | train,noise/free-sound/noise-free-sound-0391.wav,23.391,0.018330859020352364 394 | train,noise/free-sound/noise-free-sound-0392.wav,13.221,0.013229451142251492 395 | train,noise/free-sound/noise-free-sound-0393.wav,5.5379375,0.03775392845273018 396 | train,noise/free-sound/noise-free-sound-0394.wav,23.72,0.05215062201023102 397 | train,noise/free-sound/noise-free-sound-0395.wav,7.73225,0.03044125624001026 398 | train,noise/free-sound/noise-free-sound-0396.wav,9.7959375,0.03589799255132675 399 | train,noise/free-sound/noise-free-sound-0397.wav,45.328,0.011843379586935043 400 | train,noise/free-sound/noise-free-sound-0398.wav,44.8800625,0.010618491098284721 401 | train,noise/free-sound/noise-free-sound-0399.wav,16.9025,0.03693331032991409 402 | train,noise/free-sound/noise-free-sound-0400.wav,14.86075,0.012356513179838657 403 | train,noise/free-sound/noise-free-sound-0401.wav,10.3515625,0.01837407611310482 404 | train,noise/free-sound/noise-free-sound-0402.wav,29.6705625,0.013565191999077797 405 | train,noise/free-sound/noise-free-sound-0403.wav,13.009,0.06084136664867401 406 | train,noise/free-sound/noise-free-sound-0404.wav,19.040375,0.10327192395925522 407 | train,noise/free-sound/noise-free-sound-0405.wav,2.306875,0.03524678200483322 408 | train,noise/free-sound/noise-free-sound-0406.wav,19.8315,0.05282257869839668 409 | train,noise/free-sound/noise-free-sound-0407.wav,2.9594375,0.044623784720897675 410 | train,noise/free-sound/noise-free-sound-0408.wav,15.0,0.05332208797335625 411 | train,noise/free-sound/noise-free-sound-0409.wav,8.6186875,0.10226306319236755 412 | train,noise/free-sound/noise-free-sound-0410.wav,1.1808125,0.038361839950084686 413 | train,noise/free-sound/noise-free-sound-0411.wav,2.3625625,0.0322076678276062 414 | train,noise/free-sound/noise-free-sound-0412.wav,2.624875,0.044953443109989166 415 | train,noise/free-sound/noise-free-sound-0413.wav,21.995125,0.03321641683578491 416 | train,noise/free-sound/noise-free-sound-0414.wav,28.3785,0.03101469576358795 417 | train,noise/free-sound/noise-free-sound-0415.wav,6.624,0.017043698579072952 418 | train,noise/free-sound/noise-free-sound-0416.wav,14.9700625,0.07003152370452881 419 | train,noise/free-sound/noise-free-sound-0417.wav,10.1516875,0.024113353341817856 420 | train,noise/free-sound/noise-free-sound-0418.wav,40.99575,0.019238855689764023 421 | train,noise/free-sound/noise-free-sound-0419.wav,11.9901875,0.09793315082788467 422 | train,noise/free-sound/noise-free-sound-0420.wav,0.8214375,0.08973070979118347 423 | train,noise/free-sound/noise-free-sound-0421.wav,3.3698125,0.020990706980228424 424 | train,noise/free-sound/noise-free-sound-0422.wav,9.1626875,0.014323156327009201 425 | train,noise/free-sound/noise-free-sound-0423.wav,12.25,0.02368086576461792 426 | train,noise/free-sound/noise-free-sound-0424.wav,16.666125,0.09960763156414032 427 | train,noise/free-sound/noise-free-sound-0425.wav,1.410625,0.25385206937789917 428 | train,noise/free-sound/noise-free-sound-0426.wav,3.89225,0.2450893521308899 429 | train,noise/free-sound/noise-free-sound-0427.wav,21.995125,0.22187045216560364 430 | train,noise/free-sound/noise-free-sound-0428.wav,18.622375,0.06734079867601395 431 | train,noise/free-sound/noise-free-sound-0429.wav,24.0,0.05017399042844772 432 | train,noise/free-sound/noise-free-sound-0430.wav,67.2,0.039679136127233505 433 | train,noise/free-sound/noise-free-sound-0431.wav,22.9355,0.04259328544139862 434 | train,noise/free-sound/noise-free-sound-0432.wav,22.2540625,0.04090152680873871 435 | train,noise/free-sound/noise-free-sound-0433.wav,42.2370625,0.059927552938461304 436 | train,noise/free-sound/noise-free-sound-0434.wav,21.528,0.051974814385175705 437 | train,noise/free-sound/noise-free-sound-0435.wav,5.27675,0.015150045044720173 438 | train,noise/free-sound/noise-free-sound-0436.wav,51.2120625,0.03175858035683632 439 | train,noise/free-sound/noise-free-sound-0437.wav,1.19,3.958451202379365e-07 440 | train,noise/free-sound/noise-free-sound-0438.wav,7.52325,0.0758250281214714 441 | train,noise/free-sound/noise-free-sound-0439.wav,30.7875625,0.05084805563092232 442 | train,noise/free-sound/noise-free-sound-0440.wav,4.1883125,0.02127326838672161 443 | train,noise/free-sound/noise-free-sound-0441.wav,45.568,0.060903094708919525 444 | train,noise/free-sound/noise-free-sound-0442.wav,76.0721875,0.036283113062381744 445 | train,noise/free-sound/noise-free-sound-0443.wav,42.24,0.016077127307653427 446 | train,noise/free-sound/noise-free-sound-0444.wav,20.2478125,0.022445961833000183 447 | train,noise/free-sound/noise-free-sound-0445.wav,18.3901875,0.049584317952394485 448 | train,noise/free-sound/noise-free-sound-0446.wav,31.416,0.011185670271515846 449 | train,noise/free-sound/noise-free-sound-0447.wav,28.5779375,0.01004602387547493 450 | train,noise/free-sound/noise-free-sound-0448.wav,0.574,0.11578892171382904 451 | train,noise/free-sound/noise-free-sound-0449.wav,4.7380625,0.04884331300854683 452 | train,noise/free-sound/noise-free-sound-0450.wav,16.1713125,0.193118616938591 453 | train,noise/free-sound/noise-free-sound-0451.wav,44.04,0.06223294511437416 454 | train,noise/free-sound/noise-free-sound-0452.wav,17.8445625,0.15768374502658844 455 | train,noise/free-sound/noise-free-sound-0453.wav,0.9404375,0.0542965829372406 456 | train,noise/free-sound/noise-free-sound-0454.wav,41.885,0.03289172798395157 457 | train,noise/free-sound/noise-free-sound-0455.wav,168.2695,0.03914259746670723 458 | train,noise/free-sound/noise-free-sound-0456.wav,28.96325,0.05199239030480385 459 | train,noise/free-sound/noise-free-sound-0457.wav,44.3666875,0.017017558217048645 460 | train,noise/free-sound/noise-free-sound-0458.wav,20.7055625,0.06016414240002632 461 | train,noise/free-sound/noise-free-sound-0459.wav,40.3069375,0.09874601662158966 462 | train,noise/free-sound/noise-free-sound-0460.wav,13.104,0.038429416716098785 463 | train,noise/free-sound/noise-free-sound-0461.wav,53.3406875,0.04417074844241142 464 | train,noise/free-sound/noise-free-sound-0462.wav,18.76375,0.07823855429887772 465 | train,noise/free-sound/noise-free-sound-0463.wav,15.1105625,0.05224224925041199 466 | train,noise/free-sound/noise-free-sound-0464.wav,8.0831875,0.03142475709319115 467 | train,noise/free-sound/noise-free-sound-0465.wav,9.007375,0.13380466401576996 468 | train,noise/free-sound/noise-free-sound-0466.wav,13.9530625,0.04733031988143921 469 | train,noise/free-sound/noise-free-sound-0467.wav,14.1738125,0.04154488444328308 470 | train,noise/free-sound/noise-free-sound-0468.wav,23.12375,0.053234294056892395 471 | train,noise/free-sound/noise-free-sound-0469.wav,10.7805,0.0436321459710598 472 | train,noise/free-sound/noise-free-sound-0470.wav,4.757375,0.1546269804239273 473 | train,noise/free-sound/noise-free-sound-0471.wav,7.915125,0.02925923280417919 474 | train,noise/free-sound/noise-free-sound-0472.wav,94.0,0.04926766827702522 475 | train,noise/free-sound/noise-free-sound-0473.wav,9.11675,0.09038642793893814 476 | train,noise/free-sound/noise-free-sound-0474.wav,35.935,0.030509065836668015 477 | train,noise/free-sound/noise-free-sound-0475.wav,36.3624375,0.058621712028980255 478 | train,noise/free-sound/noise-free-sound-0476.wav,9.5608125,0.050260499119758606 479 | train,noise/free-sound/noise-free-sound-0477.wav,15.124875,0.07204657047986984 480 | train,noise/free-sound/noise-free-sound-0478.wav,8.5195,0.048078786581754684 481 | train,noise/free-sound/noise-free-sound-0479.wav,3.715625,0.10714650899171829 482 | train,noise/free-sound/noise-free-sound-0480.wav,35.2898125,0.04612552002072334 483 | train,noise/free-sound/noise-free-sound-0481.wav,7.907875,0.14924266934394836 484 | train,noise/free-sound/noise-free-sound-0482.wav,13.5314375,0.02954801917076111 485 | train,noise/free-sound/noise-free-sound-0483.wav,22.5959375,0.022938815876841545 486 | train,noise/free-sound/noise-free-sound-0484.wav,10.6318125,0.030818503350019455 487 | train,noise/free-sound/noise-free-sound-0485.wav,10.8930625,0.05822751671075821 488 | train,noise/free-sound/noise-free-sound-0486.wav,6.4098125,0.01118223275989294 489 | train,noise/free-sound/noise-free-sound-0487.wav,1.9301875,0.031102990731596947 490 | train,noise/free-sound/noise-free-sound-0488.wav,5.656875,0.040080949664115906 491 | train,noise/free-sound/noise-free-sound-0489.wav,62.8613125,0.023358652368187904 492 | train,noise/free-sound/noise-free-sound-0490.wav,12.9799375,0.013502035290002823 493 | train,noise/free-sound/noise-free-sound-0491.wav,1.544125,0.031686265021562576 494 | train,noise/free-sound/noise-free-sound-0492.wav,20.0,0.05861775949597359 495 | train,noise/free-sound/noise-free-sound-0493.wav,14.02775,0.029563307762145996 496 | train,noise/free-sound/noise-free-sound-0494.wav,56.05875,0.060667529702186584 497 | train,noise/free-sound/noise-free-sound-0495.wav,18.2066875,0.08725643903017044 498 | train,noise/free-sound/noise-free-sound-0496.wav,5.97325,0.055248506367206573 499 | train,noise/free-sound/noise-free-sound-0497.wav,302.0,0.045467883348464966 500 | train,noise/free-sound/noise-free-sound-0498.wav,5.0,0.016401100903749466 501 | train,noise/free-sound/noise-free-sound-0499.wav,7.889,0.017813293263316154 502 | train,noise/free-sound/noise-free-sound-0500.wav,34.374875,0.07433386147022247 503 | train,noise/free-sound/noise-free-sound-0501.wav,12.0079375,0.08994701504707336 504 | train,noise/free-sound/noise-free-sound-0502.wav,73.2,0.03240280598402023 505 | train,noise/free-sound/noise-free-sound-0503.wav,5.0648125,0.046961430460214615 506 | train,noise/free-sound/noise-free-sound-0504.wav,58.0034375,0.04388689994812012 507 | train,noise/free-sound/noise-free-sound-0505.wav,9.373875,0.09661614149808884 508 | train,noise/free-sound/noise-free-sound-0506.wav,24.58125,0.026245592162013054 509 | train,noise/free-sound/noise-free-sound-0507.wav,1.1199375,0.017153136432170868 510 | train,noise/free-sound/noise-free-sound-0508.wav,7.319875,0.009689533151686192 511 | train,noise/free-sound/noise-free-sound-0509.wav,10.79725,0.16952723264694214 512 | train,noise/free-sound/noise-free-sound-0510.wav,19.845125,0.04474690556526184 513 | train,noise/free-sound/noise-free-sound-0511.wav,4.0,0.05655713379383087 514 | train,noise/free-sound/noise-free-sound-0512.wav,3.78775,0.026339855045080185 515 | train,noise/free-sound/noise-free-sound-0513.wav,10.7885625,0.012656833045184612 516 | train,noise/free-sound/noise-free-sound-0514.wav,5.329,0.00939034391194582 517 | train,noise/free-sound/noise-free-sound-0515.wav,4.075125,0.02830776944756508 518 | train,noise/free-sound/noise-free-sound-0516.wav,1.0991875,0.07844716310501099 519 | train,noise/free-sound/noise-free-sound-0517.wav,9.962,0.014107516035437584 520 | train,noise/free-sound/noise-free-sound-0518.wav,18.7820625,0.04062504321336746 521 | train,noise/free-sound/noise-free-sound-0519.wav,33.5935,0.05104810744524002 522 | train,noise/free-sound/noise-free-sound-0520.wav,12.5,0.030691981315612793 523 | train,noise/free-sound/noise-free-sound-0521.wav,7.0008125,0.03188765048980713 524 | train,noise/free-sound/noise-free-sound-0522.wav,79.2853125,0.017952265217900276 525 | train,noise/free-sound/noise-free-sound-0523.wav,8.424,0.03407513350248337 526 | train,noise/free-sound/noise-free-sound-0524.wav,22.6495,0.035302065312862396 527 | train,noise/free-sound/noise-free-sound-0525.wav,0.9583125,0.07954972237348557 528 | train,noise/free-sound/noise-free-sound-0526.wav,1.4186875,0.03291534259915352 529 | train,noise/free-sound/noise-free-sound-0527.wav,9.3093125,0.027845852077007294 530 | train,noise/free-sound/noise-free-sound-0528.wav,3.06975,0.03752041608095169 531 | train,noise/free-sound/noise-free-sound-0529.wav,1.8594375,0.11263688653707504 532 | train,noise/free-sound/noise-free-sound-0530.wav,7.7573125,0.13751307129859924 533 | train,noise/free-sound/noise-free-sound-0531.wav,46.7388125,0.16500982642173767 534 | train,noise/free-sound/noise-free-sound-0532.wav,6.0,0.01770815998315811 535 | train,noise/free-sound/noise-free-sound-0533.wav,37.91875,0.1319381147623062 536 | train,noise/free-sound/noise-free-sound-0534.wav,17.477625,0.028349682688713074 537 | train,noise/free-sound/noise-free-sound-0535.wav,18.137625,0.02680165506899357 538 | train,noise/free-sound/noise-free-sound-0536.wav,0.69275,0.16672660410404205 539 | train,noise/free-sound/noise-free-sound-0537.wav,12.1733125,0.019201423972845078 540 | train,noise/free-sound/noise-free-sound-0538.wav,0.5738125,0.02545657753944397 541 | train,noise/free-sound/noise-free-sound-0539.wav,1.515125,0.0866997167468071 542 | train,noise/free-sound/noise-free-sound-0540.wav,3.0040625,0.016370726749300957 543 | train,noise/free-sound/noise-free-sound-0541.wav,16.37875,0.012002836912870407 544 | train,noise/free-sound/noise-free-sound-0542.wav,8.224,0.08064471930265427 545 | train,noise/free-sound/noise-free-sound-0543.wav,4.356,0.019136013463139534 546 | train,noise/free-sound/noise-free-sound-0544.wav,107.064,0.014804407954216003 547 | train,noise/free-sound/noise-free-sound-0545.wav,31.07325,0.03191414847970009 548 | train,noise/free-sound/noise-free-sound-0546.wav,117.6240625,0.016521433368325233 549 | train,noise/free-sound/noise-free-sound-0547.wav,4.848875,0.1954786479473114 550 | train,noise/free-sound/noise-free-sound-0548.wav,8.2024375,0.013446212746202946 551 | train,noise/free-sound/noise-free-sound-0549.wav,5.832,0.03358636796474457 552 | train,noise/free-sound/noise-free-sound-0550.wav,6.373875,0.08931571245193481 553 | train,noise/free-sound/noise-free-sound-0551.wav,33.18525,0.06328275054693222 554 | train,noise/free-sound/noise-free-sound-0552.wav,77.736,0.010960646905004978 555 | train,noise/free-sound/noise-free-sound-0553.wav,16.431,0.0317048504948616 556 | train,noise/free-sound/noise-free-sound-0554.wav,0.7836875,0.1763930767774582 557 | train,noise/free-sound/noise-free-sound-0555.wav,21.577125,0.054776307195425034 558 | train,noise/free-sound/noise-free-sound-0556.wav,19.9053125,0.08226145803928375 559 | train,noise/free-sound/noise-free-sound-0557.wav,9.11675,0.050691552460193634 560 | train,noise/free-sound/noise-free-sound-0558.wav,8.38575,0.06994509696960449 561 | train,noise/free-sound/noise-free-sound-0559.wav,14.204625,0.07384513318538666 562 | train,noise/free-sound/noise-free-sound-0560.wav,269.544,0.03145440295338631 563 | train,noise/free-sound/noise-free-sound-0561.wav,1.1755,0.02490137331187725 564 | train,noise/free-sound/noise-free-sound-0562.wav,9.0645,0.018991192802786827 565 | train,noise/free-sound/noise-free-sound-0563.wav,24.331875,0.05708249658346176 566 | train,noise/free-sound/noise-free-sound-0564.wav,0.1035625,0.011895965784788132 567 | train,noise/free-sound/noise-free-sound-0565.wav,7.429375,0.0369032584130764 568 | train,noise/free-sound/noise-free-sound-0566.wav,6.1883125,0.03060462884604931 569 | train,noise/free-sound/noise-free-sound-0567.wav,98.88,0.04714568704366684 570 | train,noise/free-sound/noise-free-sound-0568.wav,113.6384375,0.097239650785923 571 | train,noise/free-sound/noise-free-sound-0569.wav,3.84,0.020919956266880035 572 | train,noise/free-sound/noise-free-sound-0570.wav,6.373875,0.04353569820523262 573 | train,noise/free-sound/noise-free-sound-0571.wav,7.1314375,0.010439659468829632 574 | train,noise/free-sound/noise-free-sound-0572.wav,123.4285625,0.04965611547231674 575 | train,noise/free-sound/noise-free-sound-0573.wav,0.391,0.04393092170357704 576 | train,noise/free-sound/noise-free-sound-0574.wav,20.8456875,0.018946876749396324 577 | train,noise/free-sound/noise-free-sound-0575.wav,6.635125,0.11531470715999603 578 | train,noise/free-sound/noise-free-sound-0576.wav,3.732625,0.025617213919758797 579 | train,noise/free-sound/noise-free-sound-0577.wav,13.4530625,0.0597815178334713 580 | train,noise/free-sound/noise-free-sound-0578.wav,19.35675,0.03867294639348984 581 | train,noise/free-sound/noise-free-sound-0579.wav,17.2660625,0.037487927824258804 582 | train,noise/free-sound/noise-free-sound-0580.wav,135.68,0.02141750231385231 583 | train,noise/free-sound/noise-free-sound-0581.wav,51.4863125,0.046505432575941086 584 | train,noise/free-sound/noise-free-sound-0582.wav,1.4744375,0.09398862719535828 585 | train,noise/free-sound/noise-free-sound-0583.wav,2.04375,0.09569674730300903 586 | train,noise/free-sound/noise-free-sound-0584.wav,15.7739375,0.054114747792482376 587 | train,noise/free-sound/noise-free-sound-0585.wav,0.9288125,0.027606800198554993 588 | train,noise/free-sound/noise-free-sound-0586.wav,10.8333125,0.050648726522922516 589 | train,noise/free-sound/noise-free-sound-0587.wav,6.1226875,0.0615115761756897 590 | train,noise/free-sound/noise-free-sound-0588.wav,5.8026875,0.007875349372625351 591 | train,noise/free-sound/noise-free-sound-0589.wav,2.2465,0.07835634797811508 592 | train,noise/free-sound/noise-free-sound-0590.wav,3.78775,0.04725419357419014 593 | train,noise/free-sound/noise-free-sound-0591.wav,12.256875,0.011887668631970882 594 | train,noise/free-sound/noise-free-sound-0592.wav,32.875125,0.044854626059532166 595 | train,noise/free-sound/noise-free-sound-0593.wav,61.649,0.020986465737223625 596 | train,noise/free-sound/noise-free-sound-0594.wav,141.1773125,0.004017080180346966 597 | train,noise/free-sound/noise-free-sound-0595.wav,26.898625,0.06854495406150818 598 | train,noise/free-sound/noise-free-sound-0596.wav,13.33925,0.04375113174319267 599 | train,noise/free-sound/noise-free-sound-0597.wav,6.24325,0.06470189988613129 600 | train,noise/free-sound/noise-free-sound-0598.wav,57.920125,0.06305752694606781 601 | train,noise/free-sound/noise-free-sound-0599.wav,3.84,0.04084300249814987 602 | train,noise/free-sound/noise-free-sound-0600.wav,45.395,0.04135194420814514 603 | train,noise/free-sound/noise-free-sound-0601.wav,4.239875,0.2734590768814087 604 | train,noise/free-sound/noise-free-sound-0602.wav,4.125,0.21610486507415771 605 | train,noise/free-sound/noise-free-sound-0603.wav,7.3740625,0.3005824685096741 606 | train,noise/free-sound/noise-free-sound-0604.wav,7.358125,0.08612485975027084 607 | train,noise/free-sound/noise-free-sound-0605.wav,32.0,0.22763755917549133 608 | train,noise/free-sound/noise-free-sound-0606.wav,88.84,0.01871025189757347 609 | train,noise/free-sound/noise-free-sound-0607.wav,20.0098125,0.0681547224521637 610 | train,noise/free-sound/noise-free-sound-0608.wav,3.5265,0.0683574452996254 611 | train,noise/free-sound/noise-free-sound-0609.wav,10.161625,0.02855943888425827 612 | train,noise/free-sound/noise-free-sound-0610.wav,11.1804375,0.06024652719497681 613 | train,noise/free-sound/noise-free-sound-0611.wav,0.127,0.023479890078306198 614 | train,noise/free-sound/noise-free-sound-0612.wav,11.0498125,0.1207055076956749 615 | train,noise/free-sound/noise-free-sound-0613.wav,28.5518125,0.033192068338394165 616 | train,noise/free-sound/noise-free-sound-0614.wav,30.0,0.01605299487709999 617 | train,noise/free-sound/noise-free-sound-0615.wav,10.082875,0.08860311657190323 618 | train,noise/free-sound/noise-free-sound-0616.wav,22.6481875,0.006513164844363928 619 | train,noise/free-sound/noise-free-sound-0617.wav,8.2026875,0.016903722658753395 620 | train,noise/free-sound/noise-free-sound-0618.wav,3.0205625,0.059680454432964325 621 | train,noise/free-sound/noise-free-sound-0619.wav,0.5454375,0.005135993007570505 622 | train,noise/free-sound/noise-free-sound-0620.wav,11.5664375,0.03831050917506218 623 | train,noise/free-sound/noise-free-sound-0621.wav,6.724375,0.03734799474477768 624 | train,noise/free-sound/noise-free-sound-0622.wav,11.9901875,0.0628824383020401 625 | train,noise/free-sound/noise-free-sound-0623.wav,34.9518125,0.05226227641105652 626 | train,noise/free-sound/noise-free-sound-0624.wav,0.104,0.09036702662706375 627 | train,noise/free-sound/noise-free-sound-0625.wav,20.8195625,0.027273673564195633 628 | train,noise/free-sound/noise-free-sound-0626.wav,1.3235625,0.047352369874715805 629 | train,noise/free-sound/noise-free-sound-0627.wav,0.3135,0.14606566727161407 630 | train,noise/free-sound/noise-free-sound-0628.wav,29.0220625,0.03407597541809082 631 | train,noise/free-sound/noise-free-sound-0629.wav,3.5,0.3172118663787842 632 | train,noise/free-sound/noise-free-sound-0630.wav,3.1085625,0.08142469823360443 633 | train,noise/free-sound/noise-free-sound-0631.wav,5.6,0.09101958572864532 634 | train,noise/free-sound/noise-free-sound-0632.wav,2.568,0.16587309539318085 635 | train,noise/free-sound/noise-free-sound-0633.wav,18.272625,0.03823813423514366 636 | train,noise/free-sound/noise-free-sound-0634.wav,64.0,0.005076223518699408 637 | train,noise/free-sound/noise-free-sound-0635.wav,16.876,0.039830684661865234 638 | train,noise/free-sound/noise-free-sound-0636.wav,31.3991875,0.05804016813635826 639 | train,noise/free-sound/noise-free-sound-0637.wav,14.6841875,0.02190512977540493 640 | train,noise/free-sound/noise-free-sound-0638.wav,5.5926875,0.037356846034526825 641 | train,noise/free-sound/noise-free-sound-0639.wav,6.20375,0.03894766792654991 642 | train,noise/free-sound/noise-free-sound-0640.wav,32.7836875,0.056162264198064804 643 | train,noise/free-sound/noise-free-sound-0641.wav,35.1346875,0.03040275163948536 644 | train,noise/free-sound/noise-free-sound-0642.wav,4.4930625,0.03574768826365471 645 | train,noise/free-sound/noise-free-sound-0643.wav,2.78425,0.04011024534702301 646 | train,noise/free-sound/noise-free-sound-0644.wav,94.5981875,0.011430583894252777 647 | train,noise/free-sound/noise-free-sound-0645.wav,1.7240625,0.022321103140711784 648 | train,noise/free-sound/noise-free-sound-0646.wav,35.813,0.015494233928620815 649 | train,noise/free-sound/noise-free-sound-0647.wav,60.103375,0.027890006080269814 650 | train,noise/free-sound/noise-free-sound-0648.wav,12.288,0.020221997052431107 651 | train,noise/free-sound/noise-free-sound-0649.wav,50.4279375,0.042251117527484894 652 | train,noise/free-sound/noise-free-sound-0650.wav,16.236,0.10205979645252228 653 | train,noise/free-sound/noise-free-sound-0651.wav,65.2095,0.0796578973531723 654 | train,noise/free-sound/noise-free-sound-0652.wav,5.69925,0.014021473005414009 655 | train,noise/free-sound/noise-free-sound-0653.wav,29.125,0.05897703766822815 656 | train,noise/free-sound/noise-free-sound-0654.wav,5.9820625,0.1681751012802124 657 | train,noise/free-sound/noise-free-sound-0655.wav,21.34,0.01833922602236271 658 | train,noise/free-sound/noise-free-sound-0656.wav,4.884875,0.015281392261385918 659 | train,noise/free-sound/noise-free-sound-0657.wav,9.6670625,0.015602482482790947 660 | train,noise/free-sound/noise-free-sound-0658.wav,19.0,0.010228785686194897 661 | train,noise/free-sound/noise-free-sound-0659.wav,11.1241875,0.03744412958621979 662 | train,noise/free-sound/noise-free-sound-0660.wav,41.8373125,0.0351371206343174 663 | train,noise/free-sound/noise-free-sound-0661.wav,35.5150625,0.0055358815006911755 664 | train,noise/free-sound/noise-free-sound-0662.wav,1.410625,0.08759284019470215 665 | train,noise/free-sound/noise-free-sound-0663.wav,29.3544375,0.016564222052693367 666 | train,noise/free-sound/noise-free-sound-0664.wav,15.647375,0.06327983736991882 667 | train,noise/free-sound/noise-free-sound-0665.wav,17.2669375,0.03821848705410957 668 | train,noise/free-sound/noise-free-sound-0666.wav,13.685625,0.028673436492681503 669 | train,noise/free-sound/noise-free-sound-0667.wav,13.7598125,0.02481307089328766 670 | train,noise/free-sound/noise-free-sound-0668.wav,4.022875,0.07184793055057526 671 | train,noise/free-sound/noise-free-sound-0669.wav,4.022875,0.030827142298221588 672 | train,noise/free-sound/noise-free-sound-0670.wav,4.7600625,0.023740291595458984 673 | train,noise/free-sound/noise-free-sound-0671.wav,70.191,0.05368071794509888 674 | train,noise/free-sound/noise-free-sound-0672.wav,6.2955,0.00962742604315281 675 | train,noise/free-sound/noise-free-sound-0673.wav,3.8298125,0.045326001942157745 676 | train,noise/free-sound/noise-free-sound-0674.wav,31.131875,0.040758490562438965 677 | train,noise/free-sound/noise-free-sound-0675.wav,8.7690625,0.017590992152690887 678 | train,noise/free-sound/noise-free-sound-0676.wav,2.1630625,0.20533037185668945 679 | train,noise/free-sound/noise-free-sound-0677.wav,107.76,0.03978710621595383 680 | train,noise/free-sound/noise-free-sound-0678.wav,2.0636875,0.06593439728021622 681 | train,noise/free-sound/noise-free-sound-0679.wav,22.0,0.05452584847807884 682 | train,noise/free-sound/noise-free-sound-0680.wav,14.568,0.07073165476322174 683 | train,noise/free-sound/noise-free-sound-0681.wav,96.7053125,0.03480824455618858 684 | train,noise/free-sound/noise-free-sound-0682.wav,18.18125,0.026480460539460182 685 | train,noise/free-sound/noise-free-sound-0683.wav,27.616,0.03949678689241409 686 | train,noise/free-sound/noise-free-sound-0684.wav,139.00625,0.03529800847172737 687 | train,noise/free-sound/noise-free-sound-0685.wav,43.8785625,0.027088863775134087 688 | train,noise/free-sound/noise-free-sound-0686.wav,15.6038125,0.019841305911540985 689 | train,noise/free-sound/noise-free-sound-0687.wav,20.8020625,0.01232278160750866 690 | train,noise/free-sound/noise-free-sound-0688.wav,19.25225,0.06801490485668182 691 | train,noise/free-sound/noise-free-sound-0689.wav,1.05,0.05468634516000748 692 | train,noise/free-sound/noise-free-sound-0690.wav,5.2053125,0.036664824932813644 693 | train,noise/free-sound/noise-free-sound-0691.wav,10.0693125,0.009726029820740223 694 | train,noise/free-sound/noise-free-sound-0692.wav,0.927875,0.1667458862066269 695 | train,noise/free-sound/noise-free-sound-0693.wav,1.410625,0.13827519118785858 696 | train,noise/free-sound/noise-free-sound-0694.wav,19.8465,0.0698181539773941 697 | train,noise/free-sound/noise-free-sound-0695.wav,9.4238125,0.14697526395320892 698 | train,noise/free-sound/noise-free-sound-0696.wav,29.570625,0.016864970326423645 699 | train,noise/free-sound/noise-free-sound-0697.wav,6.112625,0.015389485284686089 700 | train,noise/free-sound/noise-free-sound-0698.wav,76.168875,0.040011338889598846 701 | train,noise/free-sound/noise-free-sound-0699.wav,119.5101875,0.01451085601001978 702 | train,noise/free-sound/noise-free-sound-0700.wav,41.04,0.027945347130298615 703 | train,noise/free-sound/noise-free-sound-0701.wav,6.057625,0.11962121725082397 704 | train,noise/free-sound/noise-free-sound-0702.wav,21.81225,0.01318186242133379 705 | train,noise/free-sound/noise-free-sound-0703.wav,8.1930625,0.03435487672686577 706 | train,noise/free-sound/noise-free-sound-0704.wav,20.16,0.07279205322265625 707 | train,noise/free-sound/noise-free-sound-0705.wav,11.3306875,0.16383984684944153 708 | train,noise/free-sound/noise-free-sound-0706.wav,28.290625,0.011244518682360649 709 | train,noise/free-sound/noise-free-sound-0707.wav,40.0718125,0.06048770621418953 710 | train,noise/free-sound/noise-free-sound-0708.wav,63.0595625,0.028981540352106094 711 | train,noise/free-sound/noise-free-sound-0709.wav,31.85775,0.032681904733181 712 | train,noise/free-sound/noise-free-sound-0710.wav,45.7404375,0.06862396746873856 713 | train,noise/free-sound/noise-free-sound-0711.wav,0.3926875,0.031020980328321457 714 | train,noise/free-sound/noise-free-sound-0712.wav,2.847375,0.09257514029741287 715 | train,noise/free-sound/noise-free-sound-0713.wav,1.6979375,0.08049245923757553 716 | train,noise/free-sound/noise-free-sound-0714.wav,1.12325,0.03631926700472832 717 | train,noise/free-sound/noise-free-sound-0715.wav,60.0555,0.06517808884382248 718 | train,noise/free-sound/noise-free-sound-0716.wav,0.624,0.03566426411271095 719 | train,noise/free-sound/noise-free-sound-0717.wav,5.9820625,0.016297001391649246 720 | train,noise/free-sound/noise-free-sound-0718.wav,13.035125,0.08569561690092087 721 | train,noise/free-sound/noise-free-sound-0719.wav,7.0008125,0.012838777154684067 722 | train,noise/free-sound/noise-free-sound-0720.wav,10.7363125,0.0775250568985939 723 | train,noise/free-sound/noise-free-sound-0721.wav,9.03,0.09511198103427887 724 | train,noise/free-sound/noise-free-sound-0722.wav,7.77,0.0734415054321289 725 | train,noise/free-sound/noise-free-sound-0723.wav,89.721875,0.019564535468816757 726 | train,noise/free-sound/noise-free-sound-0724.wav,0.9109375,0.12772886455059052 727 | train,noise/free-sound/noise-free-sound-0725.wav,33.2295625,0.05877486243844032 728 | train,noise/free-sound/noise-free-sound-0726.wav,120.0138125,0.010473191738128662 729 | train,noise/free-sound/noise-free-sound-0727.wav,29.022,0.023159675300121307 730 | train,noise/free-sound/noise-free-sound-0728.wav,123.84,0.025120077654719353 731 | train,noise/free-sound/noise-free-sound-0729.wav,187.14125,0.031593095511198044 732 | train,noise/free-sound/noise-free-sound-0730.wav,1.489,0.07127058506011963 733 | train,noise/free-sound/noise-free-sound-0731.wav,1.43675,0.07526694238185883 734 | train,noise/free-sound/noise-free-sound-0732.wav,31.73875,0.026971112936735153 735 | train,noise/free-sound/noise-free-sound-0733.wav,31.94775,0.026688866317272186 736 | train,noise/free-sound/noise-free-sound-0734.wav,21.1591875,0.01646844856441021 737 | train,noise/free-sound/noise-free-sound-0735.wav,21.81225,0.0156633909791708 738 | train,noise/free-sound/noise-free-sound-0736.wav,0.9194375,0.12268686294555664 739 | train,noise/free-sound/noise-free-sound-0737.wav,4.8469375,0.07719245553016663 740 | train,noise/free-sound/noise-free-sound-0738.wav,27.89875,0.09538252651691437 741 | train,noise/free-sound/noise-free-sound-0739.wav,19.224,0.04355339705944061 742 | train,noise/free-sound/noise-free-sound-0740.wav,64.0714375,0.14594630897045135 743 | train,noise/free-sound/noise-free-sound-0741.wav,18.696,0.07057242840528488 744 | train,noise/free-sound/noise-free-sound-0742.wav,22.5436875,0.03319770097732544 745 | train,noise/free-sound/noise-free-sound-0743.wav,43.9640625,0.059158992022275925 746 | train,noise/free-sound/noise-free-sound-0744.wav,1.044875,0.11951413005590439 747 | train,noise/free-sound/noise-free-sound-0745.wav,4.7333125,0.057827286422252655 748 | train,noise/free-sound/noise-free-sound-0746.wav,0.227,0.011438189074397087 749 | train,noise/free-sound/noise-free-sound-0747.wav,17.4236875,0.023010853677988052 750 | train,noise/free-sound/noise-free-sound-0748.wav,5.64,0.0689081996679306 751 | train,noise/free-sound/noise-free-sound-0749.wav,1.0,0.06701701134443283 752 | train,noise/free-sound/noise-free-sound-0750.wav,5.4595625,0.028374891728162766 753 | train,noise/free-sound/noise-free-sound-0751.wav,1.446375,0.2519388496875763 754 | train,noise/free-sound/noise-free-sound-0752.wav,33.6083125,0.030209451913833618 755 | train,noise/free-sound/noise-free-sound-0753.wav,24.696,0.12444719672203064 756 | train,noise/free-sound/noise-free-sound-0754.wav,80.72325,0.04299146309494972 757 | train,noise/free-sound/noise-free-sound-0755.wav,14.4195625,0.04165901988744736 758 | train,noise/free-sound/noise-free-sound-0756.wav,4.15875,0.030746938660740852 759 | train,noise/free-sound/noise-free-sound-0757.wav,13.7665,0.10464916378259659 760 | train,noise/free-sound/noise-free-sound-0758.wav,12.644375,0.05036643147468567 761 | train,noise/free-sound/noise-free-sound-0759.wav,2.731375,0.06787645071744919 762 | train,noise/free-sound/noise-free-sound-0760.wav,0.6233125,0.09675282984972 763 | train,noise/free-sound/noise-free-sound-0761.wav,32.3990625,0.03686794266104698 764 | train,noise/free-sound/noise-free-sound-0762.wav,12.8,0.20819808542728424 765 | train,noise/free-sound/noise-free-sound-0763.wav,20.491625,0.10151830315589905 766 | train,noise/free-sound/noise-free-sound-0764.wav,0.0645625,0.0005920132389292121 767 | train,noise/free-sound/noise-free-sound-0765.wav,0.20275,0.0034669337328523397 768 | train,noise/free-sound/noise-free-sound-0766.wav,0.026125,0.0 769 | train,noise/free-sound/noise-free-sound-0767.wav,29.6385625,0.05864330753684044 770 | train,noise/free-sound/noise-free-sound-0768.wav,0.4229375,0.06983917951583862 771 | train,noise/free-sound/noise-free-sound-0769.wav,25.761625,0.05337362736463547 772 | train,noise/free-sound/noise-free-sound-0770.wav,55.5624375,0.02889862284064293 773 | train,noise/free-sound/noise-free-sound-0771.wav,3.1579375,0.1223643571138382 774 | train,noise/free-sound/noise-free-sound-0772.wav,4.6535625,0.02311071753501892 775 | train,noise/free-sound/noise-free-sound-0773.wav,9.045375,0.034549664705991745 776 | train,noise/free-sound/noise-free-sound-0774.wav,3.1471875,0.19626016914844513 777 | train,noise/free-sound/noise-free-sound-0775.wav,29.5101875,0.038521576672792435 778 | train,noise/free-sound/noise-free-sound-0776.wav,15.661875,0.06775304675102234 779 | train,noise/free-sound/noise-free-sound-0777.wav,6.9224375,0.09624522179365158 780 | train,noise/free-sound/noise-free-sound-0778.wav,18.2613125,0.025550562888383865 781 | train,noise/free-sound/noise-free-sound-0779.wav,12.512625,0.044095106422901154 782 | train,noise/free-sound/noise-free-sound-0780.wav,30.2759375,0.046052154153585434 783 | train,noise/free-sound/noise-free-sound-0781.wav,60.7200625,0.0475756973028183 784 | train,noise/free-sound/noise-free-sound-0782.wav,9.8986875,0.030712144449353218 785 | train,noise/free-sound/noise-free-sound-0783.wav,8.614625,0.14119373261928558 786 | train,noise/free-sound/noise-free-sound-0784.wav,35.8040625,0.008375727571547031 787 | train,noise/free-sound/noise-free-sound-0785.wav,26.919875,0.00985590647906065 788 | train,noise/free-sound/noise-free-sound-0786.wav,19.8,0.09170135855674744 789 | train,noise/free-sound/noise-free-sound-0787.wav,34.9853125,0.05825154483318329 790 | train,noise/free-sound/noise-free-sound-0788.wav,34.8718125,0.0597430057823658 791 | train,noise/free-sound/noise-free-sound-0789.wav,8.8515,0.17519313097000122 792 | train,noise/free-sound/noise-free-sound-0790.wav,16.138125,0.03609300032258034 793 | train,noise/free-sound/noise-free-sound-0791.wav,9.0,0.021120132878422737 794 | train,noise/free-sound/noise-free-sound-0792.wav,41.5869375,0.05190187692642212 795 | train,noise/free-sound/noise-free-sound-0793.wav,9.4301875,0.0664939358830452 796 | train,noise/free-sound/noise-free-sound-0794.wav,4.434125,0.046769555658102036 797 | train,noise/free-sound/noise-free-sound-0795.wav,45.814125,0.031657714396715164 798 | train,noise/free-sound/noise-free-sound-0796.wav,10.24,0.09117131680250168 799 | train,noise/free-sound/noise-free-sound-0797.wav,16.384,0.1648857593536377 800 | train,noise/free-sound/noise-free-sound-0798.wav,12.9706875,0.18368032574653625 801 | train,noise/free-sound/noise-free-sound-0799.wav,5.6319375,0.03792005777359009 802 | train,noise/free-sound/noise-free-sound-0800.wav,9.6,0.018907127901911736 803 | train,noise/free-sound/noise-free-sound-0801.wav,6.01075,0.0029609836637973785 804 | train,noise/free-sound/noise-free-sound-0802.wav,5.1245,0.0005925616133026779 805 | train,noise/free-sound/noise-free-sound-0803.wav,14.02775,0.09098536521196365 806 | train,noise/free-sound/noise-free-sound-0804.wav,59.6985,0.006640758831053972 807 | train,noise/free-sound/noise-free-sound-0805.wav,15.16,0.05793767049908638 808 | train,noise/free-sound/noise-free-sound-0806.wav,11.78125,0.028702983632683754 809 | train,noise/free-sound/noise-free-sound-0807.wav,4.10125,0.03980659693479538 810 | train,noise/free-sound/noise-free-sound-0808.wav,1.6979375,0.1011076346039772 811 | train,noise/free-sound/noise-free-sound-0809.wav,25.1298125,0.09053251892328262 812 | train,noise/free-sound/noise-free-sound-0810.wav,4.008375,0.26876741647720337 813 | train,noise/free-sound/noise-free-sound-0811.wav,7.9760625,0.28619760274887085 814 | train,noise/free-sound/noise-free-sound-0812.wav,12.198,0.06944070011377335 815 | train,noise/free-sound/noise-free-sound-0813.wav,78.8018125,0.18343234062194824 816 | train,noise/free-sound/noise-free-sound-0814.wav,15.661625,0.09769771993160248 817 | train,noise/free-sound/noise-free-sound-0815.wav,2.4718125,0.12637928128242493 818 | train,noise/free-sound/noise-free-sound-0816.wav,1.324625,0.08751863986253738 819 | train,noise/free-sound/noise-free-sound-0817.wav,1.6123125,0.10826314985752106 820 | train,noise/free-sound/noise-free-sound-0818.wav,57.678375,0.015582166612148285 821 | train,noise/free-sound/noise-free-sound-0819.wav,109.9043125,0.013354653492569923 822 | train,noise/free-sound/noise-free-sound-0820.wav,18.450875,0.02505454793572426 823 | train,noise/free-sound/noise-free-sound-0821.wav,3.3436875,0.06855987757444382 824 | train,noise/free-sound/noise-free-sound-0822.wav,4.989375,0.1806216686964035 825 | train,noise/free-sound/noise-free-sound-0823.wav,2.749125,0.05665133148431778 826 | train,noise/free-sound/noise-free-sound-0824.wav,33.12325,0.04190948233008385 827 | train,noise/free-sound/noise-free-sound-0825.wav,2.6735625,0.07563027739524841 828 | train,noise/free-sound/noise-free-sound-0826.wav,11.18175,0.07534576207399368 829 | train,noise/free-sound/noise-free-sound-0827.wav,0.548,0.05151750519871712 830 | train,noise/free-sound/noise-free-sound-0828.wav,60.081625,0.011978531256318092 831 | train,noise/free-sound/noise-free-sound-0829.wav,6.7504375,0.06023939326405525 832 | train,noise/free-sound/noise-free-sound-0830.wav,0.9581875,0.06875304877758026 833 | train,noise/free-sound/noise-free-sound-0831.wav,11.441625,0.03155514597892761 834 | train,noise/free-sound/noise-free-sound-0832.wav,14.9420625,0.027190793305635452 835 | train,noise/free-sound/noise-free-sound-0833.wav,4.97375,0.07724756002426147 836 | train,noise/free-sound/noise-free-sound-0834.wav,17.0666875,0.0489436499774456 837 | train,noise/free-sound/noise-free-sound-0835.wav,5.0515625,0.13070523738861084 838 | train,noise/free-sound/noise-free-sound-0836.wav,64.896,0.019817186519503593 839 | train,noise/free-sound/noise-free-sound-0837.wav,13.6533125,0.010230294428765774 840 | train,noise/free-sound/noise-free-sound-0838.wav,1.0029375,0.0714196264743805 841 | train,noise/free-sound/noise-free-sound-0839.wav,2.1420625,0.1818079650402069 842 | train,noise/free-sound/noise-free-sound-0840.wav,70.191,0.05368071049451828 843 | train,noise/free-sound/noise-free-sound-0841.wav,90.0,0.015552169643342495 844 | train,noise/free-sound/noise-free-sound-0842.wav,300.0,0.0035129054449498653 845 | test,noise/sound-bible/noise-sound-bible-0000.wav,6.7499375,0.1441386342048645 846 | test,noise/sound-bible/noise-sound-bible-0001.wav,2.063,0.25420311093330383 847 | test,noise/sound-bible/noise-sound-bible-0002.wav,1.044,0.019799862056970596 848 | test,noise/sound-bible/noise-sound-bible-0003.wav,6.55675,0.06503631919622421 849 | test,noise/sound-bible/noise-sound-bible-0004.wav,11.049,0.05913013219833374 850 | test,noise/sound-bible/noise-sound-bible-0005.wav,12.408,0.042438969016075134 851 | test,noise/sound-bible/noise-sound-bible-0006.wav,0.768,0.04820356145501137 852 | test,noise/sound-bible/noise-sound-bible-0007.wav,21.06,0.1836613565683365 853 | test,noise/sound-bible/noise-sound-bible-0008.wav,9.247,0.19213145971298218 854 | test,noise/sound-bible/noise-sound-bible-0009.wav,2.115,0.0920131504535675 855 | test,noise/sound-bible/noise-sound-bible-0010.wav,7.497,0.020488228648900986 856 | test,noise/sound-bible/noise-sound-bible-0011.wav,5.093,0.06959190219640732 857 | test,noise/sound-bible/noise-sound-bible-0012.wav,3.134,0.14128883183002472 858 | test,noise/sound-bible/noise-sound-bible-0013.wav,5.12,0.08152676373720169 859 | test,noise/sound-bible/noise-sound-bible-0014.wav,27.506,0.05008871480822563 860 | test,noise/sound-bible/noise-sound-bible-0015.wav,22.23,0.05889064073562622 861 | test,noise/sound-bible/noise-sound-bible-0016.wav,7.026,0.040081411600112915 862 | test,noise/sound-bible/noise-sound-bible-0017.wav,3.2391875,0.0472022108733654 863 | test,noise/sound-bible/noise-sound-bible-0018.wav,3.683,0.07156568765640259 864 | test,noise/sound-bible/noise-sound-bible-0019.wav,6.922,0.0659433975815773 865 | test,noise/sound-bible/noise-sound-bible-0020.wav,4.336,0.08530513942241669 866 | test,noise/sound-bible/noise-sound-bible-0021.wav,12.721,0.03518088907003403 867 | test,noise/sound-bible/noise-sound-bible-0022.wav,4.153,0.11733922362327576 868 | test,noise/sound-bible/noise-sound-bible-0023.wav,2.56,0.036052703857421875 869 | test,noise/sound-bible/noise-sound-bible-0024.wav,2.455,0.07451076060533524 870 | test,noise/sound-bible/noise-sound-bible-0025.wav,5.537,0.1307377815246582 871 | test,noise/sound-bible/noise-sound-bible-0026.wav,0.653,0.16861580312252045 872 | test,noise/sound-bible/noise-sound-bible-0027.wav,3.004,0.09237220138311386 873 | test,noise/sound-bible/noise-sound-bible-0028.wav,12.617,0.09574437886476517 874 | test,noise/sound-bible/noise-sound-bible-0029.wav,3.0040625,0.013525698333978653 875 | test,noise/sound-bible/noise-sound-bible-0030.wav,14.889,0.03332362323999405 876 | test,noise/sound-bible/noise-sound-bible-0031.wav,9.769,0.035738807171583176 877 | test,noise/sound-bible/noise-sound-bible-0032.wav,25.152,0.010232904925942421 878 | test,noise/sound-bible/noise-sound-bible-0033.wav,6.687,0.02844083495438099 879 | test,noise/sound-bible/noise-sound-bible-0034.wav,20.21875,0.1358114778995514 880 | test,noise/sound-bible/noise-sound-bible-0035.wav,6.5045,0.04590274393558502 881 | test,noise/sound-bible/noise-sound-bible-0036.wav,20.035,0.10746238380670547 882 | test,noise/sound-bible/noise-sound-bible-0037.wav,1.044,0.03590782731771469 883 | test,noise/sound-bible/noise-sound-bible-0038.wav,16.248,0.1472569853067398 884 | test,noise/sound-bible/noise-sound-bible-0039.wav,4.048,0.03187888488173485 885 | test,noise/sound-bible/noise-sound-bible-0040.wav,9.142,0.026945974677801132 886 | test,noise/sound-bible/noise-sound-bible-0041.wav,2.638,0.05780833214521408 887 | test,noise/sound-bible/noise-sound-bible-0042.wav,6.269,0.14264912903308868 888 | test,noise/sound-bible/noise-sound-bible-0043.wav,23.928,0.050584856420755386 889 | test,noise/sound-bible/noise-sound-bible-0044.wav,30.0408125,0.002178351627662778 890 | test,noise/sound-bible/noise-sound-bible-0045.wav,61.413875,0.08873486518859863 891 | test,noise/sound-bible/noise-sound-bible-0046.wav,0.8098125,0.15509864687919617 892 | test,noise/sound-bible/noise-sound-bible-0047.wav,6.164,0.09232277423143387 893 | test,noise/sound-bible/noise-sound-bible-0048.wav,62.641625,0.011549966409802437 894 | test,noise/sound-bible/noise-sound-bible-0049.wav,49.3453125,0.07298757880926132 895 | test,noise/sound-bible/noise-sound-bible-0050.wav,1.056,0.042787592858076096 896 | test,noise/sound-bible/noise-sound-bible-0051.wav,2.063,0.01880437508225441 897 | test,noise/sound-bible/noise-sound-bible-0052.wav,6.765,0.2076394259929657 898 | test,noise/sound-bible/noise-sound-bible-0053.wav,12.878,0.05402258038520813 899 | test,noise/sound-bible/noise-sound-bible-0054.wav,37.172,0.31475576758384705 900 | test,noise/sound-bible/noise-sound-bible-0055.wav,18.102,0.07953961193561554 901 | test,noise/sound-bible/noise-sound-bible-0056.wav,7.888,0.274869441986084 902 | test,noise/sound-bible/noise-sound-bible-0057.wav,4.1795625,0.0715896338224411 903 | test,noise/sound-bible/noise-sound-bible-0058.wav,1.332,0.018020672723650932 904 | test,noise/sound-bible/noise-sound-bible-0059.wav,49.528,0.051448073238134384 905 | test,noise/sound-bible/noise-sound-bible-0060.wav,45.139,0.03268921747803688 906 | test,noise/sound-bible/noise-sound-bible-0061.wav,19.464,0.00734285730868578 907 | test,noise/sound-bible/noise-sound-bible-0062.wav,15.908,0.06429439783096313 908 | test,noise/sound-bible/noise-sound-bible-0063.wav,14.3935,0.03714064508676529 909 | test,noise/sound-bible/noise-sound-bible-0064.wav,2.847,0.10039804875850677 910 | test,noise/sound-bible/noise-sound-bible-0065.wav,7.235,0.10588019341230392 911 | test,noise/sound-bible/noise-sound-bible-0066.wav,2.716,0.05088731274008751 912 | test,noise/sound-bible/noise-sound-bible-0067.wav,27.7420625,0.08898525685071945 913 | test,noise/sound-bible/noise-sound-bible-0068.wav,5.784,0.03145468607544899 914 | test,noise/sound-bible/noise-sound-bible-0069.wav,1.175,0.10087023675441742 915 | test,noise/sound-bible/noise-sound-bible-0070.wav,2.063,0.21097871661186218 916 | test,noise/sound-bible/noise-sound-bible-0071.wav,4.049,0.08047668635845184 917 | test,noise/sound-bible/noise-sound-bible-0072.wav,4.832,0.06273679435253143 918 | test,noise/sound-bible/noise-sound-bible-0073.wav,7.896,0.08007873594760895 919 | test,noise/sound-bible/noise-sound-bible-0074.wav,2.063,0.018543394282460213 920 | test,noise/sound-bible/noise-sound-bible-0075.wav,32.679,0.06352479755878448 921 | test,noise/sound-bible/noise-sound-bible-0076.wav,57.704,0.09347757697105408 922 | test,noise/sound-bible/noise-sound-bible-0077.wav,5.146,0.19708338379859924 923 | test,noise/sound-bible/noise-sound-bible-0078.wav,23.4056875,0.09098900854587555 924 | test,noise/sound-bible/noise-sound-bible-0079.wav,21.394,0.08347520977258682 925 | test,noise/sound-bible/noise-sound-bible-0080.wav,4.702,0.08633428066968918 926 | test,noise/sound-bible/noise-sound-bible-0081.wav,16.848,0.07527593523263931 927 | test,noise/sound-bible/noise-sound-bible-0082.wav,3.604,0.06656266748905182 928 | test,noise/sound-bible/noise-sound-bible-0083.wav,58.61875,0.10153720527887344 929 | test,noise/sound-bible/noise-sound-bible-0084.wav,8.045,0.0457281619310379 930 | test,noise/sound-bible/noise-sound-bible-0085.wav,2.063,0.12232034653425217 931 | test,noise/sound-bible/noise-sound-bible-0086.wav,0.552,0.0682542473077774 932 | -------------------------------------------------------------------------------- /code/exp_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pathlib 4 | from collections import namedtuple 5 | from typing import List, Optional, Sequence, Tuple, Union, Callable 6 | 7 | import json 8 | import librosa 9 | import numpy as np 10 | import pandas as pd 11 | import socket 12 | import soundfile as sf 13 | import torch 14 | from asteroid.losses.sdr import singlesrc_neg_sisdr 15 | from asteroid.losses.sdr import singlesrc_neg_snr 16 | from numpy.random import Generator 17 | from scipy.signal import convolve 18 | 19 | from exp_utils import ExperimentError 20 | 21 | ROOT_DIR = os.path.dirname(os.path.realpath(__file__)) 22 | 23 | example_duration: float = 4 24 | sample_rate: int = 16000 25 | example_length: int = int(sample_rate * example_duration) 26 | 27 | _eps: float = 1e-8 28 | _rng = np.random.default_rng(0) 29 | 30 | Batch = namedtuple( 31 | 'Batch', ('inputs','targets','pre_snrs','post_snrs')) 32 | ContrastiveBatch = namedtuple( 33 | 'ContrastiveBatch', ('inputs_1','targets_1','inputs_2','targets_2', 34 | 'labels','pre_snrs','post_snrs')) 35 | 36 | 37 | def _make_2d(x: torch.Tensor): 38 | """Normalize shape of `x` to two dimensions: [batch, time].""" 39 | if isinstance(x, np.ndarray): 40 | x = torch.from_numpy(x) 41 | if x.ndim == 1: 42 | return x.reshape(1, -1) 43 | elif x.ndim == 3: 44 | return x.squeeze(1) 45 | else: 46 | if x.ndim != 2: raise ValueError('Could not force 2d.') 47 | return x 48 | 49 | 50 | def _make_3d(x: torch.Tensor): 51 | """Normalize shape of `x` to three dimensions: [batch, n_chan, time].""" 52 | if isinstance(x, np.ndarray): 53 | x = torch.from_numpy(x) 54 | if x.ndim == 1: 55 | return x.reshape(1, 1, -1) 56 | elif x.ndim == 2: 57 | return x.unsqueeze(1) 58 | else: 59 | if x.ndim != 3: raise ValueError('Could not force 3d.') 60 | return x 61 | 62 | 63 | def mix_signals( 64 | source: np.ndarray, 65 | noise: np.ndarray, 66 | snr_db: Union[float, np.ndarray] 67 | ) -> np.ndarray: 68 | """Function to mix signals. 69 | 70 | Args: 71 | source (np.ndarray): source signal 72 | noise (np.ndarray): noise signal 73 | snr_db (float): desired mixture SNR in decibels (scales noise) 74 | 75 | Returns: 76 | mixture (np.ndarray): mixture signal 77 | """ 78 | energy_s = np.sum(source ** 2, axis=-1, keepdims=True) 79 | energy_n = np.sum(noise ** 2, axis=-1, keepdims=True) 80 | b = np.sqrt((energy_s / energy_n) * (10 ** (-snr_db / 10.))) 81 | return source + b * noise 82 | 83 | 84 | def sparsity_index( 85 | signal: np.ndarray 86 | ) -> float: 87 | """Defines a sparsity value for a given signal, by computing the 88 | standard deviation of the segmental root-mean-square (RMS). 89 | """ 90 | return float(np.std(librosa.feature.rms(y=signal).reshape(-1))) 91 | 92 | 93 | def wav_read( 94 | filepath: Union[str, os.PathLike] 95 | ) -> Tuple[np.ndarray, float]: 96 | """Reads mono audio from WAV. 97 | """ 98 | y, sr = sf.read(filepath, dtype='float32', always_2d=True) 99 | if sr != sample_rate: 100 | raise IOError(f'Expected sample_rate={sample_rate}, got {sr}.') 101 | # always pick up the first channel 102 | y = np.array(y[..., 0]) 103 | return y, float(len(y) / sample_rate) 104 | 105 | 106 | def wav_write( 107 | filepath: Union[str, os.PathLike], 108 | array: np.ndarray 109 | ): 110 | sf.write(filepath, array, samplerate=sample_rate) 111 | return 112 | 113 | 114 | def wav_read_multiple( 115 | filepaths: Sequence[Union[str, os.PathLike]], 116 | concatenate: bool = False, 117 | randomly_offset: bool = True, 118 | seed: Optional[int] = None 119 | ) -> np.ndarray: 120 | """Loads multiple audio signals from file; may be batched or concatenated. 121 | """ 122 | rng = np.random.default_rng(seed) 123 | signals = [] 124 | collate_fn: Callable = np.concatenate if concatenate else np.stack 125 | for filepath in filepaths: 126 | s, duration = wav_read(filepath) 127 | if not concatenate: 128 | # pad shorter signals up to expected length 129 | if len(s) < example_length: 130 | lengths = [(0, 0)] * s.ndim 131 | lengths[-1] = (0, example_length - len(s)) 132 | s = np.pad(s, lengths, mode='constant') 133 | 134 | # randomly offset longer signals if desired 135 | offset: int = 0 136 | remainder: int = len(s) - example_length 137 | if randomly_offset and remainder > 0: 138 | offset = rng.integers(0, remainder) 139 | 140 | # trim exactly to the expected length 141 | s = s[offset:offset + example_length] 142 | signals.append(s) 143 | return collate_fn(signals, axis=0) 144 | 145 | 146 | def wav_sample( 147 | data: np.ndarray, 148 | num_clips: int, 149 | seed: Optional[int] = None 150 | ) -> np.ndarray: 151 | rng = np.random.default_rng(seed) 152 | start_indices = rng.integers(0, len(data) - example_length - 1, num_clips) 153 | signals = [data[i:i+example_length] for i in start_indices] 154 | return np.stack(signals, axis=0) 155 | 156 | def sisdr( 157 | estimate: torch.Tensor, 158 | target: torch.Tensor, 159 | reduction: Optional[str] = None 160 | ) -> torch.Tensor: 161 | """Calculate single source SI-SDR.""" 162 | return sdr(estimate, target, reduction=reduction, scale_invariant=True) 163 | 164 | 165 | def sisdr_improvement( 166 | estimate: torch.Tensor, 167 | target: torch.Tensor, 168 | mixture: torch.Tensor, 169 | reduction: Optional[str] = None 170 | ) -> torch.Tensor: 171 | """Calculate estimate to target SI-SDR improvement relative to mixture. 172 | """ 173 | return sdr_improvement( 174 | estimate, target, mixture, reduction=reduction, scale_invariant=True) 175 | 176 | 177 | def sdr( 178 | estimate: torch.Tensor, 179 | target: torch.Tensor, 180 | scale_invariant: bool = False, 181 | reduction: Optional[str] = None 182 | ) -> torch.Tensor: 183 | """Calculate single source SDR.""" 184 | ml = min(estimate.shape[-1], target.shape[-1]) 185 | estimate = _make_2d(estimate)[..., :ml] 186 | target = _make_2d(target)[..., :ml] 187 | if scale_invariant: 188 | output = -1 * singlesrc_neg_sisdr(estimate, target) 189 | else: 190 | output = -1 * singlesrc_neg_snr(estimate, target) 191 | if reduction == 'mean': 192 | output = torch.mean(output) 193 | return output 194 | 195 | 196 | def sdr_improvement( 197 | estimate: torch.Tensor, 198 | target: torch.Tensor, 199 | mixture: torch.Tensor, 200 | reduction: Optional[str] = None, 201 | scale_invariant: bool = False 202 | ) -> torch.Tensor: 203 | """Calculate estimate to target SDR improvement relative to mixture. 204 | """ 205 | output = ( 206 | sdr(estimate, target, scale_invariant=scale_invariant) 207 | - sdr(mixture, target, scale_invariant=scale_invariant) 208 | ) 209 | if reduction == 'mean': 210 | output = torch.mean(output) 211 | return output 212 | 213 | 214 | def dataframe_librispeech( 215 | dataset_directory: Optional[Union[str, os.PathLike]] = None, 216 | omit_clipped: bool = False 217 | ) -> pd.DataFrame: 218 | """Creates a Pandas DataFrame with files from the LibriSpeech corpus. 219 | Root directory should mimic archive-extracted folder structure. 220 | Dataset may be downloaded at ``_. 221 | """ 222 | columns = [ 223 | 'subset_id', 224 | 'speaker_id', 225 | 'chapter_id', 226 | 'utterance_id', 227 | 'filepath', 228 | 'duration', 229 | 'sparsity' 230 | ] 231 | if dataset_directory is None: 232 | return pd.DataFrame(columns=columns) 233 | dataset_directory = pathlib.Path(dataset_directory) 234 | dataset_dataframe = pathlib.Path(ROOT_DIR).joinpath('datasets', 'librispeech.csv') 235 | if not dataset_directory.exists(): 236 | raise ValueError(f'{dataset_directory} does not exist.') 237 | valid_subsets = [ 238 | 'train-clean-100', 239 | 'train-clean-360', 240 | 'dev-clean', 241 | 'test-clean' 242 | ] 243 | if not dataset_dataframe.exists(): 244 | raise ValueError(f'{dataset_dataframe} does not exist') 245 | rows = [] 246 | for filepath in dataset_directory.rglob('*.wav'): 247 | try: 248 | subset_id = [_ for _ in valid_subsets if _ in str(filepath)][0] 249 | speaker_id, chapter_id, utterance_id = filepath.stem.split('-') 250 | except ValueError: 251 | continue 252 | y, duration = wav_read(filepath) 253 | sparsity = sparsity_index(y) 254 | rows.append((subset_id, speaker_id, chapter_id, 255 | utterance_id, str(filepath), duration, sparsity)) 256 | if not len(rows): 257 | raise ValueError(f'Could not find any .WAV files within ' 258 | f'{dataset_directory}.') 259 | df = pd.DataFrame(rows, columns=columns) 260 | df.to_csv(dataset_dataframe, 261 | header=columns, 262 | index=False, 263 | index_label=False) 264 | else: 265 | df = pd.read_csv(dataset_dataframe) 266 | 267 | dataset_directory = str(dataset_directory) 268 | df = df.sort_values('filepath', ascending=True).reset_index(drop=True) 269 | df['filepath'] = df['filepath'].apply(lambda f: os.path.join(dataset_directory, f)) 270 | 271 | if omit_clipped: 272 | # discard recordings from speakers who possess clipped recordings 273 | # (manually found using SoX, where 'volume adjustment' == 1.000) 274 | clipped_speakers = [ 275 | '101', '1069', '1175', '118', '1290', '1379', '1456', '1552', 276 | '1578', '1629', '1754', '1933', '1943', '1963', '198', '204', 277 | '2094', '2113', '2149', '22', '2269', '2618', '2751', '307', 278 | '3168', '323', '3294', '3374', '345', '3486', '3490', '3615', 279 | '3738', '380', '4148', '446', '459', '4734', '481', '5002', 280 | '5012', '5333', '549', '5561', '5588', '559', '5678', '5740', 281 | '576', '593', '6295', '6673', '7139', '716', '7434', '7800', 282 | '781', '8329', '8347', '882' 283 | ] 284 | df = df[~df['speaker_id'].isin(clipped_speakers)] 285 | 286 | # omit recordings which are smaller than an example 287 | df = df.query('duration >= @example_duration') 288 | 289 | # organize by split 290 | def assign_split_per_speaker( 291 | sgroup, 292 | duration_s_test: int = 30, 293 | duration_s_validation: int = 30, 294 | duration_s_train: int = 60, 295 | duration_s_prevalidation: int = 30, 296 | ): 297 | # designate partition indices based on the nearest cumulative duration 298 | sp_id = set(sgroup['speaker_id']).pop() 299 | cs = sgroup['duration'].cumsum() 300 | offset = min(sgroup.index) 301 | _d = duration_s_test 302 | split_te = (cs - _d).abs().idxmin() - offset 303 | _d += duration_s_validation 304 | split_vl = (cs - _d).abs().idxmin() - offset 305 | if split_vl == split_te: split_vl += 1 306 | _d += duration_s_train 307 | split_tr = (cs - _d).abs().idxmin() - offset 308 | if split_tr == split_vl: split_tr += 1 309 | _d += duration_s_prevalidation 310 | split_pvl = (cs - _d).abs().idxmin() - offset 311 | if split_pvl == split_tr: split_pvl += 1 312 | 313 | assert (split_te != split_vl), (sp_id, split_te, split_vl) 314 | assert (split_vl != split_tr), (sp_id, split_vl, split_tr) 315 | assert (split_tr != split_pvl), (sp_id, split_tr, split_pvl) 316 | 317 | sgroup = sgroup.reset_index(drop=True) 318 | 319 | # assign split 320 | for i in range(0, split_te): 321 | sgroup.at[i, 'split'] = 'test' 322 | for i in range(split_te, split_vl): 323 | sgroup.at[i, 'split'] = 'val' 324 | for i in range(split_vl, split_tr): 325 | sgroup.at[i, 'split'] = 'train' 326 | for i in range(split_tr, split_pvl): 327 | sgroup.at[i, 'split'] = 'preval' 328 | 329 | # return the modified speaker group 330 | return sgroup 331 | 332 | df = df.assign(split='pretrain').sort_values(['speaker_id', 'duration']) 333 | g = df.reset_index(drop=True).groupby('speaker_id') 334 | df = g.apply(assign_split_per_speaker) 335 | 336 | # shuffle the recordings 337 | df = df.sample(frac=1, random_state=0) 338 | 339 | # organize by subset and split 340 | df['subset_id'] = pd.Categorical(df['subset_id'], valid_subsets) 341 | df['split'] = pd.Categorical(df['split'], ['pretrain', 'preval', 'train', 342 | 'val', 'test']) 343 | df = df.sort_values(['subset_id', 'split']) 344 | 345 | # ensure that all the audio files exist 346 | if not all([f for f in df.filepath if os.path.isfile(f)]): 347 | raise ValueError(f'Audio files missing, check {dataset_directory}.') 348 | 349 | # reindex and name the dataframe 350 | df = df[['filepath', 'subset_id', 'speaker_id', 351 | 'split', 'duration', 'sparsity']] 352 | df = df.reset_index(drop=True) 353 | df.index.name = 'LIBRISPEECH' 354 | return df 355 | 356 | 357 | def dataframe_demand( 358 | dataset_directory: Optional[Union[str, os.PathLike]] = None 359 | ) -> pd.DataFrame: 360 | """Creates a Pandas DataFrame with files from the DEMAND corpus. 361 | Root directory should mimic archive-extracted folder structure. 362 | Dataset may be downloaded at ``_. 363 | """ 364 | columns = [ 365 | 'category_id', 366 | 'location_id', 367 | 'filepath', 368 | 'duration', 369 | 'sparsity' 370 | ] 371 | if dataset_directory is None: 372 | return pd.DataFrame(columns=columns) 373 | dataset_directory = pathlib.Path(dataset_directory) 374 | dataset_dataframe = pathlib.Path(ROOT_DIR).joinpath('datasets', 'demand.csv') 375 | if not dataset_directory.exists(): 376 | raise ValueError(f'{dataset_directory} does not exist.') 377 | valid_categories = [ 378 | 'domestic', 379 | 'nature', 380 | 'office', 381 | 'public', 382 | 'street', 383 | 'transportation' 384 | ] 385 | valid_locations = [ 386 | 'kitchen', 387 | 'washing', 388 | 'park', 389 | 'hallway', 390 | 'office', 391 | 'resto', 392 | 'psquare', 393 | 'bus', 394 | 'metro', 395 | 'living', 396 | 'field', 397 | 'river', 398 | 'meeting', 399 | 'cafeter', 400 | 'station', 401 | 'traffic', 402 | 'car' 403 | ] 404 | if not dataset_dataframe.exists(): 405 | raise ValueError(f'{dataset_dataframe} does not exist') 406 | rows = [] 407 | for filepath in sorted(dataset_directory.rglob('*.wav')): 408 | if 'ch01' not in filepath.stem: 409 | continue 410 | category_id = [_ for _ in valid_categories if 411 | _[0].upper() == filepath.parent.stem[0].upper()][0] 412 | location_id = [_ for _ in valid_locations if 413 | filepath.parent.stem[1:].upper() in _.upper()][0] 414 | y, duration = wav_read(filepath) 415 | sparsity = sparsity_index(y) 416 | rows.append(( 417 | category_id, 418 | location_id, 419 | str(filepath), 420 | duration, 421 | sparsity)) 422 | if not len(rows): 423 | raise ValueError(f'Could not find any .WAV files within ' 424 | f'{dataset_directory}.') 425 | df = pd.DataFrame(rows, columns=columns) 426 | df.to_csv(dataset_dataframe, 427 | header=columns, 428 | index=False, 429 | index_label=False) 430 | else: 431 | df = pd.read_csv(dataset_dataframe) 432 | 433 | dataset_directory = str(dataset_directory) 434 | df = df.sort_values('filepath', ascending=True).reset_index(drop=True) 435 | df['filepath'] = df['filepath'].apply(lambda f: os.path.join(dataset_directory, f)) 436 | 437 | # shuffle the recordings 438 | df = df.sample(frac=1, random_state=0) 439 | 440 | # ensure that all the audio files exist 441 | if not all([f for f in df.filepath if os.path.isfile(f)]): 442 | raise ValueError(f'Audio files missing, check {dataset_directory}.') 443 | 444 | # reindex and name the dataframe 445 | df = df[['filepath', 'duration', 'sparsity']] 446 | df = df.reset_index(drop=True) 447 | df.index.name = 'DEMAND' 448 | return df 449 | 450 | 451 | def dataframe_fsd50k( 452 | dataset_directory: Optional[Union[str, os.PathLike]] = None, 453 | ) -> pd.DataFrame: 454 | """Creates a Pandas DataFrame with files from the FSD50K corpus. 455 | Root directory should mimic archive-extracted folder structure. 456 | Dataset may be downloaded at ``_. 457 | """ 458 | columns = [ 459 | 'fname', 460 | 'labels', 461 | 'mids', 462 | 'split', 463 | 'filepath', 464 | 'duration', 465 | 'sparsity' 466 | ] 467 | if dataset_directory is None: 468 | return pd.DataFrame(columns=columns) 469 | dataset_directory = pathlib.Path(dataset_directory) 470 | dataset_dataframe = pathlib.Path(ROOT_DIR).joinpath('datasets', 'fsd50k.csv') 471 | if not dataset_directory.exists(): 472 | raise ValueError(f'{dataset_directory} does not exist.') 473 | if not dataset_dataframe.exists(): 474 | raise ValueError(f'{dataset_dataframe} does not exist') 475 | 476 | # merge separate dev and eval sets into one big table 477 | df1 = pd.read_csv(next(dataset_directory.rglob('dev.csv'))) 478 | df2 = pd.read_csv(next(dataset_directory.rglob('eval.csv'))) 479 | df2['split'] = 'test' 480 | df = pd.concat([df1, df2]) 481 | 482 | durations, filepaths, sparsities = [], [], [] 483 | for row in df.itertuples(): 484 | subdir = ('FSD50K.eval_audio' if row.split == 'test' 485 | else 'FSD50K.dev_audio') 486 | filepath = dataset_directory.joinpath(subdir, str(row.fname) + '.wav') 487 | if not filepath.exists(): 488 | raise ValueError(f'{filepath} does not exist.') 489 | y, duration = wav_read(filepath) 490 | sparsity = sparsity_index(y) 491 | durations.append(duration) 492 | sparsities.append(sparsity) 493 | filepaths.append(filepath) 494 | df['filepath'] = filepaths 495 | df['duration'] = durations 496 | df['sparsity'] = sparsities 497 | if not len(filepaths): 498 | raise ValueError(f'Could not find any .WAV files within ' 499 | f'{dataset_directory}.') 500 | df.to_csv(dataset_dataframe, 501 | header=columns, 502 | index=False, 503 | index_label=False) 504 | else: 505 | df = pd.read_csv(dataset_dataframe) 506 | 507 | dataset_directory = str(dataset_directory) 508 | df = df.sort_values('filepath', ascending=True).reset_index(drop=True) 509 | df['filepath'] = df['filepath'].apply(lambda f: os.path.join(dataset_directory, f)) 510 | 511 | # omit sounds labeled as containing speech or music 512 | df['labels'] = df['labels'].apply(str.lower) 513 | df = df[~df['labels'].str.contains('speech')] 514 | df = df[~df['labels'].str.contains('music')] 515 | 516 | # omit recordings which are smaller than an example 517 | df = df.query('duration >= @example_duration') 518 | 519 | # shuffle the recordings 520 | df = df.sample(frac=1, random_state=0) 521 | 522 | # organize by split 523 | df['split'] = pd.Categorical(df['split'], ['train', 'val', 'test']) 524 | df = df.sort_values('split') 525 | 526 | # ensure that all the audio files exist 527 | if not all([f for f in df.filepath if os.path.isfile(f)]): 528 | raise ValueError(f'Audio files missing, check {dataset_directory}.') 529 | 530 | # reindex and name the dataframe 531 | df = df[['filepath', 'split', 'duration', 'labels', 'sparsity']] 532 | df = df.reset_index(drop=True) 533 | df.index.name = 'FSD50K' 534 | return df 535 | 536 | 537 | def dataframe_musan( 538 | dataset_directory: Optional[Union[str, os.PathLike]] = None, 539 | ) -> pd.DataFrame: 540 | """Creates a Pandas DataFrame with files from the MUSAN corpus. 541 | Root directory should mimic archive-extracted folder structure. 542 | Dataset may be downloaded at ``_. 543 | """ 544 | columns = [ 545 | 'split', 546 | 'filepath', 547 | 'duration', 548 | 'sparsity' 549 | ] 550 | if dataset_directory is None: 551 | return pd.DataFrame(columns=columns) 552 | dataset_directory = pathlib.Path(dataset_directory) 553 | dataset_dataframe = pathlib.Path(ROOT_DIR).joinpath('datasets', 'musan.csv') 554 | if not dataset_directory.exists(): 555 | raise ValueError(f'{dataset_directory} does not exist.') 556 | if not dataset_dataframe.exists(): 557 | raise ValueError(f'{dataset_dataframe} does not exist') 558 | rows = [] 559 | for filepath in sorted(dataset_directory.rglob('*.wav')): 560 | is_train = bool('FREE-SOUND' in str(filepath).upper()) 561 | is_test = bool('SOUND-BIBLE' in str(filepath).upper()) 562 | if not (is_train or is_test): 563 | continue 564 | split_id = 'train' if is_train else 'test' 565 | y, duration = wav_read(filepath) 566 | sparsity = sparsity_index(y) 567 | rows.append((split_id, str(filepath), duration, sparsity)) 568 | if not len(rows): 569 | raise ValueError(f'Could not find any .WAV files within ' 570 | f'{dataset_directory}.') 571 | df = pd.DataFrame(rows, columns=columns) 572 | df.to_csv(dataset_dataframe, 573 | header=columns, 574 | index=False, 575 | index_label=False) 576 | else: 577 | df = pd.read_csv(dataset_dataframe) 578 | 579 | dataset_directory = str(dataset_directory) 580 | df = df.sort_values('filepath', ascending=True).reset_index(drop=True) 581 | df['filepath'] = df['filepath'].apply(lambda f: os.path.join(dataset_directory, f)) 582 | 583 | # omit recordings which are smaller than an example 584 | df = df.query('duration >= @example_duration') 585 | 586 | # set aside the last sixty training signals for validation 587 | val_indices = df.query('split == "train"').iloc[-60:].index 588 | df.loc[val_indices, 'split'] = 'val' 589 | 590 | # organize by subset and split 591 | df['split'] = pd.Categorical(df['split'], ['train', 'val', 'test']) 592 | df = df.sort_values(['split']) 593 | 594 | # shuffle the recordings 595 | df = df.sample(frac=1, random_state=0) 596 | 597 | # ensure that all the audio files exist 598 | if not all([f for f in df.filepath if os.path.isfile(f)]): 599 | raise ValueError(f'Audio files missing, check {dataset_directory}.') 600 | 601 | # reindex and name the dataframe 602 | df = df[['filepath', 'split', 'duration', 'sparsity']] 603 | df = df.reset_index(drop=True) 604 | df.index.name = 'MUSAN' 605 | return df 606 | 607 | 608 | class Mixtures: 609 | """Dataset for noisy speech signals. 610 | """ 611 | 612 | def __init__( 613 | self, 614 | speaker_id_or_ids: Union[int, Sequence[int]], 615 | folder_librispeech: Optional[str] = None, 616 | folder_fsd50k: Optional[str] = None, 617 | folder_musan: Optional[str] = None, 618 | split_speech: Optional[str] = 'all', 619 | split_premixture: Optional[str] = 'train', 620 | split_mixture: Optional[str] = 'train', 621 | split_reverb: Optional[str] = None, 622 | frac_speech: Optional[float] = 1., 623 | snr_premixture: Optional[Union[float, Tuple[float, float]]] = None, 624 | snr_mixture: Optional[Union[float, Tuple[float, float]]] = None, 625 | dataset_duration: Union[int, float] = 0 626 | ): 627 | # verify speaker ID(s) 628 | if isinstance(speaker_id_or_ids, int): 629 | speaker_id_or_ids = [speaker_id_or_ids] 630 | elif not isinstance(speaker_id_or_ids, (list, set)): 631 | raise ValueError('Expected one or a sequence of speaker IDs.') 632 | if len(speaker_id_or_ids) < 1: 633 | raise ValueError('Expected one or more speaker IDs.') 634 | self.speaker_ids = speaker_id_or_ids 635 | self.frac_speech = frac_speech 636 | self.speaker_ids_repr = repr(self.speaker_ids) 637 | 638 | # missing pairs of arguments 639 | if not split_premixture: 640 | if snr_premixture is not None: 641 | raise ValueError('Missing argument `split_premixture`.') 642 | if not split_mixture: 643 | if snr_mixture is not None: 644 | raise ValueError('Missing argument `split_mixture`.') 645 | 646 | # unpack mixture SNR values 647 | if isinstance(snr_premixture, tuple): 648 | snr_premixture_min = float(min(snr_premixture)) 649 | snr_premixture_max = float(max(snr_premixture)) 650 | elif isinstance(snr_premixture, (float, int)): 651 | snr_premixture_min = float(snr_premixture) 652 | snr_premixture_max = float(snr_premixture) 653 | elif snr_premixture is None: 654 | snr_premixture_min = None 655 | snr_premixture_max = None 656 | else: 657 | raise ValueError('Expected `snr_premixture` to be a float type or ' 658 | 'a tuple of floats.') 659 | if isinstance(snr_mixture, tuple): 660 | snr_mixture_min = float(min(snr_mixture)) 661 | snr_mixture_max = float(max(snr_mixture)) 662 | elif isinstance(snr_mixture, (float, int)): 663 | snr_mixture_min = float(snr_mixture) 664 | snr_mixture_max = float(snr_mixture) 665 | elif snr_mixture is None: 666 | snr_mixture_min = None 667 | snr_mixture_max = None 668 | else: 669 | raise ValueError('Expected `snr_mixture` to be a float type or ' 670 | 'a tuple of floats.') 671 | self.snr_premixture_min = snr_premixture_min 672 | self.snr_premixture_max = snr_premixture_max 673 | self.snr_mixture_min = snr_mixture_min 674 | self.snr_mixture_max = snr_mixture_max 675 | 676 | # verify corpus partitions 677 | if not (split_speech in 678 | ('all', 'pretrain', 'preval', 'train', 'val', 'test')): 679 | raise ValueError('Expected `split_speech` to be either "all", ' 680 | '"pretrain", "preval", "train", "val", or "test".') 681 | if snr_premixture is not None: 682 | if not (split_premixture in ('train', 'val', 'test')): 683 | raise ValueError('Expected `split_premixture` to be either ' 684 | '"train", "val", or "test".') 685 | if snr_mixture is not None: 686 | if not (split_mixture in ('train', 'val', 'test')): 687 | raise ValueError('Expected `split_mixture` to be either ' 688 | '"train", "val", or "test".') 689 | if split_reverb is not None: 690 | if not (split_reverb in ('train', 'val', 'test')): 691 | raise ValueError('Expected `split_reverb` to be either ' 692 | '"train", "val", or "test".') 693 | self.split_speech = split_speech 694 | self.split_premixture = split_premixture or '' 695 | self.split_mixture = split_mixture or '' 696 | self.split_reverb = split_reverb or '' 697 | 698 | # verify dataset duration 699 | if not isinstance(dataset_duration, (int, float, type(None))): 700 | raise ValueError('Expected `dataset_duration` to be a number.') 701 | self.dataset_duration = int(dataset_duration or 0) 702 | 703 | self.index = 0 704 | self.example_duration = example_duration 705 | 706 | # instantiate corpora 707 | self.instantiate_corpora( 708 | folder_librispeech, 709 | folder_fsd50k, 710 | folder_musan, 711 | ) 712 | 713 | # calculate maximum random offset for all utterances 714 | max_offset_func = lambda d: d.assign(max_offset=( 715 | sample_rate * d['duration'] - example_length)).astype({ 716 | 'max_offset': int}) 717 | self.corpus_s = max_offset_func(self.corpus_s) 718 | self.corpus_m = max_offset_func(self.corpus_m) 719 | self.corpus_n = max_offset_func(self.corpus_n) 720 | 721 | # keep track of the number of utterances, premixture noises, 722 | # and injected noises 723 | self.len_s = len(self.corpus_s) 724 | self.len_m = len(self.corpus_m) 725 | self.len_n = len(self.corpus_n) 726 | self.len_r = len(self.corpus_r) 727 | if self.len_s < 1: 728 | raise ValueError('Invalid speaker_id') 729 | 730 | # if a dataset duration is provided, 731 | # truncate the audio data to the expected size 732 | self.speech_data = np.array([]) 733 | if self.dataset_duration: 734 | self.speech_data = wav_read_multiple( 735 | self.corpus_s.filepath, concatenate=True) 736 | self.speech_data = self.speech_data[:( 737 | self.dataset_duration * sample_rate)] 738 | 739 | # define flags 740 | self.is_personalized = bool(len(self.speaker_ids) == 1) 741 | self.add_premixture_noise = bool( 742 | (snr_premixture is not None) and (self.len_m > 0)) 743 | self.add_noise = bool( 744 | (snr_mixture is not None) and (self.len_n > 0)) 745 | self.add_reverb = bool(self.len_r > 0) 746 | 747 | if not self.is_personalized and self.add_premixture_noise: 748 | raise ExperimentError('Non-personalized dataset contains ' 749 | 'premixture noise.') 750 | 751 | if self.dataset_duration and self.add_premixture_noise: 752 | raise ExperimentError('Fine-tuning dataset contains ' 753 | 'premixture noise.') 754 | 755 | def instantiate_corpora(self, folder_librispeech, folder_fsd50k, folder_musan): 756 | 757 | self.corpus_s = dataframe_librispeech(folder_librispeech).query( 758 | f'speaker_id in {self.speaker_ids}') 759 | if self.split_speech != 'all': 760 | self.corpus_s = self.corpus_s.query( 761 | f'split == "{self.split_speech}"') 762 | if 0 < self.frac_speech < 1: 763 | self.corpus_s = self.corpus_s.sample( 764 | frac=self.frac_speech, random_state=0) 765 | print('Length of subsampled dataset:', len(self.corpus_s)) 766 | 767 | self.corpus_m = dataframe_fsd50k(folder_fsd50k).query( 768 | f'split == "{self.split_premixture}"') 769 | 770 | self.corpus_n = dataframe_musan(folder_musan).query( 771 | f'split == "{self.split_mixture}"') 772 | 773 | self.corpus_r = pd.DataFrame() # disable support for reverb 774 | # self.corpus_r = df_irsurvey.query( 775 | # f'split == "{self.split_reverb}"') 776 | return 777 | 778 | def __dict__(self): 779 | return { 780 | 'flags': { 781 | 'is_personalized': self.is_personalized, 782 | 'add_premixture_noise': self.add_premixture_noise, 783 | 'add_noise': self.add_noise, 784 | }, 785 | 'speaker_ids': self.speaker_ids_repr, 786 | 'snr_premixture_min': self.snr_premixture_min, 787 | 'snr_premixture_max': self.snr_premixture_max, 788 | 'snr_mixture_min': self.snr_mixture_min, 789 | 'snr_mixture_max': self.snr_mixture_max, 790 | 'split_speech': self.split_speech, 791 | 'split_premixture': self.split_premixture, 792 | 'split_mixture': self.split_mixture, 793 | 'dataset_duration': self.dataset_duration 794 | } 795 | 796 | def __repr__(self): 797 | return json.dumps(self.__dict__(), indent=2, sort_keys=True) 798 | 799 | def __call__(self, batch_size: int, seed: Optional[int] = None): 800 | 801 | if batch_size < 1: 802 | raise ValueError('batch_size must be at least 1.') 803 | 804 | if seed is None: self.index += 1 805 | tmp_index: int = 0 if seed is not None else self.index 806 | tmp_rng: Generator = np.random.default_rng(tmp_index) 807 | 808 | indices = np.arange(batch_size * tmp_index, 809 | batch_size * (tmp_index + 1)) 810 | s_filepaths = (list(self.corpus_s.filepath.iloc[indices % self.len_s]) 811 | if self.len_s else []) 812 | m_filepaths = (list(self.corpus_m.filepath.iloc[indices % self.len_m]) 813 | if self.len_m else []) 814 | n_filepaths = (list(self.corpus_n.filepath.iloc[indices % self.len_n]) 815 | if self.len_n else []) 816 | r_filepaths = (list(self.corpus_r.filepath.iloc[indices % self.len_r]) 817 | if self.len_r else []) 818 | 819 | if self.speech_data.size > 0: 820 | s = wav_sample(self.speech_data, batch_size, seed=seed) 821 | else: 822 | s = wav_read_multiple(s_filepaths, seed=seed) 823 | x = p = s 824 | 825 | pre_snrs = np.array([]) 826 | if self.add_premixture_noise: 827 | m = wav_read_multiple(m_filepaths, seed=seed) 828 | pre_snrs = tmp_rng.uniform( 829 | self.snr_premixture_min, self.snr_premixture_max, 830 | (batch_size, 1)) 831 | x = p = mix_signals(s, m, pre_snrs) 832 | 833 | if self.add_reverb: 834 | r = wav_read_multiple(r_filepaths, randomly_offset=False, seed=seed) 835 | p_rev = np.empty_like(p) 836 | p_len = p.shape[-1] 837 | for i, filt in enumerate(r): 838 | p_rev[i] = convolve(p[i], filt, mode='full')[:p_len] 839 | x = p = p_rev 840 | 841 | post_snrs = np.array([]) 842 | if self.add_noise: 843 | n = wav_read_multiple(n_filepaths, seed=seed) 844 | post_snrs = tmp_rng.uniform( 845 | self.snr_mixture_min, self.snr_mixture_max, 846 | (batch_size, 1)) 847 | x = mix_signals(p, n, post_snrs) 848 | 849 | scale_factor = float(np.abs(x).max() + _eps) 850 | return Batch( 851 | inputs=torch.cuda.FloatTensor(x) / scale_factor, 852 | targets=torch.cuda.FloatTensor(p) / scale_factor, 853 | pre_snrs=torch.cuda.FloatTensor(pre_snrs), 854 | post_snrs=torch.cuda.FloatTensor(post_snrs) 855 | ) 856 | 857 | 858 | class ContrastiveMixtures(Mixtures): 859 | 860 | def __call__( 861 | self, 862 | batch_size: int, 863 | ratio_positive: float = 0.5, 864 | seed: Optional[int] = None 865 | ): 866 | if not (0 <= ratio_positive <= 1): 867 | raise ValueError('ratio_positive should be between 0 and 1.') 868 | if batch_size < 2: 869 | raise ValueError('batch_size must be at least 2.') 870 | if batch_size % 2: 871 | raise ValueError('batch_size must be an even number.') 872 | 873 | if seed is None: self.index += 1 874 | tmp_index: int = 0 if seed is not None else self.index 875 | tmp_rng: Generator = np.random.default_rng(tmp_index) 876 | 877 | indices = np.arange(batch_size * tmp_index, 878 | batch_size * (tmp_index + 1)) 879 | s_filepaths = (list(self.corpus_s.filepath.iloc[indices % self.len_s]) 880 | if self.len_s else []) 881 | m_filepaths = (list(self.corpus_m.filepath.iloc[indices % self.len_m]) 882 | if self.len_m else []) 883 | n_filepaths = (list(self.corpus_n.filepath.iloc[indices % self.len_n]) 884 | if self.len_n else []) 885 | r_filepaths = (list(self.corpus_r.filepath.iloc[indices % self.len_r]) 886 | if self.len_r else []) 887 | 888 | ordering = tmp_rng.permutation(batch_size//2) 889 | num_positive = int(batch_size//2 * ratio_positive) 890 | num_negative = batch_size//2 - num_positive 891 | labels = np.array([1]*num_positive + [0]*num_negative) 892 | 893 | bx_1, bx_2, bp_1, bp_2, bs_1, bs_2 = [], [], [], [], [], [] 894 | bpre_snrs, bpost_snrs = [], [] 895 | 896 | # generate pairs 897 | for i in range(0, batch_size, 2): 898 | 899 | is_positive = bool(i/2 < num_positive) 900 | 901 | if self.speech_data.size > 0: 902 | if is_positive: 903 | s_1 = s_2 = wav_sample(self.speech_data, 1, seed=seed) 904 | else: 905 | s_1, s_2 = wav_sample(self.speech_data, 2, seed=seed) 906 | else: 907 | if is_positive: 908 | s_1 = s_2 = wav_read_multiple([s_filepaths[i]], seed=seed) 909 | else: 910 | s_1, s_2 = wav_read_multiple(s_filepaths[i:i+2], seed=seed) 911 | 912 | s_1, s_2 = s_1.reshape(-1), s_2.reshape(-1) 913 | 914 | p_1, p_2 = s_1, s_2 915 | pre_snr = [None, None] 916 | if self.add_premixture_noise: 917 | if is_positive: 918 | m_1 = m_2 = wav_read_multiple([m_filepaths[i]], seed=seed) 919 | pre_snr = [tmp_rng.uniform( 920 | self.snr_premixture_min, self.snr_premixture_max)] * 2 921 | else: 922 | m_1, m_2 = wav_read_multiple(m_filepaths[i:i+2], seed=seed) 923 | pre_snr = tmp_rng.uniform( 924 | self.snr_premixture_min, self.snr_premixture_max, 2) 925 | m_1, m_2 = m_1.reshape(-1), m_2.reshape(-1) 926 | p_1 = mix_signals(s_1, m_1, pre_snr[0]) 927 | p_2 = mix_signals(s_2, m_2, pre_snr[1]) 928 | 929 | if self.add_reverb: 930 | if is_positive: 931 | r_1 = r_2 = wav_read_multiple([r_filepaths[i]], seed=seed) 932 | else: 933 | r_1, r_2 = wav_read_multiple(r_filepaths[i:i+2], seed=seed) 934 | r_1, r_2 = r_1.reshape(-1), r_2.reshape(-1) 935 | p_len = p_1.shape[-1] 936 | p_1 = convolve(p_1, r_1, mode='full')[:p_len] 937 | p_2 = convolve(p_2, r_2, mode='full')[:p_len] 938 | 939 | x_1, x_2 = p_1, p_2 940 | post_snr = [None, None] 941 | if self.add_noise: 942 | if not is_positive: 943 | n_1 = n_2 = wav_read_multiple([n_filepaths[i]], seed=seed) 944 | post_snr = [tmp_rng.uniform( 945 | self.snr_mixture_min, self.snr_mixture_max)] * 2 946 | else: 947 | n_1, n_2 = wav_read_multiple(n_filepaths[i:i+2], seed=seed) 948 | post_snr = tmp_rng.uniform( 949 | self.snr_mixture_min, self.snr_mixture_max, 2) 950 | n_1, n_2 = n_1.reshape(-1), n_2.reshape(-1) 951 | x_1 = mix_signals(p_1, n_1, post_snr[0]) 952 | x_2 = mix_signals(p_2, n_2, post_snr[1]) 953 | 954 | bp_1.append(p_1) 955 | bp_2.append(p_2) 956 | bx_1.append(x_1) 957 | bx_2.append(x_2) 958 | if pre_snr[0]: 959 | bpre_snrs.append(pre_snr) 960 | if post_snr[0]: 961 | bpost_snrs.append(post_snr) 962 | 963 | # stack and shuffle all the data in the right order 964 | bp_1 = np.stack(bp_1)[ordering] 965 | bp_2 = np.stack(bp_2)[ordering] 966 | bx_1 = np.stack(bx_1)[ordering] 967 | bx_2 = np.stack(bx_2)[ordering] 968 | if bpre_snrs: 969 | bpre_snrs = np.stack(bpre_snrs)[ordering] 970 | if bpost_snrs: 971 | bpost_snrs = np.stack(bpost_snrs)[ordering] 972 | labels = labels[ordering] 973 | 974 | scale_factor_1 = float(np.abs(bx_1).max() + _eps) 975 | scale_factor_2 = float(np.abs(bx_2).max() + _eps) 976 | scale_factor = max([scale_factor_1, scale_factor_2]) 977 | return ContrastiveBatch( 978 | inputs_1=torch.cuda.FloatTensor(bx_1) / scale_factor, 979 | inputs_2=torch.cuda.FloatTensor(bx_2) / scale_factor, 980 | targets_1=torch.cuda.FloatTensor(bp_1) / scale_factor, 981 | targets_2=torch.cuda.FloatTensor(bp_2) / scale_factor, 982 | labels=torch.cuda.BoolTensor(labels), 983 | pre_snrs=torch.cuda.FloatTensor(bpre_snrs), 984 | post_snrs=torch.cuda.FloatTensor(bpost_snrs) 985 | ) 986 | -------------------------------------------------------------------------------- /code/exp_models.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | from contextlib import suppress 5 | from typing import Any, Optional, Union, Sequence, Tuple, Dict, Callable 6 | 7 | import asteroid.models 8 | import torch 9 | import torch.nn.functional as tf 10 | from torch.nn.modules.loss import _Loss 11 | 12 | from exp_data import Mixtures, sample_rate, sisdr_improvement, sdr, sisdr, wav_write 13 | from exp_utils import make_2d, make_3d, pad_x_to_y, shape_reconstructed 14 | 15 | 16 | _fft_size: int = 1024 17 | _hop_size: int = 256 18 | _eps: float = 1e-8 19 | _recover_noise: bool = False 20 | _window: torch.Tensor = torch.hann_window(_fft_size) 21 | try: 22 | from pesq import pesq 23 | except ImportError: 24 | pesq = lambda *a, **k: 0 25 | print('Module `pesq` not installed, this metric will be a no-op.') 26 | try: 27 | from pystoi import stoi 28 | except ImportError: 29 | stoi = lambda *a, **k: 0 30 | print('Module `pystoi` not installed, this metric will be a no-op.') 31 | 32 | 33 | def _forward_single_mask(self, waveform: torch.Tensor): 34 | """Custom forward function to do single-mask two-source estimation. 35 | """ 36 | # Remember shape to shape reconstruction 37 | shape = torch.tensor(waveform.shape) 38 | 39 | # Reshape to (batch, n_mix, time) 40 | waveform = make_3d(waveform) 41 | 42 | # Real forward 43 | tf_rep = self.forward_encoder(waveform) 44 | est_masks = self.forward_masker(tf_rep) 45 | est_masks = est_masks.repeat(1, 2, 1, 1) 46 | est_masks[:, 1] = 1 - est_masks[:, 1] 47 | masked_tf_rep = self.apply_masks(tf_rep, est_masks) 48 | decoded = self.forward_decoder(masked_tf_rep) 49 | 50 | reconstructed = pad_x_to_y(decoded, waveform) 51 | return shape_reconstructed(reconstructed, shape) 52 | 53 | 54 | def _logistic(v, beta: float = 1., offset: float = 0.): 55 | return 1 / (1 + torch.exp(-beta * (v - offset))) 56 | 57 | 58 | def _stft(waveform: torch.Tensor): 59 | """Calculates the Short-time Fourier transform (STFT).""" 60 | 61 | # perform the short-time Fourier transform 62 | spectrogram = torch.stft( 63 | waveform, _fft_size, _hop_size, 64 | window=_window.to(waveform.device), 65 | return_complex=False 66 | ) 67 | 68 | # swap seq_len & feature_dim of the spectrogram (for RNN processing) 69 | spectrogram = spectrogram.permute(0, 2, 1, 3) 70 | 71 | # calculate the magnitude spectrogram 72 | magnitude_spectrogram = torch.sqrt(spectrogram[..., 0] ** 2 + 73 | spectrogram[..., 1] ** 2) 74 | 75 | return spectrogram, magnitude_spectrogram 76 | 77 | 78 | def _istft(spectrogram: torch.Tensor, mask: Optional[torch.Tensor] = None): 79 | """Calculates the inverse Short-time Fourier transform (ISTFT).""" 80 | 81 | # apply a time-frequency mask if provided 82 | if mask is not None: 83 | spectrogram[..., 0] *= mask 84 | spectrogram[..., 1] *= mask 85 | 86 | # swap seq_len & feature_dim of the spectrogram (undo RNN processing) 87 | spectrogram = spectrogram.permute(0, 2, 1, 3) 88 | 89 | # perform the inverse short-time Fourier transform 90 | waveform = torch.istft( 91 | spectrogram, _fft_size, _hop_size, 92 | window=_window.to(spectrogram.device), 93 | return_complex=False 94 | ) 95 | 96 | return waveform 97 | 98 | 99 | class ConvTasNet(asteroid.models.ConvTasNet): 100 | if _recover_noise: 101 | forward = _forward_single_mask 102 | 103 | 104 | class DPRNNTasNet(asteroid.models.DPRNNTasNet): 105 | if _recover_noise: 106 | forward = _forward_single_mask 107 | 108 | 109 | class DPTNet(asteroid.models.DPTNet): 110 | if _recover_noise: 111 | forward = _forward_single_mask 112 | 113 | 114 | class GRUNet(torch.nn.Module): 115 | 116 | def __init__(self, hidden_size: int, num_layers: int = 2, 117 | bidirectional: bool = False): 118 | super().__init__() 119 | self.hidden_size = hidden_size 120 | self.num_layers = num_layers 121 | self.bidirectional = bidirectional 122 | 123 | # create a neural network which predicts a TF binary ratio mask 124 | self.rnn = torch.nn.GRU( 125 | input_size=int(_fft_size // 2 + 1), 126 | hidden_size=self.hidden_size, 127 | num_layers=self.num_layers, 128 | bidirectional=self.bidirectional, 129 | batch_first=True 130 | ) 131 | self.dnn = torch.nn.Sequential( 132 | torch.nn.Linear( 133 | in_features=self.hidden_size * (1+self.bidirectional), 134 | out_features=int(_fft_size // 2 + 1) 135 | ), 136 | torch.nn.Sigmoid() 137 | ) 138 | 139 | def forward(self, waveform: torch.Tensor): 140 | # convert waveform to spectrogram 141 | (x, x_magnitude) = _stft(waveform) 142 | 143 | # generate a time-frequency mask 144 | h = self.rnn(x_magnitude)[0] 145 | y = self.dnn(h) 146 | y = y.reshape_as(x_magnitude) 147 | 148 | # convert masked spectrogram back to waveform 149 | denoised = _istft(x, mask=y) 150 | 151 | return denoised 152 | 153 | 154 | class SNRPredictor(torch.nn.Module): 155 | 156 | def __init__(self, hidden_size: int = 1024, num_layers: int = 3): 157 | super().__init__() 158 | self.hidden_size: int = hidden_size 159 | self.num_layers: int = num_layers 160 | 161 | # layers 162 | self.rnn = torch.nn.GRU( 163 | input_size=int(_fft_size // 2 + 1), 164 | hidden_size=self.hidden_size, 165 | num_layers=self.num_layers, 166 | batch_first=True 167 | ) 168 | self.dnn = torch.nn.Linear( 169 | in_features=self.hidden_size, 170 | out_features=1 171 | ) 172 | 173 | def forward(self, waveform: torch.Tensor): 174 | 175 | # convert to time-frequency domain 176 | (_, X_magnitude) = _stft(waveform) 177 | 178 | # generate frame-by-frame SNR predictions 179 | predicted_snrs = self.dnn(self.rnn(X_magnitude)[0]).reshape( 180 | -1, X_magnitude.shape[1]).detach() 181 | 182 | return predicted_snrs if self.training else _logistic(predicted_snrs) 183 | 184 | def load(self): 185 | self.load_state_dict(torch.load('snr_predictor'), strict=False) 186 | 187 | 188 | class SegmentalLoss(_Loss): 189 | """Loss function applied to audio segmented frame by frame.""" 190 | 191 | def __init__( 192 | self, 193 | loss_type: str = 'sisdr', 194 | reduction: str = 'none', 195 | segment_size: int = 1024, 196 | hop_length: int = 256, 197 | windowing: bool = True, 198 | centering: bool = True, 199 | pad_mode: str = 'reflect' 200 | ): 201 | super().__init__(reduction=reduction) 202 | assert loss_type in ('mse', 'snr', 'sisdr', 'sdsdr') 203 | assert pad_mode in ('constant', 'reflect') 204 | assert isinstance(centering, bool) 205 | assert isinstance(windowing, bool) 206 | assert segment_size > hop_length > 0 207 | 208 | self.loss_type = loss_type 209 | self.segment_size = segment_size 210 | self.hop_length = hop_length 211 | self.pad_mode = pad_mode 212 | 213 | self.centering = centering 214 | self.windowing = windowing 215 | 216 | self.unfold = torch.nn.Unfold( 217 | kernel_size=(1, segment_size), 218 | stride=(1, hop_length) 219 | ) 220 | self.window = torch.hann_window(self.segment_size).view(1, 1, -1) 221 | 222 | def forward( 223 | self, 224 | estimate: torch.Tensor, 225 | target: torch.Tensor, 226 | weights: Optional[torch.Tensor] = None, 227 | ): 228 | assert target.size() == estimate.size() 229 | assert target.ndim == 2 230 | assert self.segment_size < target.size()[-1] 231 | 232 | # subtract signal means 233 | target -= torch.mean(target, dim=1, keepdim=True) 234 | estimate -= torch.mean(estimate, dim=1, keepdim=True) 235 | 236 | # center the signals using padding 237 | if self.centering: 238 | signal_dim = target.dim() 239 | ext_shape = [1] * (3 - signal_dim) + list(target.size()) 240 | p = int(self.segment_size // 2) 241 | target = tf.pad(target.view(ext_shape), [p, p], self.pad_mode) 242 | target = target.view(target.shape[-signal_dim:]) 243 | estimate = tf.pad(estimate.view(ext_shape), [p, p], self.pad_mode) 244 | estimate = estimate.view(estimate.shape[-signal_dim:]) 245 | 246 | # use unfold to construct overlapping frames out of inputs 247 | n_batch = target.size()[0] 248 | target = self.unfold(target.view(n_batch,1,1,-1)).permute(0,2,1) 249 | estimate = self.unfold(estimate.view(n_batch,1,1,-1)).permute(0,2,1) 250 | losses: torch.Tensor 251 | 252 | # window all the frames 253 | if self.windowing: 254 | self.window = self.window.to(target.device) 255 | target = torch.multiply(target, self.window) 256 | estimate = torch.multiply(estimate, self.window) 257 | 258 | # MSE loss 259 | if self.loss_type == 'mse': 260 | losses = ((target - estimate)**2).sum(dim=2) 261 | losses /= self.segment_size 262 | 263 | # SDR based loss 264 | else: 265 | 266 | if self.loss_type == 'snr': 267 | scaled_target = target 268 | else: 269 | dot = (estimate * target).sum(dim=2, keepdim=True) 270 | s_target_energy = (target ** 2).sum(dim=2, keepdim=True) + _eps 271 | scaled_target = dot * target / s_target_energy 272 | 273 | if self.loss_type == 'sisdr': 274 | e_noise = estimate - scaled_target 275 | else: 276 | e_noise = estimate - target 277 | 278 | losses = (scaled_target ** 2).sum(dim=2) 279 | losses = losses / ((e_noise ** 2).sum(dim=2) + _eps) 280 | losses += _eps 281 | losses = torch.log10(losses) 282 | losses *= -10 283 | 284 | # apply weighting (if provided) 285 | if weights is not None: 286 | assert losses.size() == weights.size() 287 | weights = weights.detach() 288 | losses = torch.multiply(losses, weights).mean(dim=1) 289 | 290 | if self.reduction == 'mean': 291 | losses = losses.mean() 292 | 293 | return losses 294 | 295 | 296 | def feedforward( 297 | inputs: torch.Tensor, 298 | targets: torch.Tensor, 299 | model: torch.nn.Module, 300 | loss_reg: Callable, 301 | loss_segm: Callable, 302 | weights: Optional[torch.Tensor] = None, 303 | accumulation: bool = False, 304 | test: bool = False, 305 | skip_input_metrics: bool = False, 306 | num_examples_to_save: int = 0 307 | ) -> Dict[str, float]: 308 | """Runs a feedforward pass through a model by unraveling batched data. 309 | """ 310 | batch_size = inputs.shape[0] 311 | validation = not bool(model.training) 312 | context = torch.no_grad() if (validation or test) else suppress() 313 | r_sisdr_inp: float = 0 314 | r_sisdr_enh: float = 0 315 | r_sdr_inp: float = 0 316 | r_sdr_enh: float = 0 317 | r_pesq_inp: float = 0 318 | r_pesq_enh: float = 0 319 | r_stoi_inp: float = 0 320 | r_stoi_enh: float = 0 321 | r_loss: float = 0 322 | 323 | with context: 324 | for i in range(batch_size): 325 | 326 | # unravel batch 327 | x = inputs[i].unsqueeze(0).cuda() 328 | t = targets[i].unsqueeze(0).cuda() 329 | 330 | # forward pass 331 | y = make_2d(model(x)) 332 | if 0 <= i < num_examples_to_save: 333 | wav_write(f'example_{i:02d}.wav', 334 | y.reshape(-1).detach().cpu().numpy()) 335 | 336 | # backwards pass 337 | if not test: 338 | if weights is not None: 339 | w = weights[i].unsqueeze(0) 340 | loss_tensor = torch.mean( 341 | loss_segm(y, t, w)) 342 | else: 343 | loss_tensor = torch.mean( 344 | loss_reg(y, t)) 345 | loss_tensor /= batch_size 346 | r_loss += float(loss_tensor) 347 | if not (validation or test): 348 | loss_tensor.backward() 349 | 350 | # compute PESQ and STOI (perceptual scores) only during testing 351 | if test: 352 | _x = x.detach().cpu().numpy().squeeze() 353 | _y = y.detach().cpu().numpy().squeeze() 354 | _t = t.detach().cpu().numpy().squeeze() 355 | if not skip_input_metrics: 356 | r_pesq_inp += pesq(sample_rate, _t, _x, 'wb') 357 | r_pesq_enh += pesq(sample_rate, _t, _y, 'wb') 358 | if not skip_input_metrics: 359 | r_stoi_inp += stoi(_t, _x, sample_rate, True) 360 | r_stoi_enh += stoi(_t, _y, sample_rate, True) 361 | 362 | # calculate signal improvement 363 | if not skip_input_metrics: 364 | r_sdr_inp += float(sdr(x, t, reduction='mean')) 365 | r_sdr_enh += float(sdr(y, t, reduction='mean')) 366 | r_sisdr_inp += float(sisdr(x, t, reduction='mean')) 367 | r_sisdr_enh += float(sisdr(y, t, reduction='mean')) 368 | 369 | r_sdr_inp /= batch_size 370 | r_sdr_enh /= batch_size 371 | r_sisdr_inp /= batch_size 372 | r_sisdr_enh /= batch_size 373 | r_pesq_inp /= batch_size 374 | r_pesq_enh /= batch_size 375 | r_stoi_inp /= batch_size 376 | r_stoi_enh /= batch_size 377 | 378 | return dict(loss=r_loss, 379 | sdr_inp=r_sdr_inp, 380 | sdr_enh=r_sdr_enh, 381 | sisdri=(r_sisdr_enh-r_sisdr_inp), 382 | sisdr_inp=r_sisdr_inp, 383 | sisdr_enh=r_sisdr_enh, 384 | pesq_inp=r_pesq_inp, 385 | pesq_enh=r_pesq_enh, 386 | stoi_inp=r_stoi_inp, 387 | stoi_enh=r_stoi_enh, 388 | ) 389 | 390 | 391 | def contrastive_negative_term(ly, lt, term_type: str = 'max'): 392 | if term_type == 'max': 393 | return torch.mean(torch.max(ly, lt)) 394 | elif term_type == 'abs': 395 | return torch.mean(torch.abs(ly - lt)) 396 | else: 397 | return torch.mean(torch.pow(ly - lt, 2)) 398 | 399 | 400 | def contrastive_feedforward( 401 | inputs_1: torch.Tensor, 402 | inputs_2: torch.Tensor, 403 | targets_1: torch.Tensor, 404 | targets_2: torch.Tensor, 405 | labels: torch.BoolTensor, 406 | loss_reg: Callable, 407 | loss_segm: Callable, 408 | lambda_positive: float, 409 | lambda_negative: float, 410 | model: torch.nn.Module, 411 | weights_1: Optional[torch.Tensor] = None, 412 | weights_2: Optional[torch.Tensor] = None, 413 | negative_term_type: str = 'max', 414 | accumulation: bool = False, 415 | validation: bool = False, 416 | test: bool = False 417 | ) -> Dict[str, float]: 418 | """Runs a feedforward pass through a model by unraveling batched data. 419 | """ 420 | labels = labels.bool() 421 | batch_size = inputs_1.shape[0] 422 | context = torch.no_grad() if validation else suppress() 423 | ratio_pos = float(sum(labels) / batch_size) 424 | ratio_neg = float(sum(~labels) / batch_size) 425 | use_dp = bool(weights_1 is not None) and bool(weights_2 is not None) 426 | r_sisdri: float = 0 427 | r_loss: float = 0 428 | r_loss_sig: float = 0 429 | r_loss_pos: float = 0 430 | r_loss_neg: float = 0 431 | 432 | with context: 433 | for i in range(batch_size): 434 | 435 | loss_tensor_sig, loss_tensor_pos, loss_tensor_neg = 0, 0, 0 436 | 437 | # unravel batch 438 | x_1 = inputs_1[i].unsqueeze(0).cuda() 439 | x_2 = inputs_2[i].unsqueeze(0).cuda() 440 | t_1 = targets_1[i].unsqueeze(0).cuda() 441 | t_2 = targets_2[i].unsqueeze(0).cuda() 442 | 443 | # forward pass 444 | y_1 = make_2d(model(x_1)) 445 | y_2 = make_2d(model(x_2)) 446 | 447 | # stack for batchwise loss 448 | x = torch.cat([x_1, x_2], dim=0) 449 | t = torch.cat([t_1, t_2], dim=0) 450 | y = torch.cat([y_1, y_2], dim=0) 451 | 452 | # calculate loss 453 | if use_dp: 454 | w_1 = weights_1[i].unsqueeze(0).cuda() 455 | w_2 = weights_2[i].unsqueeze(0).cuda() 456 | w_p = w_1 * w_2 457 | w = torch.cat([w_1, w_2], dim=0) 458 | loss_tensor_sig = torch.mean(loss_segm(y, t, w)) 459 | if labels[i]: 460 | loss_tensor_pos = torch.mean( 461 | loss_segm(y_1, y_2, w_1)) 462 | else: 463 | loss_tensor_neg = contrastive_negative_term( 464 | loss_segm(y_1, y_2, w_p), loss_segm(t_1, t_2, w_p)) 465 | else: 466 | loss_tensor_sig = torch.mean(loss_reg(y, t)) 467 | if labels[i]: 468 | loss_tensor_pos = torch.mean( 469 | loss_reg(y_1, y_2)) 470 | else: 471 | loss_tensor_neg = contrastive_negative_term( 472 | loss_reg(y_1, y_2), loss_reg(t_1, t_2)) 473 | 474 | loss_tensor_sig /= batch_size 475 | loss_tensor_pos *= lambda_positive / (batch_size / 2) 476 | loss_tensor_neg *= lambda_negative / (batch_size / 2) 477 | loss_tensor_total = ( 478 | loss_tensor_sig + loss_tensor_pos + loss_tensor_neg) 479 | 480 | r_loss += float(loss_tensor_total) 481 | r_loss_sig += float(loss_tensor_sig) 482 | r_loss_pos += float(loss_tensor_pos) 483 | r_loss_neg += float(loss_tensor_neg) 484 | 485 | # backwards pass 486 | if not validation: 487 | loss_tensor_total.backward() 488 | 489 | # calculate signal improvement 490 | r_sisdri += float(sisdr_improvement(y, t, x, 'mean')) 491 | 492 | r_sisdri /= batch_size 493 | 494 | return dict(loss=r_loss, 495 | loss_sig=r_loss_sig, 496 | loss_pos=r_loss_pos, 497 | loss_neg=r_loss_neg, 498 | sisdri=r_sisdri) 499 | 500 | 501 | def init_ctn(N=512, L=16, B=128, H=512, Sc=128, P=3, X=8, R=3, causal=False): 502 | model_config = locals() 503 | return (ConvTasNet( 504 | n_src=1, 505 | sample_rate=sample_rate, 506 | n_filters=N, 507 | kernel_size=L, 508 | bn_chan=B, 509 | hid_chan=H, 510 | skip_chan=Sc, 511 | conv_kernel_size=P, 512 | n_blocks=X, 513 | n_repeats=R, 514 | causal=causal 515 | ), model_config) 516 | 517 | 518 | def init_dprnn(N=64, L=2, B=128, H=128, R=6, K=250, T='lstm', causal=False): 519 | model_config = locals() 520 | return (DPRNNTasNet( 521 | n_src=1, 522 | sample_rate=sample_rate, 523 | n_filters=N, 524 | kernel_size=L, 525 | bn_chan=B, 526 | hid_size=H, 527 | n_repeats=R, 528 | chunk_size=K, 529 | rnn_type=T, 530 | bidirectional=(not causal) 531 | ), model_config) 532 | 533 | 534 | def init_gru(hidden_size=64, num_layers=2, bidirectional=True): 535 | model_config = locals() 536 | return (GRUNet( 537 | hidden_size=hidden_size, 538 | num_layers=num_layers, 539 | bidirectional=bidirectional 540 | ), model_config) 541 | 542 | 543 | def init_model( 544 | model_name: str, 545 | model_size: Optional[str] = None, 546 | model_config: Optional[dict] = None 547 | ) -> Tuple[torch.nn.Module, int, dict]: 548 | """Instantiates model based on name and size. 549 | """ 550 | # instantiate network 551 | model: torch.nn.Module 552 | model_config: dict = model_config or {} 553 | if not bool(model_size or model_config): 554 | raise ValueError('Expected either `model_size` or `model_config`.') 555 | if not (model_size in {'tiny', 'small', 'medium', 'large'}): 556 | raise ValueError('Size must be either "small", "medium", or "large".') 557 | if model_name == 'convtasnet': 558 | if model_config: 559 | model, model_config = init_ctn(**model_config) 560 | else: 561 | model, model_config = init_ctn(**{ 562 | 'tiny': dict(H=32, B=8, X=7, R=2), 563 | 'small': dict(H=64, B=16, X=7, R=2), 564 | 'medium': dict(H=128, B=32, X=7, R=2), 565 | 'large': dict(H=256, B=64, X=7, R=2), 566 | }.get(model_size)) 567 | elif model_name == 'grunet': 568 | if model_config: 569 | model, model_config = init_gru(**model_config) 570 | else: 571 | model, model_config = init_gru(**{ 572 | 'tiny': dict(hidden_size=32, num_layers=2), 573 | 'small': dict(hidden_size=64, num_layers=2), 574 | 'medium': dict(hidden_size=128, num_layers=2), 575 | 'large': dict(hidden_size=256, num_layers=2) 576 | }.get(model_size)) 577 | else: 578 | raise ValueError(f'Unsupported model name: "{model_name}".') 579 | model_nparams: int = count_parameters(model) 580 | 581 | return model, model_nparams, model_config 582 | 583 | 584 | def load_checkpoint( 585 | path: Union[str, os.PathLike] 586 | ) -> (torch.nn.Module, dict, int): 587 | 588 | input_path = pathlib.Path(path) 589 | print(input_path) 590 | 591 | # If the path suffix is the PyTorch file extension, 592 | # then it's already a checkpoint 593 | if input_path.is_file() and input_path.suffix == '.pt': 594 | checkpoint_path = str(input_path) 595 | 596 | # If it's a directory, get the latest checkpoint 597 | # from that folder. 598 | elif input_path.is_dir(): 599 | try: 600 | m = { 601 | input_path.joinpath('ckpt_best.pt'), 602 | input_path.joinpath('ckpt_last.pt') 603 | } 604 | checkpoints = set(input_path.glob('*.pt')) 605 | if m.issubset(checkpoints): 606 | checkpoints.remove(input_path.joinpath('ckpt_last.pt')) 607 | checkpoint_path = str(max(checkpoints, key=os.path.getctime)) 608 | except ValueError: 609 | raise IOError(f'Input directory {str(input_path)} does not contain ' 610 | f'checkpoints.') 611 | else: 612 | raise IOError(f'{str(input_path)} is not a checkpoint or directory.') 613 | 614 | # Get the appropriate config file. 615 | config_path = pathlib.Path(checkpoint_path).with_name('config.json') 616 | if not config_path.is_file(): 617 | raise IOError(f'Missing config file at {str(input_path)}.') 618 | 619 | # Load the config file. 620 | with open(config_path, 'r') as fp: 621 | config: dict = json.load(fp) 622 | 623 | # Initialize the model 624 | model = init_model(model_name=config.get('model_name'), 625 | model_size=config.get('model_size'))[0] 626 | ckpt = torch.load(checkpoint_path) 627 | num_examples: int = ckpt.get('num_examples') 628 | try: 629 | model.load_state_dict(ckpt.get('model_state_dict'), strict=True) 630 | except RuntimeError as e: 631 | if 'state_dict' in str(e): 632 | raise RuntimeError(f'{str(checkpoint_path)} is a mismatched model.') 633 | model.cuda() 634 | 635 | return model, config, num_examples 636 | 637 | 638 | def count_parameters(network: Any) -> int: 639 | return sum(p.numel() for p in network.parameters() if p.requires_grad) 640 | 641 | 642 | 643 | def test_denoiser_with_speaker( 644 | model: torch.nn.Module, 645 | speaker_id: int = 200, 646 | num_examples_to_save: int = 0 647 | ) -> dict: 648 | no_op_loss = lambda *a, **k: 0 649 | dataset = Mixtures(speaker_id, split_speech='test', split_mixture='test', 650 | snr_mixture=(-5, 5)) 651 | batch = dataset(100, seed=0) 652 | results = feedforward(batch.inputs, batch.targets, model, 653 | weights=None, accumulation=True, 654 | loss_reg=no_op_loss, loss_segm=no_op_loss, 655 | test=True, num_examples_to_save=num_examples_to_save) 656 | return results 657 | 658 | 659 | @torch.no_grad() 660 | def test_denoiser_from_module( 661 | model: torch.nn.Module, 662 | data_te: Union[Mixtures, Sequence[Mixtures]], 663 | accumulation: bool = False 664 | ) -> dict: 665 | """Evaluates speech enhancement model using provided dataset. 666 | """ 667 | no_op_loss = lambda *a, **k: 0 668 | if not isinstance(data_te, (list, tuple)): 669 | data_te = [data_te] 670 | results = {} 671 | for dataset in data_te: 672 | batch = dataset(100, seed=0) 673 | key = dataset.speaker_ids_repr 674 | results[key] = feedforward( 675 | batch.inputs, batch.targets, 676 | model, weights=None, accumulation=accumulation, test=True, 677 | loss_reg=no_op_loss, loss_segm=no_op_loss 678 | ) 679 | return results 680 | 681 | 682 | @torch.no_grad() 683 | def test_denoiser_from_file( 684 | checkpoint_path: Union[str, os.PathLike], 685 | data_te: Union[Mixtures, Sequence[Mixtures]], 686 | accumulation: bool = False 687 | ) -> dict: 688 | """Evaluates speech enhancement model checkpoint using provided dataset. 689 | """ 690 | # load a config yaml file which should be in the same location 691 | config_file = pathlib.Path(checkpoint_path).with_name('config.json') 692 | if not config_file.exists(): 693 | raise ValueError(f'Could not find {str(config_file)}.') 694 | with open(config_file, 'r') as fp: 695 | config: dict = json.load(fp) 696 | 697 | model = init_model(model_name=config.get('model_name'), 698 | model_size=config.get('model_size'))[0] 699 | ckpt = torch.load(checkpoint_path) 700 | num_examples: int = ckpt.get('num_examples') 701 | model.load_state_dict(ckpt.get('model_state_dict'), strict=True) 702 | model.cuda() 703 | 704 | results = test_denoiser_from_module(model, data_te, accumulation) 705 | results['num_examples'] = num_examples 706 | 707 | return results 708 | 709 | 710 | @torch.no_grad() 711 | def test_denoiser_from_folder( 712 | checkpoint_folder: Union[str, os.PathLike], 713 | data_te: Union[Mixtures, Sequence[Mixtures]], 714 | accumulation: bool = False, 715 | finetune: int = 0, 716 | use_last: bool = False 717 | ): 718 | """Selects speech enhancement model checkpoint from folder, and then 719 | evaluates using provided dataset. 720 | """ 721 | finetune_suffix = f'_ft_{int(finetune):02d}' if finetune else '' 722 | # identify the best checkpoint using saved text file 723 | checkpoint_folder = pathlib.Path(checkpoint_folder) 724 | if use_last: 725 | checkpoint_path = checkpoint_folder.joinpath(f'ckpt_last.pt') 726 | elif checkpoint_folder.joinpath(f'ckpt_best{finetune_suffix}.pt').exists(): 727 | checkpoint_path = checkpoint_folder.joinpath( 728 | f'ckpt_best{finetune_suffix}.pt') 729 | else: 730 | best_step_file = next(checkpoint_folder.glob('best_step*')) 731 | if not best_step_file.exists(): 732 | raise ValueError(f'Could not find {str(best_step_file)}.') 733 | with open(best_step_file, 'r') as fp: 734 | best_step = int(fp.readline()) 735 | checkpoint_path = checkpoint_folder.joinpath( 736 | f'ckpt_{best_step:08}{finetune_suffix}.pt') 737 | if not checkpoint_path.exists(): 738 | raise IOError(f'{str(checkpoint_path)} does not exist.') 739 | 740 | return test_denoiser_from_file(checkpoint_path, data_te, accumulation) 741 | 742 | 743 | @torch.no_grad() 744 | def test_denoiser( 745 | model: Union[str, os.PathLike, torch.nn.Module], 746 | data_te: Union[Mixtures, Sequence[Mixtures]], 747 | accumulation: bool = False, 748 | finetune: int = 0, 749 | use_last: bool = False 750 | ): 751 | if isinstance(model, torch.nn.Module): 752 | return test_denoiser_from_module(model, data_te, accumulation) 753 | elif isinstance(model, (str, os.PathLike)): 754 | path = pathlib.Path(model) 755 | if path.is_dir(): 756 | return test_denoiser_from_folder(path, data_te, accumulation, 757 | finetune, use_last) 758 | elif path.is_file(): 759 | return test_denoiser_from_file(path, data_te, accumulation) 760 | else: 761 | raise ValueError(f'{str(path)} does not exist.') 762 | else: 763 | raise ValueError('Expected input to be PyTorch model or filepath.') 764 | -------------------------------------------------------------------------------- /code/exp_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import torch 4 | import torch.nn.functional 5 | import yaml 6 | 7 | 8 | class EarlyStopping(Exception): 9 | pass 10 | 11 | 12 | class SmokeTest(Exception): 13 | pass 14 | 15 | 16 | class ExperimentError(Exception): 17 | pass 18 | 19 | 20 | def make_2d(x: torch.Tensor): 21 | """Normalize shape of `x` to two dimensions: [batch, time].""" 22 | if isinstance(x, np.ndarray): 23 | x = torch.from_numpy(x) 24 | if x.ndim == 1: 25 | return x.reshape(1, -1) 26 | elif x.ndim == 3: 27 | return x.squeeze(1) 28 | else: 29 | if x.ndim != 2: raise ValueError('Could not force 2d.') 30 | return x 31 | 32 | 33 | def make_3d(x: torch.Tensor): 34 | """Normalize shape of `x` to three dimensions: [batch, n_chan, time].""" 35 | if isinstance(x, np.ndarray): 36 | x = torch.from_numpy(x) 37 | if x.ndim == 1: 38 | return x.reshape(1, 1, -1) 39 | elif x.ndim == 2: 40 | return x.unsqueeze(1) 41 | else: 42 | if x.ndim != 3: raise ValueError('Could not force 3d.') 43 | return x 44 | 45 | 46 | def pad_x_to_y(x: torch.Tensor, y: torch.Tensor, axis: int = -1): 47 | """Right-pad or right-trim first argument to have same size as second argument 48 | Args: 49 | x (torch.Tensor): Tensor to be padded. 50 | y (torch.Tensor): Tensor to pad `x` to. 51 | axis (int): Axis to pad on. 52 | Returns: 53 | torch.Tensor, `x` padded to match `y`'s shape. 54 | """ 55 | if axis != -1: 56 | raise NotImplementedError 57 | inp_len = y.shape[axis] 58 | output_len = x.shape[axis] 59 | return torch.nn.functional.pad(x, [0, inp_len - output_len]) 60 | 61 | 62 | def shape_reconstructed(reconstructed: torch.Tensor, size: torch.Tensor): 63 | """Reshape `reconstructed` to have same size as `size` 64 | Args: 65 | reconstructed (torch.Tensor): Reconstructed waveform 66 | size (torch.Tensor): Size of desired waveform 67 | Returns: 68 | torch.Tensor: Reshaped waveform 69 | """ 70 | if len(size) == 1: 71 | return reconstructed.squeeze(0) 72 | return reconstructed 73 | 74 | 75 | def get_config_from_yaml(yaml_filepath: str): 76 | 77 | if not os.path.exists(yaml_filepath): 78 | raise OSError(f'{yaml_filepath} not found') 79 | 80 | config = {} 81 | with open(yaml_filepath) as fp: 82 | config = yaml.safe_load(fp) 83 | nonlist_keys = ( 84 | 'available_devices', 85 | 'num_gpus_per_experiment', 86 | 'num_cpus_per_experiment', 87 | 'output_folder', 88 | 'folder_librispeech', 89 | 'folder_fsd50k', 90 | 'folder_musan', 91 | 'sample_rate', 92 | 'example_duration', 93 | ) 94 | for k in config.keys(): 95 | if k not in nonlist_keys: 96 | if not isinstance(config[k], list): 97 | config[k] = [config[k],] 98 | 99 | return config 100 | -------------------------------------------------------------------------------- /code/finetune.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from run import finetune_denoiser 3 | 4 | if __name__ == '__main__': 5 | finetune_denoiser(dataset_duration=float(sys.argv[1]), 6 | checkpoint_locations=sys.argv[2:]) 7 | 8 | -------------------------------------------------------------------------------- /code/requirements.txt: -------------------------------------------------------------------------------- 1 | asteroid==0.6.0 2 | librosa==0.10.0 3 | numpy==1.23.5 4 | pandas==1.5.3 5 | pesq==0.0.4 6 | pystoi==0.3.3 7 | pytorch_lightning==1.9.3 8 | PyYAML==6.0 9 | ray==2.3.0 10 | scipy==1.10.1 11 | soundfile==0.12.1 12 | torch==1.13.1 13 | -------------------------------------------------------------------------------- /code/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import itertools 4 | import json 5 | import os 6 | import socket 7 | import sys 8 | import time 9 | import warnings 10 | from ast import literal_eval 11 | from datetime import datetime 12 | from math import ceil 13 | from pathlib import Path 14 | from typing import Optional, List, Sequence 15 | from typing import Union 16 | 17 | import asteroid.losses 18 | import numpy as np 19 | import torch 20 | import yaml 21 | from pytorch_lightning import seed_everything 22 | from ray import tune 23 | from torch.utils.tensorboard import SummaryWriter 24 | 25 | from exp_data import ContrastiveMixtures, Mixtures 26 | from exp_data import example_duration, sample_rate 27 | from exp_models import SegmentalLoss, SNRPredictor, init_model, load_checkpoint 28 | from exp_models import contrastive_feedforward, feedforward 29 | from exp_utils import EarlyStopping, ExperimentError, SmokeTest 30 | 31 | warnings.filterwarnings('ignore') 32 | torch.backends.cuda.matmul.allow_tf32 = False 33 | torch.backends.cudnn.allow_tf32 = False 34 | 35 | _host = str(socket.gethostname().split('.')[-3:].pop(0)) 36 | _snrp_path = Path(__file__).resolve().parent.joinpath('snr_predictor') 37 | _tune_kwargs = dict( 38 | reuse_actors=True, 39 | log_to_file=True, 40 | local_dir='.', 41 | fail_fast=True, 42 | verbose=1 43 | ) 44 | 45 | def save_config( 46 | output_directory: Union[str, os.PathLike], 47 | config: dict 48 | ): 49 | """Saves the config dict to file.""" 50 | output_directory = Path(output_directory) 51 | with open(output_directory.joinpath('config.json'), 'w', 52 | encoding='utf-8') as fp: 53 | json.dump(config, fp, indent=2, sort_keys=True) 54 | print(yaml.safe_dump(config, default_flow_style=False)) 55 | 56 | 57 | # noinspection PyTypeChecker 58 | def train_denoiser( 59 | model_name: str, 60 | model_size: str, 61 | data_tr: Mixtures, 62 | data_vl: Mixtures, 63 | use_loss_purification: bool = False, 64 | lambda_p: float = 1., 65 | lambda_n: float = 1., 66 | learning_rate: float = 1e-3, 67 | batch_size: int = 64, 68 | checkpoint_path: Optional[str] = None, 69 | num_examples_validation: int = 1000, 70 | num_examples_minimum: int = 100000, 71 | num_examples_earlystopping: int = 100000, 72 | trial_name: Optional[str] = None, 73 | output_folder: Union[str, os.PathLike] = f'trials_{_host}', 74 | early_stopping_metric: str = 'sisdri', 75 | distance_func: str = 'mse', 76 | called_by_ray: bool = False, 77 | run_smoke_test: bool = False 78 | ) -> str: 79 | 80 | seed_everything(0) 81 | 82 | # prepare model, optimizer, and loss function 83 | current_time = datetime.now().strftime('%b%d_%H-%M-%S') 84 | model, nparams, model_config = init_model(model_name, model_size) 85 | model = model.cuda() 86 | optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate) 87 | predictor = torch.nn.Identity() 88 | if use_loss_purification: 89 | predictor = SNRPredictor() 90 | predictor.load_state_dict(torch.load(str(_snrp_path)), strict=False) 91 | predictor.cuda() 92 | predictor.eval() 93 | 94 | use_loss_contrastive: bool = bool(isinstance(data_tr, ContrastiveMixtures)) 95 | if not type(data_tr) is type(data_vl): 96 | raise ValueError('`data_tr` and `data_vl` should be the same type.') 97 | 98 | # load a previous checkpoint if provided 99 | init_num_examples = 0 100 | output_directory: Optional[Path] = None 101 | is_finetuning = bool(data_tr.dataset_duration or 0) 102 | if checkpoint_path: 103 | # reuse output directory (to pick up experiment where left off) 104 | output_directory = Path(checkpoint_path).parent 105 | ckpt = torch.load(checkpoint_path) 106 | model.load_state_dict(ckpt['model_state_dict']) 107 | # if finetuning a generalist, make a subdirectory 108 | if is_finetuning: 109 | output_directory = output_directory.joinpath( 110 | current_time + '_ft_' + trial_name) 111 | # otherwise, resuming training so reuse the old optimizer 112 | else: 113 | optimizer.load_state_dict(ckpt['optimizer_state_dict']) 114 | init_num_examples = ckpt['num_examples'] 115 | 116 | # define experiment configuration 117 | config = { 118 | 'batch_size': batch_size, 119 | 'checkpoint_path': str(checkpoint_path or ''), 120 | 'data_tr': data_tr.__dict__(), 121 | 'data_vl': data_vl.__dict__(), 122 | 'distance_func': distance_func, 123 | 'example_duration': example_duration, 124 | 'lambda_p': lambda_p, 125 | 'lambda_n': lambda_n, 126 | 'learning_rate': learning_rate, 127 | 'model_config': model_config, 128 | 'model_name': model_name, 129 | 'model_nparams': nparams, 130 | 'model_size': model_size, 131 | 'num_examples_minimum': num_examples_minimum, 132 | 'num_examples_earlystopping': num_examples_earlystopping, 133 | 'num_examples_validation': num_examples_validation, 134 | 'sample_rate': sample_rate, 135 | 'speaker_ids': data_tr.speaker_ids_repr, 136 | 'use_loss_contrastive': use_loss_contrastive, 137 | 'use_loss_purification': use_loss_purification, 138 | 'early_stopping_metric': early_stopping_metric, 139 | 'is_finetuning': is_finetuning 140 | } 141 | 142 | # instantiate tensorboard 143 | if called_by_ray: 144 | trial_name = tune.get_trial_name() 145 | if output_directory is None: 146 | output_directory = Path(output_folder).joinpath( 147 | current_time + '_' + trial_name) 148 | writer = SummaryWriter(str(output_directory)) 149 | save_config(output_directory, config) 150 | 151 | # begin training (use gradient accumulation for TasNet models) 152 | num_examples: int = init_num_examples 153 | num_validations: int = ceil(num_examples / num_examples_validation) 154 | best_score: float = np.inf * (1 if early_stopping_metric == 'loss' else -1) 155 | best_score_step: int = init_num_examples 156 | use_gradient_accumulation: bool = not bool('grunet' in model_name) 157 | print(f'Output Directory: {str(output_directory)}') 158 | 159 | # define the distance function 160 | if distance_func == 'snr': 161 | distfunc_reg = asteroid.losses.sdr.SingleSrcNegSDR('snr') 162 | distfunc_segm = SegmentalLoss('snr', reduction='none') 163 | elif distance_func == 'sisdr': 164 | distfunc_reg = asteroid.losses.sdr.SingleSrcNegSDR('sisdr') 165 | distfunc_segm = SegmentalLoss('sisdr', reduction='none') 166 | else: 167 | distfunc_reg = torch.nn.MSELoss(reduction='none') 168 | distfunc_segm = SegmentalLoss('mse', reduction='none') 169 | 170 | try: 171 | for num_examples in itertools.count(start=init_num_examples, 172 | step=batch_size): 173 | model.train() 174 | if use_loss_contrastive: 175 | 176 | # pick up a training batch 177 | batch = data_tr(batch_size) 178 | x_1 = batch.inputs_1.cuda() 179 | x_2 = batch.inputs_2.cuda() 180 | p_1 = batch.targets_1.cuda() 181 | p_2 = batch.targets_2.cuda() 182 | 183 | # estimate data purification weights 184 | w_1 = predictor(p_1) if use_loss_purification else None 185 | w_2 = predictor(p_2) if use_loss_purification else None 186 | 187 | # forward propagation 188 | metrics_tr = contrastive_feedforward( 189 | inputs_1=x_1, inputs_2=x_2, 190 | targets_1=p_1, targets_2=p_2, 191 | weights_1=w_1, weights_2=w_2, 192 | lambda_positive=lambda_p, lambda_negative=lambda_n, 193 | loss_reg=distfunc_reg, loss_segm=distfunc_segm, 194 | labels=batch.labels.cuda(), 195 | model=model.cuda(), 196 | accumulation=use_gradient_accumulation, 197 | validation=False) 198 | 199 | else: 200 | 201 | # pick up a training batch 202 | batch = data_tr(batch_size) 203 | x = batch.inputs.cuda() 204 | p = batch.targets.cuda() 205 | 206 | # estimate data purification weights 207 | w = predictor(p) if use_loss_purification else None 208 | 209 | # forward propagation 210 | metrics_tr = feedforward( 211 | inputs=x, targets=p, model=model.train(), 212 | loss_reg=distfunc_reg, loss_segm=distfunc_segm, 213 | weights=w, accumulation=use_gradient_accumulation) 214 | 215 | # update parameters 216 | optimizer.step() 217 | optimizer.zero_grad(set_to_none=True) 218 | 219 | if num_examples < (num_validations * num_examples_validation): 220 | continue 221 | 222 | num_validations += 1 223 | model.eval() 224 | 225 | validation_time: float = 0 226 | if run_smoke_test: 227 | validation_time = time.time() 228 | 229 | with torch.no_grad(): 230 | 231 | if use_loss_contrastive: 232 | 233 | # pick up a validation batch 234 | batch = data_vl(batch_size, seed=0) 235 | x_1 = batch.inputs_1.cuda() 236 | x_2 = batch.inputs_2.cuda() 237 | p_1 = batch.targets_1.cuda() 238 | p_2 = batch.targets_2.cuda() 239 | 240 | # estimate data purification weights 241 | w_1 = predictor(p_1) if use_loss_purification else None 242 | w_2 = predictor(p_2) if use_loss_purification else None 243 | 244 | # forward propagation 245 | metrics_vl = contrastive_feedforward( 246 | inputs_1=x_1, inputs_2=x_2, 247 | targets_1=p_1, targets_2=p_2, 248 | weights_1=w_1, weights_2=w_2, 249 | lambda_positive=lambda_p, lambda_negative=lambda_n, 250 | loss_reg=distfunc_reg, loss_segm=distfunc_segm, 251 | labels=batch.labels.cuda(), 252 | model=model.cuda(), 253 | accumulation=use_gradient_accumulation, 254 | validation=True) 255 | 256 | else: 257 | 258 | # pick up a validation batch 259 | batch = data_vl(batch_size, seed=0) 260 | x = batch.inputs.cuda() 261 | p = batch.targets.cuda() 262 | 263 | # estimate data purification weights 264 | w = predictor(p) if use_loss_purification else None 265 | 266 | # forward propagation 267 | metrics_vl = feedforward( 268 | inputs=x, targets=p, model=model.eval(), 269 | loss_reg=distfunc_reg, loss_segm=distfunc_segm, 270 | weights=w, accumulation=use_gradient_accumulation) 271 | 272 | # checkpoint whenever validation score improves 273 | if early_stopping_metric == 'loss': 274 | save_ckpt = bool(metrics_vl['loss']<=best_score) 275 | else: 276 | save_ckpt = bool(metrics_vl['sisdri']>=best_score) 277 | 278 | if save_ckpt: 279 | best_score = metrics_vl[early_stopping_metric] 280 | best_score_step = num_examples 281 | best_state_dict = model.state_dict() 282 | ckpt_path = output_directory.joinpath('ckpt_best.pt') 283 | torch.save({ 284 | 'num_examples': num_examples, 285 | 'model_name': model_name, 286 | 'model_config': config, 287 | 'model_state_dict': best_state_dict, 288 | 'optimizer_state_dict': optimizer.state_dict() 289 | }, ckpt_path) 290 | if not called_by_ray: 291 | print(f'Examples: {num_examples:>10},\t' 292 | 'Validation SI-SDRi: '+str(metrics_vl['sisdri'])) 293 | step_path = output_directory.joinpath('best_step.txt') 294 | with open(step_path, 'w') as fp: 295 | print(num_examples, file=fp) 296 | 297 | # write summaries 298 | for (k, v) in metrics_tr.items(): 299 | if ('_inp' not in k) and ('_enh' not in k): 300 | writer.add_scalar( 301 | f'train/{k}', float(v), num_examples) 302 | for (k, v) in metrics_vl.items(): 303 | if ('_inp' not in k) and ('_enh' not in k): 304 | writer.add_scalar( 305 | f'validation/{k}', float(v), num_examples) 306 | writer.add_scalar( 307 | f'validation/vl_score', best_score, num_examples) 308 | if called_by_ray: 309 | _e = early_stopping_metric 310 | tune.report(**{ 311 | 'num_examples': num_examples, 312 | f'vl_{_e}': metrics_vl[_e], 313 | f'vl_score': best_score 314 | }) 315 | 316 | if num_examples > num_examples_minimum: 317 | if num_examples - best_score_step > num_examples_earlystopping: 318 | raise EarlyStopping() 319 | 320 | if run_smoke_test: 321 | validation_time = time.time() - validation_time 322 | smoke_path = output_directory.joinpath(f'smoke_test.txt') 323 | with open(smoke_path, 'w') as fp: 324 | print('Validation Run-Time (in seconds):' 325 | f' {validation_time}', file=fp) 326 | raise SmokeTest() 327 | 328 | except EarlyStopping: 329 | step_path = output_directory.joinpath(f'early_stopping.txt') 330 | with open(step_path, 'w') as fp: 331 | print(f'{num_examples}\n{best_score_step}\n{best_score}', file=fp) 332 | print(f'Automatically exited after {num_examples_earlystopping} ' 333 | f'examples; best model saw {best_score_step} examples.') 334 | 335 | except SmokeTest: 336 | print(f'Exiting due to smoke test.') 337 | 338 | except KeyboardInterrupt: 339 | print(f'Manually exited at {num_examples} examples; best model saw ' 340 | f'{best_score_step} examples.') 341 | raise KeyboardInterrupt 342 | 343 | torch.save({ 344 | 'num_examples': num_examples, 345 | 'model_name': model_name, 346 | 'model_config': model_config, 347 | 'model_state_dict': model.state_dict(), 348 | 'optimizer_state_dict': optimizer.state_dict() 349 | }, output_directory.joinpath(f'ckpt_last.pt')) 350 | 351 | # close the summary 352 | writer.close() 353 | print(f'Output Directory: {str(output_directory)}') 354 | 355 | # exit the trainer 356 | return 357 | 358 | 359 | def finetune_denoiser( 360 | dataset_duration: float, 361 | checkpoint_locations: Sequence[Union[str, os.PathLike]], 362 | learning_rate: float = 1e-4, 363 | num_examples_validation: int = 1000, 364 | num_examples_earlystopping: int = 10000, 365 | output_folder: Union[str, os.PathLike] = f'finetuning_{_host}', 366 | early_stopping_metric: str = 'sisdri', 367 | distance_func: str = 'mse' 368 | ): 369 | """Finetunes a denoiser, given checkpoint and dataset size. 370 | """ 371 | if isinstance(checkpoint_locations, (str, os.PathLike)): 372 | checkpoint_locations = [checkpoint_locations] 373 | for checkpoint_location in checkpoint_locations: 374 | 375 | # Load checkpoint and previous settings from file. 376 | checkpoint_location = checkpoint_location.replace( 377 | 'early_stopping.txt', '') 378 | base_model, config = load_checkpoint(checkpoint_location) 379 | model_name = config.get('model_name') 380 | model_size = config.get('model_size') 381 | batch_size = config.get('batch_size') 382 | config['is_finetuning'] = True 383 | config['dataset_duration'] = dataset_duration 384 | config['learning_rate'] = learning_rate 385 | config['num_examples_validation'] = num_examples_validation 386 | config['num_examples_earlystopping'] = num_examples_earlystopping 387 | config['output_folder'] = output_folder 388 | config['early_stopping_metric'] = early_stopping_metric 389 | config['distance_func'] = distance_func 390 | 391 | # define the distance function 392 | if distance_func == 'snr': 393 | distfunc_reg = asteroid.losses.sdr.SingleSrcNegSDR('snr') 394 | distfunc_segm = SegmentalLoss('snr', reduction='none') 395 | elif distance_func == 'sisdr': 396 | distfunc_reg = asteroid.losses.sdr.SingleSrcNegSDR('sisdr') 397 | distfunc_segm = SegmentalLoss('sisdr', reduction='none') 398 | else: 399 | distfunc_reg = torch.nn.MSELoss(reduction='none') 400 | distfunc_segm = SegmentalLoss('mse', reduction='none') 401 | 402 | # If this is a generalist, loop through all the personalization targets. 403 | # Else, if it is a specialist, this loop will only run once. 404 | try: 405 | speaker_ids = sorted(map( 406 | int, config.get('speaker_ids').strip('][').split(', '))) 407 | config['is_generalist'] = False 408 | except ValueError: 409 | speaker_ids = speaker_ids_te 410 | config['is_generalist'] = True 411 | for speaker_id in speaker_ids: 412 | 413 | current_time = datetime.now().strftime('%b%d_%H-%M-%S') 414 | model = copy.deepcopy(base_model).cuda() 415 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 416 | 417 | data_tr = Mixtures( 418 | speaker_id, split_speech='train', split_mixture='train', 419 | snr_mixture=(-5, 5), dataset_duration=dataset_duration) 420 | data_vl = Mixtures( 421 | speaker_id, split_speech='val', split_mixture='val', 422 | snr_mixture=(-5, 5), dataset_duration=dataset_duration) 423 | config['data_tr'] = data_tr.__dict__() 424 | config['data_vl'] = data_vl.__dict__() 425 | config['speaker_ids'] = data_tr.speaker_ids_repr 426 | 427 | # Instantiate tensorboard 428 | trial_name = '{}_{}_{}p_{}c_{}{:03}_ft{:02}'.format( 429 | model_name, model_size, 430 | 'y' if config.get('use_loss_purification') else 'n', 431 | 'y' if config.get('use_loss_contrastive') else 'n', 432 | 'ge' if config.get('is_generalist') else 'sp', 433 | speaker_id, int(dataset_duration) 434 | ) 435 | output_directory = Path(output_folder).joinpath( 436 | current_time + '_' + trial_name) 437 | writer = SummaryWriter(str(output_directory)) 438 | save_config(output_directory, config) 439 | 440 | # Begin training 441 | num_examples: int = 0 442 | num_validations: int = 0 443 | best_score: float = np.inf * (1 if early_stopping_metric == 'loss' 444 | else 445 | -1) 446 | best_score_step: int = 0 447 | use_gradient_accumulation: bool = not bool('grunet' in model_name) 448 | print(f'Output Directory: {str(output_directory)}') 449 | 450 | try: 451 | for num_examples in itertools.count(start=0, step=batch_size): 452 | 453 | batch = data_tr(batch_size) 454 | 455 | metrics_tr = feedforward( 456 | batch.inputs, batch.targets, model.train(), 457 | loss_reg=distfunc_reg, loss_segm=distfunc_segm, 458 | accumulation=use_gradient_accumulation) 459 | optimizer.step() 460 | optimizer.zero_grad(set_to_none=True) 461 | 462 | if num_examples < (num_validations*num_examples_validation): 463 | continue 464 | 465 | num_validations += 1 466 | batch = data_vl(batch_size, seed=0) 467 | metrics_vl = feedforward( 468 | batch.inputs, batch.targets, model.eval(), 469 | loss_reg=distfunc_reg, loss_segm=distfunc_segm, 470 | accumulation=use_gradient_accumulation) 471 | 472 | # write summaries 473 | for (k, v) in metrics_tr.items(): 474 | if ('_inp' not in k) and ('_enh' not in k): 475 | writer.add_scalar( 476 | f'train/{k}', float(v), num_examples) 477 | for (k, v) in metrics_vl.items(): 478 | if ('_inp' not in k) and ('_enh' not in k): 479 | writer.add_scalar( 480 | f'validation/{k}', float(v), num_examples) 481 | 482 | do_save_checkpoint = { 483 | 'loss': bool(metrics_vl['loss'] <= best_score), 484 | 'sisdri': bool(metrics_vl['sisdri'] >= best_score) 485 | }.get(early_stopping_metric, False) 486 | 487 | if do_save_checkpoint: 488 | best_score = { 489 | 'loss': metrics_vl['loss'], 490 | 'sisdri': metrics_vl['sisdri'] 491 | }.get(early_stopping_metric, 0) 492 | best_score_step = num_examples 493 | ckpt_path = output_directory.joinpath('ckpt_best.pt') 494 | torch.save({ 495 | 'num_examples': num_examples, 496 | 'model_name': model_name, 497 | 'model_config': config, 498 | 'model_state_dict': model.state_dict(), 499 | 'optimizer_state_dict': optimizer.state_dict() 500 | }, ckpt_path) 501 | print(f'Examples: {num_examples:>10},\t' 502 | 'Validation SI-SDRi: '+str(metrics_vl['sisdri'])) 503 | step_path = output_directory.joinpath('best_step.txt') 504 | with open(step_path, 'w') as fp: 505 | print(num_examples, file=fp) 506 | 507 | if (num_examples - best_score_step > 508 | num_examples_earlystopping): 509 | raise EarlyStopping() 510 | 511 | except EarlyStopping: 512 | step_path = output_directory.joinpath(f'early_stopping.txt') 513 | with open(step_path, 'w') as fp: 514 | print(f'{num_examples},{best_score_step}', file=fp) 515 | print(f'Automatically exited after {num_examples_earlystopping}' 516 | f' examples; best model saw {best_score_step} examples.') 517 | 518 | writer.close() 519 | print(f'Output Directory: {str(output_directory)}') 520 | 521 | return 522 | 523 | 524 | def parse_arguments( 525 | arg_list: Optional[List[str]] = None 526 | ) -> argparse.Namespace: 527 | """Parses arguments from a list.""" 528 | # use system default arguments 529 | if arg_list is None: arg_list = sys.argv[1:] 530 | abs_path = lambda p: Path(p).absolute() 531 | 532 | def t_mixture_snr(string): 533 | try: 534 | return_val = float(string) 535 | except ValueError: 536 | return_val = literal_eval(string) 537 | return return_val 538 | 539 | parser = argparse.ArgumentParser() 540 | parser.add_argument('model_name', type=str) 541 | parser.add_argument('model_size', type=str, 542 | choices={'tiny', 'small', 'medium', 'large'}) 543 | parser.add_argument('--speaker_id', type=int, nargs='+', required=False) 544 | parser.add_argument('-b', '--batch_size', type=int, default=64) 545 | parser.add_argument('-l', '--learning_rate', type=float, default=1e-3) 546 | parser.add_argument('--use_loss_purification', action='store_true') 547 | parser.add_argument('--use_loss_contrastive', action='store_true') 548 | parser.add_argument('--lambda_p', type=float, default=1.) 549 | parser.add_argument('--lambda_n', type=float, default=1.) 550 | parser.add_argument('--generalist_frac', type=float, default=1.) 551 | parser.add_argument('--distance_func', type=str, 552 | choices={'mse', 'snr', 'sisdr'}, required=True) 553 | parser.add_argument('--early_stopping_metric', type=str, 554 | choices={'loss', 'sisdri'}, default='sisdri') 555 | parser.add_argument("--premixture_snr", 556 | type=t_mixture_snr, default='(0, 15)') 557 | parser.add_argument("--mixture_snr", 558 | type=t_mixture_snr, default='(-5, 5)') 559 | parser.add_argument('--warm_start', type=abs_path) 560 | parser.add_argument('--trial_suffix', type=str, default='') 561 | parser.add_argument('--output_folder', type=abs_path, 562 | default=abs_path(__file__).parent / f'runs_{_host}') 563 | args = parser.parse_args(arg_list) 564 | 565 | # validate warm start argument 566 | if args.warm_start: 567 | if Path(args.warm_start).suffix != '.pt': 568 | raise IOError('Warm start checkpoint should have extension ".pt".') 569 | if not Path(args.warm_start).is_file(): 570 | raise IOError('Warm start checkpoint does not exist.') 571 | args.warm_start = str(args.warm_start) 572 | 573 | # validate speaker IDs 574 | if args.speaker_id: 575 | # check that speaker IDs are valid for personalization experiments 576 | if not set(args.speaker_id).issubset(set(speaker_ids_te)): 577 | raise ExperimentError( 578 | 'Please choose speaker IDs specificed in "speakers/test.csv". ' 579 | 'Allowed values are: {}.'.format(speaker_ids_te)) 580 | return args 581 | 582 | 583 | def hparam_search_cm( 584 | speaker_id_or_ids: Union[int, Sequence[int]] = 200, 585 | num_cpus: int = 1, 586 | num_gpus: int = 1 587 | ): 588 | # define the hyperparameter search space 589 | search_space = { 590 | 'distance_func': tune.grid_search(['snr',]), 591 | 'use_loss_purification': tune.grid_search([False, True]), 592 | 'lambda_p': tune.grid_search([0, 0.0001, 0.0005, 0.001, 0.005, 593 | 0.01, 0.05, 0.1, 0.5, 1]), 594 | 'lambda_n': tune.grid_search([0, 0.0001, 0.0005, 0.001, 0.005, 595 | 0.01, 0.05, 0.1, 0.5, 1]), 596 | } 597 | 598 | def ray_search_cm(config): 599 | d_tr = ContrastiveMixtures( 600 | speaker_id_or_ids, split_speech='pretrain', 601 | split_premixture='train', snr_premixture=(0, 15), 602 | split_mixture='train', snr_mixture=(-5, 5)) 603 | d_vl = ContrastiveMixtures( 604 | speaker_id_or_ids, split_speech='preval', 605 | split_premixture='val', snr_premixture=(0, 15), 606 | split_mixture='val', snr_mixture=(-5, 5)) 607 | train_denoiser( 608 | model_name='convtasnet', 609 | model_size='small', 610 | data_tr=d_tr, 611 | data_vl=d_vl, 612 | use_loss_purification=config['use_loss_purification'], 613 | lambda_p=config['lambda_p'], 614 | lambda_n=config['lambda_n'], 615 | output_folder='.', 616 | distance_func=config['distance_func'], 617 | called_by_ray=True, 618 | ) 619 | return 620 | 621 | analysis = tune.run( 622 | ray_search_cm, 623 | name='ray_search_cm', 624 | config=search_space, 625 | resources_per_trial={'cpu': num_cpus, 'gpu': num_gpus}, 626 | reuse_actors=True, 627 | log_to_file=True, 628 | local_dir='.', 629 | fail_fast=True, 630 | verbose=1 631 | ) 632 | ts = datetime.now().strftime('%b%d_%H-%M-%S') 633 | analysis.results_df.to_csv(f'ray_search_cm/results_{ts}.csv') 634 | return 635 | 636 | 637 | def hparam_search_df( 638 | speaker_id_or_ids: Union[int, Sequence[int]] = 200, 639 | num_cpus: int = 1, 640 | num_gpus: int = 1 641 | ): 642 | # define the hyperparameter search space 643 | search_space = { 644 | 'model_size': tune.grid_search(['tiny', 'small', 'medium', 'large']), 645 | 'distance_func': tune.grid_search(['mse', 'snr', 'sisdr']), 646 | 'use_loss_purification': tune.grid_search([False, True]), 647 | } 648 | 649 | def ray_search_distance_func(config): 650 | d_tr = Mixtures( 651 | speaker_id_or_ids, split_speech='pretrain', 652 | split_premixture='train', snr_premixture=(0, 15), 653 | split_mixture='train', snr_mixture=(-5, 5)) 654 | d_vl = Mixtures( 655 | speaker_id_or_ids, split_speech='preval', 656 | split_premixture='val', snr_premixture=(0, 15), 657 | split_mixture='val', snr_mixture=(-5, 5)) 658 | train_denoiser( 659 | model_name='convtasnet', 660 | model_size=config['model_size'], 661 | data_tr=d_tr, 662 | data_vl=d_vl, 663 | use_loss_purification=config['use_loss_purification'], 664 | output_folder='.', 665 | distance_func=config['distance_func'], 666 | called_by_ray=True, 667 | ) 668 | return 669 | 670 | analysis = tune.run( 671 | ray_search_distance_func, 672 | name='ray_search_distance_func', 673 | config=search_space, 674 | resources_per_trial={'cpu': num_cpus, 'gpu': num_gpus}, 675 | reuse_actors=True, 676 | log_to_file=True, 677 | local_dir='.', 678 | fail_fast=True, 679 | verbose=1 680 | ) 681 | ts = datetime.now().strftime('%b%d_%H-%M-%S') 682 | analysis.results_df.to_csv(f'ray_search_distance_func/results_{ts}.csv') 683 | return 684 | 685 | -------------------------------------------------------------------------------- /code/snr_predictor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IU-SAIGE/pse/e9bf8f73f0d4a9a53dc1c0d19dd9b9b3d8979ce8/code/snr_predictor -------------------------------------------------------------------------------- /code/speakers/test.csv: -------------------------------------------------------------------------------- 1 | speaker_id,speaker_name,gender 2 | 19,Kara Shallenberg,F 3 | 26,Denny Sayers,M 4 | 39,Sherry Crowther,F 5 | 40,Vicki Barbour,F 6 | 78,Hugh McGuire,M 7 | 83,Catharine Eastman,F 8 | 87,Rosalind Wills,F 9 | 89,Kristen McQuillin,F 10 | 118,Alex Buie,M 11 | 125,Claire Goget,F 12 | 163,Andrew Miller,M 13 | 196,Stewart Wills,M 14 | 198,Heather Barnett,F 15 | 200,Maureen S. O'Brien,F 16 | 201,Joplin James,M 17 | 250,Mary Reagan,F 18 | 254,Alan Davis Drake,M 19 | 307,Randy Phillips,M 20 | 405,Eric Dennison,M 21 | 446,Steve Hartzog,M 22 | 23 | -------------------------------------------------------------------------------- /code/speakers/train.csv: -------------------------------------------------------------------------------- 1 | speaker_id,speaker_name,gender 2 | 27,Sean McKinley,M 3 | 32,Betsie Bush,F 4 | 60,|CBW|Simon,M 5 | 103,Karen Savage,F 6 | 150,Fox in the Stars,F 7 | 211,shanda_w,F 8 | 226,Deb Bacon-Ziegler,F 9 | 229,carnright,M 10 | 233,Steve Karafit,M 11 | 248,Becky Miller,F 12 | 289,Barbara Wedge,F 13 | 311,deadwhitemales,M 14 | 328,Elizabeth Palmer,F 15 | 332,Aaron Teiser,M 16 | 374,kumarei,M 17 | 403,Nocturna,F 18 | 412,Brian Roberg,M 19 | 441,Sandra,F 20 | 445,Dave Foss,M 21 | 458,Scott Splavec,M 22 | 460,Dave Ranson,M 23 | 481,Neal Foley,M 24 | 625,toriasuncle,M 25 | 669,Anne,F 26 | 696,Tamara R. Schwartz,F 27 | 831,Nick Gallant,M 28 | 839,rovert405,M 29 | 909,Greg Bryant,M 30 | 911,frankjf,M 31 | 1034,Kevin O'Coin,M 32 | 1040,John Garvin,M 33 | 1069,Dawn,F 34 | 1081,Fracture,M 35 | 1088,Christabel,F 36 | 1098,Merryb,F 37 | 1183,roolynninms,F 38 | 1246,Sandra,F 39 | 1334,John Schell,M 40 | 1355,Chris Gladis,M 41 | 1363,Tammy Sanders,F 42 | 1447,Luigina,F 43 | 1455,webslog,M 44 | 1553,Mim Ritty,F 45 | 1578,Lorelle Anderson,F 46 | 1624,Daniel Shorten,M 47 | 1737,Erin Hastings,F 48 | 1743,Bryan Ness,M 49 | 1841,Laura Caldwell,F 50 | 1898,Jennifer,F 51 | 1926,Nikki Sullivan,F 52 | 1963,Belinda Brown,F 53 | 1970,Dawn Larsen,F 54 | 1992,Michelle White,F 55 | 2002,Larry Maddocks,M 56 | 2007,Sheila Morton,F 57 | 2092,Elaine Hamby,F 58 | 2136,Great Plains,M 59 | 2182,Susan Umpleby,F 60 | 2196,Andrea Fiore,F 61 | 2384,Ger,M 62 | 2391,treefingers,F 63 | 2416,Julia Albath,F 64 | 2514,S. Young,M 65 | 2691,Donna Stewart,F 66 | 2764,Piper Hale,F 67 | 2817,Catherine Millward,F 68 | 2836,Linda McDaniel,F 69 | 2843,ricell,M 70 | 2893,Ryan Sutter,M 71 | 2910,Janna,F 72 | 2989,Jamie Strassenburg,F 73 | 3112,Jessica Louise,F 74 | 3168,David Anton,M 75 | 3214,fourteatoo,M 76 | 3235,Karen Commins,F 77 | 3240,flakker,M 78 | 3242,peac,M 79 | 3259,Kate West,F 80 | 3374,Craig Campbell,M 81 | 3436,Anders Lankford,M 82 | 3440,Heidi Will,F 83 | 3486,Robin Balmer,M 84 | 3526,Bereni,F 85 | 3607,Richard Wallis,M 86 | 3664,Barry Eads,M 87 | 3699,Bruce Pirie,M 88 | 3723,Kevin Lavin,M 89 | 3807,Jesse Noar,M 90 | 3830,rymd80,M 91 | 3857,Epistomolus,M 92 | 3879,Keneva,F 93 | 3947,johnell,F 94 | 3982,Kate Adams,F 95 | 3983,lavocedorata,F 96 | 4014,Tom Clifton,M 97 | 4018,Nicholas Clifford,M 98 | 4051,Liz Devens,F 99 | 4088,Blazin48,F 100 | 4137,Sarah LuAnn,F 101 | 4160,Rosie,F 102 | 4195,bj,F 103 | 4214,A. Janelle Risa,F 104 | 4267,Ric F,M 105 | 4297,Tina Horning,F 106 | 4340,kiwafruit,F 107 | 4362,Michelle Montano,F 108 | 4397,John Dennison,M 109 | 4406,Matthew Scott Surprenant,M 110 | 4441,William Peck,M 111 | 4481,margo zinberg,F 112 | 4640,Karen Mason,F 113 | 4680,pachayes,F 114 | 4788,Bill Boerst,M 115 | 4813,Steve Mattern,M 116 | 4830,George Aalto,M 117 | 4853,Barbara Derksen,F 118 | 4859,nathank,M 119 | 4898,greatbasinrain,M 120 | 5022,Kathleen Costa,F 121 | 5049,Bradley Smith,M 122 | 5104,Chuck Burke,M 123 | 5163,LilyAnne,F 124 | 5192,Jason Esteves,M 125 | 5322,Jay Bidal,M 126 | 5339,Lauren McCullough,F 127 | 5390,Charles Bice,M 128 | 5393,Amy Hengst,F 129 | 5456,e_scarab,M 130 | 5463,GLM,M 131 | 5514,Ella Jane Quentin,F 132 | 5561,Ellen Jones,F 133 | 5652,amicrazy2u,F 134 | 5678,jgoffena,M 135 | 5688,Jennifer Dionne,F 136 | 5703,Garth Comira,M 137 | 5750,laurencetrask,M 138 | 5778,Laura Victoria,F 139 | 5789,Kirsten Wever,F 140 | 5808,jeandelfrio,M 141 | 5867,Sharon Omi,F 142 | 6000,MissRose,F 143 | 6019,DerekP,M 144 | 6064,Deborah Knight,F 145 | 6078,dobsonfly,F 146 | 6081,Lazuli,M 147 | 6147,Liberty Stump,F 148 | 6181,Mike,M 149 | 6209,deckerteach,M 150 | 6272,jlenardon,F 151 | 6367,Vince Dee,M 152 | 6385,Novella Serena,F 153 | 6415,Daryl Wor,F 154 | 6437,John Hoerr,M 155 | 6454,David Wales,M 156 | 6476,Viridian,F 157 | 6529,Fred DeBerardinis,M 158 | 6531,janesandberg,F 159 | 6563,William Tomcho,M 160 | 6818,beckyboyd,F 161 | 6836,John,M 162 | 6848,KarlHenning,M 163 | 6880,Capybara,M 164 | 6925,Thomas Meaney,M 165 | 7059,Joannemmp,F 166 | 7067,Matthew Wall,M 167 | 7078,Mary in Arkansas,F 168 | 7113,Sukaina Jaffer,F 169 | 7148,Vickie Ranz,F 170 | 7178,J.K. Neely,F 171 | 7190,Tony Posante,M 172 | 7226,Jonathan Moore,M 173 | 7264,Sean McClain,M 174 | 7278,Jon Smith,M 175 | 7302,Asta1234,F 176 | 7312,nkneer,M 177 | 7367,NIneFive83,M 178 | 7402,Canby Ibarra,M 179 | 7447,dasbury,M 180 | 7505,Ron Lockhart,M 181 | 7511,Sherri Vance,F 182 | 7517,Raz Mason,F 183 | 7635,Judy Guinan,F 184 | 7780,tazzle,F 185 | 7794,mlcui,F 186 | 7800,Arie,F 187 | 7859,xinamarieuhl,F 188 | 8014,constatine,F 189 | 8051,Maria Kasper,F 190 | 8063,Robert Snoza,M 191 | 8088,Jason Bolestridge,M 192 | 8095,Theodulf,M 193 | 8098,Arnold,M 194 | 8108,drakaunus,M 195 | 8123,Sheila Wood,F 196 | 8226,Adam Picot,M 197 | 8238,Madam Fickle,F 198 | 8312,Jaimie Noy,F 199 | 8324,Kathy Wright,F 200 | 8419,Jon Kissack,M 201 | 8425,Larry Wilson,M 202 | 8465,TinaNygard2,F 203 | 8468,Jennifer Dorr,F 204 | 8580,Gary Dana,M 205 | 8609,noblesavage,M 206 | 8629,Shivansh Dhar,M 207 | 8630,Eduardo,M 208 | 8747,DeanOBuchanan,M 209 | 8770,Paul Simonin,M 210 | 8797,Sean Grabosky,M 211 | 8838,Kevin Owens,M 212 | 8975,Daisy Flaim,F -------------------------------------------------------------------------------- /code/speakers/validation.csv: -------------------------------------------------------------------------------- 1 | speaker_id,speaker_name,gender 2 | 298,Caroline Morse,F 3 | 302,Chris Peterson,F 4 | 322,Elisabeth Shields,F 5 | 426,Norah Piehl,F 6 | 587,Joy Scaglione,F 7 | 730,Karen Labenz,F 8 | 887,Lana Taylor,F 9 | 1116,Megan Stemm-Wade,F 10 | 1235,Tim Gregory,M 11 | 1263,Leonie Rose,F 12 | 1502,Ann Boyer,F 13 | 1594,Jon Scott Jones,M 14 | 1723,Rob Whelan,M 15 | 1867,Rowdy Delaney,M 16 | 2159,Matthew Westra,M 17 | 2289,David Kleparek,M 18 | 2436,Seth Adam Sher,M 19 | 2518,Rob Powell,M 20 | 2911,David Lawrence,M 21 | 2952,Scott Carpenter,M 22 | -------------------------------------------------------------------------------- /code/test.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import re 3 | import sys 4 | from datetime import datetime 5 | 6 | import torch 7 | from ray import tune 8 | 9 | from exp_data import Mixtures 10 | from exp_models import load_checkpoint, feedforward 11 | 12 | 13 | def get_timestamp() -> str: 14 | # format_str = "%A, %d %b %Y %H:%M:%S %p" 15 | format_str = "%Y_%b_%d" 16 | result = str(datetime.now().strftime(format_str)) 17 | return result 18 | 19 | 20 | def no_op_loss(*args, **kwargs): 21 | return 0 22 | 23 | 24 | @torch.no_grad() 25 | def test_function( 26 | filepath: str, 27 | print_to_console: bool = True, 28 | write_to_file: bool = True, 29 | called_by_tune: bool = True 30 | ): 31 | use_gradient_accumulation = not bool('grunet' in filepath) 32 | filepath = pathlib.Path(filepath.strip().replace('early_stopping.txt', '')) 33 | 34 | # load the experiment configuration (should be in the same directory) 35 | model, config, num_examples = load_checkpoint(filepath) 36 | 37 | # indentify the personalization target (if there is one) 38 | # and prepare the speaker-specific test sets 39 | speaker_id = 200 40 | if 'ray' not in str(filepath): 41 | try: 42 | match = re.match(r'.*_(sp|ge)(\d\d\d).*', str(filepath)) 43 | speaker_id = int(match.group(2)) 44 | except AttributeError: 45 | raise NotImplementedError('need to add support for generalists') 46 | dataset = Mixtures(speaker_id, 47 | split_speech='test', 48 | split_mixture='test', 49 | snr_mixture=(-5, 5)) 50 | 51 | # run the test 52 | batch = dataset(100, seed=0) 53 | results = feedforward(batch.inputs, batch.targets, model, 54 | weights=None, accumulation=use_gradient_accumulation, 55 | test=True, loss_reg=no_op_loss, loss_segm=no_op_loss) 56 | 57 | if print_to_console: 58 | print(f'{filepath} ({num_examples}),{results}') 59 | if write_to_file: 60 | with open('log.txt', 'a') as op: 61 | print(f'{filepath} ({num_examples}),{results}', file=op) 62 | if called_by_tune: 63 | tune.report(**results) 64 | return 65 | else: 66 | return results 67 | 68 | 69 | def main(use_tune: bool = False): 70 | 71 | if len(sys.argv) > 1: 72 | folders = sys.argv[1:] 73 | else: 74 | p = pathlib.Path('/N/u/asivara/2022-jstsp/0408_hparams_cm').rglob( 75 | 'ckpt_best.pt') 76 | folders = [str(f) for f in p if '619b6' in str(f)] 77 | # else: 78 | # raise ValueError('Expected subsequent arguments to be checkpoint paths ' 79 | # 'or directory.') 80 | 81 | def test_wrapper(config): 82 | return test_function( 83 | filepath=config['filepath'], 84 | called_by_tune=True 85 | ) 86 | 87 | if use_tune: 88 | tune.run( 89 | test_wrapper, 90 | config={ 91 | 'filepath': tune.grid_search(folders) 92 | }, 93 | resources_per_trial={'cpu': 1, 'gpu': 0.25}, 94 | local_dir=f'test_results-({get_timestamp()})' 95 | ) 96 | pass 97 | else: 98 | for f in folders: 99 | test_function(f, called_by_tune=False) 100 | 101 | 102 | if __name__ == '__main__': 103 | main(False) 104 | -------------------------------------------------------------------------------- /code/train_generalists.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import pandas as pd 5 | from ray import tune 6 | 7 | from datetime import datetime 8 | from exp_data import Mixtures 9 | from exp_utils import get_config_from_yaml 10 | from run import train_denoiser 11 | 12 | 13 | ROOT_DIR = os.path.dirname(os.path.realpath(sys.argv[0])) 14 | 15 | 16 | def train_generalist(config: dict): 17 | 18 | speaker_ids_tr = pd.read_csv( 19 | ROOT_DIR+'/speakers/train.csv')['speaker_id'].to_list() 20 | speaker_ids_vl = pd.read_csv( 21 | ROOT_DIR+'/speakers/validation.csv')['speaker_id'].to_list() 22 | 23 | data_train = Mixtures( 24 | speaker_ids_tr, 25 | config['folder_librispeech'], 26 | None, 27 | config['folder_musan'], 28 | frac_speech=config.get('generalist_frac', 1), 29 | split_mixture='train', 30 | snr_mixture=(-5, 5) 31 | ) 32 | data_validation = Mixtures( 33 | speaker_ids_vl, 34 | config['folder_librispeech'], 35 | None, 36 | config['folder_musan'], 37 | split_mixture='val', 38 | snr_mixture=(-5, 5) 39 | ) 40 | 41 | train_denoiser( 42 | model_name=config['model_name'], 43 | model_size=config['model_size'], 44 | distance_func=config['distance_func'], 45 | data_tr=data_train, 46 | data_vl=data_validation, 47 | learning_rate=config['learning_rate'], 48 | batch_size=config['batch_size'], 49 | output_folder=config['output_folder'], 50 | called_by_ray=True, 51 | run_smoke_test=config['run_smoke_test'] 52 | ) 53 | 54 | return 55 | 56 | 57 | if __name__ == '__main__': 58 | 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("-t", "--run_smoke_test", 61 | help="check if a single training iteration runs succesfully", 62 | action="store_true") 63 | args = parser.parse_args() 64 | 65 | config = get_config_from_yaml(ROOT_DIR+'/conf_generalists.yaml') 66 | os.environ['CUDA_VISIBLE_DEVICES'] = config['available_devices'] 67 | 68 | analysis = tune.run( 69 | train_generalist, 70 | name='train_generalist', 71 | config={ 72 | 'model_name': tune.grid_search(config['model_name']), 73 | 'model_size': tune.grid_search(config['model_size']), 74 | 'distance_func': tune.grid_search(config['distance_func']), 75 | 'batch_size': tune.grid_search(config['batch_size']), 76 | 'learning_rate': tune.grid_search(config['learning_rate']), 77 | 'folder_librispeech': config['folder_librispeech'], 78 | 'folder_musan': config['folder_musan'], 79 | 'sample_rate': config['sample_rate'], 80 | 'example_duration': config['example_duration'], 81 | 'output_folder': config['output_folder'], 82 | 'run_smoke_test': args.run_smoke_test 83 | }, 84 | resources_per_trial={ 85 | 'cpu': config['num_cpus_per_experiment'], 86 | 'gpu': config['num_gpus_per_experiment'] 87 | }, 88 | reuse_actors=True, 89 | log_to_file=True, 90 | local_dir=config['output_folder'], 91 | fail_fast=True, 92 | verbose=3 93 | ) 94 | ts = datetime.now().strftime('%b%d_%H-%M-%S') 95 | output_filepath = os.path.join( 96 | config['output_folder'], f'train_generalist/results_{ts}.csv') 97 | analysis.results_df.to_csv(output_filepath) 98 | print('Completed training generalist(s).') 99 | 100 | -------------------------------------------------------------------------------- /code/train_specialists.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from ray import tune 5 | 6 | from datetime import datetime 7 | from exp_data import ContrastiveMixtures, Mixtures 8 | from exp_utils import get_config_from_yaml 9 | from run import train_denoiser 10 | 11 | 12 | ROOT_DIR = os.path.dirname(os.path.realpath(sys.argv[0])) 13 | 14 | 15 | def train_specialist(config: dict): 16 | 17 | data_class = Mixtures 18 | if config.get('use_loss_contrastive', False): 19 | data_class = ContrastiveMixtures 20 | 21 | data_train = data_class( 22 | config['speaker_id'], 23 | config['folder_librispeech'], 24 | config['folder_fsd50k'], 25 | config['folder_musan'], 26 | split_speech='pretrain', 27 | split_premixture='train', 28 | snr_premixture=(0, 15), 29 | split_mixture='train', 30 | snr_mixture=(-5, 5) 31 | ) 32 | data_validation = data_class( 33 | config['speaker_id'], 34 | config['folder_librispeech'], 35 | config['folder_fsd50k'], 36 | config['folder_musan'], 37 | split_speech='preval', 38 | split_premixture='val', 39 | snr_premixture=(0, 15), 40 | split_mixture='val', 41 | snr_mixture=(-5, 5) 42 | ) 43 | 44 | train_denoiser( 45 | model_name=config['model_name'], 46 | model_size=config['model_size'], 47 | distance_func=config['distance_func'], 48 | data_tr=data_train, 49 | data_vl=data_validation, 50 | learning_rate=config['learning_rate'], 51 | use_loss_purification=config['use_loss_purification'], 52 | batch_size=config['batch_size'], 53 | output_folder=config['output_folder'], 54 | called_by_ray=True, 55 | run_smoke_test=config['run_smoke_test'] 56 | ) 57 | 58 | return 59 | 60 | 61 | if __name__ == '__main__': 62 | 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("-t", "--run_smoke_test", 65 | help="check if a single training iteration runs succesfully", 66 | action="store_true") 67 | args = parser.parse_args() 68 | 69 | config = get_config_from_yaml(ROOT_DIR+'/conf_specialists.yaml') 70 | os.environ['CUDA_VISIBLE_DEVICES'] = config['available_devices'] 71 | 72 | analysis = tune.run( 73 | train_specialist, 74 | name='train_specialist', 75 | config={ 76 | 'model_name': tune.grid_search(config['model_name']), 77 | 'model_size': tune.grid_search(config['model_size']), 78 | 'distance_func': tune.grid_search(config['distance_func']), 79 | 'speaker_id': tune.grid_search(config['speaker_id']), 80 | 'use_loss_contrastive': tune.grid_search( 81 | config['use_loss_contrastive']), 82 | 'use_loss_purification': tune.grid_search( 83 | config['use_loss_purification']), 84 | 'batch_size': tune.grid_search(config['batch_size']), 85 | 'learning_rate': tune.grid_search(config['learning_rate']), 86 | 'folder_librispeech': config['folder_librispeech'], 87 | 'folder_fsd50k': config['folder_fsd50k'], 88 | 'folder_musan': config['folder_musan'], 89 | 'sample_rate': config['sample_rate'], 90 | 'example_duration': config['example_duration'], 91 | 'output_folder': config['output_folder'], 92 | 'run_smoke_test': args.run_smoke_test 93 | }, 94 | resources_per_trial={ 95 | 'cpu': config['num_cpus_per_experiment'], 96 | 'gpu': config['num_gpus_per_experiment'] 97 | }, 98 | reuse_actors=True, 99 | log_to_file=True, 100 | local_dir=config['output_folder'], 101 | fail_fast=True, 102 | verbose=3 103 | ) 104 | ts = datetime.now().strftime('%b%d_%H-%M-%S') 105 | output_filepath = os.path.join( 106 | config['output_folder'], f'train_specialist/results_{ts}.csv') 107 | analysis.results_df.to_csv(output_filepath) 108 | print('Completed training specialist(s).') 109 | 110 | -------------------------------------------------------------------------------- /docs/images/pse_ssl_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IU-SAIGE/pse/e9bf8f73f0d4a9a53dc1c0d19dd9b9b3d8979ce8/docs/images/pse_ssl_overview.png -------------------------------------------------------------------------------- /docs/images/waveforms_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IU-SAIGE/pse/e9bf8f73f0d4a9a53dc1c0d19dd9b9b3d8979ce8/docs/images/waveforms_cm.png -------------------------------------------------------------------------------- /docs/images/waveforms_pseudose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IU-SAIGE/pse/e9bf8f73f0d4a9a53dc1c0d19dd9b9b3d8979ce8/docs/images/waveforms_pseudose.png --------------------------------------------------------------------------------