├── .gitmodules ├── tools ├── __init__.py ├── setup_helpers │ └── __init__.py └── travis │ └── test_script.sh ├── version.txt ├── src ├── torio │ ├── lib │ │ └── __init__.py │ ├── utils │ │ └── __init__.py │ ├── __init__.py │ ├── io │ │ └── __init__.py │ └── _extension │ │ └── __init__.py ├── torchaudio │ ├── lib │ │ └── __init__.py │ ├── prototype │ │ ├── __init__.py │ │ ├── datasets │ │ │ └── __init__.py │ │ ├── pipelines │ │ │ ├── _vggish │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── transforms │ │ │ └── __init__.py │ │ ├── functional │ │ │ └── __init__.py │ │ └── models │ │ │ └── __init__.py │ ├── pipelines │ │ ├── _wav2vec2 │ │ │ └── __init__.py │ │ └── _tts │ │ │ └── __init__.py │ ├── compliance │ │ └── __init__.py │ ├── models │ │ ├── wav2vec2 │ │ │ ├── utils │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── squim │ │ │ └── __init__.py │ │ └── decoder │ │ │ └── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── ffmpeg_utils.py │ ├── _internal │ │ └── __init__.py │ ├── backend │ │ ├── __init__.py │ │ ├── common.py │ │ ├── no_backend.py │ │ ├── sox_io_backend.py │ │ ├── soundfile_backend.py │ │ └── _no_backend.py │ ├── sox_effects │ │ └── __init__.py │ ├── io │ │ └── __init__.py │ ├── __init__.py │ ├── datasets │ │ └── __init__.py │ └── transforms │ │ └── __init__.py ├── libtorchaudio │ ├── iir_cuda.h │ ├── utils.h │ ├── rnnt │ │ ├── compute_alphas.cpp │ │ ├── compute_betas.cpp │ │ ├── compute.h │ │ ├── macros.h │ │ ├── types.h │ │ ├── gpu │ │ │ ├── half.cuh │ │ │ └── math.cuh │ │ ├── cpu │ │ │ └── math.h │ │ └── compute.cpp │ ├── forced_align │ │ ├── compute.h │ │ └── compute.cpp │ ├── pybind │ │ └── pybind.cpp │ ├── sox │ │ ├── CMakeLists.txt │ │ ├── effects.h │ │ ├── types.h │ │ ├── io.h │ │ └── pybind │ │ │ └── pybind.cpp │ ├── utils.cpp │ └── cuctc │ │ ├── CMakeLists.txt │ │ └── LICENSE └── libtorio │ └── ffmpeg │ ├── hw_context.h │ ├── stream_reader │ ├── packet_buffer.h │ ├── buffer │ │ ├── unchunked_buffer.h │ │ ├── unchunked_buffer.cpp │ │ └── chunked_buffer.h │ ├── packet_buffer.cpp │ └── post_process.h │ ├── stream_writer │ ├── packet_writer.h │ ├── types.h │ ├── encoder.h │ └── packet_writer.cpp │ └── hw_context.cpp ├── test ├── integration_tests │ ├── __init__.py │ ├── prototype │ │ └── vggish_pipeline_test.py │ ├── ctc_decoder_integration_test.py │ ├── tacotron2_pipeline_test.py │ └── rnnt_pipeline_test.py ├── torchaudio_unittest │ ├── io │ │ ├── __init__.py │ │ └── common.py │ ├── models │ │ ├── __init__.py │ │ ├── rnnt │ │ │ ├── __init__.py │ │ │ ├── rnnt_cpu_test.py │ │ │ └── rnnt_gpu_test.py │ │ ├── conformer │ │ │ ├── __init__.py │ │ │ ├── conformer_cpu_test.py │ │ │ └── conformer_gpu_test.py │ │ ├── decoder │ │ │ └── __init__.py │ │ ├── emformer │ │ │ ├── __init__.py │ │ │ ├── emformer_cpu_test.py │ │ │ └── emformer_gpu_test.py │ │ ├── hdemucs │ │ │ ├── __init__.py │ │ │ ├── hdemucs_cpu_test.py │ │ │ └── hdemucs_gpu_test.py │ │ ├── squim │ │ │ └── __init__.py │ │ ├── tacotron2 │ │ │ ├── __init__.py │ │ │ ├── model_test_cpu_test.py │ │ │ └── model_test_gpu_test.py │ │ ├── wav2vec2 │ │ │ └── __init__.py │ │ └── rnnt_decoder │ │ │ ├── __init__.py │ │ │ ├── rnnt_decoder_cpu_test.py │ │ │ └── rnnt_decoder_gpu_test.py │ ├── utils │ │ └── __init__.py │ ├── backend │ │ ├── __init__.py │ │ ├── sox_io │ │ │ ├── __init__.py │ │ │ └── common.py │ │ ├── dispatcher │ │ │ ├── __init__.py │ │ │ ├── ffmpeg │ │ │ │ └── __init__.py │ │ │ ├── sox │ │ │ │ ├── __init__.py │ │ │ │ └── common.py │ │ │ └── soundfile │ │ │ │ └── __init__.py │ │ ├── soundfile │ │ │ └── __init__.py │ │ └── common.py │ ├── compliance │ │ ├── __init__.py │ │ └── kaldi │ │ │ ├── __init__.py │ │ │ ├── kaldi_compatibility_cpu_test.py │ │ │ └── kaldi_compatibility_cuda_test.py │ ├── datasets │ │ ├── __init__.py │ │ └── librispeech_test.py │ ├── functional │ │ ├── __init__.py │ │ ├── librosa_compatibility_cpu_test.py │ │ ├── kaldi_compatibility_cpu_test.py │ │ ├── librosa_compatibility_cuda_test.py │ │ ├── autograd_cpu_test.py │ │ ├── kaldi_compatibility_cuda_test.py │ │ ├── torchscript_consistency_cpu_test.py │ │ ├── autograd_cuda_test.py │ │ ├── torchscript_consistency_cuda_test.py │ │ ├── functional_cuda_test.py │ │ └── kaldi_compatibility_test_impl.py │ ├── prototype │ │ ├── __init__.py │ │ ├── datasets │ │ │ └── __init__.py │ │ ├── functional │ │ │ ├── __init__.py │ │ │ ├── librosa_compatibility_cpu_test.py │ │ │ ├── librosa_compatibility_cuda_test.py │ │ │ ├── autograd_cpu_test.py │ │ │ ├── autograd_cuda_test.py │ │ │ ├── torchscript_consistency_cuda_test.py │ │ │ ├── functional_cpu_test.py │ │ │ ├── functional_cuda_test.py │ │ │ └── torchscript_consistency_cpu_test.py │ │ ├── hifi_gan │ │ │ ├── __init__.py │ │ │ ├── original │ │ │ │ ├── env.py │ │ │ │ └── utils.py │ │ │ ├── hifi_gan_cpu_test.py │ │ │ └── hifi_gan_gpu_test.py │ │ ├── transforms │ │ │ ├── __init__.py │ │ │ ├── autograd_cpu_test.py │ │ │ ├── autograd_cuda_test.py │ │ │ ├── librosa_compatibility_cpu_test.py │ │ │ ├── librosa_compatibility_cuda_test.py │ │ │ ├── transforms_cpu_test.py │ │ │ └── transforms_cuda_test.py │ │ ├── rnnt_cpu_test.py │ │ ├── conv_emformer_cpu_test.py │ │ ├── rnnt_gpu_test.py │ │ ├── conv_emformer_gpu_test.py │ │ └── conv_emformer_test_impl.py │ ├── sox_effect │ │ ├── __init__.py │ │ └── common.py │ ├── transforms │ │ ├── __init__.py │ │ ├── librosa_compatibility_cpu_test.py │ │ ├── autograd_cpu_test.py │ │ ├── librosa_compatibility_cuda_test.py │ │ ├── autograd_cuda_test.py │ │ ├── transforms_cpu_test.py │ │ ├── transforms_cuda_test.py │ │ ├── torchscript_consistency_cpu_test.py │ │ └── torchscript_consistency_cuda_test.py │ ├── example │ │ ├── tacotron2 │ │ │ ├── __init__.py │ │ │ ├── tacotron2_loss_cpu_test.py │ │ │ └── tacotron2_loss_gpu_test.py │ │ ├── souce_sepration │ │ │ ├── __init__.py │ │ │ └── metrics_test.py │ │ ├── __init__.py │ │ ├── hubert │ │ │ └── __init__.py │ │ └── emformer_rnnt │ │ │ ├── __init__.py │ │ │ └── utils.py │ ├── assets │ │ ├── VCTK-Corpus │ │ │ ├── txt │ │ │ │ └── p224 │ │ │ │ │ └── p224_002.txt │ │ │ └── wav48 │ │ │ │ └── p224 │ │ │ │ └── p224_002.wav │ │ ├── decoder │ │ │ ├── tokens.txt │ │ │ ├── nnlm_lex_dict.txt │ │ │ ├── lexicon.txt │ │ │ ├── nnlm_lexfree_dict.txt │ │ │ ├── kenlm.arpa │ │ │ └── kenlm_char.arpa │ │ ├── sox_effect_test_fir_coeffs.txt │ │ ├── mat.ark │ │ ├── vec_int.ark │ │ ├── sinewave.wav │ │ ├── testsrc.hevc │ │ ├── vec_flt.ark │ │ ├── kaldi_file.wav │ │ ├── mp3_without_ext │ │ ├── nasa_13013.avi │ │ ├── nasa_13013.mp4 │ │ ├── io │ │ │ ├── 96k_0_1ch.opus │ │ │ ├── 96k_0_2ch.opus │ │ │ ├── 96k_10_1ch.opus │ │ │ ├── 96k_10_2ch.opus │ │ │ ├── 96k_5_1ch.opus │ │ │ └── 96k_5_2ch.opus │ │ ├── kaldi_file_8000.wav │ │ ├── nasa_13013_no_audio.mp4 │ │ ├── nasa_13013_no_video.mp4 │ │ ├── vad-go-mono-32000.wav │ │ ├── vad-go-stereo-44100.wav │ │ ├── RATRACE_wave_f_nm_np1_fr_goo_37.avi │ │ ├── steam-train-whistle-daniel_simon.mp3 │ │ ├── steam-train-whistle-daniel_simon.wav │ │ ├── kaldi_test_pitch_args.jsonl │ │ ├── README.md │ │ └── wav2vec2 │ │ │ ├── huggingface │ │ │ └── generate_huggingface_model_config.py │ │ │ └── fairseq │ │ │ └── xlsr_53_56k.json │ ├── __init__.py │ ├── common_utils │ │ ├── func_utils.py │ │ ├── psd_utils.py │ │ ├── autograd_utils.py │ │ └── kaldi_utils.py │ └── kaldi_io_test.py ├── smoke_test │ └── smoke_test_no_ffmpeg.py └── cpp │ └── CMakeLists.txt ├── examples ├── pipeline_tacotron2 │ └── text │ │ └── __init__.py ├── tutorials │ └── README.rst ├── libtorchaudio │ ├── .gitignore │ ├── data │ │ ├── rir.wav │ │ ├── input.wav │ │ └── README.md │ ├── augmentation │ │ ├── CMakeLists.txt │ │ ├── main.cpp │ │ └── README.md │ ├── speech_recognition │ │ ├── CMakeLists.txt │ │ ├── greedy_decoder.py │ │ └── transcribe.cpp │ ├── build.sh │ ├── CMakeLists.txt │ └── README.md ├── avsr │ ├── data_prep │ │ ├── requirements.txt │ │ ├── detectors │ │ │ └── retinaface │ │ │ │ └── detector.py │ │ └── tools │ │ │ └── README.md │ ├── models │ │ ├── conformer_rnnt.py │ │ ├── emformer_rnnt.py │ │ └── fusion.py │ ├── average_checkpoints.py │ └── schedulers.py ├── hubert │ ├── loss │ │ └── __init__.py │ ├── utils │ │ └── __init__.py │ └── dataset │ │ └── __init__.py ├── source_separation │ ├── conv_tasnet │ │ └── __init__.py │ └── utils │ │ ├── dataset │ │ └── __init__.py │ │ └── __init__.py ├── self_supervised_learning │ ├── losses │ │ └── __init__.py │ ├── lr_schedulers │ │ ├── __init__.py │ │ └── _linear_decay.py │ ├── data_modules │ │ └── __init__.py │ └── README.md ├── pipeline_wav2letter │ ├── transforms.py │ ├── ctc_decoders.py │ ├── languagemodels.py │ └── utils.py ├── asr │ └── librispeech_conformer_rnnt_biasing │ │ ├── score.sh │ │ └── blists │ │ └── README.md └── pipeline_wavernn │ └── processing.py ├── .github ├── pytorch-probot.yml ├── scripts │ ├── unittest-windows │ │ ├── install_conda.bat │ │ ├── environment.yml │ │ ├── run_test.sh │ │ ├── set_cuda_envs.sh │ │ └── setup_env.sh │ ├── ffmpeg │ │ └── build.bat │ └── unittest-linux │ │ └── run_test.sh ├── pull_request_template.md ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── documentation.yml │ └── feature-request.yml └── workflows │ ├── bandit.yml │ ├── pr-labels.yml │ ├── integration-test.yml │ └── lint.yml ├── docs ├── source │ ├── references.rst │ ├── _static │ │ └── img │ │ │ ├── logo.png │ │ │ └── favicon.ico │ ├── _templates │ │ └── autosummary │ │ │ ├── class.rst │ │ │ ├── bundle_data.rst │ │ │ ├── io.rst │ │ │ ├── utils.rst │ │ │ ├── dataset_class.rst │ │ │ ├── cuda_ctc_decoder_class.rst │ │ │ ├── ctc_decoder_class.rst │ │ │ └── io_class.rst │ ├── libtorio.rst │ ├── prototype.datasets.rst │ ├── torio.utils.rst │ ├── prototype.transforms.rst │ ├── utils.rst │ ├── io.rst │ ├── torio.io.rst │ ├── compliance.kaldi.rst │ ├── sox_effects.rst │ ├── torio.rst │ ├── prototype.rst │ ├── prototype.functional.rst │ ├── models.decoder.rst │ ├── models.rst │ ├── kaldi_io.rst │ ├── feature_classifications.rst │ ├── datasets.rst │ └── prototype.models.rst ├── requirements-tutorials.txt ├── post_process_dispatcher.py ├── make.bat ├── requirements.txt └── Makefile ├── mypy.ini ├── setup.cfg ├── .gitattributes ├── packaging ├── torchaudio │ ├── bld.bat │ └── build.sh ├── vs2019 │ ├── meta.yaml │ ├── conda_build_config.yaml │ └── activate.bat ├── windows │ └── internal │ │ └── driver_update.bat ├── vc_env_helper.bat └── cut_release.sh ├── requirements.txt ├── pyproject.toml ├── .flake8 ├── CODEOWNERS ├── third_party ├── LICENSES_BUNDLED.txt ├── sox │ └── CMakeLists.txt └── ffmpeg │ └── single │ └── CMakeLists.txt ├── CITATION ├── .clang-tidy ├── .pre-commit-config.yaml └── LICENSE /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 2.2.0a0 2 | -------------------------------------------------------------------------------- /src/torio/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/torchaudio/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/pipeline_tacotron2/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/torchaudio/pipelines/_wav2vec2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/compliance/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/rnnt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/sox_effect/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/pytorch-probot.yml: -------------------------------------------------------------------------------- 1 | tracking_issue: 736 2 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/sox_io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/compliance/kaldi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/conformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/decoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/emformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/hdemucs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/squim/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/wav2vec2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/tutorials/README.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/dispatcher/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/soundfile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/rnnt_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/hifi_gan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/dispatcher/ffmpeg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/dispatcher/sox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/souce_sepration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/dispatcher/soundfile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/setup_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .extension import * # noqa 2 | -------------------------------------------------------------------------------- /docs/source/references.rst: -------------------------------------------------------------------------------- 1 | References 2 | ---------- 3 | 4 | .. bibliography:: 5 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | allow_redefinition = True 3 | ignore_missing_imports = True 4 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/VCTK-Corpus/txt/p224/p224_002.txt: -------------------------------------------------------------------------------- 1 | VCTK Test. 2 | -------------------------------------------------------------------------------- /examples/libtorchaudio/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | data/output.wav 3 | *.zip 4 | output 5 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/decoder/tokens.txt: -------------------------------------------------------------------------------- 1 | - 2 | | 3 | f 4 | o 5 | b 6 | a 7 | r 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [pydocstyle] 2 | select = D417 # Missing argument descriptions in the docstring 3 | -------------------------------------------------------------------------------- /examples/avsr/data_prep/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | scikit-image 3 | opencv-python 4 | ffmpeg-python 5 | -------------------------------------------------------------------------------- /src/torio/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ffmpeg_utils 2 | 3 | 4 | __all__ = ["ffmpeg_utils"] 5 | -------------------------------------------------------------------------------- /test/smoke_test/smoke_test_no_ffmpeg.py: -------------------------------------------------------------------------------- 1 | from smoke_test import main 2 | 3 | main(["--no-ffmpeg"]) 4 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/decoder/nnlm_lex_dict.txt: -------------------------------------------------------------------------------- 1 | | 2 | foo 3 | bar 4 | foobar 5 | 6 | -------------------------------------------------------------------------------- /src/torchaudio/compliance/__init__.py: -------------------------------------------------------------------------------- 1 | from . import kaldi 2 | 3 | __all__ = [ 4 | "kaldi", 5 | ] 6 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .musan import Musan 2 | 3 | 4 | __all__ = ["Musan"] 5 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/decoder/lexicon.txt: -------------------------------------------------------------------------------- 1 | foo f o o | 2 | bar b a r | 3 | foobar f o o b a r | 4 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/sox_effect_test_fir_coeffs.txt: -------------------------------------------------------------------------------- 1 | 0.0195 -0.082 0.234 0.891 -0.145 0.043 2 | -------------------------------------------------------------------------------- /docs/source/_static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/docs/source/_static/img/logo.png -------------------------------------------------------------------------------- /test/torchaudio_unittest/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from . import fb # noqa 3 | except Exception: 4 | pass 5 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/decoder/nnlm_lexfree_dict.txt: -------------------------------------------------------------------------------- 1 | - 2 | | 3 | f 4 | o 5 | b 6 | a 7 | r 8 | 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # To exclude autogenerated files from code reviews 2 | .circleci/config.yml linguist-generated=true 3 | -------------------------------------------------------------------------------- /docs/source/_static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/docs/source/_static/img/favicon.ico -------------------------------------------------------------------------------- /examples/libtorchaudio/data/rir.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/examples/libtorchaudio/data/rir.wav -------------------------------------------------------------------------------- /examples/hubert/loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .hubert_loss import hubert_loss 2 | 3 | __all__ = [ 4 | "hubert_loss", 5 | ] 6 | -------------------------------------------------------------------------------- /examples/libtorchaudio/data/input.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/examples/libtorchaudio/data/input.wav -------------------------------------------------------------------------------- /examples/source_separation/conv_tasnet/__init__.py: -------------------------------------------------------------------------------- 1 | from . import train, trainer 2 | 3 | __all__ = ["train", "trainer"] 4 | -------------------------------------------------------------------------------- /examples/source_separation/utils/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils, wsj0mix 2 | 3 | __all__ = ["utils", "wsj0mix"] 4 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/mat.ark: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/mat.ark -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/vec_int.ark: -------------------------------------------------------------------------------- 1 | key1 Bkey2 B   key3 B -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/sinewave.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/sinewave.wav -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/testsrc.hevc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/testsrc.hevc -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/vec_flt.ark: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/vec_flt.ark -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/kaldi_file.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/kaldi_file.wav -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/mp3_without_ext: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/mp3_without_ext -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/nasa_13013.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013.avi -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/nasa_13013.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013.mp4 -------------------------------------------------------------------------------- /examples/source_separation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import dataset, dist_utils, metrics 2 | 3 | __all__ = ["dataset", "dist_utils", "metrics"] 4 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/io/96k_0_1ch.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_0_1ch.opus -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/io/96k_0_2ch.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_0_2ch.opus -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/io/96k_10_1ch.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_10_1ch.opus -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/io/96k_10_2ch.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_10_2ch.opus -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/io/96k_5_1ch.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_5_1ch.opus -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/io/96k_5_2ch.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_5_2ch.opus -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/kaldi_file_8000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/kaldi_file_8000.wav -------------------------------------------------------------------------------- /packaging/torchaudio/bld.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set IS_CONDA=1 4 | 5 | python setup.py install --single-version-externally-managed --record=record.txt 6 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/pipelines/_vggish/__init__.py: -------------------------------------------------------------------------------- 1 | from ._vggish_pipeline import VGGISH, VGGishBundle 2 | 3 | __all__ = ["VGGISH", "VGGishBundle"] 4 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/nasa_13013_no_audio.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013_no_audio.mp4 -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/nasa_13013_no_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013_no_video.mp4 -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/vad-go-mono-32000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/vad-go-mono-32000.wav -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/vad-go-stereo-44100.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/vad-go-stereo-44100.wav -------------------------------------------------------------------------------- /.github/scripts/unittest-windows/install_conda.bat: -------------------------------------------------------------------------------- 1 | start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda% 2 | -------------------------------------------------------------------------------- /src/torio/__init__.py: -------------------------------------------------------------------------------- 1 | from . import _extension # noqa # usort: skip 2 | from . import io, utils 3 | 4 | 5 | __all__ = [ 6 | "io", 7 | "utils", 8 | ] 9 | -------------------------------------------------------------------------------- /examples/self_supervised_learning/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ._hubert_loss import hubert_loss 2 | 3 | __all__ = [ 4 | "hubert_loss", 5 | "wav2vec2_loss", 6 | ] 7 | -------------------------------------------------------------------------------- /examples/self_supervised_learning/lr_schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from ._linear_decay import LinearDecayLRScheduler 2 | 3 | __all__ = [ 4 | "LinearDecayLRScheduler", 5 | ] 6 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples")) 6 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/RATRACE_wave_f_nm_np1_fr_goo_37.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/RATRACE_wave_f_nm_np1_fr_goo_37.avi -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/VCTK-Corpus/wav48/p224/p224_002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/VCTK-Corpus/wav48/p224/p224_002.wav -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.mp3 -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav -------------------------------------------------------------------------------- /docs/requirements-tutorials.txt: -------------------------------------------------------------------------------- 1 | IPython 2 | deep-phonemizer 3 | boto3 4 | cython 5 | pandas 6 | librosa==0.10.0 7 | sentencepiece 8 | pandoc 9 | mir_eval 10 | pesq 11 | pystoi 12 | -------------------------------------------------------------------------------- /examples/self_supervised_learning/data_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from ._hubert_datamodule import HuBERTDataModule 2 | 3 | __all__ = [ 4 | "HuBERTDataModule", 5 | "Wav2Vec2DataModule", 6 | ] 7 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/hubert/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "examples", "hubert")) 6 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/class.rst 3 | 4 | {{ name | underline }} 5 | 6 | .. autoclass:: {{ fullname }} 7 | :members: 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Minimum runtime dependencies 2 | torch 3 | 4 | # Optional runtime dependencies 5 | kaldi_io 6 | SoundFile 7 | 8 | # For build and test-time dependencies please refer to CONTRIBUTING.md 9 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | PLEASE NOTE THAT THE TORCHAUDIO REPOSITORY IS NO LONGER ACTIVELY MONITORED. You may not get a response. For open discussions, visit https://discuss.pytorch.org/. 2 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/emformer_rnnt/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "examples", "asr", "emformer_rnnt")) 6 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/hifi_gan/original/env.py: -------------------------------------------------------------------------------- 1 | class AttrDict(dict): 2 | def __init__(self, *args, **kwargs): 3 | super(AttrDict, self).__init__(*args, **kwargs) 4 | self.__dict__ = self 5 | -------------------------------------------------------------------------------- /examples/libtorchaudio/augmentation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(augment main.cpp) 2 | target_link_libraries(augment "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}") 3 | set_property(TARGET augment PROPERTY CXX_STANDARD 14) 4 | -------------------------------------------------------------------------------- /src/libtorchaudio/iir_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void cuda_lfilter_core_loop( 6 | const torch::Tensor& in, 7 | const torch::Tensor& a_flipped, 8 | torch::Tensor& padded_out); 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Usage questions 4 | url: https://discuss.pytorch.org/ 5 | about: Ask questions and discuss with other torchaudio community members 6 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.usort] 2 | 3 | first_party_detection = false 4 | 5 | [tool.black] 6 | 7 | line-length = 120 8 | target-version = ["py38"] 9 | 10 | [tool.ufmt] 11 | excludes = [ 12 | "examples/tutorials/", 13 | ] 14 | -------------------------------------------------------------------------------- /src/libtorchaudio/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace torchaudio { 5 | bool is_rir_available(); 6 | bool is_align_available(); 7 | c10::optional cuda_version(); 8 | } // namespace torchaudio 9 | -------------------------------------------------------------------------------- /src/torchaudio/models/wav2vec2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .import_fairseq import import_fairseq_model 2 | from .import_huggingface import import_huggingface_model 3 | 4 | __all__ = [ 5 | "import_huggingface_model", 6 | "import_fairseq_model", 7 | ] 8 | -------------------------------------------------------------------------------- /src/torchaudio/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from torio.utils import ffmpeg_utils 2 | 3 | from . import sox_utils 4 | from .download import download_asset 5 | 6 | 7 | __all__ = [ 8 | "download_asset", 9 | "sox_utils", 10 | "ffmpeg_utils", 11 | ] 12 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/hw_context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace torio::io { 6 | 7 | AVBufferRef* get_cuda_context(int index); 8 | 9 | void clear_cuda_context_cache(); 10 | 11 | } // namespace torio::io 12 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/autograd_cpu_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase 2 | 3 | from .autograd_test_impl import Autograd 4 | 5 | 6 | class AutogradCPUTest(Autograd, PytorchTestCase): 7 | device = "cpu" 8 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/bundle_data.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/bundle_data.rst 3 | 4 | {{ name | underline }} 5 | 6 | .. container:: py attribute 7 | 8 | .. autodata:: {{ fullname }} 9 | :no-value: 10 | -------------------------------------------------------------------------------- /docs/source/libtorio.rst: -------------------------------------------------------------------------------- 1 | libtorio 2 | ======== 3 | 4 | .. warning:: 5 | TorchAudio's C++ API is a prototype feature. 6 | API/ABI backward compatibility is not guaranteed. 7 | 8 | .. toctree:: 9 | libtorio.stream_reader 10 | libtorio.stream_writer 11 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/kaldi_test_pitch_args.jsonl: -------------------------------------------------------------------------------- 1 | {"sample_rate": 8000} 2 | {"sample_rate": 8000, "frames_per_chunk": 200} 3 | {"sample_rate": 8000, "frames_per_chunk": 200, "simulate_first_pass_online": true} 4 | {"sample_rate": 16000} 5 | {"sample_rate": 44100} 6 | -------------------------------------------------------------------------------- /src/torio/io/__init__.py: -------------------------------------------------------------------------------- 1 | from ._streaming_media_decoder import StreamingMediaDecoder 2 | from ._streaming_media_encoder import CodecConfig, StreamingMediaEncoder 3 | 4 | 5 | __all__ = [ 6 | "StreamingMediaDecoder", 7 | "CodecConfig", 8 | "StreamingMediaEncoder", 9 | ] 10 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/librosa_compatibility_cpu_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase 2 | 3 | from .librosa_compatibility_test_impl import Functional 4 | 5 | 6 | class TestFunctionalCPU(Functional, PytorchTestCase): 7 | device = "cpu" 8 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/autograd_cuda_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 2 | 3 | from .autograd_test_impl import Autograd 4 | 5 | 6 | @skipIfNoCuda 7 | class AutogradCUDATest(Autograd, PytorchTestCase): 8 | device = "cuda" 9 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from ._transforms import BarkScale, BarkSpectrogram, ChromaScale, ChromaSpectrogram, InverseBarkScale 2 | 3 | __all__ = [ 4 | "BarkScale", 5 | "BarkSpectrogram", 6 | "ChromaScale", 7 | "ChromaSpectrogram", 8 | "InverseBarkScale", 9 | ] 10 | -------------------------------------------------------------------------------- /src/torchaudio/_internal/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .fb import download_url_to_file, load_state_dict_from_url 3 | except ImportError: 4 | from torch.hub import download_url_to_file, load_state_dict_from_url 5 | 6 | 7 | __all__ = [ 8 | "load_state_dict_from_url", 9 | "download_url_to_file", 10 | ] 11 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/common_utils/func_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import torch 4 | 5 | 6 | def torch_script(obj): 7 | """TorchScript the given function or Module""" 8 | buffer = io.BytesIO() 9 | torch.jit.save(torch.jit.script(obj), buffer) 10 | buffer.seek(0) 11 | return torch.jit.load(buffer) 12 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/librosa_compatibility_cuda_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 2 | 3 | from .librosa_compatibility_test_impl import Functional 4 | 5 | 6 | @skipIfNoCuda 7 | class TestFunctionalCUDA(Functional, PytorchTestCase): 8 | device = "cuda" 9 | -------------------------------------------------------------------------------- /src/torchaudio/backend/__init__.py: -------------------------------------------------------------------------------- 1 | # NOTE: 2 | # The entire `torchaudio.backend` module is deprecated. 3 | # New things should be added to `torchaudio._backend`. 4 | # Only things related to backward compatibility should be placed here. 5 | 6 | from . import common, no_backend, soundfile_backend, sox_io_backend # noqa 7 | 8 | __all__ = [] 9 | -------------------------------------------------------------------------------- /.github/scripts/unittest-windows/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | dependencies: 4 | - flake8 5 | - pytest 6 | - pytest-cov 7 | - codecov 8 | - scipy >= 1.4.1 9 | - pip 10 | - pip: 11 | - kaldi-io 12 | - PySoundFile 13 | - future 14 | - parameterized 15 | - dataclasses 16 | - expecttest 17 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/compute_alphas.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) { 4 | m.def( 5 | "rnnt_loss_alphas(Tensor logits," 6 | "Tensor targets," 7 | "Tensor logit_lengths," 8 | "Tensor target_lengths," 9 | "int blank," 10 | "float clamp) -> Tensor"); 11 | } 12 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/compute_betas.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) { 4 | m.def( 5 | "rnnt_loss_betas(Tensor logits," 6 | "Tensor targets," 7 | "Tensor logit_lengths," 8 | "Tensor target_lengths," 9 | "int blank," 10 | "float clamp) -> Tensor"); 11 | } 12 | -------------------------------------------------------------------------------- /src/torchaudio/sox_effects/__init__.py: -------------------------------------------------------------------------------- 1 | from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects 2 | 3 | 4 | __all__ = [ 5 | "init_sox_effects", 6 | "shutdown_sox_effects", 7 | "effect_names", 8 | "apply_effects_tensor", 9 | "apply_effects_file", 10 | ] 11 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/hifi_gan/original/utils.py: -------------------------------------------------------------------------------- 1 | def init_weights(m, mean=0.0, std=0.01): 2 | classname = m.__class__.__name__ 3 | if classname.find("Conv") != -1: 4 | m.weight.data.normal_(mean, std) 5 | 6 | 7 | def get_padding(kernel_size, dilation=1): 8 | return int((kernel_size * dilation - dilation) / 2) 9 | -------------------------------------------------------------------------------- /src/libtorchaudio/forced_align/compute.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | std::tuple forced_align( 6 | const torch::Tensor& logProbs, 7 | const torch::Tensor& targets, 8 | const torch::Tensor& inputLengths, 9 | const torch::Tensor& targetLengths, 10 | const int64_t blank); 11 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/autograd_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .autograd_test_impl import AutogradTestImpl 5 | 6 | 7 | class TestAutogradCPUFloat64(AutogradTestImpl, PytorchTestCase): 8 | dtype = torch.float64 9 | device = torch.device("cpu") 10 | -------------------------------------------------------------------------------- /docs/source/prototype.datasets.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.prototype.datasets 2 | 3 | torchaudio.prototype.datasets 4 | ============================= 5 | 6 | .. currentmodule:: torchaudio.prototype.datasets 7 | 8 | .. autosummary:: 9 | :toctree: generated 10 | :nosignatures: 11 | :template: autosummary/dataset_class.rst 12 | 13 | Musan 14 | -------------------------------------------------------------------------------- /examples/pipeline_wav2letter/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Normalize(torch.nn.Module): 5 | def forward(self, tensor): 6 | return (tensor - tensor.mean(-1, keepdim=True)) / tensor.std(-1, keepdim=True) 7 | 8 | 9 | class UnsqueezeFirst(torch.nn.Module): 10 | def forward(self, tensor): 11 | return tensor.unsqueeze(0) 12 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/librosa_compatibility_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .librosa_compatibility_test_impl import TransformsTestBase 5 | 6 | 7 | class TestTransforms(TransformsTestBase, PytorchTestCase): 8 | dtype = torch.float64 9 | device = torch.device("cpu") 10 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Note: it's recommended to use `pre-commit run -a flake8` 3 | 4 | max-line-length = 120 5 | ignore = E203,E402,E741,W503 6 | 7 | # Note: exclude is not honnored when flake8 is executed from pre-commit. 8 | # pre-commit has a separate config 9 | exclude = build,docs/src,third_party 10 | 11 | per-file-ignores = 12 | examples/tutorials/*.py: E501 13 | -------------------------------------------------------------------------------- /examples/hubert/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .common_utils import _get_id2label, _get_label2id, create_tsv 2 | from .feature_utils import dump_features 3 | from .kmeans import get_km_label, learn_kmeans 4 | 5 | __all__ = [ 6 | "create_tsv", 7 | "_get_id2label", 8 | "_get_label2id", 9 | "dump_features", 10 | "learn_kmeans", 11 | "get_km_label", 12 | ] 13 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/datasets/librispeech_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio.datasets import librispeech 2 | from torchaudio_unittest.common_utils import TorchaudioTestCase 3 | from torchaudio_unittest.datasets.librispeech_test_impl import LibriSpeechTestMixin 4 | 5 | 6 | class TestLibriSpeech(LibriSpeechTestMixin, TorchaudioTestCase): 7 | librispeech_cls = librispeech.LIBRISPEECH 8 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .librosa_compatibility_test_impl import TransformsTestBase 5 | 6 | 7 | class TestTransforms(TransformsTestBase, PytorchTestCase): 8 | dtype = torch.float64 9 | device = torch.device("cpu") 10 | -------------------------------------------------------------------------------- /src/torchaudio/io/__init__.py: -------------------------------------------------------------------------------- 1 | from torio.io import CodecConfig, StreamingMediaDecoder as StreamReader, StreamingMediaEncoder as StreamWriter 2 | 3 | from ._effector import AudioEffector 4 | from ._playback import play_audio 5 | 6 | 7 | __all__ = [ 8 | "AudioEffector", 9 | "StreamReader", 10 | "StreamWriter", 11 | "CodecConfig", 12 | "play_audio", 13 | ] 14 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/autograd_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .autograd_test_impl import AutogradTestImpl 5 | 6 | 7 | @skipIfNoCuda 8 | class TestAutogradCUDAFloat64(AutogradTestImpl, PytorchTestCase): 9 | dtype = torch.float64 10 | device = torch.device("cuda") 11 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/README.md: -------------------------------------------------------------------------------- 1 | * RATRACE_wave_f_nm_np1_fr_goo_37.avi 2 | * Source: HMDB-51 dataset ("wave" subset) 3 | https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/#Downloads 4 | * License: Creative Commons Attribution 4.0 International License. 5 | * Note: This file does not have proper PTS values thus useful for testing seek for such files. 6 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/io.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/io.rst 3 | 4 | {{ fullname | underline }} 5 | 6 | .. autofunction:: {{ fullname }} 7 | 8 | 9 | {%- if name == "info" %} 10 | 11 | Support Structure 12 | ----------------- 13 | 14 | AudioMetaData 15 | ~~~~~~~~~~~~~ 16 | 17 | .. autoclass:: torchaudio.AudioMetaData 18 | 19 | {%- endif %} 20 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/compute.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | std::tuple> rnnt_loss( 6 | torch::Tensor& logits, 7 | const torch::Tensor& targets, 8 | const torch::Tensor& logit_lengths, 9 | const torch::Tensor& target_lengths, 10 | int64_t blank, 11 | double clamp, 12 | bool fused_log_softmax); 13 | -------------------------------------------------------------------------------- /src/torchaudio/utils/ffmpeg_utils.py: -------------------------------------------------------------------------------- 1 | """Module to change the configuration of FFmpeg libraries (such as libavformat). 2 | 3 | It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`torchaudio.load`). 4 | """ 5 | 6 | 7 | # This file is just for BC. 8 | def __getattr__(item): 9 | from torio.utils import ffmpeg_utils 10 | 11 | return getattr(ffmpeg_utils, item) 12 | -------------------------------------------------------------------------------- /src/torio/_extension/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import _init_ffmpeg, _LazyImporter 2 | 3 | 4 | _FFMPEG_EXT = None 5 | 6 | 7 | def lazy_import_ffmpeg_ext(): 8 | """Load FFmpeg integration based on availability in lazy manner""" 9 | 10 | global _FFMPEG_EXT 11 | if _FFMPEG_EXT is None: 12 | _FFMPEG_EXT = _LazyImporter("_torio_ffmpeg", _init_ffmpeg) 13 | return _FFMPEG_EXT 14 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/hdemucs/hdemucs_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.models.hdemucs.hdemucs_test_impl import CompareHDemucsOriginal, HDemucsTests 4 | 5 | 6 | class HDemucsFloat32CPUTest(HDemucsTests, CompareHDemucsOriginal, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/autograd_cpu_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase 2 | 3 | from .autograd_test_impl import AutogradTestFloat32, AutogradTestMixin 4 | 5 | 6 | class AutogradCPUTest(AutogradTestMixin, PytorchTestCase): 7 | device = "cpu" 8 | 9 | 10 | class AutogradRNNTCPUTest(AutogradTestFloat32, PytorchTestCase): 11 | device = "cpu" 12 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/librosa_compatibility_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .librosa_compatibility_test_impl import TransformsTestBase 5 | 6 | 7 | @skipIfNoCuda 8 | class TestTransforms(TransformsTestBase, PytorchTestCase): 9 | dtype = torch.float64 10 | device = torch.device("cuda") 11 | -------------------------------------------------------------------------------- /docs/source/torio.utils.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torio.utils 2 | 3 | torio.utils 4 | =========== 5 | 6 | ``torio.utils`` module contains utility functions to query and configure the global state of third party libraries. 7 | 8 | .. currentmodule:: torio.utils 9 | 10 | .. autosummary:: 11 | :toctree: generated 12 | :nosignatures: 13 | :template: autosummary/utils.rst 14 | 15 | ffmpeg_utils 16 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_CUDA 4 | #define WARP_SIZE 32 5 | #define MAX_THREADS_PER_BLOCK 1024 6 | #define REDUCE_THREADS 256 7 | #define HOST_AND_DEVICE __host__ __device__ 8 | #define FORCE_INLINE __forceinline__ 9 | #include 10 | #include 11 | #else 12 | #define HOST_AND_DEVICE 13 | #define FORCE_INLINE inline 14 | #endif // USE_CUDA 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .librosa_compatibility_test_impl import TransformsTestBase 5 | 6 | 7 | @skipIfNoCuda 8 | class TestTransforms(TransformsTestBase, PytorchTestCase): 9 | dtype = torch.float64 10 | device = torch.device("cuda") 11 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/librosa_compatibility_cpu_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase 2 | 3 | from .librosa_compatibility_test_impl import Functional, FunctionalComplex 4 | 5 | 6 | class TestFunctionalCPU(Functional, PytorchTestCase): 7 | device = "cpu" 8 | 9 | 10 | class TestFunctionalComplexCPU(FunctionalComplex, PytorchTestCase): 11 | device = "cpu" 12 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_reader/packet_buffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace torio { 5 | namespace io { 6 | class PacketBuffer { 7 | public: 8 | void push_packet(AVPacket* packet); 9 | std::vector pop_packets(); 10 | bool has_packets(); 11 | 12 | private: 13 | std::deque packets; 14 | }; 15 | } // namespace io 16 | } // namespace torio 17 | -------------------------------------------------------------------------------- /docs/source/prototype.transforms.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.prototype.transforms 2 | 3 | torchaudio.prototype.transforms 4 | =============================== 5 | 6 | .. currentmodule:: torchaudio.prototype.transforms 7 | 8 | .. autosummary:: 9 | :toctree: generated 10 | :nosignatures: 11 | 12 | BarkScale 13 | BarkSpectrogram 14 | ChromaScale 15 | ChromaSpectrogram 16 | InverseBarkScale 17 | -------------------------------------------------------------------------------- /src/libtorchaudio/pybind/pybind.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace torchaudio { 5 | namespace { 6 | 7 | PYBIND11_MODULE(_torchaudio, m) { 8 | m.def("is_rir_available", &is_rir_available, ""); 9 | m.def("is_align_available", &is_align_available, ""); 10 | m.def("cuda_version", &cuda_version, ""); 11 | } 12 | 13 | } // namespace 14 | } // namespace torchaudio 15 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/utils.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/utils.rst 3 | 4 | .. py:module:: {{ fullname }} 5 | 6 | {{ name | underline }} 7 | 8 | .. automodule:: {{fullname}} 9 | :noindex: 10 | 11 | .. currentmodule:: {{ fullname }} 12 | 13 | {%- for func in functions %} 14 | 15 | {{ func | underline("-") }} 16 | 17 | .. autofunction:: {{ func }} 18 | 19 | {%- endfor %} 20 | -------------------------------------------------------------------------------- /docs/source/utils.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.utils 2 | 3 | torchaudio.utils 4 | ================ 5 | 6 | ``torchaudio.utils`` module contains utility functions to configure the global state of third party libraries. 7 | 8 | .. currentmodule:: torchaudio.utils 9 | 10 | .. autosummary:: 11 | :toctree: generated 12 | :nosignatures: 13 | :template: autosummary/utils.rst 14 | 15 | sox_utils 16 | ffmpeg_utils 17 | -------------------------------------------------------------------------------- /examples/asr/librispeech_conformer_rnnt_biasing/score.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | dir=$1 # the path to the decoding dir, e.g. experiments/librispeech_clean100_suffix600_tcpgen500_sche30_nodrop/decode_test_clean_b10_KB1000/ 9 | sclite -r "${dir}/ref.trn.txt" trn -h "${dir}/hyp.trn.txt" trn -i rm -o all stdout > "${dir}/result.wrd.txt" 10 | -------------------------------------------------------------------------------- /src/torchaudio/models/squim/__init__.py: -------------------------------------------------------------------------------- 1 | from .objective import squim_objective_base, squim_objective_model, SquimObjective 2 | from .subjective import squim_subjective_base, squim_subjective_model, SquimSubjective 3 | 4 | __all__ = [ 5 | "squim_objective_base", 6 | "squim_objective_model", 7 | "squim_subjective_base", 8 | "squim_subjective_model", 9 | "SquimObjective", 10 | "SquimSubjective", 11 | ] 12 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Fallback 2 | * @pytorch/team-audio-core 3 | 4 | /examples/avsr @mpc001 5 | /examples/asr @hwangjeff 6 | /examples/self_supervised_learning @nateanl 7 | /examples/dnn_beamformer @nateanl 8 | /examples/hubert @nateanl 9 | /examples/tutorials @mthrok 10 | /torchaudio @mthrok 11 | -------------------------------------------------------------------------------- /docs/source/io.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.io 2 | 3 | torchaudio.io 4 | ============= 5 | 6 | .. currentmodule:: torchaudio.io 7 | 8 | .. autosummary:: 9 | :toctree: generated 10 | :nosignatures: 11 | :template: autosummary/io_class.rst 12 | 13 | StreamReader 14 | StreamWriter 15 | AudioEffector 16 | play_audio 17 | 18 | .. rubric:: Tutorials using ``torchaudio.io`` 19 | 20 | .. minigallery:: torchaudio.io 21 | -------------------------------------------------------------------------------- /docs/source/torio.io.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torio.io 2 | 3 | torio.io 4 | ======== 5 | 6 | .. currentmodule:: torio.io 7 | 8 | .. autosummary:: 9 | :toctree: generated 10 | :nosignatures: 11 | :template: autosummary/torio_io_class.rst 12 | 13 | StreamingMediaDecoder 14 | StreamingMediaEncoder 15 | 16 | .. rubric:: Tutorials using ``torio.io`` 17 | 18 | .. minigallery:: torio.io 19 | 20 | .. minigallery:: torchaudio.io 21 | -------------------------------------------------------------------------------- /examples/libtorchaudio/speech_recognition/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(transcribe transcribe.cpp) 2 | add_executable(transcribe_list transcribe_list.cpp) 3 | target_link_libraries(transcribe "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}") 4 | target_link_libraries(transcribe_list "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}") 5 | set_property(TARGET transcribe PROPERTY CXX_STANDARD 14) 6 | set_property(TARGET transcribe_list PROPERTY CXX_STANDARD 14) 7 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/hdemucs/hdemucs_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.models.hdemucs.hdemucs_test_impl import CompareHDemucsOriginal, HDemucsTests 4 | 5 | 6 | @skipIfNoCuda 7 | class HDemucsFloat32GPUTest(HDemucsTests, CompareHDemucsOriginal, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | -------------------------------------------------------------------------------- /examples/libtorchaudio/data/README.md: -------------------------------------------------------------------------------- 1 | The files in this directory are originated from [VOiCES](https://iqtlabs.github.io/voices/) dataset, which is licensed under Creative Commos BY 4.0. They are modified to fit into the tutorial. 2 | 3 | * `input.wav`: `VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav` 4 | 5 | * `rir.wav`: `VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav` 6 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/autograd_cuda_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 2 | 3 | from .autograd_test_impl import AutogradTestFloat32, AutogradTestMixin 4 | 5 | 6 | @skipIfNoCuda 7 | class AutogradCUDATest(AutogradTestMixin, PytorchTestCase): 8 | device = "cuda" 9 | 10 | 11 | @skipIfNoCuda 12 | class AutogradRNNTCUDATest(AutogradTestFloat32, PytorchTestCase): 13 | device = "cuda" 14 | -------------------------------------------------------------------------------- /.github/scripts/ffmpeg/build.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set PROJ_FOLDER=%cd% 4 | 5 | choco install -y --no-progress msys2 --package-parameters "/NoUpdate" 6 | C:\tools\msys64\usr\bin\env MSYSTEM=MINGW64 /bin/bash -l -c "pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain diffutils" 7 | C:\tools\msys64\usr\bin\env MSYSTEM=MINGW64 /bin/bash -l -c "cd ${PROJ_FOLDER} && packaging/vc_env_helper.bat bash .github/scripts/ffmpeg/build.sh" 8 | 9 | :end 10 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .kaldi_compatibility_test_impl import Kaldi 5 | 6 | 7 | class TestKaldiFloat32(Kaldi, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestKaldiFloat64(Kaldi, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/librosa_compatibility_cuda_test.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 2 | 3 | from .librosa_compatibility_test_impl import Functional, FunctionalComplex 4 | 5 | 6 | @skipIfNoCuda 7 | class TestFunctionalCUDA(Functional, PytorchTestCase): 8 | device = "cuda" 9 | 10 | 11 | @skipIfNoCuda 12 | class TestFunctionalComplexCUDA(FunctionalComplex, PytorchTestCase): 13 | device = "cuda" 14 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_writer/packet_writer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace torio::io { 5 | class PacketWriter { 6 | AVFormatContext* format_ctx; 7 | AVStream* stream; 8 | AVRational original_time_base; 9 | 10 | public: 11 | PacketWriter( 12 | AVFormatContext* format_ctx_, 13 | const StreamParams& stream_params_); 14 | void write_packet(const AVPacketPtr& packet); 15 | }; 16 | } // namespace torio::io 17 | -------------------------------------------------------------------------------- /src/libtorchaudio/sox/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set( 2 | sources 3 | io.cpp 4 | utils.cpp 5 | effects.cpp 6 | effects_chain.cpp 7 | types.cpp 8 | ) 9 | torchaudio_library( 10 | libtorchaudio_sox 11 | "${sources}" 12 | "" 13 | "torch;sox" 14 | "" 15 | ) 16 | 17 | if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) 18 | torchaudio_extension( 19 | _torchaudio_sox 20 | "pybind/pybind.cpp;" 21 | "" 22 | "libtorchaudio_sox" 23 | "" 24 | ) 25 | endif() 26 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from ._vggish import VGGISH, VGGishBundle 2 | from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle 3 | from .rnnt_pipeline import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3 4 | 5 | __all__ = [ 6 | "EMFORMER_RNNT_BASE_MUSTC", 7 | "EMFORMER_RNNT_BASE_TEDLIUM3", 8 | "HIFIGAN_VOCODER_V3_LJSPEECH", 9 | "HiFiGANVocoderBundle", 10 | "VGGISH", 11 | "VGGishBundle", 12 | ] 13 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/compliance/kaldi/kaldi_compatibility_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest import common_utils 3 | 4 | from .kaldi_compatibility_impl import Kaldi 5 | 6 | 7 | class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/transforms_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .transforms_test_impl import TransformsTestBase 5 | 6 | 7 | class TransformsCPUFloat32Test(TransformsTestBase, PytorchTestCase): 8 | device = "cpu" 9 | dtype = torch.float32 10 | 11 | 12 | class TransformsCPUFloat64Test(TransformsTestBase, PytorchTestCase): 13 | device = "cpu" 14 | dtype = torch.float64 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/rnnt/rnnt_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.models.rnnt.rnnt_test_impl import RNNTTestImpl 4 | 5 | 6 | class RNNTFloat32CPUTest(RNNTTestImpl, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | 10 | 11 | class RNNTFloat64CPUTest(RNNTTestImpl, PytorchTestCase): 12 | dtype = torch.float64 13 | device = torch.device("cpu") 14 | -------------------------------------------------------------------------------- /third_party/LICENSES_BUNDLED.txt: -------------------------------------------------------------------------------- 1 | The Torchaudio repository and source distributions bundle several libraries that are 2 | compatibly licensed. We list some here. 3 | 4 | Name: cuctc 5 | License: BSD-2-Clause (Files without specific notes) 6 | BSD-3-Clause File: 7 | torchaudio/csrc/cuctc/src/ctc_fast_divmod.cuh, 8 | Apache 2.0 Files: 9 | torchaudio/csrc/cuctc/src/bitonic_topk 10 | For details, see: cuctc/LICENSE, 11 | torchaudio/csrc/cuctc/src/bitonic_topk/LICENSE 12 | -------------------------------------------------------------------------------- /examples/libtorchaudio/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eux 4 | 5 | this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 6 | build_dir="${this_dir}/build" 7 | 8 | mkdir -p "${build_dir}" 9 | cd "${build_dir}" 10 | 11 | git submodule update 12 | cmake -GNinja \ 13 | -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \ 14 | -DBUILD_SOX=ON \ 15 | -DBUILD_KALDI=OFF \ 16 | .. 17 | cmake --build . 18 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/autograd_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest import common_utils 3 | 4 | from .autograd_impl import Autograd, AutogradFloat32 5 | 6 | 7 | class TestAutogradLfilterCPU(Autograd, common_utils.PytorchTestCase): 8 | dtype = torch.float64 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestAutogradRNNTCPU(AutogradFloat32, common_utils.PytorchTestCase): 13 | dtype = torch.float32 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/hifi_gan/hifi_gan_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .hifi_gan_test_impl import HiFiGANTestImpl 5 | 6 | 7 | class HiFiGANFloat32CPUTest(HiFiGANTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class HiFiGANFloat64CPUTest(HiFiGANTestImpl, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/sox_io/common.py: -------------------------------------------------------------------------------- 1 | def name_func(func, _, params): 2 | return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}' 3 | 4 | 5 | def get_enc_params(dtype): 6 | if dtype == "float32": 7 | return "PCM_F", 32 8 | if dtype == "int32": 9 | return "PCM_S", 32 10 | if dtype == "int16": 11 | return "PCM_S", 16 12 | if dtype == "uint8": 13 | return "PCM_U", 8 14 | raise ValueError(f"Unexpected dtype: {dtype}") 15 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace torchaudio { 4 | namespace rnnt { 5 | 6 | enum status_t { 7 | SUCCESS = 0, 8 | FAILURE = 1, 9 | COMPUTE_DENOMINATOR_REDUCE_MAX_FAILED = 2, 10 | COMPUTE_DENOMINATOR_REDUCE_SUM_FAILED = 3, 11 | COMPUTE_LOG_PROBS_FAILED = 4, 12 | COMPUTE_ALPHAS_BETAS_COSTS_FAILED = 5, 13 | COMPUTE_GRADIENTS_FAILED = 6 14 | }; 15 | 16 | enum device_t { UNDEFINED = 0, CPU = 1, GPU = 2 }; 17 | 18 | } // namespace rnnt 19 | } // namespace torchaudio 20 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/dispatcher/sox/common.py: -------------------------------------------------------------------------------- 1 | def name_func(func, _, params): 2 | return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}' 3 | 4 | 5 | def get_enc_params(dtype): 6 | if dtype == "float32": 7 | return "PCM_F", 32 8 | if dtype == "int32": 9 | return "PCM_S", 32 10 | if dtype == "int16": 11 | return "PCM_S", 16 12 | if dtype == "uint8": 13 | return "PCM_U", 8 14 | raise ValueError(f"Unexpected dtype: {dtype}") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/transforms_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .transforms_test_impl import TransformsTestImpl 5 | 6 | 7 | class TransformsFloat32CPUTest(TransformsTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TransformsFloat64CPUTest(TransformsTestImpl, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/kaldi_compatibility_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .kaldi_compatibility_test_impl import Kaldi 5 | 6 | 7 | @skipIfNoCuda 8 | class TestKaldiFloat32(Kaldi, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TestKaldiFloat64(Kaldi, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/emformer/emformer_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.models.emformer.emformer_test_impl import EmformerTestImpl 4 | 5 | 6 | class EmformerFloat32CPUTest(EmformerTestImpl, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | 10 | 11 | class EmformerFloat64CPUTest(EmformerTestImpl, PytorchTestCase): 12 | dtype = torch.float64 13 | device = torch.device("cpu") 14 | -------------------------------------------------------------------------------- /src/torchaudio/backend/common.py: -------------------------------------------------------------------------------- 1 | def __getattr__(name: str): 2 | if name == "AudioMetaData": 3 | import warnings 4 | 5 | warnings.warn( 6 | "`torchaudio.backend.common.AudioMetaData` has been moved to " 7 | "`torchaudio.AudioMetaData`. Please update the import path.", 8 | stacklevel=2, 9 | ) 10 | from torchaudio import AudioMetaData 11 | 12 | return AudioMetaData 13 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 14 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/rnnt_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.prototype.rnnt_test_impl import ConformerRNNTTestImpl 4 | 5 | 6 | class ConformerRNNTFloat32CPUTest(ConformerRNNTTestImpl, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | 10 | 11 | class ConformerRNNTFloat64CPUTest(ConformerRNNTTestImpl, PytorchTestCase): 12 | dtype = torch.float64 13 | device = torch.device("cpu") 14 | -------------------------------------------------------------------------------- /src/torchaudio/pipelines/_tts/__init__.py: -------------------------------------------------------------------------------- 1 | from .impl import ( 2 | TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH, 3 | TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH, 4 | TACOTRON2_WAVERNN_CHAR_LJSPEECH, 5 | TACOTRON2_WAVERNN_PHONE_LJSPEECH, 6 | ) 7 | from .interface import Tacotron2TTSBundle 8 | 9 | 10 | __all__ = [ 11 | "Tacotron2TTSBundle", 12 | "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH", 13 | "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH", 14 | "TACOTRON2_WAVERNN_CHAR_LJSPEECH", 15 | "TACOTRON2_WAVERNN_PHONE_LJSPEECH", 16 | ] 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/conformer/conformer_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.models.conformer.conformer_test_impl import ConformerTestImpl 4 | 5 | 6 | class ConformerFloat32CPUTest(ConformerTestImpl, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | 10 | 11 | class ConformerFloat64CPUTest(ConformerTestImpl, PytorchTestCase): 12 | dtype = torch.float64 13 | device = torch.device("cpu") 14 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/transforms_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .transforms_test_impl import TransformsTestBase 5 | 6 | 7 | @skipIfNoCuda 8 | class TransformsCUDAFloat32Test(TransformsTestBase, PytorchTestCase): 9 | device = "cuda" 10 | dtype = torch.float32 11 | 12 | 13 | @skipIfNoCuda 14 | class TransformsCUDAFloat64Test(TransformsTestBase, PytorchTestCase): 15 | device = "cuda" 16 | dtype = torch.float64 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/compliance/kaldi/kaldi_compatibility_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest import common_utils 3 | 4 | from .kaldi_compatibility_impl import Kaldi 5 | 6 | 7 | @common_utils.skipIfNoCuda 8 | class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @common_utils.skipIfNoCuda 14 | class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/torchscript_consistency_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .torchscript_consistency_impl import Functional, FunctionalFloat32Only 5 | 6 | 7 | class TestFunctionalFloat32(Functional, FunctionalFloat32Only, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestFunctionalFloat64(Functional, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/rnnt/rnnt_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.models.rnnt.rnnt_test_impl import RNNTTestImpl 4 | 5 | 6 | @skipIfNoCuda 7 | class RNNTFloat32GPUTest(RNNTTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | 11 | 12 | @skipIfNoCuda 13 | class RNNTFloat64GPUTest(RNNTTestImpl, PytorchTestCase): 14 | dtype = torch.float64 15 | device = torch.device("cuda") 16 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/conv_emformer_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.prototype.conv_emformer_test_impl import ConvEmformerTestImpl 4 | 5 | 6 | class ConvEmformerFloat32CPUTest(ConvEmformerTestImpl, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | 10 | 11 | class ConvEmformerFloat64CPUTest(ConvEmformerTestImpl, PytorchTestCase): 12 | dtype = torch.float64 13 | device = torch.device("cpu") 14 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .torchscript_consistency_impl import Transforms, TransformsFloat32Only 5 | 6 | 7 | class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestTransformsFloat64(Transforms, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/hifi_gan/hifi_gan_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .hifi_gan_test_impl import HiFiGANTestImpl 5 | 6 | 7 | @skipIfNoCuda 8 | class HiFiGANFloat32CPUTest(HiFiGANTestImpl, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class HiFiGANFloat64CPUTest(HiFiGANTestImpl, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /examples/asr/librispeech_conformer_rnnt_biasing/blists/README.md: -------------------------------------------------------------------------------- 1 | This is the default directory where rare word list files should be found. 2 | 3 | To train or evaluate a model, please download the following files, and save them here. 4 | 5 | - [`rareword_f15.txt`](https://download.pytorch.org/torchaudio/pipeline-assets/tcpgen/rareword_f15.txt) 6 | - [`rareword_f30.txt`](https://download.pytorch.org/torchaudio/pipeline-assets/tcpgen/rareword_f30.txt) 7 | - [`all_rare_words.txt`](https://download.pytorch.org/torchaudio/pipeline-assets/tcpgen/all_rare_words.txt) 8 | -------------------------------------------------------------------------------- /examples/pipeline_wav2letter/ctc_decoders.py: -------------------------------------------------------------------------------- 1 | from torch import topk 2 | 3 | 4 | class GreedyDecoder: 5 | def __call__(self, outputs): 6 | """Greedy Decoder. Returns highest probability of class labels for each timestep 7 | 8 | Args: 9 | outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) 10 | 11 | Returns: 12 | torch.Tensor: class labels per time step. 13 | """ 14 | _, indices = topk(outputs, k=1, dim=-1) 15 | return indices[..., 0] 16 | -------------------------------------------------------------------------------- /src/torchaudio/backend/no_backend.py: -------------------------------------------------------------------------------- 1 | def __getattr__(name: str): 2 | import warnings 3 | 4 | warnings.warn( 5 | "Torchaudio's I/O functions now support par-call bakcend dispatch. " 6 | "Importing backend implementation directly is no longer guaranteed to work. " 7 | "Please use `backend` keyword with load/save/info function, instead of " 8 | "calling the udnerlying implementation directly.", 9 | stacklevel=2, 10 | ) 11 | 12 | from . import _no_backend 13 | 14 | return getattr(_no_backend, name) 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/autograd_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest import common_utils 3 | 4 | from .autograd_impl import Autograd, AutogradFloat32 5 | 6 | 7 | @common_utils.skipIfNoCuda 8 | class TestAutogradLfilterCUDA(Autograd, common_utils.PytorchTestCase): 9 | dtype = torch.float64 10 | device = torch.device("cuda") 11 | 12 | 13 | @common_utils.skipIfNoCuda 14 | class TestAutogradRNNTCUDA(AutogradFloat32, common_utils.PytorchTestCase): 15 | dtype = torch.float32 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | from torchaudio_unittest.models.rnnt_decoder.rnnt_decoder_test_impl import RNNTBeamSearchTestImpl 4 | 5 | 6 | class RNNTBeamSearchFloat32CPUTest(RNNTBeamSearchTestImpl, PytorchTestCase): 7 | dtype = torch.float32 8 | device = torch.device("cpu") 9 | 10 | 11 | class RNNTBeamSearchFloat64CPUTest(RNNTBeamSearchTestImpl, PytorchTestCase): 12 | dtype = torch.float64 13 | device = torch.device("cpu") 14 | -------------------------------------------------------------------------------- /.github/scripts/unittest-windows/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" 6 | conda activate ./env 7 | 8 | this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 9 | source "$this_dir/set_cuda_envs.sh" 10 | 11 | python -m torch.utils.collect_env 12 | env | grep TORCHAUDIO || true 13 | 14 | cd test 15 | pytest --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest 16 | coverage html 17 | -------------------------------------------------------------------------------- /examples/hubert/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .hubert_dataset import ( 2 | _get_lengths_librilightlimited, 3 | _get_lengths_librispeech, 4 | BucketizeBatchSampler, 5 | CollateFnHubert, 6 | CollateFnLibriLightLimited, 7 | DistributedBatchSampler, 8 | HuBERTDataSet, 9 | ) 10 | 11 | 12 | __all__ = [ 13 | "_get_lengths_librilightlimited", 14 | "_get_lengths_librispeech", 15 | "BucketizeBatchSampler", 16 | "CollateFnHubert", 17 | "CollateFnLibriLightLimited", 18 | "DistributedBatchSampler", 19 | "HuBERTDataSet", 20 | ] 21 | -------------------------------------------------------------------------------- /src/torchaudio/backend/sox_io_backend.py: -------------------------------------------------------------------------------- 1 | def __getattr__(name: str): 2 | import warnings 3 | 4 | warnings.warn( 5 | "Torchaudio's I/O functions now support par-call bakcend dispatch. " 6 | "Importing backend implementation directly is no longer guaranteed to work. " 7 | "Please use `backend` keyword with load/save/info function, instead of " 8 | "calling the udnerlying implementation directly.", 9 | stacklevel=2, 10 | ) 11 | 12 | from . import _sox_io_backend 13 | 14 | return getattr(_sox_io_backend, name) 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/transforms/transforms_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .transforms_test_impl import TransformsTestImpl 5 | 6 | 7 | @skipIfNoCuda 8 | class TransformsFloat32CUDATest(TransformsTestImpl, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TransformsFloat64CUDATest(TransformsTestImpl, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_writer/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | namespace torio::io { 3 | 4 | struct CodecConfig { 5 | int bit_rate = -1; 6 | int compression_level = -1; 7 | 8 | // qscale corresponds to ffmpeg CLI's qscale. 9 | // Example: MP3 10 | // https://trac.ffmpeg.org/wiki/Encode/MP3 11 | // This should be set like 12 | // https://github.com/FFmpeg/FFmpeg/blob/n4.3.2/fftools/ffmpeg_opt.c#L1550 13 | const c10::optional qscale = -1; 14 | 15 | // video 16 | int gop_size = -1; 17 | int max_b_frames = -1; 18 | }; 19 | } // namespace torio::io 20 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/emformer/emformer_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.models.emformer.emformer_test_impl import EmformerTestImpl 4 | 5 | 6 | @skipIfNoCuda 7 | class EmformerFloat32GPUTest(EmformerTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | 11 | 12 | @skipIfNoCuda 13 | class EmformerFloat64GPUTest(EmformerTestImpl, PytorchTestCase): 14 | dtype = torch.float64 15 | device = torch.device("cuda") 16 | -------------------------------------------------------------------------------- /.github/workflows/bandit.yml: -------------------------------------------------------------------------------- 1 | # GitHub Actions Bandit Workflow 2 | 3 | name: Bandit 4 | 5 | on: 6 | pull_request: 7 | branches: [ main ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | # Task will fail if any high-severity issues are found 19 | # Ignoring submodules 20 | - name: Run Bandit Security Analysis 21 | run: | 22 | python -m pip install bandit 23 | python -m bandit -r . -x ./third_party -lll 24 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/rnnt_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.prototype.rnnt_test_impl import ConformerRNNTTestImpl 4 | 5 | 6 | @skipIfNoCuda 7 | class ConformerRNNTFloat32GPUTest(ConformerRNNTTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | 11 | 12 | @skipIfNoCuda 13 | class ConformerRNNTFloat64GPUTest(ConformerRNNTTestImpl, PytorchTestCase): 14 | dtype = torch.float64 15 | device = torch.device("cuda") 16 | -------------------------------------------------------------------------------- /docs/source/compliance.kaldi.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.compliance.kaldi 2 | 3 | torchaudio.compliance.kaldi 4 | =========================== 5 | 6 | .. currentmodule:: torchaudio.compliance.kaldi 7 | 8 | The useful processing operations of kaldi_ can be performed with torchaudio. 9 | Various functions with identical parameters are given so that torchaudio can 10 | produce similar outputs. 11 | 12 | .. _kaldi: https://github.com/kaldi-asr/kaldi 13 | 14 | .. autosummary:: 15 | :toctree: generated 16 | :nosignatures: 17 | 18 | spectrogram 19 | fbank 20 | mfcc 21 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/conformer/conformer_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.models.conformer.conformer_test_impl import ConformerTestImpl 4 | 5 | 6 | @skipIfNoCuda 7 | class ConformerFloat32GPUTest(ConformerTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | 11 | 12 | @skipIfNoCuda 13 | class ConformerFloat64GPUTest(ConformerTestImpl, PytorchTestCase): 14 | dtype = torch.float64 15 | device = torch.device("cuda") 16 | -------------------------------------------------------------------------------- /src/torchaudio/backend/soundfile_backend.py: -------------------------------------------------------------------------------- 1 | def __getattr__(name: str): 2 | import warnings 3 | 4 | warnings.warn( 5 | "Torchaudio's I/O functions now support par-call bakcend dispatch. " 6 | "Importing backend implementation directly is no longer guaranteed to work. " 7 | "Please use `backend` keyword with load/save/info function, instead of " 8 | "calling the udnerlying implementation directly.", 9 | stacklevel=2, 10 | ) 11 | 12 | from torchaudio._backend import soundfile_backend 13 | 14 | return getattr(soundfile_backend, name) 15 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/torchscript_consistency_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .torchscript_consistency_impl import Functional, FunctionalFloat32Only 5 | 6 | 7 | @skipIfNoCuda 8 | class TestFunctionalFloat32(Functional, FunctionalFloat32Only, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TestFunctionalFloat64(Functional, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/conv_emformer_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.prototype.conv_emformer_test_impl import ConvEmformerTestImpl 4 | 5 | 6 | @skipIfNoCuda 7 | class ConvEmformerFloat32GPUTest(ConvEmformerTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | 11 | 12 | @skipIfNoCuda 13 | class ConvEmformerFloat64GPUTest(ConvEmformerTestImpl, PytorchTestCase): 14 | dtype = torch.float64 15 | device = torch.device("cuda") 16 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/transforms/torchscript_consistency_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .torchscript_consistency_impl import Transforms, TransformsFloat32Only 5 | 6 | 7 | @skipIfNoCuda 8 | class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TestTransformsFloat64(Transforms, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /packaging/torchaudio/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | torch_cuda_version=$(python -c "import torch; print(torch.version.cuda)") 5 | echo "torch.cuda.version is $torch_cuda_version" 6 | 7 | echo USE_CUDA is "$USE_CUDA" 8 | 9 | shopt -s nocasematch 10 | if [ "${USE_CUDA}" == "1" ] ; then 11 | if [ "$torch_cuda_version" == "None" ]; then 12 | echo "We want to build torch auido with cuda but the installed pytorch isn't with cuda" 13 | exit 1 14 | fi 15 | fi 16 | shopt -u nocasematch 17 | python setup.py install --single-version-externally-managed --record=record.txt 18 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | from torchaudio_unittest.models.rnnt_decoder.rnnt_decoder_test_impl import RNNTBeamSearchTestImpl 4 | 5 | 6 | @skipIfNoCuda 7 | class RNNTBeamSearchFloat32GPUTest(RNNTBeamSearchTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cuda") 10 | 11 | 12 | @skipIfNoCuda 13 | class RNNTBeamSearchFloat64GPUTest(RNNTBeamSearchTestImpl, PytorchTestCase): 14 | dtype = torch.float64 15 | device = torch.device("cuda") 16 | -------------------------------------------------------------------------------- /src/libtorchaudio/utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #ifdef USE_CUDA 5 | #include 6 | #endif 7 | 8 | namespace torchaudio { 9 | 10 | bool is_rir_available() { 11 | #ifdef INCLUDE_RIR 12 | return true; 13 | #else 14 | return false; 15 | #endif 16 | } 17 | 18 | bool is_align_available() { 19 | #ifdef INCLUDE_ALIGN 20 | return true; 21 | #else 22 | return false; 23 | #endif 24 | } 25 | 26 | c10::optional cuda_version() { 27 | #ifdef USE_CUDA 28 | return CUDA_VERSION; 29 | #else 30 | return {}; 31 | #endif 32 | } 33 | 34 | } // namespace torchaudio 35 | -------------------------------------------------------------------------------- /docs/post_process_dispatcher.py: -------------------------------------------------------------------------------- 1 | """Replaces every instance of 'torchaudio._backend' with 'torchaudio' in torchaudio.html. 2 | Temporary hack while we maintain both the existing set of info/load/save functions and the 3 | new ones backed by the backend dispatcher in torchaudio._backend. 4 | """ 5 | import sys 6 | 7 | if __name__ == "__main__": 8 | build_dir = sys.argv[1] 9 | filepath = f"{build_dir}/html/torchaudio.html" 10 | 11 | with open(filepath, "r") as f: 12 | text = f.read() 13 | text = text.replace("torchaudio._backend", "torchaudio") 14 | 15 | with open(filepath, "w") as f: 16 | f.write(text) 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/io/common.py: -------------------------------------------------------------------------------- 1 | import torchaudio 2 | 3 | 4 | # If FFmpeg is 4.1 or older 5 | # Tests that checks the number of output samples from OPUS fails 6 | # They work on 4.2+ 7 | # Probably this commit fixed it. 8 | # https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c 9 | def lt42(): 10 | ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"] 11 | # 5.1 libavcodec 59. 18.100 12 | # 4.4 libavcodec 58.134.100 13 | # 4.3 libavcodec 58. 91.100 14 | # 4.2 libavcodec 58. 54.100 15 | # 4.1 libavcodec 58. 35.100 16 | return ver[0] < 59 and ver[1] < 54 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/torchscript_consistency_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl 5 | 6 | 7 | @skipIfNoCuda 8 | class TorchScriptConsistencyCUDAFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TorchScriptConsistencyCUDAFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda") 17 | -------------------------------------------------------------------------------- /docs/source/sox_effects.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.sox_effects 2 | 3 | torchaudio.sox_effects 4 | ====================== 5 | 6 | .. currentmodule:: torchaudio.sox_effects 7 | 8 | Applying effects 9 | ---------------- 10 | 11 | Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor. 12 | 13 | .. autosummary:: 14 | :toctree: generated 15 | :nosignatures: 16 | 17 | apply_effects_tensor 18 | apply_effects_file 19 | 20 | .. minigallery:: torchaudio.sox_effects.apply_effects_tensor 21 | 22 | Utilities 23 | --------- 24 | 25 | .. autosummary:: 26 | :toctree: generated 27 | :nosignatures: 28 | 29 | effect_names 30 | -------------------------------------------------------------------------------- /docs/source/torio.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torio 2 | 3 | torio 4 | ===== 5 | 6 | .. currentmodule:: torio.io 7 | 8 | ``torio`` is an alternative top-level module for I/O features. It is the extraction of the core implementation of I/O feature of ``torchaudio``. 9 | 10 | If you want to use the multimedia processing features, but do not want to depend on the entire ``torchaudio`` package, you can use ``torio``. 11 | 12 | .. note:: 13 | 14 | Currently, ``torio`` is distributed alongside ``torchaudio``, and there is no stand-alone 15 | procedure to install ``torio`` only. Please refer to https://pytorch.org/get-started/locally/ 16 | for the installation of ``torchaudio``. 17 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/backend/common.py: -------------------------------------------------------------------------------- 1 | from torchaudio_unittest.common_utils import sox_utils 2 | 3 | 4 | def get_encoding(ext, dtype): 5 | exts = { 6 | "mp3", 7 | "flac", 8 | "vorbis", 9 | } 10 | encodings = { 11 | "float32": "PCM_F", 12 | "int32": "PCM_S", 13 | "int16": "PCM_S", 14 | "uint8": "PCM_U", 15 | } 16 | return ext.upper() if ext in exts else encodings[dtype] 17 | 18 | 19 | def get_bits_per_sample(ext, dtype): 20 | bits_per_samples = { 21 | "flac": 24, 22 | "mp3": 0, 23 | "vorbis": 0, 24 | } 25 | return bits_per_samples.get(ext, sox_utils.get_bit_depth(dtype)) 26 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/functional_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl 5 | 6 | 7 | class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | 16 | 17 | class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase): 18 | dtype = torch.float64 19 | device = torch.device("cpu") 20 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @misc{hwang2023torchaudio, 2 | title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, 3 | author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis}, 4 | year={2023}, 5 | eprint={2310.17864}, 6 | archivePrefix={arXiv}, 7 | primaryClass={eess.AS} 8 | } 9 | -------------------------------------------------------------------------------- /examples/libtorchaudio/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | project(libtorchaudio-cpp-example) 4 | 5 | SET(BUILD_SOX ON CACHE BOOL "Build libsox into libtorchaudio") 6 | 7 | SET(BUILD_KALDI OFF CACHE BOOL "Build Kaldi into libtorchaudio") 8 | SET(BUILD_RNNT ON CACHE BOOL "Build RNN transducer into libtorchaudio") 9 | SET(BUILD_TORCHAUDIO_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding") 10 | 11 | find_package(Torch REQUIRED) 12 | message("libtorchaudio CMakeLists: ${TORCH_CXX_FLAGS}") 13 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") 14 | 15 | add_subdirectory(../.. libtorchaudio) 16 | add_subdirectory(augmentation) 17 | add_subdirectory(speech_recognition) 18 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/tacotron2/model_test_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .model_test_impl import Tacotron2DecoderTests, Tacotron2EncoderTests, Tacotron2Tests 5 | 6 | 7 | class TestTacotron2EncoderFloat32CPU(Tacotron2EncoderTests, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestTacotron2DecoderFloat32CPU(Tacotron2DecoderTests, PytorchTestCase): 13 | dtype = torch.float32 14 | device = torch.device("cpu") 15 | 16 | 17 | class TestTacotron2Float32CPU(Tacotron2Tests, PytorchTestCase): 18 | dtype = torch.float32 19 | device = torch.device("cpu") 20 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace torio::io::detail { 8 | 9 | class UnchunkedBuffer { 10 | // Each AVFrame is converted to a Tensor and stored here. 11 | std::deque chunks; 12 | double pts = -1.; 13 | AVRational time_base; 14 | 15 | public: 16 | explicit UnchunkedBuffer(AVRational time_base); 17 | bool is_ready() const; 18 | void push_frame(torch::Tensor frame, int64_t pts_); 19 | c10::optional pop_chunk(); 20 | void flush(); 21 | }; 22 | 23 | } // namespace torio::io::detail 24 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/functional/__init__.py: -------------------------------------------------------------------------------- 1 | from ._dsp import ( 2 | adsr_envelope, 3 | exp_sigmoid, 4 | extend_pitch, 5 | filter_waveform, 6 | frequency_impulse_response, 7 | oscillator_bank, 8 | sinc_impulse_response, 9 | ) 10 | from ._rir import ray_tracing, simulate_rir_ism 11 | from .functional import barkscale_fbanks, chroma_filterbank 12 | 13 | 14 | __all__ = [ 15 | "adsr_envelope", 16 | "exp_sigmoid", 17 | "barkscale_fbanks", 18 | "chroma_filterbank", 19 | "extend_pitch", 20 | "filter_waveform", 21 | "frequency_impulse_response", 22 | "oscillator_bank", 23 | "ray_tracing", 24 | "sinc_impulse_response", 25 | "simulate_rir_ism", 26 | ] 27 | -------------------------------------------------------------------------------- /test/integration_tests/prototype/vggish_pipeline_test.py: -------------------------------------------------------------------------------- 1 | import torchaudio 2 | from torchaudio.prototype.pipelines import VGGISH 3 | 4 | 5 | def test_vggish(): 6 | input_sr = VGGISH.sample_rate 7 | input_proc = VGGISH.get_input_processor() 8 | model = VGGISH.get_model() 9 | path = torchaudio.utils.download_asset("test-assets/Chopin_Ballade_-1_In_G_Minor,_Op._23_excerpt.mp3") 10 | waveform, sr = torchaudio.load(path, backend="ffmpeg") 11 | waveform = waveform.mean(axis=0) 12 | waveform = torchaudio.functional.resample(waveform, sr, input_sr) 13 | batch = input_proc(waveform) 14 | assert batch.shape == (62, 1, 96, 64) 15 | output = model(batch) 16 | assert output.shape == (62, 128) 17 | -------------------------------------------------------------------------------- /test/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | FetchContent_Declare( 3 | googletest 4 | URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip 5 | ) 6 | 7 | # For Windows: Prevent overriding the parent project's compiler/linker settings 8 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 9 | FetchContent_MakeAvailable(googletest) 10 | 11 | enable_testing() 12 | 13 | add_executable( 14 | wall_collision 15 | rir/wall_collision.cpp 16 | ) 17 | target_link_libraries( 18 | wall_collision 19 | torch 20 | GTest::gtest_main 21 | ) 22 | target_include_directories( 23 | wall_collision 24 | PRIVATE 25 | "${PROJECT_SOURCE_DIR}/src" 26 | ) 27 | add_test(NAME wall_collision_test COMMAND wall_collision) 28 | -------------------------------------------------------------------------------- /docs/source/prototype.rst: -------------------------------------------------------------------------------- 1 | torchaudio.prototype 2 | ==================== 3 | 4 | ``torchaudio.prototype`` provides prototype features; 5 | they are at an early stage for feedback and testing. 6 | Their interfaces might be changed without prior notice. 7 | 8 | Most modules of prototypes are excluded from release. 9 | Please refer to `here `_ for 10 | more information on prototype features. 11 | 12 | The modules under ``torchaudio.prototype`` must be 13 | imported explicitly, e.g. 14 | 15 | .. code-block:: python 16 | 17 | import torchaudio.prototype.models 18 | 19 | .. toctree:: 20 | prototype.datasets 21 | prototype.functional 22 | prototype.models 23 | prototype.pipelines 24 | prototype.transforms 25 | -------------------------------------------------------------------------------- /packaging/vs2019/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set vcver="14.2" %} 2 | {% set vcfeature="14" %} 3 | {% set vsyear="2019" %} 4 | {% set fullver="15.4.27004.2010" %} 5 | 6 | package: 7 | name: vs{{ vsyear }} 8 | version: {{ fullver }} 9 | 10 | build: 11 | skip: True [not win] 12 | script_env: 13 | - VSDEVCMD_ARGS # [win] 14 | 15 | outputs: 16 | - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }} 17 | script: install_activate.bat 18 | track_features: 19 | # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14. Tools are "v142". 20 | strong: 21 | - vc{{ vcfeature }} 22 | about: 23 | summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler 24 | license: BSD 3-clause 25 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_reader/packet_buffer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace torio::io { 4 | void PacketBuffer::push_packet(AVPacket* packet) { 5 | TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null."); 6 | AVPacket* p = av_packet_clone(packet); 7 | TORCH_INTERNAL_ASSERT(p, "Failed to clone packet."); 8 | packets.emplace_back(p); 9 | } 10 | std::vector PacketBuffer::pop_packets() { 11 | std::vector ret{ 12 | std::make_move_iterator(packets.begin()), 13 | std::make_move_iterator(packets.end())}; 14 | packets.clear(); 15 | return ret; 16 | } 17 | bool PacketBuffer::has_packets() { 18 | return packets.size() > 0; 19 | } 20 | } // namespace torio::io 21 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/common_utils/psd_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def psd_numpy( 8 | X: np.array, mask: Optional[np.array], multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15 9 | ) -> np.array: 10 | X_conj = np.conj(X) 11 | psd_X = np.einsum("...cft,...eft->...ftce", X, X_conj) 12 | if mask is not None: 13 | if multi_mask: 14 | mask = mask.mean(axis=-3) 15 | if normalize: 16 | mask = mask / (mask.sum(axis=-1, keepdims=True) + eps) 17 | psd = psd_X * mask[..., None, None] 18 | else: 19 | psd = psd_X 20 | 21 | psd = psd.sum(axis=-3) 22 | 23 | return torch.tensor(psd, dtype=torch.cdouble) 24 | -------------------------------------------------------------------------------- /.github/scripts/unittest-linux/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | eval "$($(which conda) shell.bash hook)" 6 | 7 | conda activate ci 8 | 9 | python -m torch.utils.collect_env 10 | env | grep TORCHAUDIO || true 11 | 12 | export PATH="${PWD}/third_party/install/bin/:${PATH}" 13 | 14 | declare -a args=( 15 | '--continue-on-collection-errors' 16 | '-v' 17 | '--cov=torchaudio' 18 | "--junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml" 19 | '--durations' '20' 20 | ) 21 | 22 | if [[ "${CUDA_TESTS_ONLY}" = "1" ]]; then 23 | args+=('-k' 'cuda or gpu') 24 | fi 25 | 26 | ( 27 | cd build/temp*/test/cpp 28 | ctest 29 | ) 30 | 31 | ( 32 | cd test 33 | pytest "${args[@]}" torchaudio_unittest 34 | coverage html 35 | ) 36 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/functional_cuda_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl 5 | 6 | 7 | @skipIfNoCuda 8 | class FunctionalFloat32CUDATest(FunctionalTestImpl, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda", 0) 11 | 12 | 13 | @skipIfNoCuda 14 | class FunctionalFloat64CUDATest(FunctionalTestImpl, PytorchTestCase): 15 | dtype = torch.float64 16 | device = torch.device("cuda", 0) 17 | 18 | 19 | @skipIfNoCuda 20 | class FunctionalFloat64OnlyCUDATest(Functional64OnlyTestImpl, PytorchTestCase): 21 | dtype = torch.float64 22 | device = torch.device("cuda") 23 | -------------------------------------------------------------------------------- /test/integration_tests/ctc_decoder_integration_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "model,expected", 6 | [ 7 | ("librispeech", ["the", "captain", "shook", "his", "head"]), 8 | ("librispeech-3-gram", ["the", "captain", "shook", "his", "head"]), 9 | ], 10 | ) 11 | def test_decoder_from_pretrained(model, expected, emissions): 12 | from torchaudio.models.decoder import ctc_decoder, download_pretrained_files 13 | 14 | pretrained_files = download_pretrained_files(model) 15 | decoder = ctc_decoder( 16 | lexicon=pretrained_files.lexicon, 17 | tokens=pretrained_files.tokens, 18 | lm=pretrained_files.lm, 19 | ) 20 | result = decoder(emissions) 21 | assert result[0][0].words == expected 22 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/models/tacotron2/model_test_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .model_test_impl import Tacotron2DecoderTests, Tacotron2EncoderTests, Tacotron2Tests 5 | 6 | 7 | @skipIfNoCuda 8 | class TestTacotron2EncoderFloat32CUDA(Tacotron2EncoderTests, PytorchTestCase): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TestTacotron2DecoderFloat32CUDA(Tacotron2DecoderTests, PytorchTestCase): 15 | dtype = torch.float32 16 | device = torch.device("cuda") 17 | 18 | 19 | @skipIfNoCuda 20 | class TestTacotron2Float32CUDA(Tacotron2Tests, PytorchTestCase): 21 | dtype = torch.float32 22 | device = torch.device("cuda") 23 | -------------------------------------------------------------------------------- /examples/libtorchaudio/augmentation/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char* argv[]) { 4 | if (argc != 4) { 5 | std::cerr << "Usage: " << argv[0] 6 | << " " << std::endl; 7 | return -1; 8 | } 9 | 10 | torch::jit::script::Module module; 11 | std::cout << "Loading module from: " << argv[1] << std::endl; 12 | try { 13 | module = torch::jit::load(argv[1]); 14 | } catch (const c10::Error& error) { 15 | std::cerr << "Failed to load the module:" << error.what() << std::endl; 16 | return -1; 17 | } 18 | 19 | std::cout << "Performing the process ..." << std::endl; 20 | module.forward({c10::IValue(argv[2]), c10::IValue(argv[3])}); 21 | std::cout << "Done." << std::endl; 22 | } 23 | -------------------------------------------------------------------------------- /third_party/sox/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | FetchContent_Declare( 4 | sox_src 5 | URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2 6 | URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c 7 | PATCH_COMMAND "" 8 | CONFIGURE_COMMAND "" 9 | BUILD_COMMAND "" 10 | ) 11 | # FetchContent_MakeAvailable will parse the downloaded content and setup the targets. 12 | # We want to only download and not build, so we run Populate manually. 13 | if(NOT sox_src_POPULATED) 14 | FetchContent_Populate(sox_src) 15 | endif() 16 | 17 | add_library(sox SHARED stub.c) 18 | if(APPLE) 19 | set_target_properties(sox PROPERTIES SUFFIX .dylib) 20 | endif(APPLE) 21 | target_include_directories(sox PUBLIC ${sox_src_SOURCE_DIR}/src) 22 | -------------------------------------------------------------------------------- /packaging/vs2019/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | c_compiler: 2 | - vs2019 # [win] 3 | cxx_compiler: 4 | - vs2019 # [win] 5 | python: 6 | - 3.8 7 | # This differs from target_platform in that it determines what subdir the compiler 8 | # will target, not what subdir the compiler package will be itself. 9 | # For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 10 | # code on win-64 miniconda. 11 | cross_compiler_target_platform: 12 | - win-64 # [win] 13 | target_platform: 14 | - win-64 # [win] 15 | vc: 16 | - 14 17 | zip_keys: 18 | - # [win] 19 | - vc # [win] 20 | - c_compiler # [win] 21 | - cxx_compiler # [win] 22 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/tacotron2/tacotron2_loss_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .tacotron2_loss_impl import Tacotron2LossGradcheckTests, Tacotron2LossShapeTests, Tacotron2LossTorchscriptTests 5 | 6 | 7 | class TestTacotron2LossShapeFloat32CPU(Tacotron2LossShapeTests, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TestTacotron2TorchsciptFloat32CPU(Tacotron2LossTorchscriptTests, PytorchTestCase): 13 | dtype = torch.float32 14 | device = torch.device("cpu") 15 | 16 | 17 | class TestTacotron2GradcheckFloat64CPU(Tacotron2LossGradcheckTests, PytorchTestCase): 18 | dtype = torch.float64 # gradcheck needs a higher numerical accuracy 19 | device = torch.device("cpu") 20 | -------------------------------------------------------------------------------- /docs/source/prototype.functional.rst: -------------------------------------------------------------------------------- 1 | torchaudio.prototype.functional 2 | =============================== 3 | 4 | .. py:module:: torchaudio.prototype.functional 5 | .. currentmodule:: torchaudio.prototype.functional 6 | 7 | Utility 8 | ~~~~~~~ 9 | 10 | .. autosummary:: 11 | :toctree: generated 12 | :nosignatures: 13 | 14 | barkscale_fbanks 15 | chroma_filterbank 16 | 17 | DSP 18 | ~~~ 19 | 20 | .. autosummary:: 21 | :toctree: generated 22 | :nosignatures: 23 | 24 | adsr_envelope 25 | filter_waveform 26 | extend_pitch 27 | oscillator_bank 28 | sinc_impulse_response 29 | frequency_impulse_response 30 | 31 | Room Impulse Response Simulation 32 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 33 | 34 | .. autosummary:: 35 | :toctree: generated 36 | :nosignatures: 37 | 38 | ray_tracing 39 | simulate_rir_ism 40 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/decoder/kenlm.arpa: -------------------------------------------------------------------------------- 1 | \data\ 2 | ngram 1=6 3 | ngram 2=9 4 | ngram 3=8 5 | 6 | \1-grams: 7 | -0.8515802 0 8 | 0 -0.30103 9 | -0.8515802 0 10 | -0.8515802 foo -0.30103 11 | -0.44013768 bar -0.30103 12 | -0.6679358 foobar -0.30103 13 | 14 | \2-grams: 15 | -0.7091413 foo 0 16 | -0.6251838 bar 0 17 | -0.24384303 foobar 0 18 | -0.6251838 foo -0.30103 19 | -0.49434766 foo foo -0.30103 20 | -0.39393726 bar foo -0.30103 21 | -0.4582359 bar -0.30103 22 | -0.51359576 foo bar -0.30103 23 | -0.56213206 foobar -0.30103 24 | 25 | \3-grams: 26 | -0.45881382 bar foo 27 | -0.43354067 foo bar 28 | -0.105027884 foobar 29 | -0.18033421 foo foo 30 | -0.38702002 bar foo foo 31 | -0.15375455 bar foo 32 | -0.34500393 foo bar foo 33 | -0.18492673 foo foo bar 34 | 35 | \end\ 36 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/common_utils/autograd_utils.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | 3 | import torch 4 | 5 | 6 | @contextlib.contextmanager 7 | def use_deterministic_algorithms(mode: bool, warn_only: bool): 8 | r""" 9 | This context manager can be used to temporarily enable or disable deterministic algorithms. 10 | Upon exiting the context manager, the previous state of the flag will be restored. 11 | """ 12 | previous_mode: bool = torch.are_deterministic_algorithms_enabled() 13 | previous_warn_only: bool = torch.is_deterministic_algorithms_warn_only_enabled() 14 | try: 15 | torch.use_deterministic_algorithms(mode, warn_only=warn_only) 16 | yield {} 17 | except RuntimeError as err: 18 | raise err 19 | finally: 20 | torch.use_deterministic_algorithms(previous_mode, warn_only=previous_warn_only) 21 | -------------------------------------------------------------------------------- /src/libtorchaudio/forced_align/compute.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | std::tuple forced_align( 5 | const torch::Tensor& logProbs, 6 | const torch::Tensor& targets, 7 | const torch::Tensor& inputLengths, 8 | const torch::Tensor& targetLengths, 9 | const int64_t blank) { 10 | static auto op = torch::Dispatcher::singleton() 11 | .findSchemaOrThrow("torchaudio::forced_align", "") 12 | .typed(); 13 | return op.call(logProbs, targets, inputLengths, targetLengths, blank); 14 | } 15 | 16 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) { 17 | m.def( 18 | "forced_align(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> (Tensor, Tensor)"); 19 | } 20 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_writer/encoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace torio::io { 8 | 9 | // Encoder + Muxer 10 | class Encoder { 11 | // Reference to the AVFormatContext (muxer) 12 | AVFormatContext* format_ctx; 13 | // Reference to codec context (encoder) 14 | AVCodecContext* codec_ctx; 15 | // Stream object as reference. Owned by AVFormatContext. 16 | AVStream* stream; 17 | // Temporary object used during the encoding 18 | // Encoder owns it. 19 | AVPacketPtr packet{alloc_avpacket()}; 20 | 21 | public: 22 | Encoder( 23 | AVFormatContext* format_ctx, 24 | AVCodecContext* codec_ctx, 25 | AVStream* stream) noexcept; 26 | 27 | void encode(AVFrame* frame); 28 | }; 29 | 30 | } // namespace torio::io 31 | -------------------------------------------------------------------------------- /src/libtorchaudio/sox/effects.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCHAUDIO_SOX_EFFECTS_H 2 | #define TORCHAUDIO_SOX_EFFECTS_H 3 | 4 | #include 5 | #include 6 | 7 | namespace torchaudio::sox { 8 | 9 | void initialize_sox_effects(); 10 | 11 | void shutdown_sox_effects(); 12 | 13 | auto apply_effects_tensor( 14 | torch::Tensor waveform, 15 | int64_t sample_rate, 16 | const std::vector>& effects, 17 | bool channels_first) -> std::tuple; 18 | 19 | auto apply_effects_file( 20 | const std::string& path, 21 | const std::vector>& effects, 22 | c10::optional normalize, 23 | c10::optional channels_first, 24 | const c10::optional& format) 25 | -> std::tuple; 26 | 27 | } // namespace torchaudio::sox 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /.github/workflows/pr-labels.yml: -------------------------------------------------------------------------------- 1 | name: pr-labels 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | is-properly-labeled: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Set up python 14 | uses: actions/setup-python@v2 15 | 16 | - name: Install requests 17 | run: pip install requests 18 | 19 | - name: Checkout repository 20 | uses: actions/checkout@v2 21 | 22 | - name: Process commit and find merger responsible for labeling 23 | id: commit 24 | env: 25 | SHA1: ${{ github.sha }} 26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 27 | run: python .github/process_commit.py "${SHA1}" 28 | 29 | concurrency: 30 | group: pr-labels-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} 31 | cancel-in-progress: true 32 | -------------------------------------------------------------------------------- /src/torchaudio/backend/_no_backend.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Callable, Optional, Tuple, Union 3 | 4 | from torch import Tensor 5 | from torchaudio import AudioMetaData 6 | 7 | 8 | def load( 9 | filepath: Union[str, Path], 10 | out: Optional[Tensor] = None, 11 | normalization: Union[bool, float, Callable] = True, 12 | channels_first: bool = True, 13 | num_frames: int = 0, 14 | offset: int = 0, 15 | filetype: Optional[str] = None, 16 | ) -> Tuple[Tensor, int]: 17 | raise RuntimeError("No audio I/O backend is available.") 18 | 19 | 20 | def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None: 21 | raise RuntimeError("No audio I/O backend is available.") 22 | 23 | 24 | def info(filepath: str) -> AudioMetaData: 25 | raise RuntimeError("No audio I/O backend is available.") 26 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/tacotron2/tacotron2_loss_gpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 3 | 4 | from .tacotron2_loss_impl import Tacotron2LossGradcheckTests, Tacotron2LossShapeTests, Tacotron2LossTorchscriptTests 5 | 6 | 7 | @skipIfNoCuda 8 | class TestTacotron2LossShapeFloat32CUDA(PytorchTestCase, Tacotron2LossShapeTests): 9 | dtype = torch.float32 10 | device = torch.device("cuda") 11 | 12 | 13 | @skipIfNoCuda 14 | class TestTacotron2TorchsciptFloat32CUDA(PytorchTestCase, Tacotron2LossTorchscriptTests): 15 | dtype = torch.float32 16 | device = torch.device("cuda") 17 | 18 | 19 | @skipIfNoCuda 20 | class TestTacotron2GradcheckFloat64CUDA(PytorchTestCase, Tacotron2LossGradcheckTests): 21 | dtype = torch.float64 # gradcheck needs a higher numerical accuracy 22 | device = torch.device("cuda") 23 | -------------------------------------------------------------------------------- /examples/avsr/models/conformer_rnnt.py: -------------------------------------------------------------------------------- 1 | from torchaudio.prototype.models import conformer_rnnt_model 2 | 3 | # https://pytorch.org/audio/master/_modules/torchaudio/prototype/models/rnnt.html#conformer_rnnt_model 4 | 5 | 6 | def conformer_rnnt(): 7 | return conformer_rnnt_model( 8 | input_dim=512, 9 | encoding_dim=1024, 10 | time_reduction_stride=1, 11 | conformer_input_dim=256, 12 | conformer_ffn_dim=1024, 13 | conformer_num_layers=16, 14 | conformer_num_heads=4, 15 | conformer_depthwise_conv_kernel_size=31, 16 | conformer_dropout=0.1, 17 | num_symbols=1024, 18 | symbol_embedding_dim=256, 19 | num_lstm_layers=2, 20 | lstm_hidden_dim=512, 21 | lstm_layer_norm=True, 22 | lstm_layer_norm_epsilon=1e-5, 23 | lstm_dropout=0.3, 24 | joiner_activation="tanh", 25 | ) 26 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/gpu/half.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_C10_HALF 4 | #include "c10/util/Half.h" 5 | #endif // USE_C10_HALF 6 | 7 | #include 8 | 9 | namespace torchaudio { 10 | namespace rnnt { 11 | 12 | struct alignas(sizeof(__half)) Half { 13 | __half x; 14 | 15 | HOST_AND_DEVICE Half() = default; 16 | 17 | FORCE_INLINE HOST_AND_DEVICE Half(float f) { 18 | x = __float2half_rn(f); 19 | if (isinf(__half2float(x))) { 20 | x = __float2half_rz(f); // round toward 0. 21 | } 22 | } 23 | 24 | FORCE_INLINE HOST_AND_DEVICE operator float() const { 25 | return __half2float(x); 26 | } 27 | 28 | FORCE_INLINE HOST_AND_DEVICE Half(__half f) { 29 | x = f; 30 | } 31 | 32 | FORCE_INLINE HOST_AND_DEVICE operator __half() const { 33 | return x; 34 | } 35 | }; 36 | 37 | } // namespace rnnt 38 | } // namespace torchaudio 39 | -------------------------------------------------------------------------------- /docs/source/models.decoder.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.models.decoder 2 | 3 | torchaudio.models.decoder 4 | ========================= 5 | 6 | .. currentmodule:: torchaudio.models.decoder 7 | 8 | CTC Decoder 9 | ----------- 10 | 11 | .. autosummary:: 12 | :toctree: generated 13 | :nosignatures: 14 | :template: autosummary/ctc_decoder_class.rst 15 | 16 | CTCDecoder 17 | ctc_decoder 18 | download_pretrained_files 19 | 20 | .. rubric:: Tutorials using CTC Decoder 21 | 22 | .. minigallery:: torchaudio.models.decoder.CTCDecoder 23 | 24 | CUDA CTC Decoder 25 | ---------------- 26 | 27 | .. autosummary:: 28 | :toctree: generated 29 | :nosignatures: 30 | :template: autosummary/cuda_ctc_decoder_class.rst 31 | 32 | CUCTCDecoder 33 | cuda_ctc_decoder 34 | 35 | 36 | .. rubric:: Tutorials using CUDA CTC Decoder 37 | 38 | .. minigallery:: torchaudio.models.decoder.CUCTCDecoder 39 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/sox_effect/common.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from parameterized import param 4 | from torchaudio_unittest.common_utils import get_asset_path 5 | 6 | 7 | def name_func(func, _, params): 8 | if isinstance(params.args[0], str): 9 | args = "_".join([str(arg) for arg in params.args]) 10 | else: 11 | args = "_".join([str(arg) for arg in params.args[0]]) 12 | return f"{func.__name__}_{args}" 13 | 14 | 15 | def load_params(*paths): 16 | params = [] 17 | with open(get_asset_path(*paths), "r") as file: 18 | for line in file: 19 | data = json.loads(line) 20 | for effect in data["effects"]: 21 | for i, arg in enumerate(effect): 22 | if arg.startswith(""): 23 | effect[i] = arg.replace("", get_asset_path()) 24 | params.append(param(data)) 25 | return params 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=torchaudio 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace torio::io::detail { 4 | 5 | UnchunkedBuffer::UnchunkedBuffer(AVRational time_base) : time_base(time_base){}; 6 | 7 | bool UnchunkedBuffer::is_ready() const { 8 | return chunks.size() > 0; 9 | } 10 | 11 | void UnchunkedBuffer::push_frame(torch::Tensor frame, int64_t pts_) { 12 | if (chunks.size() == 0) { 13 | pts = double(pts_) * time_base.num / time_base.den; 14 | } 15 | chunks.push_back(frame); 16 | } 17 | 18 | c10::optional UnchunkedBuffer::pop_chunk() { 19 | if (chunks.size() == 0) { 20 | return {}; 21 | } 22 | 23 | auto frames = 24 | torch::cat(std::vector{chunks.begin(), chunks.end()}, 0); 25 | chunks.clear(); 26 | return {Chunk{frames, pts}}; 27 | } 28 | 29 | void UnchunkedBuffer::flush() { 30 | chunks.clear(); 31 | } 32 | 33 | } // namespace torio::io::detail 34 | -------------------------------------------------------------------------------- /examples/avsr/average_checkpoints.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def average_checkpoints(last): 7 | avg = None 8 | for path in last: 9 | states = torch.load(path, map_location=lambda storage, loc: storage)["state_dict"] 10 | if avg is None: 11 | avg = states 12 | else: 13 | for k in avg.keys(): 14 | avg[k] += states[k] 15 | # average 16 | for k in avg.keys(): 17 | if avg[k] is not None: 18 | if avg[k].is_floating_point(): 19 | avg[k] /= len(last) 20 | else: 21 | avg[k] //= len(last) 22 | return avg 23 | 24 | 25 | def ensemble(args): 26 | last = [os.path.join(args.exp_dir, args.exp_name, f"epoch={n}.ckpt") for n in range(args.epochs - 10, args.epochs)] 27 | model_path = os.path.join(args.exp_dir, args.exp_name, "model_avg_10.pth") 28 | torch.save({"state_dict": average_checkpoints(last)}, model_path) 29 | -------------------------------------------------------------------------------- /examples/libtorchaudio/speech_recognition/greedy_decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Decoder(torch.nn.Module): 5 | def __init__(self, labels): 6 | super().__init__() 7 | self.labels = labels 8 | 9 | def forward(self, logits: torch.Tensor) -> str: 10 | """Given a sequence logits over labels, get the best path string 11 | 12 | Args: 13 | logits (Tensor): Logit tensors. Shape `[num_seq, num_label]`. 14 | 15 | Returns: 16 | str: The resulting transcript 17 | """ 18 | best_path = torch.argmax(logits, dim=-1) # [num_seq,] 19 | best_path = torch.unique_consecutive(best_path, dim=-1) 20 | hypothesis = "" 21 | for i in best_path: 22 | char = self.labels[i] 23 | if char in ["", ""]: 24 | continue 25 | if char == "|": 26 | char = " " 27 | hypothesis += char 28 | return hypothesis 29 | -------------------------------------------------------------------------------- /test/integration_tests/tacotron2_pipeline_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from torchaudio.pipelines import ( 3 | TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH, 4 | TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH, 5 | TACOTRON2_WAVERNN_CHAR_LJSPEECH, 6 | TACOTRON2_WAVERNN_PHONE_LJSPEECH, 7 | ) 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "bundle", 12 | [ 13 | TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH, 14 | TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH, 15 | TACOTRON2_WAVERNN_CHAR_LJSPEECH, 16 | TACOTRON2_WAVERNN_PHONE_LJSPEECH, 17 | ], 18 | ) 19 | def test_tts_models(bundle): 20 | """Smoke test of TTS pipeline""" 21 | text = "Hello world! Text to Speech!" 22 | 23 | processor = bundle.get_text_processor() 24 | tacotron2 = bundle.get_tacotron2() 25 | vocoder = bundle.get_vocoder() 26 | processed, lengths = processor(text) 27 | mel_spec, lengths, _ = tacotron2.infer(processed, lengths) 28 | waveforms, lengths = vocoder(mel_spec, lengths) 29 | -------------------------------------------------------------------------------- /docs/source/models.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.models 2 | 3 | torchaudio.models 4 | ================= 5 | 6 | .. currentmodule:: torchaudio.models 7 | 8 | The ``torchaudio.models`` subpackage contains definitions of models for addressing common audio tasks. 9 | 10 | .. note:: 11 | For models with pre-trained parameters, please refer to :mod:`torchaudio.pipelines` module. 12 | 13 | Model defintions are responsible for constructing computation graphs and executing them. 14 | 15 | Some models have complex structure and variations. 16 | For such models, factory functions are provided. 17 | 18 | .. autosummary:: 19 | :toctree: generated 20 | :nosignatures: 21 | :template: autosummary/model_class.rst 22 | 23 | Conformer 24 | ConvTasNet 25 | DeepSpeech 26 | Emformer 27 | HDemucs 28 | HuBERTPretrainModel 29 | RNNT 30 | RNNTBeamSearch 31 | SquimObjective 32 | SquimSubjective 33 | Tacotron2 34 | Wav2Letter 35 | Wav2Vec2Model 36 | WaveRNN 37 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/cpu/math.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace torchaudio { 7 | namespace rnnt { 8 | 9 | namespace math { 10 | 11 | template 12 | FORCE_INLINE HOST_AND_DEVICE DTYPE max(DTYPE x, DTYPE y) { 13 | if (x > y) { 14 | return x; 15 | } else { 16 | return y; 17 | } 18 | } 19 | 20 | template 21 | FORCE_INLINE HOST_AND_DEVICE DTYPE min(DTYPE x, DTYPE y) { 22 | if (x > y) { 23 | return y; 24 | } else { 25 | return x; 26 | } 27 | } 28 | 29 | // log_sum_exp 30 | template 31 | FORCE_INLINE HOST_AND_DEVICE DTYPE lse(DTYPE x, DTYPE y); 32 | 33 | template <> 34 | FORCE_INLINE HOST_AND_DEVICE float lse(float x, float y) { 35 | if (y > x) { 36 | return y + log1pf(expf(x - y)); 37 | } else { 38 | return x + log1pf(expf(y - x)); 39 | } 40 | } 41 | 42 | } // namespace math 43 | 44 | } // namespace rnnt 45 | } // namespace torchaudio 46 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/functional/torchscript_consistency_cpu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio_unittest.common_utils import PytorchTestCase 3 | 4 | from .torchscript_consistency_test_impl import TorchScriptConsistencyCPUOnlyTestImpl, TorchScriptConsistencyTestImpl 5 | 6 | 7 | class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase): 8 | dtype = torch.float32 9 | device = torch.device("cpu") 10 | 11 | 12 | class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase): 13 | dtype = torch.float64 14 | device = torch.device("cpu") 15 | 16 | 17 | class TorchScriptConsistencyCPUOnlyFloat32Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase): 18 | dtype = torch.float32 19 | device = torch.device("cpu") 20 | 21 | 22 | class TorchScriptConsistencyCPUOnlyFloat64Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase): 23 | dtype = torch.float64 24 | device = torch.device("cpu") 25 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/gpu/math.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_CUDA 4 | 5 | #include 6 | 7 | #endif // USE_CUDA 8 | 9 | #include 10 | 11 | namespace torchaudio { 12 | namespace rnnt { 13 | 14 | namespace math { 15 | 16 | template 17 | FORCE_INLINE HOST_AND_DEVICE DTYPE max(DTYPE x, DTYPE y) { 18 | if (x > y) 19 | return x; 20 | else 21 | return y; 22 | } 23 | 24 | template 25 | FORCE_INLINE HOST_AND_DEVICE DTYPE min(DTYPE x, DTYPE y) { 26 | if (x > y) 27 | return y; 28 | else 29 | return x; 30 | } 31 | 32 | // log_sum_exp 33 | template 34 | FORCE_INLINE HOST_AND_DEVICE DTYPE lse(DTYPE x, DTYPE y); 35 | 36 | template <> 37 | FORCE_INLINE HOST_AND_DEVICE float lse(float x, float y) { 38 | if (y > x) { 39 | return y + log1pf(expf(x - y)); 40 | } else { 41 | return x + log1pf(expf(y - x)); 42 | } 43 | } 44 | 45 | } // namespace math 46 | 47 | } // namespace rnnt 48 | } // namespace torchaudio 49 | -------------------------------------------------------------------------------- /packaging/windows/internal/driver_update.bat: -------------------------------------------------------------------------------- 1 | set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe" 2 | curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe 3 | if errorlevel 1 exit /b 1 4 | 5 | start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot 6 | if errorlevel 1 exit /b 1 7 | 8 | del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL 9 | 10 | setlocal EnableDelayedExpansion 11 | set NVIDIA_GPU_EXISTS=0 12 | for /F "delims=" %%i in ('wmic path win32_VideoController get name') do ( 13 | set GPUS=%%i 14 | if not "x!GPUS:NVIDIA=!" == "x!GPUS!" ( 15 | SET NVIDIA_GPU_EXISTS=1 16 | goto gpu_check_end 17 | ) 18 | ) 19 | :gpu_check_end 20 | endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS% 21 | 22 | if "%NVIDIA_GPU_EXISTS%" == "0" ( 23 | echo "CUDA Driver installation Failed" 24 | exit /b 1 25 | ) 26 | -------------------------------------------------------------------------------- /docs/source/kaldi_io.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | torchaudio.kaldi_io 5 | ====================== 6 | 7 | .. py:module:: torchaudio.kaldi_io 8 | 9 | .. currentmodule:: torchaudio.kaldi_io 10 | 11 | To use this module, the dependency kaldi_io_ needs to be installed. 12 | This is a light wrapper around ``kaldi_io`` that returns :class:`torch.Tensor`. 13 | 14 | .. _kaldi_io: https://github.com/vesis84/kaldi-io-for-python 15 | 16 | Vectors 17 | ------- 18 | 19 | :hidden:`read_vec_int_ark` 20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 21 | 22 | .. autofunction:: read_vec_int_ark 23 | 24 | :hidden:`read_vec_flt_scp` 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | .. autofunction:: read_vec_flt_scp 28 | 29 | :hidden:`read_vec_flt_ark` 30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 31 | 32 | .. autofunction:: read_vec_flt_ark 33 | 34 | Matrices 35 | -------- 36 | 37 | :hidden:`read_mat_scp` 38 | ~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | .. autofunction:: read_mat_scp 41 | 42 | :hidden:`read_mat_ark` 43 | ~~~~~~~~~~~~~~~~~~~~~~ 44 | 45 | .. autofunction:: read_mat_ark 46 | -------------------------------------------------------------------------------- /examples/avsr/data_prep/detectors/retinaface/detector.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Imperial College London (Pingchuan Ma) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import warnings 8 | 9 | import numpy as np 10 | from ibug.face_detection import RetinaFacePredictor 11 | 12 | warnings.filterwarnings("ignore") 13 | 14 | 15 | class LandmarksDetector: 16 | def __init__(self, device="cuda:0", model_name="resnet50"): 17 | self.face_detector = RetinaFacePredictor( 18 | device=device, threshold=0.8, model=RetinaFacePredictor.get_model(model_name) 19 | ) 20 | 21 | def __call__(self, video_frames): 22 | landmarks = [] 23 | for frame in video_frames: 24 | detected_faces = self.face_detector(frame, rgb=False) 25 | if len(detected_faces) >= 1: 26 | landmarks.append(np.reshape(detected_faces[0][:4], (2, 2))) 27 | else: 28 | landmarks.append(None) 29 | return landmarks 30 | -------------------------------------------------------------------------------- /docs/source/feature_classifications.rst: -------------------------------------------------------------------------------- 1 | Feature Classifications 2 | ======================= 3 | 4 | Features described in this documentation are classified by release status: 5 | 6 | *Stable:* These features will be maintained long-term and there should generally 7 | be no major performance limitations or gaps in documentation. 8 | We also expect to maintain backwards compatibility (although 9 | breaking changes can happen and notice will be given one release ahead 10 | of time). 11 | 12 | *Beta:* Features are tagged as Beta because the API may change based on 13 | user feedback, because the performance needs to improve, or because 14 | coverage across operators is not yet complete. For Beta features, we are 15 | committing to seeing the feature through to the Stable classification. 16 | We are not, however, committing to backwards compatibility. 17 | 18 | *Prototype:* These features are typically not available as part of 19 | binary distributions like PyPI or Conda, except sometimes behind run-time 20 | flags, and are at an early stage for feedback and testing. 21 | -------------------------------------------------------------------------------- /examples/avsr/models/emformer_rnnt.py: -------------------------------------------------------------------------------- 1 | from torchaudio.models.rnnt import emformer_rnnt_model 2 | 3 | 4 | # https://pytorch.org/audio/master/_modules/torchaudio/models/rnnt.html#emformer_rnnt_base 5 | def emformer_rnnt(): 6 | return emformer_rnnt_model( 7 | input_dim=512, 8 | encoding_dim=1024, 9 | num_symbols=1024, 10 | segment_length=64, 11 | right_context_length=0, 12 | time_reduction_input_dim=128, 13 | time_reduction_stride=1, 14 | transformer_num_heads=4, 15 | transformer_ffn_dim=2048, 16 | transformer_num_layers=20, 17 | transformer_dropout=0.1, 18 | transformer_activation="gelu", 19 | transformer_left_context_length=30, 20 | transformer_max_memory_size=0, 21 | transformer_weight_init_scale_strategy="depthwise", 22 | transformer_tanh_on_mem=True, 23 | symbol_embedding_dim=512, 24 | num_lstm_layers=3, 25 | lstm_layer_norm=True, 26 | lstm_layer_norm_epsilon=1e-3, 27 | lstm_dropout=0.3, 28 | ) 29 | -------------------------------------------------------------------------------- /examples/avsr/schedulers.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler): 7 | def __init__( 8 | self, 9 | optimizer: torch.optim.Optimizer, 10 | warmup_epochs: int, 11 | total_epochs: int, 12 | steps_per_epoch: int, 13 | last_epoch=-1, 14 | verbose=False, 15 | ): 16 | self.warmup_steps = warmup_epochs * steps_per_epoch 17 | self.total_steps = total_epochs * steps_per_epoch 18 | super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose) 19 | 20 | def get_lr(self): 21 | if self._step_count < self.warmup_steps: 22 | return [self._step_count / self.warmup_steps * base_lr for base_lr in self.base_lrs] 23 | else: 24 | decay_steps = self.total_steps - self.warmup_steps 25 | return [ 26 | 0.5 * base_lr * (1 + math.cos(math.pi * (self._step_count - self.warmup_steps) / decay_steps)) 27 | for base_lr in self.base_lrs 28 | ] 29 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/functional_cuda_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda 5 | 6 | from .functional_impl import Functional, FunctionalCUDAOnly 7 | 8 | 9 | @skipIfNoCuda 10 | class TestFunctionalFloat32(Functional, PytorchTestCase): 11 | dtype = torch.float32 12 | device = torch.device("cuda") 13 | 14 | @unittest.expectedFailure 15 | def test_lfilter_9th_order_filter_stability(self): 16 | super().test_lfilter_9th_order_filter_stability() 17 | 18 | 19 | @skipIfNoCuda 20 | class TestLFilterFloat64(Functional, PytorchTestCase): 21 | dtype = torch.float64 22 | device = torch.device("cuda") 23 | 24 | 25 | @skipIfNoCuda 26 | class TestFunctionalCUDAOnlyFloat32(FunctionalCUDAOnly, PytorchTestCase): 27 | dtype = torch.float32 28 | device = torch.device("cuda") 29 | 30 | 31 | @skipIfNoCuda 32 | class TestFunctionalCUDAOnlyFloat64(FunctionalCUDAOnly, PytorchTestCase): 33 | dtype = torch.float64 34 | device = torch.device("cuda") 35 | -------------------------------------------------------------------------------- /packaging/vc_env_helper.bat: -------------------------------------------------------------------------------- 1 | @echo on 2 | 3 | set VC_VERSION_LOWER=16 4 | set VC_VERSION_UPPER=17 5 | 6 | for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( 7 | if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( 8 | set "VS15INSTALLDIR=%%i" 9 | set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" 10 | goto vswhere 11 | ) 12 | ) 13 | 14 | :vswhere 15 | if "%VSDEVCMD_ARGS%" == "" ( 16 | call "%VS15VCVARSALL%" x64 || exit /b 1 17 | ) else ( 18 | call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1 19 | ) 20 | 21 | @echo on 22 | 23 | set DISTUTILS_USE_SDK=1 24 | 25 | set args=%1 26 | shift 27 | :start 28 | if [%1] == [] goto done 29 | set args=%args% %1 30 | shift 31 | goto start 32 | 33 | :done 34 | if "%args%" == "" ( 35 | echo Usage: vc_env_helper.bat [command] [args] 36 | echo e.g. vc_env_helper.bat cl /c test.cpp 37 | ) 38 | 39 | %args% || exit /b 1 40 | -------------------------------------------------------------------------------- /src/libtorchaudio/cuctc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Custom CMakeLists for building cuda ctc decoder 2 | 3 | set(CMAKE_CXX_VISIBILITY_PRESET default) 4 | 5 | # the following line is added in order to export symbols when building on Windows 6 | # this approach has some limitations as documented in https://github.com/pytorch/pytorch/pull/3650 7 | if (MSVC) 8 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 9 | endif() 10 | 11 | set( 12 | libctc_prefix_decoder_src 13 | src/ctc_prefix_decoder.cpp 14 | src/ctc_prefix_decoder_kernel_v2.cu 15 | ) 16 | 17 | set( 18 | additional_libs 19 | ) 20 | 21 | list( 22 | APPEND 23 | additional_libs 24 | cuda_deps 25 | ) 26 | 27 | torchaudio_library( 28 | libctc_prefix_decoder 29 | "${libctc_prefix_decoder_src}" 30 | "${CMAKE_CURRENT_SOURCE_DIR}" 31 | "${additional_libs}" 32 | "" 33 | ) 34 | 35 | if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) 36 | torchaudio_extension( 37 | pybind11_prefixctc 38 | src/python_binding.cpp 39 | "${CMAKE_CURRENT_SOURCE_DIR}" 40 | "libctc_prefix_decoder;${additional_libs}" 41 | "" 42 | ) 43 | endif() 44 | -------------------------------------------------------------------------------- /test/integration_tests/rnnt_pipeline_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torchaudio 3 | from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH 4 | from torchaudio.prototype.pipelines import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "bundle,lang,expected", 9 | [ 10 | (EMFORMER_RNNT_BASE_LIBRISPEECH, "en", "i have that curiosity beside me at this moment"), 11 | (EMFORMER_RNNT_BASE_MUSTC, "en", "I had that curiosity beside me at this moment."), 12 | (EMFORMER_RNNT_BASE_TEDLIUM3, "en", "i had that curiosity beside me at this moment"), 13 | ], 14 | ) 15 | def test_rnnt(bundle, sample_speech, expected): 16 | feature_extractor = bundle.get_feature_extractor() 17 | decoder = bundle.get_decoder().eval() 18 | token_processor = bundle.get_token_processor() 19 | waveform, _ = torchaudio.load(sample_speech) 20 | features, length = feature_extractor(waveform.squeeze()) 21 | hypotheses = decoder(features, length, 10) 22 | text = token_processor(hypotheses[0][0]) 23 | assert text == expected 24 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2<3.1.0 2 | matplotlib<=3.8 3 | pyparsing<3,>=2.0.2 4 | 5 | # C++ docs 6 | breathe==4.34.0 7 | 8 | # Note: 9 | # When changing Sphinx-related packages, make sure that the custom behaviors in the following 10 | # locations are working as expected. 11 | # - source/_templates/layout.html 12 | # - source/_static/css/custom.css 13 | -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@32a6550#egg=pytorch_sphinx_theme 14 | sphinx==5.1.1 15 | sphinxcontrib.katex==0.8.6 16 | sphinxcontrib.bibtex==2.4.2 17 | sphinx_gallery==0.11.1 18 | nbsphinx==0.8.8 19 | 20 | # https://github.com/bmcfee/resampy/issues/106 21 | # Since 2022-07-07 build_docs CI job started to fail. 22 | # Pinning resampy to 0.2.2 resolves this. 23 | # The real cause is not know at the moment but the use 24 | # of librosa seems to cause this 25 | # https://github.com/bmcfee/resampy/issues/106 26 | # In our case, the tutorial timed out is online_asr_tutorial, 27 | # which itself does not use resampy 28 | # However audio_feature_augmentation_tutorial is executed before that, 29 | # which uses librosa. 30 | resampy==0.2.2 31 | -------------------------------------------------------------------------------- /src/libtorchaudio/rnnt/compute.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | std::tuple> rnnt_loss( 5 | torch::Tensor& logits, 6 | const torch::Tensor& targets, 7 | const torch::Tensor& logit_lengths, 8 | const torch::Tensor& target_lengths, 9 | int64_t blank, 10 | double clamp, 11 | bool fused_log_softmax = true) { 12 | static auto op = torch::Dispatcher::singleton() 13 | .findSchemaOrThrow("torchaudio::rnnt_loss", "") 14 | .typed(); 15 | return op.call( 16 | logits, 17 | targets, 18 | logit_lengths, 19 | target_lengths, 20 | blank, 21 | clamp, 22 | fused_log_softmax); 23 | } 24 | 25 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) { 26 | m.def( 27 | "rnnt_loss(Tensor logits," 28 | "Tensor targets," 29 | "Tensor logit_lengths," 30 | "Tensor target_lengths," 31 | "int blank," 32 | "float clamp," 33 | "bool fused_log_softmax) -> (Tensor, Tensor?)"); 34 | } 35 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/hw_context.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace torio::io { 4 | namespace { 5 | 6 | static std::mutex MUTEX; 7 | static std::map CUDA_CONTEXT_CACHE; 8 | 9 | } // namespace 10 | 11 | AVBufferRef* get_cuda_context(int index) { 12 | std::lock_guard lock(MUTEX); 13 | if (index == -1) { 14 | index = 0; 15 | } 16 | if (CUDA_CONTEXT_CACHE.count(index) == 0) { 17 | AVBufferRef* p = nullptr; 18 | int ret = av_hwdevice_ctx_create( 19 | &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0); 20 | TORCH_CHECK( 21 | ret >= 0, 22 | "Failed to create CUDA device context on device ", 23 | index, 24 | "(", 25 | av_err2string(ret), 26 | ")"); 27 | assert(p); 28 | CUDA_CONTEXT_CACHE.emplace(index, p); 29 | return p; 30 | } 31 | AVBufferRefPtr& buffer = CUDA_CONTEXT_CACHE.at(index); 32 | return buffer; 33 | } 34 | 35 | void clear_cuda_context_cache() { 36 | std::lock_guard lock(MUTEX); 37 | CUDA_CONTEXT_CACHE.clear(); 38 | } 39 | 40 | } // namespace torio::io 41 | -------------------------------------------------------------------------------- /src/libtorchaudio/sox/types.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCHAUDIO_SOX_TYPES_H 2 | #define TORCHAUDIO_SOX_TYPES_H 3 | 4 | #include 5 | #include 6 | 7 | namespace torchaudio::sox { 8 | 9 | enum class Format { 10 | WAV, 11 | MP3, 12 | FLAC, 13 | VORBIS, 14 | AMR_NB, 15 | AMR_WB, 16 | AMB, 17 | SPHERE, 18 | GSM, 19 | HTK, 20 | }; 21 | 22 | Format get_format_from_string(const std::string& format); 23 | 24 | enum class Encoding { 25 | NOT_PROVIDED, 26 | UNKNOWN, 27 | PCM_SIGNED, 28 | PCM_UNSIGNED, 29 | PCM_FLOAT, 30 | FLAC, 31 | ULAW, 32 | ALAW, 33 | MP3, 34 | VORBIS, 35 | AMR_WB, 36 | AMR_NB, 37 | OPUS, 38 | }; 39 | 40 | std::string to_string(Encoding v); 41 | Encoding get_encoding_from_option(const c10::optional& encoding); 42 | 43 | enum class BitDepth : unsigned { 44 | NOT_PROVIDED = 0, 45 | B8 = 8, 46 | B16 = 16, 47 | B24 = 24, 48 | B32 = 32, 49 | B64 = 64, 50 | }; 51 | 52 | BitDepth get_bit_depth_from_option(const c10::optional& bit_depth); 53 | 54 | std::string get_encoding(sox_encoding_t encoding); 55 | 56 | } // namespace torchaudio::sox 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/torchaudio/__init__.py: -------------------------------------------------------------------------------- 1 | # Initialize extension and backend first 2 | from . import _extension # noqa # usort: skip 3 | from ._backend import ( # noqa # usort: skip 4 | AudioMetaData, 5 | get_audio_backend, 6 | info, 7 | list_audio_backends, 8 | load, 9 | save, 10 | set_audio_backend, 11 | ) 12 | 13 | from . import ( # noqa: F401 14 | compliance, 15 | datasets, 16 | functional, 17 | io, 18 | kaldi_io, 19 | models, 20 | pipelines, 21 | sox_effects, 22 | transforms, 23 | utils, 24 | ) 25 | 26 | # For BC 27 | from . import backend # noqa # usort: skip 28 | 29 | try: 30 | from .version import __version__, git_version # noqa: F401 31 | except ImportError: 32 | pass 33 | 34 | 35 | __all__ = [ 36 | "AudioMetaData", 37 | "load", 38 | "info", 39 | "save", 40 | "io", 41 | "compliance", 42 | "datasets", 43 | "functional", 44 | "models", 45 | "pipelines", 46 | "kaldi_io", 47 | "utils", 48 | "sox_effects", 49 | "transforms", 50 | "list_audio_backends", 51 | "get_audio_backend", 52 | "set_audio_backend", 53 | ] 54 | -------------------------------------------------------------------------------- /examples/self_supervised_learning/README.md: -------------------------------------------------------------------------------- 1 | # Modularized Self-supervised Learning Recipe 2 | 3 | This directory contains the modularized training recipe for audio/speech self-supervised learning. The principle is to let users easily inject a new component (model, data_module, loss function, etc) to the existing recipe for different tasks (e.g. Wav2Vec 2.0, HuBERT, etc). 4 | 5 | 6 | ## HuBERT Pre-training Example 7 | To get the K-Means labels for HuBERT pre-training, please check the [pre-processing step](../hubert/README.md#pre-processing-1st-iteration) in hubert example. 8 | 9 | In order to run the HuBERT pre-training script for the first iteration, users need to go to `examples` directory and run the following SLURM command: 10 | ``` 11 | cd examples 12 | 13 | srun \ 14 | --gpus-per-node=8 \ 15 | --ntasks-per-node=8 \ 16 | -N 4 \ 17 | --cpus-per-task=10 \ 18 | python -m self_supervised_learning.train_hubert \ 19 | --dataset-path hubert/exp/data/mfcc/ \ 20 | --exp-dir self_supervised_learning/exp_iter1 \ 21 | --feature-type mfcc \ 22 | --num-class 100 \ 23 | --max-updates 250000 \ 24 | --learning-rate 0.0005 \ 25 | --gpus 8 \ 26 | --num-nodes 4 27 | ``` 28 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_reader/post_process.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace torio::io { 6 | 7 | struct IPostDecodeProcess { 8 | virtual ~IPostDecodeProcess() = default; 9 | 10 | virtual int process_frame(AVFrame* frame) = 0; 11 | virtual c10::optional pop_chunk() = 0; 12 | virtual bool is_buffer_ready() const = 0; 13 | virtual const std::string& get_filter_desc() const = 0; 14 | virtual FilterGraphOutputInfo get_filter_output_info() const = 0; 15 | virtual void flush() = 0; 16 | }; 17 | 18 | std::unique_ptr get_audio_process( 19 | AVRational input_time_base, 20 | AVCodecContext* codec_ctx, 21 | const std::string& desc, 22 | int frames_per_chunk, 23 | int num_chunks); 24 | 25 | std::unique_ptr get_video_process( 26 | AVRational input_time_base, 27 | AVRational frame_rate, 28 | AVCodecContext* codec_ctx, 29 | const std::string& desc, 30 | int frames_per_chunk, 31 | int num_chunks, 32 | const torch::Device& device); 33 | 34 | } // namespace torio::io 35 | -------------------------------------------------------------------------------- /src/torchaudio/models/wav2vec2/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | from .model import ( 3 | hubert_base, 4 | hubert_large, 5 | hubert_pretrain_base, 6 | hubert_pretrain_large, 7 | hubert_pretrain_model, 8 | hubert_pretrain_xlarge, 9 | hubert_xlarge, 10 | HuBERTPretrainModel, 11 | wav2vec2_base, 12 | wav2vec2_large, 13 | wav2vec2_large_lv60k, 14 | wav2vec2_model, 15 | wav2vec2_xlsr_1b, 16 | wav2vec2_xlsr_2b, 17 | wav2vec2_xlsr_300m, 18 | Wav2Vec2Model, 19 | wavlm_base, 20 | wavlm_large, 21 | wavlm_model, 22 | ) 23 | 24 | __all__ = [ 25 | "Wav2Vec2Model", 26 | "HuBERTPretrainModel", 27 | "wavlm_model", 28 | "wavlm_base", 29 | "wavlm_large", 30 | "wav2vec2_model", 31 | "wav2vec2_base", 32 | "wav2vec2_large", 33 | "wav2vec2_large_lv60k", 34 | "hubert_base", 35 | "hubert_large", 36 | "hubert_xlarge", 37 | "hubert_pretrain_model", 38 | "hubert_pretrain_base", 39 | "hubert_pretrain_large", 40 | "hubert_pretrain_xlarge", 41 | "utils", 42 | "wav2vec2_xlsr_300m", 43 | "wav2vec2_xlsr_1b", 44 | "wav2vec2_xlsr_2b", 45 | ] 46 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/dataset_class.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/dataset_class.rst 3 | 4 | {{ name | underline }} 5 | 6 | .. autoclass:: {{ fullname }} 7 | 8 | {%- if "get_metadata" in methods %} 9 | {%- set meth=["__getitem__", "get_metadata"] %} 10 | {%- else %} 11 | {%- set meth=["__getitem__"] %} 12 | {%- endif %} 13 | 14 | {%- if name == "CMUDict" %} 15 | {%- set properties=["symbols"] %} 16 | {%- elif name == "TEDLIUM" %} 17 | {%- set properties=["phoneme_dict"] %} 18 | {%- else %} 19 | {%- set properties=[] %} 20 | {%- endif %} 21 | 22 | {%- if properties %} 23 | 24 | Properties 25 | ========== 26 | 27 | {% for item in properties %} 28 | 29 | {{item | underline("-") }} 30 | 31 | .. container:: py attribute 32 | 33 | .. autoproperty:: {{[fullname, item] | join('.')}} 34 | 35 | {%- endfor %} 36 | 37 | {%- endif %} 38 | 39 | {%- if properties %} 40 | 41 | Methods 42 | ======= 43 | 44 | {%- endif %} 45 | 46 | {% for item in meth %} 47 | 48 | {{item | underline("-") }} 49 | 50 | .. container:: py attribute 51 | 52 | .. automethod:: {{[fullname, item] | join('.')}} 53 | 54 | {%- endfor %} 55 | -------------------------------------------------------------------------------- /examples/self_supervised_learning/lr_schedulers/_linear_decay.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim.optimizer import Optimizer 3 | 4 | 5 | class LinearDecayLRScheduler(torch.optim.lr_scheduler._LRScheduler): 6 | """Linear learning rate scheduler with warm up.""" 7 | 8 | def __init__( 9 | self, 10 | optimizer: Optimizer, 11 | warmup_updates: int, 12 | max_updates: int, 13 | last_epoch: int = -1, 14 | verbose: bool = False, 15 | ): 16 | self.warmup_updates = warmup_updates 17 | self.max_updates = max_updates 18 | super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose) 19 | 20 | def get_lr(self): 21 | if self._step_count <= self.warmup_updates: 22 | return [self._step_count / self.warmup_updates * base_lr for base_lr in self.base_lrs] 23 | elif self._step_count >= self.max_updates: 24 | return [0.0 for _ in self.base_lrs] 25 | else: 26 | pct_remaining = (self.max_updates - self._step_count) / (self.max_updates - self.warmup_updates) 27 | return [base_lr * pct_remaining for base_lr in self.base_lrs] 28 | -------------------------------------------------------------------------------- /examples/libtorchaudio/README.md: -------------------------------------------------------------------------------- 1 | # Libtorchaudio Examples 2 | 3 | * [Augmentation](./augmentation) 4 | * [Speech Recognition with wav2vec2.0](./speech_recognition) 5 | 6 | ## Build 7 | 8 | The example applications in this directory depend on `libtorch` and `libtorchaudio`. 9 | If you have a working `PyTorch`, you already have `libtorch`. 10 | Please refer to [this tutorial](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) for the use of `libtorch` and TorchScript. 11 | 12 | `libtorchaudio` is the library of torchaudio's C++ components without Python component. 13 | It is currently not distributed, and it will be built alongside with the applications. 14 | 15 | The following commands will build `libtorchaudio` and applications. 16 | 17 | ```bash 18 | git submodule update 19 | mkdir build 20 | cd build 21 | cmake -GNinja \ 22 | -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \ 23 | -DBUILD_SOX=ON \ 24 | -DBUILD_KALDI=OFF \ 25 | -DBUILD_RNNT=ON \ 26 | .. 27 | cmake --build . 28 | ``` 29 | 30 | For the usages of each application, refer to the corresponding application directory. 31 | -------------------------------------------------------------------------------- /.github/workflows/integration-test.yml: -------------------------------------------------------------------------------- 1 | name: Integration Test 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-22.04 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | python-version: [ 3.8 ] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | sudo apt install -y -qq libavfilter-dev libavdevice-dev 27 | - name: Install packages 28 | run: | 29 | python -m pip install --quiet --upgrade pip 30 | python -m pip install --quiet --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html 31 | python -m pip install --quiet pytest requests cmake ninja deep-phonemizer sentencepiece flashlight-text git+https://github.com/kpu/kenlm 32 | python setup.py install 33 | - name: Run integration test 34 | run: | 35 | cd test && pytest integration_tests -v --use-tmp-hub-dir 36 | -------------------------------------------------------------------------------- /examples/avsr/data_prep/tools/README.md: -------------------------------------------------------------------------------- 1 | ## Face Recognition 2 | We provide [ibug.face_detection](https://github.com/hhj1897/face_detection) in this repository. You can install directly from github repositories or by using compressed files. 3 | 4 | ### Option 1. Install from github repositories 5 | 6 | * [Git LFS](https://git-lfs.github.com/), needed for downloading the pretrained weights that are larger than 100 MB. 7 | 8 | You could install *`Homebrew`* and then install *`git-lfs`* without sudo priviledges. 9 | 10 | ```Shell 11 | git clone https://github.com/hhj1897/face_detection.git 12 | cd face_detection 13 | git lfs pull 14 | pip install -e . 15 | cd .. 16 | ``` 17 | 18 | ### Option 2. Install by using compressed files 19 | 20 | If you are experiencing over-quota issues for the above repositoies, you can download both packages [ibug.face_detection](https://www.doc.ic.ac.uk/~pm4115/tracker/face_detection.zip), unzip the files, and then run `pip install -e .` to install each package. 21 | 22 | ```Shell 23 | wget https://www.doc.ic.ac.uk/~pm4115/tracker/face_detection.zip -O ./face_detection.zip 24 | unzip -o ./face_detection.zip -d ./ 25 | cd face_detection 26 | pip install -e . 27 | cd .. 28 | ``` 29 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/prototype/conv_emformer_test_impl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchaudio.prototype.models.conv_emformer import ConvEmformer 3 | from torchaudio_unittest.common_utils import TestBaseMixin 4 | from torchaudio_unittest.models.emformer.emformer_test_impl import EmformerTestMixin 5 | 6 | 7 | class ConvEmformerTestImpl(EmformerTestMixin, TestBaseMixin): 8 | def gen_model(self, input_dim, right_context_length): 9 | emformer = ConvEmformer( 10 | input_dim, 11 | 8, 12 | 256, 13 | 3, 14 | 4, 15 | 12, 16 | left_context_length=30, 17 | right_context_length=right_context_length, 18 | max_memory_size=1, 19 | ).to(device=self.device, dtype=self.dtype) 20 | return emformer 21 | 22 | def gen_inputs(self, input_dim, batch_size, num_frames, right_context_length): 23 | input = torch.rand(batch_size, num_frames, input_dim).to(device=self.device, dtype=self.dtype) 24 | lengths = torch.randint(1, num_frames - right_context_length, (batch_size,)).to( 25 | device=self.device, dtype=self.dtype 26 | ) 27 | return input, lengths 28 | -------------------------------------------------------------------------------- /examples/pipeline_wavernn/processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class NormalizeDB(nn.Module): 6 | r"""Normalize the spectrogram with a minimum db value""" 7 | 8 | def __init__(self, min_level_db, normalization): 9 | super().__init__() 10 | self.min_level_db = min_level_db 11 | self.normalization = normalization 12 | 13 | def forward(self, specgram): 14 | specgram = torch.log10(torch.clamp(specgram.squeeze(0), min=1e-5)) 15 | if self.normalization: 16 | return torch.clamp((self.min_level_db - 20 * specgram) / self.min_level_db, min=0, max=1) 17 | return specgram 18 | 19 | 20 | def normalized_waveform_to_bits(waveform: torch.Tensor, bits: int) -> torch.Tensor: 21 | r"""Transform waveform [-1, 1] to label [0, 2 ** bits - 1]""" 22 | 23 | assert abs(waveform).max() <= 1.0 24 | waveform = (waveform + 1.0) * (2**bits - 1) / 2 25 | return torch.clamp(waveform, 0, 2**bits - 1).int() 26 | 27 | 28 | def bits_to_normalized_waveform(label: torch.Tensor, bits: int) -> torch.Tensor: 29 | r"""Transform label [0, 2 ** bits - 1] to waveform [-1, 1]""" 30 | 31 | return 2 * label / (2**bits - 1.0) - 1.0 32 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace torio::io::detail { 6 | 7 | class ChunkedBuffer { 8 | // Each AVFrame is converted to a Tensor and stored here. 9 | std::deque chunks; 10 | // Time stamps corresponding the first frame of each chunk 11 | std::deque pts; 12 | AVRational time_base; 13 | 14 | // The number of frames to return as a chunk 15 | // If <0, then user wants to receive all the frames 16 | const int64_t frames_per_chunk; 17 | // The numbe of chunks to retain 18 | const int64_t num_chunks; 19 | // The number of currently stored chunks 20 | // For video, one Tensor corresponds to one frame, but for audio, 21 | // one Tensor contains multiple samples, so we track here. 22 | int64_t num_buffered_frames = 0; 23 | 24 | public: 25 | ChunkedBuffer(AVRational time_base, int frames_per_chunk, int num_chunks); 26 | 27 | bool is_ready() const; 28 | void flush(); 29 | c10::optional pop_chunk(); 30 | void push_frame(torch::Tensor frame, int64_t pts_); 31 | }; 32 | 33 | } // namespace torio::io::detail 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://pytorch.org/audio/stable/index.html 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | PLEASE NOTE THAT THE TORCHAUDIO REPOSITORY IS NO LONGER ACTIVELY MONITORED. You will not likely get a response. For open discussions, visit https://discuss.pytorch.org/. 9 | - type: textarea 10 | attributes: 11 | label: 📚 The doc issue 12 | description: > 13 | A description of what content in https://pytorch.org/audio/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new. 14 | validations: 15 | required: true 16 | - type: textarea 17 | attributes: 18 | label: Suggest a potential alternative/fix 19 | description: > 20 | Tell us how we could improve the documentation in this regard. 21 | - type: markdown 22 | attributes: 23 | value: > 24 | Thanks for contributing 🎉! 25 | -------------------------------------------------------------------------------- /docs/source/datasets.rst: -------------------------------------------------------------------------------- 1 | .. py:module:: torchaudio.datasets 2 | 3 | torchaudio.datasets 4 | ==================== 5 | 6 | All datasets are subclasses of :class:`torch.utils.data.Dataset` 7 | and have ``__getitem__`` and ``__len__`` methods implemented. 8 | 9 | Hence, they can all be passed to a :class:`torch.utils.data.DataLoader` 10 | which can load multiple samples parallelly using :mod:`torch.multiprocessing` workers. 11 | For example: 12 | 13 | .. code:: 14 | 15 | yesno_data = torchaudio.datasets.YESNO('.', download=True) 16 | data_loader = torch.utils.data.DataLoader( 17 | yesno_data, 18 | batch_size=1, 19 | shuffle=True, 20 | num_workers=args.nThreads) 21 | 22 | .. currentmodule:: torchaudio.datasets 23 | 24 | .. autosummary:: 25 | :toctree: generated 26 | :nosignatures: 27 | :template: autosummary/dataset_class.rst 28 | 29 | CMUARCTIC 30 | CMUDict 31 | COMMONVOICE 32 | DR_VCTK 33 | FluentSpeechCommands 34 | GTZAN 35 | IEMOCAP 36 | LibriMix 37 | LIBRISPEECH 38 | LibriLightLimited 39 | LIBRITTS 40 | LJSPEECH 41 | MUSDB_HQ 42 | QUESST14 43 | Snips 44 | SPEECHCOMMANDS 45 | TEDLIUM 46 | VCTK_092 47 | VoxCeleb1Identification 48 | VoxCeleb1Verification 49 | YESNO 50 | -------------------------------------------------------------------------------- /src/libtorchaudio/sox/io.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCHAUDIO_SOX_IO_H 2 | #define TORCHAUDIO_SOX_IO_H 3 | 4 | #include 5 | #include 6 | 7 | namespace torchaudio::sox { 8 | 9 | auto get_effects( 10 | const c10::optional& frame_offset, 11 | const c10::optional& num_frames) 12 | -> std::vector>; 13 | 14 | std::tuple get_info_file( 15 | const std::string& path, 16 | const c10::optional& format); 17 | 18 | std::tuple load_audio_file( 19 | const std::string& path, 20 | const c10::optional& frame_offset, 21 | const c10::optional& num_frames, 22 | c10::optional normalize, 23 | c10::optional channels_first, 24 | const c10::optional& format); 25 | 26 | void save_audio_file( 27 | const std::string& path, 28 | torch::Tensor tensor, 29 | int64_t sample_rate, 30 | bool channels_first, 31 | c10::optional compression, 32 | c10::optional format, 33 | c10::optional encoding, 34 | c10::optional bits_per_sample); 35 | 36 | } // namespace torchaudio::sox 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - nightly 8 | - main 9 | - release/* 10 | workflow_dispatch: 11 | 12 | jobs: 13 | python-source-and-configs: 14 | uses: pytorch/test-infra/.github/workflows/linux_job.yml@main 15 | with: 16 | repository: pytorch/audio 17 | script: | 18 | set -euo pipefail 19 | 20 | echo '::group::Setup environment' 21 | eval "$("$(which conda)" shell.bash hook)" 22 | # libcst does not have 3.11 pre-built binaries yet. Use python 3.10 23 | conda create -y --name env python=3.10 24 | conda activate env 25 | pip3 install --progress-bar=off pre-commit 26 | echo '::endgroup::' 27 | 28 | set +e 29 | pre-commit run --all-files --show-diff-on-failure 30 | status=$? 31 | 32 | echo '::group::Add Summry' 33 | if [ $status -ne 0 ]; then 34 | echo '### Lint failure' >> $GITHUB_STEP_SUMMARY 35 | echo '```diff' >> $GITHUB_STEP_SUMMARY 36 | git --no-pager diff >> $GITHUB_STEP_SUMMARY 37 | echo '```' >> $GITHUB_STEP_SUMMARY 38 | fi 39 | echo '::endgroup::' 40 | exit $status 41 | -------------------------------------------------------------------------------- /examples/libtorchaudio/augmentation/README.md: -------------------------------------------------------------------------------- 1 | # Augmentation 2 | 3 | This example demonstrates how you can use torchaudio's I/O features and augmentations in C++ application. 4 | 5 | **NOTE** 6 | This example uses `"sox_io"` backend, thus does not work on Windows. 7 | 8 | ## Steps 9 | ### 1. Create augmentation pipeline TorchScript file. 10 | 11 | First, we implement our data process pipeline as a regular Python, and save it as a TorchScript object. 12 | We will load and execute it in our C++ application. The C++ code is found in [`main.cpp`](./main.cpp). 13 | 14 | ```python 15 | python create_jittable_pipeline.py \ 16 | --rir-path "../data/rir.wav" \ 17 | --output-path "./pipeline.zip" 18 | ``` 19 | 20 | ### 2. Build the application 21 | 22 | Please refer to [the top level README.md](../README.md) 23 | 24 | ### 3. Run the application 25 | 26 | Now we run the C++ application `augment`, with the TorchScript object we created in Step.1 and an input audio file. 27 | 28 | In [the top level directory](../) 29 | 30 | ```bash 31 | input_audio_file="./data/input.wav" 32 | ./build/augmentation/augment ./augmentation/pipeline.zip "${input_audio_file}" "output.wav" 33 | ``` 34 | 35 | When you give a clean speech file, the output audio sounds like it's a phone conversation. 36 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/wav2vec2/huggingface/generate_huggingface_model_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from transformers import Wav2Vec2Model 5 | 6 | _THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | 9 | def _main(): 10 | keys = [ 11 | # pretrained 12 | "facebook/wav2vec2-base", 13 | "facebook/wav2vec2-large", 14 | "facebook/wav2vec2-large-lv60", 15 | "facebook/wav2vec2-base-10k-voxpopuli", 16 | "facebook/wav2vec2-large-xlsr-53", 17 | # finetuned 18 | "facebook/wav2vec2-base-960h", 19 | "facebook/wav2vec2-large-960h", 20 | "facebook/wav2vec2-large-960h-lv60", 21 | "facebook/wav2vec2-large-960h-lv60-self", 22 | "facebook/wav2vec2-large-xlsr-53-german", 23 | ] 24 | for key in keys: 25 | path = os.path.join(_THIS_DIR, f"{key}.json") 26 | print("Generating ", path) 27 | cfg = Wav2Vec2Model.from_pretrained(key).config 28 | cfg = json.loads(cfg.to_json_string()) 29 | del cfg["_name_or_path"] 30 | 31 | with open(path, "w") as file_: 32 | file_.write(json.dumps(cfg, indent=4, sort_keys=True)) 33 | file_.write("\n") 34 | 35 | 36 | if __name__ == "__main__": 37 | _main() 38 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio.functional as F 3 | from torchaudio_unittest.common_utils import skipIfNoExec, TempDirMixin, TestBaseMixin 4 | from torchaudio_unittest.common_utils.kaldi_utils import convert_args, run_kaldi 5 | 6 | 7 | class Kaldi(TempDirMixin, TestBaseMixin): 8 | def assert_equal(self, output, *, expected, rtol=None, atol=None): 9 | expected = expected.to(dtype=self.dtype, device=self.device) 10 | self.assertEqual(output, expected, rtol=rtol, atol=atol) 11 | 12 | @skipIfNoExec("apply-cmvn-sliding") 13 | def test_sliding_window_cmn(self): 14 | """sliding_window_cmn should be numerically compatible with apply-cmvn-sliding""" 15 | kwargs = { 16 | "cmn_window": 600, 17 | "min_cmn_window": 100, 18 | "center": False, 19 | "norm_vars": False, 20 | } 21 | 22 | tensor = torch.randn(40, 10, dtype=self.dtype, device=self.device) 23 | result = F.sliding_window_cmn(tensor, **kwargs) 24 | command = ["apply-cmvn-sliding"] + convert_args(**kwargs) + ["ark:-", "ark:-"] 25 | kaldi_result = run_kaldi(command, "ark", tensor) 26 | self.assert_equal(result, expected=kaldi_result) 27 | -------------------------------------------------------------------------------- /.github/scripts/unittest-windows/set_cuda_envs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euxo pipefail 3 | 4 | if [ -z "${CUDA_VERSION:-}" ] ; then 5 | version="cpu" 6 | else 7 | version="$CUDA_VERSION" 8 | fi 9 | 10 | # Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi. 11 | # It would exit the shell. One result is cpu tests would not run if the shell exit. 12 | # Unless there's an error, Don't exit. 13 | if [[ "$version" != "cpu" ]]; then 14 | # set cuda envs 15 | export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH" 16 | export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}" 17 | export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}" 18 | 19 | if [ ! -d "$CUDA_PATH" ] 20 | then 21 | echo "$CUDA_PATH" does not exist 22 | exit 1 23 | fi 24 | 25 | # check cuda driver version 26 | for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do 27 | if [[ -x "$path" ]]; then 28 | "$path" || echo "true"; 29 | break 30 | fi 31 | done 32 | 33 | which nvcc 34 | nvcc --version 35 | env | grep CUDA 36 | fi 37 | -------------------------------------------------------------------------------- /examples/avsr/models/fusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class FeedForwardModule(torch.nn.Module): 5 | r"""Positionwise feed forward layer. 6 | 7 | Args: 8 | input_dim (int): input dimension. 9 | hidden_dim (int): hidden dimension. 10 | dropout (float, optional): dropout probability. (Default: 0.0) 11 | """ 12 | 13 | def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, dropout: float = 0.0) -> None: 14 | super().__init__() 15 | self.sequential = torch.nn.Sequential( 16 | torch.nn.LayerNorm(input_dim), 17 | torch.nn.Linear(input_dim, hidden_dim, bias=True), 18 | torch.nn.SiLU(), 19 | torch.nn.Dropout(dropout), 20 | torch.nn.Linear(hidden_dim, output_dim, bias=True), 21 | torch.nn.Dropout(dropout), 22 | ) 23 | 24 | def forward(self, input: torch.Tensor) -> torch.Tensor: 25 | r""" 26 | Args: 27 | input (torch.Tensor): with shape `(*, D)`. 28 | 29 | Returns: 30 | torch.Tensor: output, with shape `(*, D)`. 31 | """ 32 | return self.sequential(input) 33 | 34 | 35 | def fusion_module(input_dim=1024, hidden_dim=3072, output_dim=512, dropout=0.1): 36 | return FeedForwardModule(input_dim, hidden_dim, output_dim, dropout) 37 | -------------------------------------------------------------------------------- /docs/source/prototype.models.rst: -------------------------------------------------------------------------------- 1 | torchaudio.prototype.models 2 | =========================== 3 | 4 | .. py:module:: torchaudio.prototype.models 5 | .. currentmodule:: torchaudio.prototype.models 6 | 7 | 8 | The ``torchaudio.prototype.models`` subpackage contains definitions of models for addressing common audio tasks. 9 | 10 | .. note:: 11 | For models with pre-trained parameters, please refer to :mod:`torchaudio.prototype.pipelines` module. 12 | 13 | Model defintions are responsible for constructing computation graphs and executing them. 14 | 15 | Some models have complex structure and variations. 16 | For such models, factory functions are provided. 17 | 18 | .. autosummary:: 19 | :toctree: generated 20 | :nosignatures: 21 | :template: autosummary/prototype_model_class.rst 22 | 23 | ConformerWav2Vec2PretrainModel 24 | ConvEmformer 25 | HiFiGANVocoder 26 | 27 | Prototype Factory Functions of Beta Models 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | 30 | .. currentmodule:: torchaudio.models 31 | 32 | Some model definitions are in beta, but there are new factory functions that are still in prototype. Please check "Prototype Factory Functions" section in each model. 33 | 34 | .. autosummary:: 35 | :toctree: generated 36 | :nosignatures: 37 | :template: autosummary/model_class.rst 38 | 39 | Wav2Vec2Model 40 | RNNT 41 | -------------------------------------------------------------------------------- /third_party/ffmpeg/single/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMake file for searching existing FFmpeg installation and defining ffmpeg TARGET 2 | 3 | message(STATUS "Searching existing FFmpeg installation") 4 | message(STATUS FFMPEG_ROOT=$ENV{FFMPEG_ROOT}) 5 | if (NOT DEFINED ENV{FFMPEG_ROOT}) 6 | message(FATAL_ERROR "Environment variable FFMPEG_ROOT is not set.") 7 | endif() 8 | 9 | set(_root $ENV{FFMPEG_ROOT}) 10 | set(lib_dirs "${_root}/lib" "${_root}/bin") 11 | set(include_dir "${_root}/include") 12 | 13 | add_library(ffmpeg INTERFACE) 14 | target_include_directories(ffmpeg INTERFACE "${include_dir}") 15 | 16 | function (_find_ffmpeg_lib component) 17 | find_path("${component}_header" 18 | NAMES "lib${component}/${component}.h" 19 | PATHS "${include_dir}" 20 | DOC "The include directory for ${component}" 21 | REQUIRED 22 | NO_DEFAULT_PATH) 23 | find_library("lib${component}" 24 | NAMES "${component}" 25 | PATHS ${lib_dirs} 26 | DOC "${component} library" 27 | REQUIRED 28 | NO_DEFAULT_PATH) 29 | message(STATUS "Found ${component}: ${lib${component}}") 30 | target_link_libraries( 31 | ffmpeg 32 | INTERFACE 33 | ${lib${component}}) 34 | endfunction () 35 | 36 | _find_ffmpeg_lib(avutil) 37 | _find_ffmpeg_lib(avcodec) 38 | _find_ffmpeg_lib(avformat) 39 | _find_ffmpeg_lib(avdevice) 40 | _find_ffmpeg_lib(avfilter) 41 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/cuda_ctc_decoder_class.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/cuda_ctc_decoder_class.rst 3 | 4 | 5 | {# 6 | ################################################################################ 7 | # autosummary template for CUCTCDecoder 8 | # Since the class has multiple methods and support structure. 9 | # we want to have them show up in the table of contents. 10 | # The default class template does not do this, so we use custom one here. 11 | ################################################################################ 12 | #} 13 | 14 | {{ name | underline }} 15 | 16 | {%- if name != "CUCTCDecoder" %} 17 | 18 | .. autofunction:: {{fullname}} 19 | 20 | {%- else %} 21 | 22 | .. autoclass:: {{ fullname }}() 23 | 24 | Methods 25 | ======= 26 | 27 | {%- for item in members %} 28 | {%- if not item.startswith('_') or item == "__call__" %} 29 | 30 | {{ item | underline("-") }} 31 | 32 | .. container:: py attribute 33 | 34 | .. automethod:: {{[fullname, item] | join('.')}} 35 | 36 | {%- endif %} 37 | {%- endfor %} 38 | 39 | Support Structures 40 | ================== 41 | 42 | {%- for item in ["CUCTCHypothesis"] %} 43 | 44 | {{ item | underline("-") }} 45 | 46 | .. autoclass:: torchaudio.models.decoder.{{item}} 47 | :members: 48 | 49 | {%- endfor %} 50 | 51 | {%- endif %} 52 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | # NOTE there must be no spaces before the '-' and check name. 3 | # If you edit this list, please verify list of enabled check with 4 | # clang-tidy --list-checks 5 | InheritParentConfig: true 6 | Checks: ' 7 | bugprone-*, 8 | -bugprone-forward-declaration-namespace, 9 | -bugprone-macro-parentheses, 10 | -clang-analyzer-*, 11 | cppcoreguidelines-*, 12 | -cppcoreguidelines-interfaces-global-init, 13 | -cppcoreguidelines-owning-memory, 14 | -cppcoreguidelines-pro-bounds-array-to-pointer-decay, 15 | -cppcoreguidelines-pro-bounds-constant-array-index, 16 | -cppcoreguidelines-pro-bounds-pointer-arithmetic, 17 | -cppcoreguidelines-pro-type-cstyle-cast, 18 | -cppcoreguidelines-pro-type-reinterpret-cast, 19 | -cppcoreguidelines-pro-type-static-cast-downcast, 20 | -cppcoreguidelines-pro-type-union-access, 21 | -cppcoreguidelines-pro-type-vararg, 22 | -cppcoreguidelines-special-member-functions, 23 | -facebook-hte-RelativeInclude, 24 | hicpp-exception-baseclass, 25 | hicpp-avoid-goto, 26 | modernize-*, 27 | -modernize-concat-nested-namespaces, 28 | -modernize-return-braced-init-list, 29 | -modernize-use-auto, 30 | -modernize-use-default-member-init, 31 | -modernize-use-trailing-return-type, 32 | -modernize-use-using, 33 | performance-unnecessary-value-param, 34 | ' 35 | HeaderFilterRegex: 'torchaudio/.*' 36 | AnalyzeTemporaryDtors: false 37 | CheckOptions: 38 | ... 39 | -------------------------------------------------------------------------------- /examples/pipeline_wav2letter/languagemodels.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import itertools 3 | 4 | 5 | class LanguageModel: 6 | def __init__(self, labels, char_blank, char_space): 7 | 8 | self.char_space = char_space 9 | self.char_blank = char_blank 10 | 11 | labels = list(labels) 12 | self.length = len(labels) 13 | enumerated = list(enumerate(labels)) 14 | flipped = [(sub[1], sub[0]) for sub in enumerated] 15 | 16 | d1 = collections.OrderedDict(enumerated) 17 | d2 = collections.OrderedDict(flipped) 18 | self.mapping = {**d1, **d2} 19 | 20 | def encode(self, iterable): 21 | if isinstance(iterable, list): 22 | return [self.encode(i) for i in iterable] 23 | else: 24 | return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] 25 | 26 | def decode(self, tensor): 27 | if len(tensor) > 0 and isinstance(tensor[0], list): 28 | return [self.decode(t) for t in tensor] 29 | else: 30 | # not idempotent, since clean string 31 | x = (self.mapping[i] for i in tensor) 32 | x = "".join(i for i, _ in itertools.groupby(x)) 33 | x = x.replace(self.char_blank, "") 34 | # x = x.strip() 35 | return x 36 | 37 | def __len__(self): 38 | return self.length 39 | -------------------------------------------------------------------------------- /packaging/cut_release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Usage (run from root of project): 4 | # TEST_INFRA_BRANCH=release/2.1 RELEASE_BRANCH=release/2.1 RELEASE_VERSION=2.1.0 packaging/cut_release.sh 5 | # 6 | # TEST_INFRA_BRANCH: The release branch of test-infra that houses all reusable 7 | # workflows 8 | # 9 | # RELEASE_BRANCH: The name of the release branch for this repo 10 | # 11 | # RELEASE_VERSION: Version of this current release 12 | 13 | set -eou pipefail 14 | 15 | # Create and Check out to Release Branch 16 | git checkout -b "${RELEASE_BRANCH}" 17 | 18 | # Change all GitHub Actions to reference the test-infra release branch 19 | # as opposed to main. 20 | for i in .github/workflows/*.yml; do 21 | if [[ "$OSTYPE" == "darwin"* ]]; then 22 | sed -i '' -e s#@main#@"${TEST_INFRA_BRANCH}"# $i; 23 | sed -i '' -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i; 24 | else 25 | sed -i -e s#@main#@"${TEST_INFRA_BRANCH}"# $i; 26 | sed -i -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i; 27 | fi 28 | done 29 | 30 | # Update the Release Version in version.txt 31 | echo "${RELEASE_VERSION}" >version.txt 32 | 33 | # Optional 34 | # git add ./github/workflows/*.yml version.txt 35 | # git commit -m "[RELEASE-ONLY CHANGES] Branch Cut for Release {RELEASE_VERSION}" 36 | # git push origin "${RELEASE_BRANCH}" 37 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/ctc_decoder_class.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/ctc_decoder_class.rst 3 | 4 | 5 | {# 6 | ################################################################################ 7 | # autosummary template for CTCDecoder 8 | # Since the class has multiple methods and support structure. 9 | # we want to have them show up in the table of contents. 10 | # The default class template does not do this, so we use custom one here. 11 | ################################################################################ 12 | #} 13 | 14 | {{ name | underline }} 15 | 16 | {%- if name != "CTCDecoder" %} 17 | 18 | .. autofunction:: {{fullname}} 19 | 20 | {%- else %} 21 | 22 | .. autoclass:: {{ fullname }}() 23 | 24 | Methods 25 | ======= 26 | 27 | {%- for item in members %} 28 | {%- if not item.startswith('_') or item == "__call__" %} 29 | 30 | {{ item | underline("-") }} 31 | 32 | .. container:: py attribute 33 | 34 | .. automethod:: {{[fullname, item] | join('.')}} 35 | 36 | {%- endif %} 37 | {%- endfor %} 38 | 39 | Support Structures 40 | ================== 41 | 42 | {%- for item in ["CTCHypothesis", "CTCDecoderLM", "CTCDecoderLMState"] %} 43 | 44 | {{ item | underline("-") }} 45 | 46 | .. autoclass:: torchaudio.models.decoder.{{item}} 47 | :members: 48 | 49 | {%- endfor %} 50 | 51 | {%- endif %} 52 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = torchaudio 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | docset: html 16 | doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/audio/ --force $(BUILDDIR)/html/ 17 | 18 | # Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution. 19 | cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png 20 | convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png 21 | 22 | .PHONY: help Makefile docset 23 | 24 | # Catch-all target: route all unknown targets to Sphinx using the new 25 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 26 | %: Makefile 27 | doxygen source/Doxyfile 28 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 29 | @python post_process_dispatcher.py $(BUILDDIR) 30 | 31 | clean: 32 | rm -rf $(BUILDDIR)/* 33 | rm -rf $(SOURCEDIR)/generated/ 34 | rm -rf $(SOURCEDIR)/aen_images/ 35 | rm -rf $(SOURCEDIR)/gen_modules/ 36 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/emformer_rnnt/utils.py: -------------------------------------------------------------------------------- 1 | class MockSentencePieceProcessor: 2 | def __init__(self, num_symbols, *args, **kwargs): 3 | self.num_symbols = num_symbols 4 | 5 | def get_piece_size(self): 6 | return self.num_symbols 7 | 8 | def encode(self, input): 9 | return [1, 5, 2] 10 | 11 | def decode(self, input): 12 | return "hey" 13 | 14 | def unk_id(self): 15 | return 0 16 | 17 | def eos_id(self): 18 | return 1 19 | 20 | def pad_id(self): 21 | return 2 22 | 23 | 24 | class MockCustomDataset: 25 | def __init__(self, base_dataset, *args, **kwargs): 26 | self.base_dataset = base_dataset 27 | 28 | def __getitem__(self, n: int): 29 | return [self.base_dataset[n]] 30 | 31 | def __len__(self): 32 | return len(self.base_dataset) 33 | 34 | 35 | class MockDataloader: 36 | def __init__(self, base_dataset, batch_size, collate_fn, *args, **kwargs): 37 | self.base_dataset = base_dataset 38 | self.batch_size = batch_size 39 | self.collate_fn = collate_fn 40 | 41 | def __iter__(self): 42 | for sample in iter(self.base_dataset): 43 | if self.batch_size == 1: 44 | sample = [sample] 45 | yield self.collate_fn(sample) 46 | 47 | def __len__(self): 48 | return len(self.base_dataset) 49 | -------------------------------------------------------------------------------- /src/torchaudio/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .cmuarctic import CMUARCTIC 2 | from .cmudict import CMUDict 3 | from .commonvoice import COMMONVOICE 4 | from .dr_vctk import DR_VCTK 5 | from .fluentcommands import FluentSpeechCommands 6 | from .gtzan import GTZAN 7 | from .iemocap import IEMOCAP 8 | from .librilight_limited import LibriLightLimited 9 | from .librimix import LibriMix 10 | from .librispeech import LIBRISPEECH 11 | from .librispeech_biasing import LibriSpeechBiasing 12 | from .libritts import LIBRITTS 13 | from .ljspeech import LJSPEECH 14 | from .musdb_hq import MUSDB_HQ 15 | from .quesst14 import QUESST14 16 | from .snips import Snips 17 | from .speechcommands import SPEECHCOMMANDS 18 | from .tedlium import TEDLIUM 19 | from .vctk import VCTK_092 20 | from .voxceleb1 import VoxCeleb1Identification, VoxCeleb1Verification 21 | from .yesno import YESNO 22 | 23 | 24 | __all__ = [ 25 | "COMMONVOICE", 26 | "LIBRISPEECH", 27 | "LibriSpeechBiasing", 28 | "LibriLightLimited", 29 | "SPEECHCOMMANDS", 30 | "VCTK_092", 31 | "DR_VCTK", 32 | "YESNO", 33 | "LJSPEECH", 34 | "GTZAN", 35 | "CMUARCTIC", 36 | "CMUDict", 37 | "LibriMix", 38 | "LIBRITTS", 39 | "TEDLIUM", 40 | "QUESST14", 41 | "MUSDB_HQ", 42 | "FluentSpeechCommands", 43 | "VoxCeleb1Identification", 44 | "VoxCeleb1Verification", 45 | "IEMOCAP", 46 | "Snips", 47 | ] 48 | -------------------------------------------------------------------------------- /packaging/vs2019/activate.bat: -------------------------------------------------------------------------------- 1 | :: Set env vars that tell distutils to use the compiler that we put on path 2 | SET DISTUTILS_USE_SDK=1 3 | SET MSSdk=1 4 | 5 | SET "VS_VERSION=16.0" 6 | SET "VS_MAJOR=16" 7 | SET "VS_YEAR=2019" 8 | 9 | set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out" 10 | set "MSYS2_ENV_CONV_EXCL=CL" 11 | 12 | :: For Python 3.5+, ensure that we link with the dynamic runtime. See 13 | :: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info 14 | set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll" 15 | 16 | for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do ( 17 | if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( 18 | set "VSINSTALLDIR=%%i\" 19 | goto :vswhere 20 | ) 21 | ) 22 | 23 | :vswhere 24 | 25 | :: Shorten PATH to avoid the `input line too long` error. 26 | SET MyPath=%PATH% 27 | 28 | setlocal EnableDelayedExpansion 29 | 30 | SET TempPath="%MyPath:;=";"%" 31 | SET var= 32 | FOR %%a IN (%TempPath%) DO ( 33 | IF EXIST %%~sa ( 34 | SET "var=!var!;%%~sa" 35 | ) 36 | ) 37 | 38 | set "TempPath=!var:~1!" 39 | endlocal & set "PATH=%TempPath%" 40 | 41 | :: Shorten current directory too 42 | FOR %%A IN (.) DO CD "%%~sA" 43 | 44 | :: other things added by install_activate.bat at package build time 45 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | node: 16.14.2 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.0.1 7 | hooks: 8 | - id: check-docstring-first 9 | - id: check-toml 10 | - id: check-yaml 11 | exclude: packaging/.* 12 | - id: end-of-file-fixer 13 | 14 | - repo: https://github.com/omnilib/ufmt 15 | rev: v1.3.2 16 | hooks: 17 | - id: ufmt 18 | additional_dependencies: 19 | - black == 22.3 20 | - usort == 1.0.2 21 | - libcst == 0.4.1 22 | 23 | - repo: https://github.com/pre-commit/mirrors-clang-format 24 | rev: v11.0.1 25 | hooks: 26 | - id: clang-format 27 | 28 | - repo: https://github.com/pycqa/flake8 29 | rev: 4.0.1 30 | hooks: 31 | - id: flake8 32 | args: ['src', 'test', 'tools', 'docs/source/conf.py', 'examples'] 33 | exclude: 'build|docs/src|third_party' 34 | additional_dependencies: 35 | - flake8-breakpoint == 1.1.0 36 | - flake8-bugbear == 22.6.22 37 | - flake8-comprehensions == 3.10.0 38 | - flake8-pyi == 22.5.1 39 | - mccabe == 0.6.0 40 | - pycodestyle == 2.8.0 41 | 42 | - repo: https://github.com/pycqa/pydocstyle 43 | rev: 6.3.0 44 | hooks: 45 | - id: pydocstyle 46 | exclude: 'build|test|examples|third_party|docs|tools' 47 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new torchaudio feature 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | PLEASE NOTE THAT THE TORCHAUDIO REPOSITORY IS NO LONGER ACTIVELY MONITORED. You may not get a response. For open discussions, visit https://discuss.pytorch.org/. 9 | - type: textarea 10 | attributes: 11 | label: 🚀 The feature 12 | description: > 13 | A clear and concise description of the feature proposal 14 | validations: 15 | required: true 16 | - type: textarea 17 | attributes: 18 | label: Motivation, pitch 19 | description: > 20 | Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 21 | validations: 22 | required: true 23 | - type: textarea 24 | attributes: 25 | label: Alternatives 26 | description: > 27 | A description of any alternative solutions or features you've considered, if any. 28 | - type: textarea 29 | attributes: 30 | label: Additional context 31 | description: > 32 | Add any other context or screenshots about the feature request. 33 | - type: markdown 34 | attributes: 35 | value: > 36 | Thanks for contributing 🎉! 37 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/decoder/kenlm_char.arpa: -------------------------------------------------------------------------------- 1 | \data\ 2 | ngram 1=8 3 | ngram 2=8 4 | ngram 3=8 5 | ngram 4=8 6 | ngram 5=8 7 | 8 | \1-grams: 9 | -1.146128 0 10 | 0 -0.30103 11 | -0.8731268 0 12 | -0.70679533 f -0.30103 13 | -0.70679533 o -0.30103 14 | -0.8731268 b -0.30103 15 | -0.8731268 a -0.30103 16 | -0.8731268 r -0.30103 17 | 18 | \2-grams: 19 | -0.24644431 r 0 20 | -0.22314323 f -0.30103 21 | -0.57694924 o f -0.30103 22 | -0.22314323 f o -0.30103 23 | -0.57694924 o o -0.30103 24 | -0.6314696 o b -0.30103 25 | -0.24644431 b a -0.30103 26 | -0.24644431 a r -0.30103 27 | 28 | \3-grams: 29 | -0.105970904 a r 0 30 | -0.41743615 o o f -0.30103 31 | -0.097394995 f o -0.30103 32 | -0.097394995 o f o -0.30103 33 | -0.19898036 f o o -0.30103 34 | -0.43555236 o o b -0.30103 35 | -0.105970904 o b a -0.30103 36 | -0.105970904 b a r -0.30103 37 | 38 | \4-grams: 39 | -0.049761247 b a r 0 40 | -0.4462542 f o o f -0.30103 41 | -0.045972984 o o f o -0.30103 42 | -0.08819265 f o o -0.30103 43 | -0.08819265 o f o o -0.30103 44 | -0.286727 f o o b -0.30103 45 | -0.049761247 o o b a -0.30103 46 | -0.049761247 o b a r -0.30103 47 | 48 | \5-grams: 49 | -0.02416831 o b a r 50 | -0.36759996 f o o f 51 | -0.022378458 f o o f o 52 | -0.041861475 o o f o o 53 | -0.29381964 f o o b 54 | -0.12011856 o f o o b 55 | -0.02416831 f o o b a 56 | -0.02416831 o o b a r 57 | 58 | \end\ 59 | -------------------------------------------------------------------------------- /src/libtorchaudio/cuctc/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2023 Nvidia 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2017 Facebook Inc. (Soumith Chintala), 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /src/torchaudio/models/decoder/__init__.py: -------------------------------------------------------------------------------- 1 | _CTC_DECODERS = [ 2 | "CTCHypothesis", 3 | "CTCDecoder", 4 | "CTCDecoderLM", 5 | "CTCDecoderLMState", 6 | "ctc_decoder", 7 | "download_pretrained_files", 8 | ] 9 | _CUDA_CTC_DECODERS = [ 10 | "CUCTCDecoder", 11 | "CUCTCHypothesis", 12 | "cuda_ctc_decoder", 13 | ] 14 | 15 | 16 | def __getattr__(name: str): 17 | if name in _CTC_DECODERS: 18 | try: 19 | from . import _ctc_decoder 20 | except Exception as err: 21 | raise RuntimeError( 22 | "CTC Decoder suit requires flashlight-text package and optionally KenLM. Please install them." 23 | ) from err 24 | 25 | item = getattr(_ctc_decoder, name) 26 | globals()[name] = item 27 | return item 28 | elif name in _CUDA_CTC_DECODERS: 29 | try: 30 | from . import _cuda_ctc_decoder 31 | except AttributeError as err: 32 | raise RuntimeError( 33 | "To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source." 34 | ) from err 35 | 36 | item = getattr(_cuda_ctc_decoder, name) 37 | globals()[name] = item 38 | return item 39 | raise AttributeError(f"module {__name__} has no attribute {name}") 40 | 41 | 42 | def __dir__(): 43 | return sorted(__all__) 44 | 45 | 46 | __all__ = _CTC_DECODERS + _CUDA_CTC_DECODERS 47 | -------------------------------------------------------------------------------- /src/torchaudio/prototype/models/__init__.py: -------------------------------------------------------------------------------- 1 | from ._conformer_wav2vec2 import ( 2 | conformer_wav2vec2_base, 3 | conformer_wav2vec2_model, 4 | conformer_wav2vec2_pretrain_base, 5 | conformer_wav2vec2_pretrain_large, 6 | conformer_wav2vec2_pretrain_model, 7 | ConformerWav2Vec2PretrainModel, 8 | ) 9 | from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model 10 | from .conv_emformer import ConvEmformer 11 | from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder 12 | from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model 13 | from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing 14 | 15 | __all__ = [ 16 | "conformer_rnnt_base", 17 | "conformer_rnnt_model", 18 | "conformer_rnnt_biasing", 19 | "conformer_rnnt_biasing_base", 20 | "ConvEmformer", 21 | "conformer_wav2vec2_model", 22 | "conformer_wav2vec2_base", 23 | "conformer_wav2vec2_pretrain_model", 24 | "conformer_wav2vec2_pretrain_base", 25 | "conformer_wav2vec2_pretrain_large", 26 | "ConformerWav2Vec2PretrainModel", 27 | "emformer_hubert_base", 28 | "emformer_hubert_model", 29 | "Hypothesis", 30 | "RNNTBeamSearchBiasing", 31 | "HiFiGANVocoder", 32 | "hifigan_vocoder_v1", 33 | "hifigan_vocoder_v2", 34 | "hifigan_vocoder_v3", 35 | "hifigan_vocoder", 36 | ] 37 | -------------------------------------------------------------------------------- /src/libtorio/ffmpeg/stream_writer/packet_writer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace torio::io { 4 | namespace { 5 | AVStream* add_stream( 6 | AVFormatContext* format_ctx, 7 | const StreamParams& stream_params) { 8 | AVStream* stream = avformat_new_stream(format_ctx, nullptr); 9 | int ret = 10 | avcodec_parameters_copy(stream->codecpar, stream_params.codec_params); 11 | TORCH_CHECK( 12 | ret >= 0, 13 | "Failed to copy the stream's codec parameters. (", 14 | av_err2string(ret), 15 | ")"); 16 | stream->time_base = stream_params.time_base; 17 | return stream; 18 | } 19 | } // namespace 20 | PacketWriter::PacketWriter( 21 | AVFormatContext* format_ctx_, 22 | const StreamParams& stream_params_) 23 | : format_ctx(format_ctx_), 24 | stream(add_stream(format_ctx_, stream_params_)), 25 | original_time_base(stream_params_.time_base) {} 26 | 27 | void PacketWriter::write_packet(const AVPacketPtr& packet) { 28 | AVPacket dst_packet; 29 | int ret = av_packet_ref(&dst_packet, packet); 30 | TORCH_CHECK(ret >= 0, "Failed to copy packet."); 31 | av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base); 32 | dst_packet.stream_index = stream->index; 33 | ret = av_interleaved_write_frame(format_ctx, &dst_packet); 34 | TORCH_CHECK(ret >= 0, "Failed to write packet to destination."); 35 | } 36 | } // namespace torio::io 37 | -------------------------------------------------------------------------------- /.github/scripts/unittest-windows/setup_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script is for setting up environment in which unit test is ran. 4 | # To speed up the CI time, the resulting environment is cached. 5 | # 6 | # Do not install PyTorch and torchaudio here, otherwise they also get cached. 7 | 8 | set -euxo pipefail 9 | 10 | this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 11 | root_dir="$(git rev-parse --show-toplevel)" 12 | conda_dir="${root_dir}/conda" 13 | env_dir="${root_dir}/env" 14 | 15 | cd "${root_dir}" 16 | 17 | # 1. Install conda at ./conda 18 | if [ ! -d "${conda_dir}" ]; then 19 | printf "* Installing conda\n" 20 | export tmp_conda="$(echo $conda_dir | tr '/' '\\')" 21 | export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe" 22 | curl --silent --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O 23 | "$this_dir/install_conda.bat" 24 | unset tmp_conda 25 | unset miniconda_exe 26 | fi 27 | eval "$("${conda_dir}/Scripts/conda.exe" 'shell.bash' 'hook')" 28 | 29 | # 2. Create test environment at ./env 30 | if [ ! -d "${env_dir}" ]; then 31 | printf "* Creating a test environment with PYTHON_VERSION=%s\n" "${PYTHON_VERSION}" 32 | conda create --prefix "${env_dir}" -y python="${PYTHON_VERSION}" 33 | fi 34 | conda activate "${env_dir}" 35 | 36 | # 3. Install minimal build tools 37 | pip --quiet install cmake ninja 38 | conda install --quiet -y 'ffmpeg>=4.1' 39 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/common_utils/kaldi_utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | import torch 4 | 5 | 6 | def convert_args(**kwargs): 7 | args = [] 8 | for key, value in kwargs.items(): 9 | if key == "sample_rate": 10 | key = "sample_frequency" 11 | key = "--" + key.replace("_", "-") 12 | value = str(value).lower() if value in [True, False] else str(value) 13 | args.append("%s=%s" % (key, value)) 14 | return args 15 | 16 | 17 | def run_kaldi(command, input_type, input_value): 18 | """Run provided Kaldi command, pass a tensor and get the resulting tensor 19 | 20 | Args: 21 | command (list of str): The command with arguments 22 | input_type (str): 'ark' or 'scp' 23 | input_value (Tensor for 'ark', string for 'scp'): The input to pass. 24 | Must be a path to an audio file for 'scp'. 25 | """ 26 | import kaldi_io 27 | 28 | key = "foo" 29 | process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 30 | if input_type == "ark": 31 | kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key) 32 | elif input_type == "scp": 33 | process.stdin.write(f"{key} {input_value}".encode("utf8")) 34 | else: 35 | raise NotImplementedError("Unexpected type") 36 | process.stdin.close() 37 | result = dict(kaldi_io.read_mat_ark(process.stdout))["foo"] 38 | return torch.from_numpy(result.copy()) # copy supresses some torch warning 39 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/io_class.rst: -------------------------------------------------------------------------------- 1 | .. 2 | autogenerated from source/_templates/autosummary/io_class.rst 3 | 4 | {#- 5 | ################################################################################ 6 | # autosummary template for torchaudio.io module 7 | # Since StreamReader/StreamWriter have many methods/properties, 8 | # we want to list them up in the table of contents. 9 | # The default class template does not do this, so we use custom one here. 10 | ################################################################################ 11 | #} 12 | 13 | {{ name | underline }} 14 | 15 | .. autoclass:: {{ fullname }} 16 | 17 | {%- if name not in ["StreamReader", "StreamWriter"] %} 18 | 19 | {%- if attributes %} 20 | 21 | Properties 22 | ---------- 23 | 24 | {%- for item in attributes %} 25 | {%- if not item.startswith('_') and item not in inherited_members %} 26 | 27 | {{ item | underline("~") }} 28 | 29 | .. container:: py attribute 30 | 31 | .. autoproperty:: {{[fullname, item] | join('.')}} 32 | 33 | {%- endif %} 34 | {%- endfor %} 35 | {%- endif %} 36 | 37 | {%- if members %} 38 | 39 | Methods 40 | ------- 41 | 42 | {%- for item in members %} 43 | {%- if 44 | not item.startswith('_') 45 | and item not in inherited_members 46 | and item not in attributes 47 | %} 48 | 49 | {{ item | underline("~") }} 50 | 51 | .. container:: py attribute 52 | 53 | .. automethod:: {{[fullname, item] | join('.')}} 54 | 55 | {%- endif %} 56 | {%- endfor %} 57 | {%- endif %} 58 | 59 | {%- endif %} 60 | -------------------------------------------------------------------------------- /examples/libtorchaudio/speech_recognition/transcribe.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char* argv[]) { 4 | if (argc != 3) { 5 | std::cerr << "Usage: " << argv[0] << " " 6 | << std::endl; 7 | return -1; 8 | } 9 | 10 | torch::jit::script::Module loader, encoder, decoder; 11 | std::cout << "Loading module from: " << argv[1] << std::endl; 12 | try { 13 | loader = torch::jit::load(std::string(argv[1]) + "/loader.zip"); 14 | } catch (const c10::Error& error) { 15 | std::cerr << "Failed to load the module:" << error.what() << std::endl; 16 | return -1; 17 | } 18 | try { 19 | encoder = torch::jit::load(std::string(argv[1]) + "/encoder.zip"); 20 | } catch (const c10::Error& error) { 21 | std::cerr << "Failed to load the module:" << error.what() << std::endl; 22 | return -1; 23 | } 24 | try { 25 | decoder = torch::jit::load(std::string(argv[1]) + "/decoder.zip"); 26 | } catch (const c10::Error& error) { 27 | std::cerr << "Failed to load the module:" << error.what() << std::endl; 28 | return -1; 29 | } 30 | 31 | std::cout << "Loading the audio" << std::endl; 32 | auto waveform = loader.forward({c10::IValue(argv[2])}); 33 | std::cout << "Running inference" << std::endl; 34 | auto emission = encoder.forward({waveform}); 35 | std::cout << "Generating the transcription" << std::endl; 36 | auto result = decoder.forward({emission}); 37 | std::cout << result.toStringRef() << std::endl; 38 | std::cout << "Done." << std::endl; 39 | } 40 | -------------------------------------------------------------------------------- /src/libtorchaudio/sox/pybind/pybind.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace torchaudio { 7 | namespace sox { 8 | namespace { 9 | 10 | TORCH_LIBRARY(torchaudio_sox, m) { 11 | m.def("torchaudio_sox::get_info", &get_info_file); 12 | m.def("torchaudio_sox::load_audio_file", &load_audio_file); 13 | m.def("torchaudio_sox::save_audio_file", &save_audio_file); 14 | m.def("torchaudio_sox::initialize_sox_effects", &initialize_sox_effects); 15 | m.def("torchaudio_sox::shutdown_sox_effects", &shutdown_sox_effects); 16 | m.def("torchaudio_sox::apply_effects_tensor", &apply_effects_tensor); 17 | m.def("torchaudio_sox::apply_effects_file", &apply_effects_file); 18 | } 19 | 20 | PYBIND11_MODULE(_torchaudio_sox, m) { 21 | m.def("set_seed", &set_seed, "Set random seed."); 22 | m.def("set_verbosity", &set_verbosity, "Set verbosity."); 23 | m.def("set_use_threads", &set_use_threads, "Set threading."); 24 | m.def("set_buffer_size", &set_buffer_size, "Set buffer size."); 25 | m.def("get_buffer_size", &get_buffer_size, "Get buffer size."); 26 | m.def("list_effects", &list_effects, "List available effects."); 27 | m.def( 28 | "list_read_formats", 29 | &list_read_formats, 30 | "List supported formats for decoding."); 31 | m.def( 32 | "list_write_formats", 33 | &list_write_formats, 34 | "List supported formats for encoding."); 35 | } 36 | 37 | } // namespace 38 | } // namespace sox 39 | } // namespace torchaudio 40 | -------------------------------------------------------------------------------- /tools/travis/test_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called by the "script" step defined in 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 4 | # The behavior of the script is controlled by environment variabled defined 5 | # in the .travis.yml in the top level folder of the project. 6 | set -e 7 | 8 | python --version 9 | python -c 'import torch;print("torch:", torch.__version__)' 10 | 11 | run_tests() { 12 | # find all the test files that match "test*.py" 13 | TEST_FILES="$(find test -type f -name "test*.py" | sort)" 14 | echo "Test files are:" 15 | echo $TEST_FILES 16 | 17 | echo "Executing tests:" 18 | EXIT_STATUS=0 19 | for FILE in $TEST_FILES; do 20 | # run each file on a separate process. if one fails, just keep going and 21 | # return the final exit status. 22 | python -m pytest -v $FILE 23 | STATUS=$? 24 | EXIT_STATUS="$(($EXIT_STATUS+STATUS))" 25 | done 26 | 27 | echo "Done, exit status: $EXIT_STATUS" 28 | exit $EXIT_STATUS 29 | } 30 | 31 | if [[ "$RUN_FLAKE8" == "true" ]]; then 32 | flake8 33 | fi 34 | 35 | if [[ "$SKIP_TESTS" != "true" ]]; then 36 | echo "run_tests" 37 | run_tests 38 | fi 39 | 40 | if [[ "$RUN_EXAMPLE_TESTS" == "true" ]]; then 41 | echo "run_example_tests" 42 | pushd examples 43 | ASR_MODEL_PATH=$HOME/download/data/model.pt \ 44 | ASR_INPUT_FILE=interactive_asr/data/sample.wav \ 45 | ASR_DATA_PATH=$HOME/download/data \ 46 | ASR_USER_DIR=$HOME/download/fairseq/examples/speech_recognition \ 47 | python -m unittest test/test_interactive_asr.py 48 | popd 49 | fi 50 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/example/souce_sepration/metrics_test.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | 3 | import torch 4 | from parameterized import parameterized 5 | from source_separation.utils import metrics 6 | from torch.testing._internal.common_utils import TestCase 7 | 8 | from . import sdr_reference 9 | 10 | 11 | class TestSDR(TestCase): 12 | @parameterized.expand([(1,), (2,), (32,)]) 13 | def test_sdr(self, batch_size): 14 | """sdr produces the same result as the reference implementation""" 15 | num_frames = 256 16 | 17 | estimation = torch.rand(batch_size, num_frames) 18 | origin = torch.rand(batch_size, num_frames) 19 | 20 | sdr_ref = sdr_reference.calc_sdr_torch(estimation, origin) 21 | sdr = metrics.sdr(estimation.unsqueeze(1), origin.unsqueeze(1)).squeeze(1) 22 | 23 | self.assertEqual(sdr, sdr_ref) 24 | 25 | @parameterized.expand(list(product([1, 2, 32], [2, 3, 4, 5]))) 26 | def test_sdr_pit(self, batch_size, num_sources): 27 | """sdr_pit produces the same result as the reference implementation""" 28 | num_frames = 256 29 | 30 | estimation = torch.randn(batch_size, num_sources, num_frames) 31 | origin = torch.randn(batch_size, num_sources, num_frames) 32 | 33 | estimation -= estimation.mean(axis=2, keepdim=True) 34 | origin -= origin.mean(axis=2, keepdim=True) 35 | 36 | batch_sdr_ref = sdr_reference.batch_SDR_torch(estimation, origin) 37 | batch_sdr = metrics.sdr_pit(estimation, origin) 38 | 39 | self.assertEqual(batch_sdr, batch_sdr_ref) 40 | -------------------------------------------------------------------------------- /examples/pipeline_wav2letter/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import shutil 5 | from collections import defaultdict 6 | 7 | import torch 8 | 9 | 10 | class MetricLogger(defaultdict): 11 | def __init__(self, name, print_freq=1, disable=False): 12 | super().__init__(lambda: 0.0) 13 | self.disable = disable 14 | self.print_freq = print_freq 15 | self._iter = 0 16 | self["name"] = name 17 | 18 | def __str__(self): 19 | return json.dumps(self) 20 | 21 | def __call__(self): 22 | self._iter = (self._iter + 1) % self.print_freq 23 | if not self.disable and not self._iter: 24 | print(self, flush=True) 25 | 26 | 27 | def save_checkpoint(state, is_best, filename, disable): 28 | """ 29 | Save the model to a temporary file first, 30 | then copy it to filename, in case the signal interrupts 31 | the torch.save() process. 32 | """ 33 | 34 | if disable: 35 | return 36 | 37 | if filename == "": 38 | return 39 | 40 | tempfile = filename + ".temp" 41 | 42 | # Remove tempfile in case interuption during the copying from tempfile to filename 43 | if os.path.isfile(tempfile): 44 | os.remove(tempfile) 45 | 46 | torch.save(state, tempfile) 47 | if os.path.isfile(tempfile): 48 | os.rename(tempfile, filename) 49 | if is_best: 50 | shutil.copyfile(filename, "model_best.pth.tar") 51 | logging.warning("Checkpoint: saved") 52 | 53 | 54 | def count_parameters(model): 55 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 56 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/kaldi_io_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio.kaldi_io as kio 3 | from torchaudio_unittest import common_utils 4 | 5 | 6 | class Test_KaldiIO(common_utils.TorchaudioTestCase): 7 | data1 = [[1, 2, 3], [11, 12, 13], [21, 22, 23]] 8 | data2 = [[31, 32, 33], [41, 42, 43], [51, 52, 53]] 9 | 10 | def _test_helper(self, file_name, expected_data, fn, expected_dtype): 11 | """Takes a file_name to the input data and a function fn to extract the 12 | data. It compares the extracted data to the expected_data. The expected_dtype 13 | will be used to check that the extracted data is of the right type. 14 | """ 15 | test_filepath = common_utils.get_asset_path(file_name) 16 | expected_output = { 17 | "key" + str(idx + 1): torch.tensor(val, dtype=expected_dtype) for idx, val in enumerate(expected_data) 18 | } 19 | 20 | for key, vec in fn(test_filepath): 21 | self.assertTrue(key in expected_output) 22 | self.assertTrue(isinstance(vec, torch.Tensor)) 23 | self.assertEqual(vec.dtype, expected_dtype) 24 | self.assertTrue(torch.all(torch.eq(vec, expected_output[key]))) 25 | 26 | def test_read_vec_int_ark(self): 27 | self._test_helper("vec_int.ark", self.data1, kio.read_vec_int_ark, torch.int32) 28 | 29 | def test_read_vec_flt_ark(self): 30 | self._test_helper("vec_flt.ark", self.data1, kio.read_vec_flt_ark, torch.float32) 31 | 32 | def test_read_mat_ark(self): 33 | self._test_helper("mat.ark", [self.data1, self.data2], kio.read_mat_ark, torch.float32) 34 | -------------------------------------------------------------------------------- /src/torchaudio/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR 2 | from ._transforms import ( 3 | AddNoise, 4 | AmplitudeToDB, 5 | ComputeDeltas, 6 | Convolve, 7 | Deemphasis, 8 | Fade, 9 | FFTConvolve, 10 | FrequencyMasking, 11 | GriffinLim, 12 | InverseMelScale, 13 | InverseSpectrogram, 14 | LFCC, 15 | Loudness, 16 | MelScale, 17 | MelSpectrogram, 18 | MFCC, 19 | MuLawDecoding, 20 | MuLawEncoding, 21 | PitchShift, 22 | Preemphasis, 23 | Resample, 24 | RNNTLoss, 25 | SlidingWindowCmn, 26 | SpecAugment, 27 | SpectralCentroid, 28 | Spectrogram, 29 | Speed, 30 | SpeedPerturbation, 31 | TimeMasking, 32 | TimeStretch, 33 | Vad, 34 | Vol, 35 | ) 36 | 37 | 38 | __all__ = [ 39 | "AddNoise", 40 | "AmplitudeToDB", 41 | "ComputeDeltas", 42 | "Convolve", 43 | "Deemphasis", 44 | "Fade", 45 | "FFTConvolve", 46 | "FrequencyMasking", 47 | "GriffinLim", 48 | "InverseMelScale", 49 | "InverseSpectrogram", 50 | "LFCC", 51 | "Loudness", 52 | "MFCC", 53 | "MVDR", 54 | "MelScale", 55 | "MelSpectrogram", 56 | "MuLawDecoding", 57 | "MuLawEncoding", 58 | "PSD", 59 | "PitchShift", 60 | "Preemphasis", 61 | "RNNTLoss", 62 | "RTFMVDR", 63 | "Resample", 64 | "SlidingWindowCmn", 65 | "SoudenMVDR", 66 | "SpecAugment", 67 | "SpectralCentroid", 68 | "Spectrogram", 69 | "Speed", 70 | "SpeedPerturbation", 71 | "TimeMasking", 72 | "TimeStretch", 73 | "Vad", 74 | "Vol", 75 | ] 76 | -------------------------------------------------------------------------------- /test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_53_56k.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "wav2vec2", 3 | "activation_dropout": 0.0, 4 | "activation_fn": "gelu", 5 | "attention_dropout": 0.0, 6 | "codebook_negatives": 0, 7 | "conv_bias": true, 8 | "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2", 9 | "conv_pos": 128, 10 | "conv_pos_groups": 16, 11 | "cross_sample_negatives": 0, 12 | "dropout": 0.0, 13 | "dropout_features": 0.0, 14 | "dropout_input": 0.0, 15 | "encoder_attention_heads": 16, 16 | "encoder_embed_dim": 1024, 17 | "encoder_ffn_embed_dim": 4096, 18 | "encoder_layerdrop": 0.0, 19 | "encoder_layers": 24, 20 | "extractor_mode": "layer_norm", 21 | "feature_grad_mult": 1.0, 22 | "final_dim": 768, 23 | "latent_dim": 0, 24 | "latent_groups": 2, 25 | "latent_temp": [ 26 | 2.0, 27 | 0.1, 28 | 0.999995 29 | ], 30 | "latent_vars": 320, 31 | "layer_norm_first": true, 32 | "logit_temp": 0.1, 33 | "mask_channel_length": 10, 34 | "mask_channel_min_space": 1, 35 | "mask_channel_other": 0.0, 36 | "mask_channel_prob": 0.0, 37 | "mask_channel_selection": "static", 38 | "mask_length": 10, 39 | "mask_min_space": 1, 40 | "mask_other": 0.0, 41 | "mask_prob": 0.65, 42 | "mask_selection": "static", 43 | "negatives_from_everywhere": false, 44 | "no_mask_channel_overlap": false, 45 | "no_mask_overlap": false, 46 | "num_negatives": 100, 47 | "quantize_input": false, 48 | "quantize_targets": true, 49 | "same_quantizer": false, 50 | "target_glu": false 51 | } 52 | --------------------------------------------------------------------------------