├── .gitmodules
├── tools
    ├── __init__.py
    ├── setup_helpers
    │   └── __init__.py
    └── travis
    │   └── test_script.sh
├── version.txt
├── src
    ├── torio
    │   ├── lib
    │   │   └── __init__.py
    │   ├── utils
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── io
    │   │   └── __init__.py
    │   └── _extension
    │   │   └── __init__.py
    ├── torchaudio
    │   ├── lib
    │   │   └── __init__.py
    │   ├── prototype
    │   │   ├── __init__.py
    │   │   ├── datasets
    │   │   │   └── __init__.py
    │   │   ├── pipelines
    │   │   │   ├── _vggish
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   ├── transforms
    │   │   │   └── __init__.py
    │   │   ├── functional
    │   │   │   └── __init__.py
    │   │   └── models
    │   │   │   └── __init__.py
    │   ├── pipelines
    │   │   ├── _wav2vec2
    │   │   │   └── __init__.py
    │   │   └── _tts
    │   │   │   └── __init__.py
    │   ├── compliance
    │   │   └── __init__.py
    │   ├── models
    │   │   ├── wav2vec2
    │   │   │   ├── utils
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   ├── squim
    │   │   │   └── __init__.py
    │   │   └── decoder
    │   │   │   └── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── ffmpeg_utils.py
    │   ├── _internal
    │   │   └── __init__.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── no_backend.py
    │   │   ├── sox_io_backend.py
    │   │   ├── soundfile_backend.py
    │   │   └── _no_backend.py
    │   ├── sox_effects
    │   │   └── __init__.py
    │   ├── io
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── datasets
    │   │   └── __init__.py
    │   └── transforms
    │   │   └── __init__.py
    ├── libtorchaudio
    │   ├── iir_cuda.h
    │   ├── utils.h
    │   ├── rnnt
    │   │   ├── compute_alphas.cpp
    │   │   ├── compute_betas.cpp
    │   │   ├── compute.h
    │   │   ├── macros.h
    │   │   ├── types.h
    │   │   ├── gpu
    │   │   │   ├── half.cuh
    │   │   │   └── math.cuh
    │   │   ├── cpu
    │   │   │   └── math.h
    │   │   └── compute.cpp
    │   ├── forced_align
    │   │   ├── compute.h
    │   │   └── compute.cpp
    │   ├── pybind
    │   │   └── pybind.cpp
    │   ├── sox
    │   │   ├── CMakeLists.txt
    │   │   ├── effects.h
    │   │   ├── types.h
    │   │   ├── io.h
    │   │   └── pybind
    │   │   │   └── pybind.cpp
    │   ├── utils.cpp
    │   └── cuctc
    │   │   ├── CMakeLists.txt
    │   │   └── LICENSE
    └── libtorio
    │   └── ffmpeg
    │       ├── hw_context.h
    │       ├── stream_reader
    │           ├── packet_buffer.h
    │           ├── buffer
    │           │   ├── unchunked_buffer.h
    │           │   ├── unchunked_buffer.cpp
    │           │   └── chunked_buffer.h
    │           ├── packet_buffer.cpp
    │           └── post_process.h
    │       ├── stream_writer
    │           ├── packet_writer.h
    │           ├── types.h
    │           ├── encoder.h
    │           └── packet_writer.cpp
    │       └── hw_context.cpp
├── test
    ├── integration_tests
    │   ├── __init__.py
    │   ├── prototype
    │   │   └── vggish_pipeline_test.py
    │   ├── ctc_decoder_integration_test.py
    │   ├── tacotron2_pipeline_test.py
    │   └── rnnt_pipeline_test.py
    ├── torchaudio_unittest
    │   ├── io
    │   │   ├── __init__.py
    │   │   └── common.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── rnnt
    │   │   │   ├── __init__.py
    │   │   │   ├── rnnt_cpu_test.py
    │   │   │   └── rnnt_gpu_test.py
    │   │   ├── conformer
    │   │   │   ├── __init__.py
    │   │   │   ├── conformer_cpu_test.py
    │   │   │   └── conformer_gpu_test.py
    │   │   ├── decoder
    │   │   │   └── __init__.py
    │   │   ├── emformer
    │   │   │   ├── __init__.py
    │   │   │   ├── emformer_cpu_test.py
    │   │   │   └── emformer_gpu_test.py
    │   │   ├── hdemucs
    │   │   │   ├── __init__.py
    │   │   │   ├── hdemucs_cpu_test.py
    │   │   │   └── hdemucs_gpu_test.py
    │   │   ├── squim
    │   │   │   └── __init__.py
    │   │   ├── tacotron2
    │   │   │   ├── __init__.py
    │   │   │   ├── model_test_cpu_test.py
    │   │   │   └── model_test_gpu_test.py
    │   │   ├── wav2vec2
    │   │   │   └── __init__.py
    │   │   └── rnnt_decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── rnnt_decoder_cpu_test.py
    │   │   │   └── rnnt_decoder_gpu_test.py
    │   ├── utils
    │   │   └── __init__.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── sox_io
    │   │   │   ├── __init__.py
    │   │   │   └── common.py
    │   │   ├── dispatcher
    │   │   │   ├── __init__.py
    │   │   │   ├── ffmpeg
    │   │   │   │   └── __init__.py
    │   │   │   ├── sox
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── common.py
    │   │   │   └── soundfile
    │   │   │   │   └── __init__.py
    │   │   ├── soundfile
    │   │   │   └── __init__.py
    │   │   └── common.py
    │   ├── compliance
    │   │   ├── __init__.py
    │   │   └── kaldi
    │   │   │   ├── __init__.py
    │   │   │   ├── kaldi_compatibility_cpu_test.py
    │   │   │   └── kaldi_compatibility_cuda_test.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── librispeech_test.py
    │   ├── functional
    │   │   ├── __init__.py
    │   │   ├── librosa_compatibility_cpu_test.py
    │   │   ├── kaldi_compatibility_cpu_test.py
    │   │   ├── librosa_compatibility_cuda_test.py
    │   │   ├── autograd_cpu_test.py
    │   │   ├── kaldi_compatibility_cuda_test.py
    │   │   ├── torchscript_consistency_cpu_test.py
    │   │   ├── autograd_cuda_test.py
    │   │   ├── torchscript_consistency_cuda_test.py
    │   │   ├── functional_cuda_test.py
    │   │   └── kaldi_compatibility_test_impl.py
    │   ├── prototype
    │   │   ├── __init__.py
    │   │   ├── datasets
    │   │   │   └── __init__.py
    │   │   ├── functional
    │   │   │   ├── __init__.py
    │   │   │   ├── librosa_compatibility_cpu_test.py
    │   │   │   ├── librosa_compatibility_cuda_test.py
    │   │   │   ├── autograd_cpu_test.py
    │   │   │   ├── autograd_cuda_test.py
    │   │   │   ├── torchscript_consistency_cuda_test.py
    │   │   │   ├── functional_cpu_test.py
    │   │   │   ├── functional_cuda_test.py
    │   │   │   └── torchscript_consistency_cpu_test.py
    │   │   ├── hifi_gan
    │   │   │   ├── __init__.py
    │   │   │   ├── original
    │   │   │   │   ├── env.py
    │   │   │   │   └── utils.py
    │   │   │   ├── hifi_gan_cpu_test.py
    │   │   │   └── hifi_gan_gpu_test.py
    │   │   ├── transforms
    │   │   │   ├── __init__.py
    │   │   │   ├── autograd_cpu_test.py
    │   │   │   ├── autograd_cuda_test.py
    │   │   │   ├── librosa_compatibility_cpu_test.py
    │   │   │   ├── librosa_compatibility_cuda_test.py
    │   │   │   ├── transforms_cpu_test.py
    │   │   │   └── transforms_cuda_test.py
    │   │   ├── rnnt_cpu_test.py
    │   │   ├── conv_emformer_cpu_test.py
    │   │   ├── rnnt_gpu_test.py
    │   │   ├── conv_emformer_gpu_test.py
    │   │   └── conv_emformer_test_impl.py
    │   ├── sox_effect
    │   │   ├── __init__.py
    │   │   └── common.py
    │   ├── transforms
    │   │   ├── __init__.py
    │   │   ├── librosa_compatibility_cpu_test.py
    │   │   ├── autograd_cpu_test.py
    │   │   ├── librosa_compatibility_cuda_test.py
    │   │   ├── autograd_cuda_test.py
    │   │   ├── transforms_cpu_test.py
    │   │   ├── transforms_cuda_test.py
    │   │   ├── torchscript_consistency_cpu_test.py
    │   │   └── torchscript_consistency_cuda_test.py
    │   ├── example
    │   │   ├── tacotron2
    │   │   │   ├── __init__.py
    │   │   │   ├── tacotron2_loss_cpu_test.py
    │   │   │   └── tacotron2_loss_gpu_test.py
    │   │   ├── souce_sepration
    │   │   │   ├── __init__.py
    │   │   │   └── metrics_test.py
    │   │   ├── __init__.py
    │   │   ├── hubert
    │   │   │   └── __init__.py
    │   │   └── emformer_rnnt
    │   │   │   ├── __init__.py
    │   │   │   └── utils.py
    │   ├── assets
    │   │   ├── VCTK-Corpus
    │   │   │   ├── txt
    │   │   │   │   └── p224
    │   │   │   │   │   └── p224_002.txt
    │   │   │   └── wav48
    │   │   │   │   └── p224
    │   │   │   │       └── p224_002.wav
    │   │   ├── decoder
    │   │   │   ├── tokens.txt
    │   │   │   ├── nnlm_lex_dict.txt
    │   │   │   ├── lexicon.txt
    │   │   │   ├── nnlm_lexfree_dict.txt
    │   │   │   ├── kenlm.arpa
    │   │   │   └── kenlm_char.arpa
    │   │   ├── sox_effect_test_fir_coeffs.txt
    │   │   ├── mat.ark
    │   │   ├── vec_int.ark
    │   │   ├── sinewave.wav
    │   │   ├── testsrc.hevc
    │   │   ├── vec_flt.ark
    │   │   ├── kaldi_file.wav
    │   │   ├── mp3_without_ext
    │   │   ├── nasa_13013.avi
    │   │   ├── nasa_13013.mp4
    │   │   ├── io
    │   │   │   ├── 96k_0_1ch.opus
    │   │   │   ├── 96k_0_2ch.opus
    │   │   │   ├── 96k_10_1ch.opus
    │   │   │   ├── 96k_10_2ch.opus
    │   │   │   ├── 96k_5_1ch.opus
    │   │   │   └── 96k_5_2ch.opus
    │   │   ├── kaldi_file_8000.wav
    │   │   ├── nasa_13013_no_audio.mp4
    │   │   ├── nasa_13013_no_video.mp4
    │   │   ├── vad-go-mono-32000.wav
    │   │   ├── vad-go-stereo-44100.wav
    │   │   ├── RATRACE_wave_f_nm_np1_fr_goo_37.avi
    │   │   ├── steam-train-whistle-daniel_simon.mp3
    │   │   ├── steam-train-whistle-daniel_simon.wav
    │   │   ├── kaldi_test_pitch_args.jsonl
    │   │   ├── README.md
    │   │   └── wav2vec2
    │   │   │   ├── huggingface
    │   │   │       └── generate_huggingface_model_config.py
    │   │   │   └── fairseq
    │   │   │       └── xlsr_53_56k.json
    │   ├── __init__.py
    │   ├── common_utils
    │   │   ├── func_utils.py
    │   │   ├── psd_utils.py
    │   │   ├── autograd_utils.py
    │   │   └── kaldi_utils.py
    │   └── kaldi_io_test.py
    ├── smoke_test
    │   └── smoke_test_no_ffmpeg.py
    └── cpp
    │   └── CMakeLists.txt
├── examples
    ├── pipeline_tacotron2
    │   └── text
    │   │   └── __init__.py
    ├── tutorials
    │   └── README.rst
    ├── libtorchaudio
    │   ├── .gitignore
    │   ├── data
    │   │   ├── rir.wav
    │   │   ├── input.wav
    │   │   └── README.md
    │   ├── augmentation
    │   │   ├── CMakeLists.txt
    │   │   ├── main.cpp
    │   │   └── README.md
    │   ├── speech_recognition
    │   │   ├── CMakeLists.txt
    │   │   ├── greedy_decoder.py
    │   │   └── transcribe.cpp
    │   ├── build.sh
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── avsr
    │   ├── data_prep
    │   │   ├── requirements.txt
    │   │   ├── detectors
    │   │   │   └── retinaface
    │   │   │   │   └── detector.py
    │   │   └── tools
    │   │   │   └── README.md
    │   ├── models
    │   │   ├── conformer_rnnt.py
    │   │   ├── emformer_rnnt.py
    │   │   └── fusion.py
    │   ├── average_checkpoints.py
    │   └── schedulers.py
    ├── hubert
    │   ├── loss
    │   │   └── __init__.py
    │   ├── utils
    │   │   └── __init__.py
    │   └── dataset
    │   │   └── __init__.py
    ├── source_separation
    │   ├── conv_tasnet
    │   │   └── __init__.py
    │   └── utils
    │   │   ├── dataset
    │   │       └── __init__.py
    │   │   └── __init__.py
    ├── self_supervised_learning
    │   ├── losses
    │   │   └── __init__.py
    │   ├── lr_schedulers
    │   │   ├── __init__.py
    │   │   └── _linear_decay.py
    │   ├── data_modules
    │   │   └── __init__.py
    │   └── README.md
    ├── pipeline_wav2letter
    │   ├── transforms.py
    │   ├── ctc_decoders.py
    │   ├── languagemodels.py
    │   └── utils.py
    ├── asr
    │   └── librispeech_conformer_rnnt_biasing
    │   │   ├── score.sh
    │   │   └── blists
    │   │       └── README.md
    └── pipeline_wavernn
    │   └── processing.py
├── .github
    ├── pytorch-probot.yml
    ├── scripts
    │   ├── unittest-windows
    │   │   ├── install_conda.bat
    │   │   ├── environment.yml
    │   │   ├── run_test.sh
    │   │   ├── set_cuda_envs.sh
    │   │   └── setup_env.sh
    │   ├── ffmpeg
    │   │   └── build.bat
    │   └── unittest-linux
    │   │   └── run_test.sh
    ├── pull_request_template.md
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── documentation.yml
    │   └── feature-request.yml
    └── workflows
    │   ├── bandit.yml
    │   ├── pr-labels.yml
    │   ├── integration-test.yml
    │   └── lint.yml
├── docs
    ├── source
    │   ├── references.rst
    │   ├── _static
    │   │   └── img
    │   │   │   ├── logo.png
    │   │   │   └── favicon.ico
    │   ├── _templates
    │   │   └── autosummary
    │   │   │   ├── class.rst
    │   │   │   ├── bundle_data.rst
    │   │   │   ├── io.rst
    │   │   │   ├── utils.rst
    │   │   │   ├── dataset_class.rst
    │   │   │   ├── cuda_ctc_decoder_class.rst
    │   │   │   ├── ctc_decoder_class.rst
    │   │   │   └── io_class.rst
    │   ├── libtorio.rst
    │   ├── prototype.datasets.rst
    │   ├── torio.utils.rst
    │   ├── prototype.transforms.rst
    │   ├── utils.rst
    │   ├── io.rst
    │   ├── torio.io.rst
    │   ├── compliance.kaldi.rst
    │   ├── sox_effects.rst
    │   ├── torio.rst
    │   ├── prototype.rst
    │   ├── prototype.functional.rst
    │   ├── models.decoder.rst
    │   ├── models.rst
    │   ├── kaldi_io.rst
    │   ├── feature_classifications.rst
    │   ├── datasets.rst
    │   └── prototype.models.rst
    ├── requirements-tutorials.txt
    ├── post_process_dispatcher.py
    ├── make.bat
    ├── requirements.txt
    └── Makefile
├── mypy.ini
├── setup.cfg
├── .gitattributes
├── packaging
    ├── torchaudio
    │   ├── bld.bat
    │   └── build.sh
    ├── vs2019
    │   ├── meta.yaml
    │   ├── conda_build_config.yaml
    │   └── activate.bat
    ├── windows
    │   └── internal
    │   │   └── driver_update.bat
    ├── vc_env_helper.bat
    └── cut_release.sh
├── requirements.txt
├── pyproject.toml
├── .flake8
├── CODEOWNERS
├── third_party
    ├── LICENSES_BUNDLED.txt
    ├── sox
    │   └── CMakeLists.txt
    └── ffmpeg
    │   └── single
    │       └── CMakeLists.txt
├── CITATION
├── .clang-tidy
├── .pre-commit-config.yaml
└── LICENSE


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 2.2.0a0
2 | 


--------------------------------------------------------------------------------
/src/torio/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/torchaudio/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/integration_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/pipeline_tacotron2/text/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/torchaudio/pipelines/_wav2vec2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/compliance/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/rnnt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/sox_effect/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/pytorch-probot.yml:
--------------------------------------------------------------------------------
1 | tracking_issue: 736
2 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/sox_io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/compliance/kaldi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/conformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/decoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/emformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/hdemucs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/squim/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/wav2vec2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/tutorials/README.rst:
--------------------------------------------------------------------------------
1 | Tutorials
2 | =========
3 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/dispatcher/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/soundfile/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/rnnt_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/hifi_gan/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/dispatcher/ffmpeg/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/dispatcher/sox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/souce_sepration/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/dispatcher/soundfile/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/setup_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from .extension import *  # noqa
2 | 


--------------------------------------------------------------------------------
/docs/source/references.rst:
--------------------------------------------------------------------------------
1 | References
2 | ----------
3 | 
4 | .. bibliography::
5 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | allow_redefinition = True
3 | ignore_missing_imports = True
4 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/VCTK-Corpus/txt/p224/p224_002.txt:
--------------------------------------------------------------------------------
1 | VCTK Test.
2 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | data/output.wav
3 | *.zip
4 | output
5 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/decoder/tokens.txt:
--------------------------------------------------------------------------------
1 | -
2 | |
3 | f
4 | o
5 | b
6 | a
7 | r
8 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [pydocstyle]
2 | select = D417 # Missing argument descriptions in the docstring 
3 | 


--------------------------------------------------------------------------------
/examples/avsr/data_prep/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | scikit-image
3 | opencv-python
4 | ffmpeg-python
5 | 


--------------------------------------------------------------------------------
/src/torio/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ffmpeg_utils
2 | 
3 | 
4 | __all__ = ["ffmpeg_utils"]
5 | 


--------------------------------------------------------------------------------
/test/smoke_test/smoke_test_no_ffmpeg.py:
--------------------------------------------------------------------------------
1 | from smoke_test import main
2 | 
3 | main(["--no-ffmpeg"])
4 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/decoder/nnlm_lex_dict.txt:
--------------------------------------------------------------------------------
1 | |
2 | foo
3 | bar
4 | foobar
5 | <unk>
6 | 


--------------------------------------------------------------------------------
/src/torchaudio/compliance/__init__.py:
--------------------------------------------------------------------------------
1 | from . import kaldi
2 | 
3 | __all__ = [
4 |     "kaldi",
5 | ]
6 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .musan import Musan
2 | 
3 | 
4 | __all__ = ["Musan"]
5 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/decoder/lexicon.txt:
--------------------------------------------------------------------------------
1 | foo f o o |
2 | bar b a r |
3 | foobar f o o b a r |
4 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/sox_effect_test_fir_coeffs.txt:
--------------------------------------------------------------------------------
1 | 0.0195 -0.082 0.234 0.891 -0.145 0.043
2 | 


--------------------------------------------------------------------------------
/docs/source/_static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/docs/source/_static/img/logo.png


--------------------------------------------------------------------------------
/test/torchaudio_unittest/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from . import fb  # noqa
3 | except Exception:
4 |     pass
5 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/decoder/nnlm_lexfree_dict.txt:
--------------------------------------------------------------------------------
1 | -
2 | |
3 | f
4 | o
5 | b
6 | a
7 | r
8 | <unk>
9 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # To exclude autogenerated files from code reviews
2 | .circleci/config.yml linguist-generated=true
3 | 


--------------------------------------------------------------------------------
/docs/source/_static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/docs/source/_static/img/favicon.ico


--------------------------------------------------------------------------------
/examples/libtorchaudio/data/rir.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/examples/libtorchaudio/data/rir.wav


--------------------------------------------------------------------------------
/examples/hubert/loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .hubert_loss import hubert_loss
2 | 
3 | __all__ = [
4 |     "hubert_loss",
5 | ]
6 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/data/input.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/examples/libtorchaudio/data/input.wav


--------------------------------------------------------------------------------
/examples/source_separation/conv_tasnet/__init__.py:
--------------------------------------------------------------------------------
1 | from . import train, trainer
2 | 
3 | __all__ = ["train", "trainer"]
4 | 


--------------------------------------------------------------------------------
/examples/source_separation/utils/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from . import utils, wsj0mix
2 | 
3 | __all__ = ["utils", "wsj0mix"]
4 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/mat.ark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/mat.ark


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/vec_int.ark:
--------------------------------------------------------------------------------
1 | key1  B            key2  B            key3  B            


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/sinewave.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/sinewave.wav


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/testsrc.hevc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/testsrc.hevc


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/vec_flt.ark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/vec_flt.ark


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/kaldi_file.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/kaldi_file.wav


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/mp3_without_ext:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/mp3_without_ext


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/nasa_13013.avi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013.avi


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/nasa_13013.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013.mp4


--------------------------------------------------------------------------------
/examples/source_separation/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import dataset, dist_utils, metrics
2 | 
3 | __all__ = ["dataset", "dist_utils", "metrics"]
4 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/io/96k_0_1ch.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_0_1ch.opus


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/io/96k_0_2ch.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_0_2ch.opus


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/io/96k_10_1ch.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_10_1ch.opus


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/io/96k_10_2ch.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_10_2ch.opus


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/io/96k_5_1ch.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_5_1ch.opus


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/io/96k_5_2ch.opus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/io/96k_5_2ch.opus


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/kaldi_file_8000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/kaldi_file_8000.wav


--------------------------------------------------------------------------------
/packaging/torchaudio/bld.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | 
3 | set IS_CONDA=1
4 | 
5 | python setup.py install --single-version-externally-managed --record=record.txt
6 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/pipelines/_vggish/__init__.py:
--------------------------------------------------------------------------------
1 | from ._vggish_pipeline import VGGISH, VGGishBundle
2 | 
3 | __all__ = ["VGGISH", "VGGishBundle"]
4 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/nasa_13013_no_audio.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013_no_audio.mp4


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/nasa_13013_no_video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/nasa_13013_no_video.mp4


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/vad-go-mono-32000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/vad-go-mono-32000.wav


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/vad-go-stereo-44100.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/vad-go-stereo-44100.wav


--------------------------------------------------------------------------------
/.github/scripts/unittest-windows/install_conda.bat:
--------------------------------------------------------------------------------
1 | start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
2 | 


--------------------------------------------------------------------------------
/src/torio/__init__.py:
--------------------------------------------------------------------------------
1 | from . import _extension  # noqa  # usort: skip
2 | from . import io, utils
3 | 
4 | 
5 | __all__ = [
6 |     "io",
7 |     "utils",
8 | ]
9 | 


--------------------------------------------------------------------------------
/examples/self_supervised_learning/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from ._hubert_loss import hubert_loss
2 | 
3 | __all__ = [
4 |     "hubert_loss",
5 |     "wav2vec2_loss",
6 | ]
7 | 


--------------------------------------------------------------------------------
/examples/self_supervised_learning/lr_schedulers/__init__.py:
--------------------------------------------------------------------------------
1 | from ._linear_decay import LinearDecayLRScheduler
2 | 
3 | __all__ = [
4 |     "LinearDecayLRScheduler",
5 | ]
6 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples"))
6 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/RATRACE_wave_f_nm_np1_fr_goo_37.avi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/RATRACE_wave_f_nm_np1_fr_goo_37.avi


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/VCTK-Corpus/wav48/p224/p224_002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/VCTK-Corpus/wav48/p224/p224_002.wav


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.mp3


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wetdog/audio/main/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav


--------------------------------------------------------------------------------
/docs/requirements-tutorials.txt:
--------------------------------------------------------------------------------
 1 | IPython
 2 | deep-phonemizer
 3 | boto3
 4 | cython
 5 | pandas
 6 | librosa==0.10.0
 7 | sentencepiece
 8 | pandoc
 9 | mir_eval
10 | pesq
11 | pystoi
12 | 


--------------------------------------------------------------------------------
/examples/self_supervised_learning/data_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from ._hubert_datamodule import HuBERTDataModule
2 | 
3 | __all__ = [
4 |     "HuBERTDataModule",
5 |     "Wav2Vec2DataModule",
6 | ]
7 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/hubert/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "examples", "hubert"))
6 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
1 | ..
2 |   autogenerated from source/_templates/autosummary/class.rst
3 | 
4 | {{ name | underline }}
5 | 
6 | .. autoclass:: {{ fullname }}
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Minimum runtime dependencies
2 | torch
3 | 
4 | # Optional runtime dependencies
5 | kaldi_io
6 | SoundFile
7 | 
8 | # For build and test-time dependencies please refer to CONTRIBUTING.md
9 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | <strong>PLEASE NOTE THAT THE TORCHAUDIO REPOSITORY IS NO LONGER ACTIVELY MONITORED.</strong> You may not get a response. For open discussions, visit https://discuss.pytorch.org/.
2 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/emformer_rnnt/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "examples", "asr", "emformer_rnnt"))
6 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/hifi_gan/original/env.py:
--------------------------------------------------------------------------------
1 | class AttrDict(dict):
2 |     def __init__(self, *args, **kwargs):
3 |         super(AttrDict, self).__init__(*args, **kwargs)
4 |         self.__dict__ = self
5 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/augmentation/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(augment main.cpp)
2 | target_link_libraries(augment "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
3 | set_property(TARGET augment PROPERTY CXX_STANDARD 14)
4 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/iir_cuda.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <torch/types.h>
4 | 
5 | void cuda_lfilter_core_loop(
6 |     const torch::Tensor& in,
7 |     const torch::Tensor& a_flipped,
8 |     torch::Tensor& padded_out);
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Usage questions
4 |     url: https://discuss.pytorch.org/
5 |     about: Ask questions and discuss with other torchaudio community members
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.usort]
 2 | 
 3 | first_party_detection = false
 4 | 
 5 | [tool.black]
 6 | 
 7 | line-length = 120
 8 | target-version = ["py38"]
 9 | 
10 | [tool.ufmt]
11 | excludes = [
12 |     "examples/tutorials/",
13 | ]
14 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/types.h>
3 | 
4 | namespace torchaudio {
5 | bool is_rir_available();
6 | bool is_align_available();
7 | c10::optional<int64_t> cuda_version();
8 | } // namespace torchaudio
9 | 


--------------------------------------------------------------------------------
/src/torchaudio/models/wav2vec2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .import_fairseq import import_fairseq_model
2 | from .import_huggingface import import_huggingface_model
3 | 
4 | __all__ = [
5 |     "import_huggingface_model",
6 |     "import_fairseq_model",
7 | ]
8 | 


--------------------------------------------------------------------------------
/src/torchaudio/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from torio.utils import ffmpeg_utils
 2 | 
 3 | from . import sox_utils
 4 | from .download import download_asset
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "download_asset",
 9 |     "sox_utils",
10 |     "ffmpeg_utils",
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/hw_context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <libtorio/ffmpeg/ffmpeg.h>
 4 | 
 5 | namespace torio::io {
 6 | 
 7 | AVBufferRef* get_cuda_context(int index);
 8 | 
 9 | void clear_cuda_context_cache();
10 | 
11 | } // namespace torio::io
12 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/autograd_cpu_test.py:
--------------------------------------------------------------------------------
1 | from torchaudio_unittest.common_utils import PytorchTestCase
2 | 
3 | from .autograd_test_impl import Autograd
4 | 
5 | 
6 | class AutogradCPUTest(Autograd, PytorchTestCase):
7 |     device = "cpu"
8 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/bundle_data.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/bundle_data.rst
 3 | 
 4 | {{ name | underline }}
 5 | 
 6 | .. container:: py attribute
 7 | 
 8 |    .. autodata:: {{ fullname }}
 9 |       :no-value:
10 | 


--------------------------------------------------------------------------------
/docs/source/libtorio.rst:
--------------------------------------------------------------------------------
 1 | libtorio
 2 | ========
 3 | 
 4 | .. warning::
 5 |    TorchAudio's C++ API is a prototype feature.
 6 |    API/ABI backward compatibility is not guaranteed.
 7 | 
 8 | .. toctree::
 9 |    libtorio.stream_reader
10 |    libtorio.stream_writer
11 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/kaldi_test_pitch_args.jsonl:
--------------------------------------------------------------------------------
1 | {"sample_rate": 8000}
2 | {"sample_rate": 8000, "frames_per_chunk": 200}
3 | {"sample_rate": 8000, "frames_per_chunk": 200, "simulate_first_pass_online": true}
4 | {"sample_rate": 16000}
5 | {"sample_rate": 44100}
6 | 


--------------------------------------------------------------------------------
/src/torio/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._streaming_media_decoder import StreamingMediaDecoder
 2 | from ._streaming_media_encoder import CodecConfig, StreamingMediaEncoder
 3 | 
 4 | 
 5 | __all__ = [
 6 |     "StreamingMediaDecoder",
 7 |     "CodecConfig",
 8 |     "StreamingMediaEncoder",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cpu_test.py:
--------------------------------------------------------------------------------
1 | from torchaudio_unittest.common_utils import PytorchTestCase
2 | 
3 | from .librosa_compatibility_test_impl import Functional
4 | 
5 | 
6 | class TestFunctionalCPU(Functional, PytorchTestCase):
7 |     device = "cpu"
8 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/autograd_cuda_test.py:
--------------------------------------------------------------------------------
1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
2 | 
3 | from .autograd_test_impl import Autograd
4 | 
5 | 
6 | @skipIfNoCuda
7 | class AutogradCUDATest(Autograd, PytorchTestCase):
8 |     device = "cuda"
9 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._transforms import BarkScale, BarkSpectrogram, ChromaScale, ChromaSpectrogram, InverseBarkScale
 2 | 
 3 | __all__ = [
 4 |     "BarkScale",
 5 |     "BarkSpectrogram",
 6 |     "ChromaScale",
 7 |     "ChromaSpectrogram",
 8 |     "InverseBarkScale",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/torchaudio/_internal/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from .fb import download_url_to_file, load_state_dict_from_url
 3 | except ImportError:
 4 |     from torch.hub import download_url_to_file, load_state_dict_from_url
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "load_state_dict_from_url",
 9 |     "download_url_to_file",
10 | ]
11 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/common_utils/func_utils.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def torch_script(obj):
 7 |     """TorchScript the given function or Module"""
 8 |     buffer = io.BytesIO()
 9 |     torch.jit.save(torch.jit.script(obj), buffer)
10 |     buffer.seek(0)
11 |     return torch.jit.load(buffer)
12 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cuda_test.py:
--------------------------------------------------------------------------------
1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
2 | 
3 | from .librosa_compatibility_test_impl import Functional
4 | 
5 | 
6 | @skipIfNoCuda
7 | class TestFunctionalCUDA(Functional, PytorchTestCase):
8 |     device = "cuda"
9 | 


--------------------------------------------------------------------------------
/src/torchaudio/backend/__init__.py:
--------------------------------------------------------------------------------
1 | # NOTE:
2 | # The entire `torchaudio.backend` module is deprecated.
3 | # New things should be added to `torchaudio._backend`.
4 | # Only things related to backward compatibility should be placed here.
5 | 
6 | from . import common, no_backend, soundfile_backend, sox_io_backend  # noqa
7 | 
8 | __all__ = []
9 | 


--------------------------------------------------------------------------------
/.github/scripts/unittest-windows/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - defaults
 3 | dependencies:
 4 |   - flake8
 5 |   - pytest
 6 |   - pytest-cov
 7 |   - codecov
 8 |   - scipy >= 1.4.1
 9 |   - pip
10 |   - pip:
11 |     - kaldi-io
12 |     - PySoundFile
13 |     - future
14 |     - parameterized
15 |     - dataclasses
16 |     - expecttest
17 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/compute_alphas.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/script.h>
 2 | 
 3 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
 4 |   m.def(
 5 |       "rnnt_loss_alphas(Tensor logits,"
 6 |       "Tensor targets,"
 7 |       "Tensor logit_lengths,"
 8 |       "Tensor target_lengths,"
 9 |       "int blank,"
10 |       "float clamp) -> Tensor");
11 | }
12 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/compute_betas.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/script.h>
 2 | 
 3 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
 4 |   m.def(
 5 |       "rnnt_loss_betas(Tensor logits,"
 6 |       "Tensor targets,"
 7 |       "Tensor logit_lengths,"
 8 |       "Tensor target_lengths,"
 9 |       "int blank,"
10 |       "float clamp) -> Tensor");
11 | }
12 | 


--------------------------------------------------------------------------------
/src/torchaudio/sox_effects/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects
 2 | 
 3 | 
 4 | __all__ = [
 5 |     "init_sox_effects",
 6 |     "shutdown_sox_effects",
 7 |     "effect_names",
 8 |     "apply_effects_tensor",
 9 |     "apply_effects_file",
10 | ]
11 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/hifi_gan/original/utils.py:
--------------------------------------------------------------------------------
1 | def init_weights(m, mean=0.0, std=0.01):
2 |     classname = m.__class__.__name__
3 |     if classname.find("Conv") != -1:
4 |         m.weight.data.normal_(mean, std)
5 | 
6 | 
7 | def get_padding(kernel_size, dilation=1):
8 |     return int((kernel_size * dilation - dilation) / 2)
9 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/forced_align/compute.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/script.h>
 4 | 
 5 | std::tuple<torch::Tensor, torch::Tensor> forced_align(
 6 |     const torch::Tensor& logProbs,
 7 |     const torch::Tensor& targets,
 8 |     const torch::Tensor& inputLengths,
 9 |     const torch::Tensor& targetLengths,
10 |     const int64_t blank);
11 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/autograd_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .autograd_test_impl import AutogradTestImpl
 5 | 
 6 | 
 7 | class TestAutogradCPUFloat64(AutogradTestImpl, PytorchTestCase):
 8 |     dtype = torch.float64
 9 |     device = torch.device("cpu")
10 | 


--------------------------------------------------------------------------------
/docs/source/prototype.datasets.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.prototype.datasets
 2 | 
 3 | torchaudio.prototype.datasets
 4 | =============================
 5 | 
 6 | .. currentmodule:: torchaudio.prototype.datasets
 7 | 
 8 | .. autosummary::
 9 |     :toctree: generated
10 |     :nosignatures:
11 |     :template: autosummary/dataset_class.rst
12 | 
13 |     Musan
14 | 


--------------------------------------------------------------------------------
/examples/pipeline_wav2letter/transforms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class Normalize(torch.nn.Module):
 5 |     def forward(self, tensor):
 6 |         return (tensor - tensor.mean(-1, keepdim=True)) / tensor.std(-1, keepdim=True)
 7 | 
 8 | 
 9 | class UnsqueezeFirst(torch.nn.Module):
10 |     def forward(self, tensor):
11 |         return tensor.unsqueeze(0)
12 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/librosa_compatibility_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .librosa_compatibility_test_impl import TransformsTestBase
 5 | 
 6 | 
 7 | class TestTransforms(TransformsTestBase, PytorchTestCase):
 8 |     dtype = torch.float64
 9 |     device = torch.device("cpu")
10 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # Note: it's recommended to use `pre-commit run -a flake8`
 3 | 
 4 | max-line-length = 120
 5 | ignore = E203,E402,E741,W503
 6 | 
 7 | # Note: exclude is not honnored when flake8 is executed from pre-commit.
 8 | # pre-commit has a separate config
 9 | exclude = build,docs/src,third_party
10 | 
11 | per-file-ignores =
12 |   examples/tutorials/*.py: E501
13 | 


--------------------------------------------------------------------------------
/examples/hubert/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .common_utils import _get_id2label, _get_label2id, create_tsv
 2 | from .feature_utils import dump_features
 3 | from .kmeans import get_km_label, learn_kmeans
 4 | 
 5 | __all__ = [
 6 |     "create_tsv",
 7 |     "_get_id2label",
 8 |     "_get_label2id",
 9 |     "dump_features",
10 |     "learn_kmeans",
11 |     "get_km_label",
12 | ]
13 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/datasets/librispeech_test.py:
--------------------------------------------------------------------------------
1 | from torchaudio.datasets import librispeech
2 | from torchaudio_unittest.common_utils import TorchaudioTestCase
3 | from torchaudio_unittest.datasets.librispeech_test_impl import LibriSpeechTestMixin
4 | 
5 | 
6 | class TestLibriSpeech(LibriSpeechTestMixin, TorchaudioTestCase):
7 |     librispeech_cls = librispeech.LIBRISPEECH
8 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .librosa_compatibility_test_impl import TransformsTestBase
 5 | 
 6 | 
 7 | class TestTransforms(TransformsTestBase, PytorchTestCase):
 8 |     dtype = torch.float64
 9 |     device = torch.device("cpu")
10 | 


--------------------------------------------------------------------------------
/src/torchaudio/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from torio.io import CodecConfig, StreamingMediaDecoder as StreamReader, StreamingMediaEncoder as StreamWriter
 2 | 
 3 | from ._effector import AudioEffector
 4 | from ._playback import play_audio
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "AudioEffector",
 9 |     "StreamReader",
10 |     "StreamWriter",
11 |     "CodecConfig",
12 |     "play_audio",
13 | ]
14 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/autograd_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .autograd_test_impl import AutogradTestImpl
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestAutogradCUDAFloat64(AutogradTestImpl, PytorchTestCase):
 9 |     dtype = torch.float64
10 |     device = torch.device("cuda")
11 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/README.md:
--------------------------------------------------------------------------------
1 | * RATRACE_wave_f_nm_np1_fr_goo_37.avi
2 |   * Source: HMDB-51 dataset ("wave" subset)
3 |     https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/#Downloads
4 |   * License: Creative Commons Attribution 4.0 International License.
5 |   * Note: This file does not have proper PTS values thus useful for testing seek for such files.
6 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/io.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/io.rst
 3 | 
 4 | {{ fullname | underline }}
 5 | 
 6 | .. autofunction:: {{ fullname }}
 7 | 
 8 | 
 9 | {%- if name == "info" %}
10 | 
11 | Support Structure
12 | -----------------
13 | 
14 | AudioMetaData
15 | ~~~~~~~~~~~~~
16 | 
17 | .. autoclass:: torchaudio.AudioMetaData
18 | 
19 | {%- endif %}
20 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/compute.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/script.h>
 4 | 
 5 | std::tuple<torch::Tensor, c10::optional<torch::Tensor>> rnnt_loss(
 6 |     torch::Tensor& logits,
 7 |     const torch::Tensor& targets,
 8 |     const torch::Tensor& logit_lengths,
 9 |     const torch::Tensor& target_lengths,
10 |     int64_t blank,
11 |     double clamp,
12 |     bool fused_log_softmax);
13 | 


--------------------------------------------------------------------------------
/src/torchaudio/utils/ffmpeg_utils.py:
--------------------------------------------------------------------------------
 1 | """Module to change the configuration of FFmpeg libraries (such as libavformat).
 2 | 
 3 | It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`torchaudio.load`).
 4 | """
 5 | 
 6 | 
 7 | # This file is just for BC.
 8 | def __getattr__(item):
 9 |     from torio.utils import ffmpeg_utils
10 | 
11 |     return getattr(ffmpeg_utils, item)
12 | 


--------------------------------------------------------------------------------
/src/torio/_extension/__init__.py:
--------------------------------------------------------------------------------
 1 | from .utils import _init_ffmpeg, _LazyImporter
 2 | 
 3 | 
 4 | _FFMPEG_EXT = None
 5 | 
 6 | 
 7 | def lazy_import_ffmpeg_ext():
 8 |     """Load FFmpeg integration based on availability in lazy manner"""
 9 | 
10 |     global _FFMPEG_EXT
11 |     if _FFMPEG_EXT is None:
12 |         _FFMPEG_EXT = _LazyImporter("_torio_ffmpeg", _init_ffmpeg)
13 |     return _FFMPEG_EXT
14 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/hdemucs/hdemucs_cpu_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torchaudio_unittest.common_utils import PytorchTestCase
3 | from torchaudio_unittest.models.hdemucs.hdemucs_test_impl import CompareHDemucsOriginal, HDemucsTests
4 | 
5 | 
6 | class HDemucsFloat32CPUTest(HDemucsTests, CompareHDemucsOriginal, PytorchTestCase):
7 |     dtype = torch.float32
8 |     device = torch.device("cpu")
9 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/autograd_cpu_test.py:
--------------------------------------------------------------------------------
 1 | from torchaudio_unittest.common_utils import PytorchTestCase
 2 | 
 3 | from .autograd_test_impl import AutogradTestFloat32, AutogradTestMixin
 4 | 
 5 | 
 6 | class AutogradCPUTest(AutogradTestMixin, PytorchTestCase):
 7 |     device = "cpu"
 8 | 
 9 | 
10 | class AutogradRNNTCPUTest(AutogradTestFloat32, PytorchTestCase):
11 |     device = "cpu"
12 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/librosa_compatibility_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .librosa_compatibility_test_impl import TransformsTestBase
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestTransforms(TransformsTestBase, PytorchTestCase):
 9 |     dtype = torch.float64
10 |     device = torch.device("cuda")
11 | 


--------------------------------------------------------------------------------
/docs/source/torio.utils.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torio.utils
 2 | 
 3 | torio.utils
 4 | ===========
 5 | 
 6 | ``torio.utils`` module contains utility functions to query and configure the global state of third party libraries.
 7 | 
 8 | .. currentmodule:: torio.utils
 9 | 
10 | .. autosummary::
11 |    :toctree: generated
12 |    :nosignatures:
13 |    :template: autosummary/utils.rst
14 | 
15 |    ffmpeg_utils
16 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/macros.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_CUDA
 4 | #define WARP_SIZE 32
 5 | #define MAX_THREADS_PER_BLOCK 1024
 6 | #define REDUCE_THREADS 256
 7 | #define HOST_AND_DEVICE __host__ __device__
 8 | #define FORCE_INLINE __forceinline__
 9 | #include <cuda_fp16.h>
10 | #include <cuda_runtime.h>
11 | #else
12 | #define HOST_AND_DEVICE
13 | #define FORCE_INLINE inline
14 | #endif // USE_CUDA
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .librosa_compatibility_test_impl import TransformsTestBase
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestTransforms(TransformsTestBase, PytorchTestCase):
 9 |     dtype = torch.float64
10 |     device = torch.device("cuda")
11 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/librosa_compatibility_cpu_test.py:
--------------------------------------------------------------------------------
 1 | from torchaudio_unittest.common_utils import PytorchTestCase
 2 | 
 3 | from .librosa_compatibility_test_impl import Functional, FunctionalComplex
 4 | 
 5 | 
 6 | class TestFunctionalCPU(Functional, PytorchTestCase):
 7 |     device = "cpu"
 8 | 
 9 | 
10 | class TestFunctionalComplexCPU(FunctionalComplex, PytorchTestCase):
11 |     device = "cpu"
12 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_reader/packet_buffer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <libtorio/ffmpeg/ffmpeg.h>
 3 | 
 4 | namespace torio {
 5 | namespace io {
 6 | class PacketBuffer {
 7 |  public:
 8 |   void push_packet(AVPacket* packet);
 9 |   std::vector<AVPacketPtr> pop_packets();
10 |   bool has_packets();
11 | 
12 |  private:
13 |   std::deque<AVPacketPtr> packets;
14 | };
15 | } // namespace io
16 | } // namespace torio
17 | 


--------------------------------------------------------------------------------
/docs/source/prototype.transforms.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.prototype.transforms
 2 | 
 3 | torchaudio.prototype.transforms
 4 | ===============================
 5 | 
 6 | .. currentmodule:: torchaudio.prototype.transforms
 7 | 
 8 | .. autosummary::
 9 |     :toctree: generated
10 |     :nosignatures:
11 | 
12 |     BarkScale
13 |     BarkSpectrogram
14 |     ChromaScale
15 |     ChromaSpectrogram
16 |     InverseBarkScale
17 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/pybind/pybind.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorchaudio/utils.h>
 2 | #include <torch/extension.h>
 3 | 
 4 | namespace torchaudio {
 5 | namespace {
 6 | 
 7 | PYBIND11_MODULE(_torchaudio, m) {
 8 |   m.def("is_rir_available", &is_rir_available, "");
 9 |   m.def("is_align_available", &is_align_available, "");
10 |   m.def("cuda_version", &cuda_version, "");
11 | }
12 | 
13 | } // namespace
14 | } // namespace torchaudio
15 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/utils.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/utils.rst
 3 | 
 4 | .. py:module:: {{ fullname }}
 5 | 
 6 | {{ name | underline }}
 7 | 
 8 | .. automodule:: {{fullname}}
 9 |    :noindex:
10 | 
11 | .. currentmodule:: {{ fullname }}
12 | 
13 | {%- for func in functions %}
14 | 
15 | {{ func | underline("-") }}
16 | 
17 | .. autofunction:: {{ func }}
18 | 
19 | {%- endfor %}
20 | 


--------------------------------------------------------------------------------
/docs/source/utils.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.utils
 2 | 
 3 | torchaudio.utils
 4 | ================
 5 | 
 6 | ``torchaudio.utils`` module contains utility functions to configure the global state of third party libraries.
 7 | 
 8 | .. currentmodule:: torchaudio.utils
 9 | 
10 | .. autosummary::
11 |    :toctree: generated
12 |    :nosignatures:
13 |    :template: autosummary/utils.rst
14 | 
15 |    sox_utils
16 |    ffmpeg_utils
17 | 


--------------------------------------------------------------------------------
/examples/asr/librispeech_conformer_rnnt_biasing/score.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $# -ne 1 ]; then
 4 |   echo "Usage: $0 <DECODING_DIR>"
 5 |   exit 1
 6 | fi
 7 | 
 8 | dir=$1 # the path to the decoding dir, e.g. experiments/librispeech_clean100_suffix600_tcpgen500_sche30_nodrop/decode_test_clean_b10_KB1000/
 9 | sclite -r "${dir}/ref.trn.txt" trn -h "${dir}/hyp.trn.txt" trn -i rm -o all stdout > "${dir}/result.wrd.txt"
10 | 


--------------------------------------------------------------------------------
/src/torchaudio/models/squim/__init__.py:
--------------------------------------------------------------------------------
 1 | from .objective import squim_objective_base, squim_objective_model, SquimObjective
 2 | from .subjective import squim_subjective_base, squim_subjective_model, SquimSubjective
 3 | 
 4 | __all__ = [
 5 |     "squim_objective_base",
 6 |     "squim_objective_model",
 7 |     "squim_subjective_base",
 8 |     "squim_subjective_model",
 9 |     "SquimObjective",
10 |     "SquimSubjective",
11 | ]
12 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # Fallback
 2 | *       @pytorch/team-audio-core
 3 | 
 4 | /examples/avsr                          @mpc001
 5 | /examples/asr                           @hwangjeff
 6 | /examples/self_supervised_learning      @nateanl
 7 | /examples/dnn_beamformer                @nateanl
 8 | /examples/hubert                        @nateanl
 9 | /examples/tutorials                     @mthrok
10 | /torchaudio                             @mthrok
11 | 


--------------------------------------------------------------------------------
/docs/source/io.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.io
 2 | 
 3 | torchaudio.io
 4 | =============
 5 | 
 6 | .. currentmodule:: torchaudio.io
 7 | 
 8 | .. autosummary::
 9 |    :toctree: generated
10 |    :nosignatures:
11 |    :template: autosummary/io_class.rst
12 | 
13 |    StreamReader
14 |    StreamWriter
15 |    AudioEffector
16 |    play_audio
17 | 
18 | .. rubric:: Tutorials using ``torchaudio.io``
19 | 
20 | .. minigallery:: torchaudio.io
21 | 


--------------------------------------------------------------------------------
/docs/source/torio.io.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torio.io
 2 | 
 3 | torio.io
 4 | ========
 5 | 
 6 | .. currentmodule:: torio.io
 7 | 
 8 | .. autosummary::
 9 |    :toctree: generated
10 |    :nosignatures:
11 |    :template: autosummary/torio_io_class.rst
12 | 
13 |    StreamingMediaDecoder
14 |    StreamingMediaEncoder
15 | 
16 | .. rubric:: Tutorials using ``torio.io``
17 | 
18 | .. minigallery:: torio.io
19 | 
20 | .. minigallery:: torchaudio.io
21 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/speech_recognition/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(transcribe transcribe.cpp)
2 | add_executable(transcribe_list transcribe_list.cpp)
3 | target_link_libraries(transcribe "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
4 | target_link_libraries(transcribe_list "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
5 | set_property(TARGET transcribe PROPERTY CXX_STANDARD 14)
6 | set_property(TARGET transcribe_list PROPERTY CXX_STANDARD 14)
7 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/hdemucs/hdemucs_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.models.hdemucs.hdemucs_test_impl import CompareHDemucsOriginal, HDemucsTests
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class HDemucsFloat32GPUTest(HDemucsTests, CompareHDemucsOriginal, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/data/README.md:
--------------------------------------------------------------------------------
1 | The files in this directory are originated from [VOiCES](https://iqtlabs.github.io/voices/) dataset, which is licensed under Creative Commos BY 4.0. They are modified to fit into the tutorial.
2 | 
3 | * `input.wav`: `VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav`
4 | 
5 | * `rir.wav`: `VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav`
6 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/autograd_cuda_test.py:
--------------------------------------------------------------------------------
 1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 2 | 
 3 | from .autograd_test_impl import AutogradTestFloat32, AutogradTestMixin
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class AutogradCUDATest(AutogradTestMixin, PytorchTestCase):
 8 |     device = "cuda"
 9 | 
10 | 
11 | @skipIfNoCuda
12 | class AutogradRNNTCUDATest(AutogradTestFloat32, PytorchTestCase):
13 |     device = "cuda"
14 | 


--------------------------------------------------------------------------------
/.github/scripts/ffmpeg/build.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | set PROJ_FOLDER=%cd%
 4 | 
 5 | choco install -y --no-progress msys2 --package-parameters "/NoUpdate"
 6 | C:\tools\msys64\usr\bin\env MSYSTEM=MINGW64 /bin/bash -l -c "pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain diffutils"
 7 | C:\tools\msys64\usr\bin\env MSYSTEM=MINGW64 /bin/bash -l -c "cd ${PROJ_FOLDER} && packaging/vc_env_helper.bat bash .github/scripts/ffmpeg/build.sh"
 8 | 
 9 | :end
10 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .kaldi_compatibility_test_impl import Kaldi
 5 | 
 6 | 
 7 | class TestKaldiFloat32(Kaldi, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestKaldiFloat64(Kaldi, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/librosa_compatibility_cuda_test.py:
--------------------------------------------------------------------------------
 1 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 2 | 
 3 | from .librosa_compatibility_test_impl import Functional, FunctionalComplex
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class TestFunctionalCUDA(Functional, PytorchTestCase):
 8 |     device = "cuda"
 9 | 
10 | 
11 | @skipIfNoCuda
12 | class TestFunctionalComplexCUDA(FunctionalComplex, PytorchTestCase):
13 |     device = "cuda"
14 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_writer/packet_writer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <libtorio/ffmpeg/ffmpeg.h>
 3 | 
 4 | namespace torio::io {
 5 | class PacketWriter {
 6 |   AVFormatContext* format_ctx;
 7 |   AVStream* stream;
 8 |   AVRational original_time_base;
 9 | 
10 |  public:
11 |   PacketWriter(
12 |       AVFormatContext* format_ctx_,
13 |       const StreamParams& stream_params_);
14 |   void write_packet(const AVPacketPtr& packet);
15 | };
16 | } // namespace torio::io
17 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/sox/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(
 2 |   sources
 3 |   io.cpp
 4 |   utils.cpp
 5 |   effects.cpp
 6 |   effects_chain.cpp
 7 |   types.cpp
 8 |   )
 9 | torchaudio_library(
10 |   libtorchaudio_sox
11 |   "${sources}"
12 |   ""
13 |   "torch;sox"
14 |   ""
15 |   )
16 | 
17 | if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
18 |   torchaudio_extension(
19 |     _torchaudio_sox
20 |     "pybind/pybind.cpp;"
21 |     ""
22 |     "libtorchaudio_sox"
23 |     ""
24 |     )
25 | endif()
26 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._vggish import VGGISH, VGGishBundle
 2 | from .hifigan_pipeline import HIFIGAN_VOCODER_V3_LJSPEECH, HiFiGANVocoderBundle
 3 | from .rnnt_pipeline import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
 4 | 
 5 | __all__ = [
 6 |     "EMFORMER_RNNT_BASE_MUSTC",
 7 |     "EMFORMER_RNNT_BASE_TEDLIUM3",
 8 |     "HIFIGAN_VOCODER_V3_LJSPEECH",
 9 |     "HiFiGANVocoderBundle",
10 |     "VGGISH",
11 |     "VGGishBundle",
12 | ]
13 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/compliance/kaldi/kaldi_compatibility_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest import common_utils
 3 | 
 4 | from .kaldi_compatibility_impl import Kaldi
 5 | 
 6 | 
 7 | class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/transforms_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .transforms_test_impl import TransformsTestBase
 5 | 
 6 | 
 7 | class TransformsCPUFloat32Test(TransformsTestBase, PytorchTestCase):
 8 |     device = "cpu"
 9 |     dtype = torch.float32
10 | 
11 | 
12 | class TransformsCPUFloat64Test(TransformsTestBase, PytorchTestCase):
13 |     device = "cpu"
14 |     dtype = torch.float64
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/rnnt/rnnt_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | from torchaudio_unittest.models.rnnt.rnnt_test_impl import RNNTTestImpl
 4 | 
 5 | 
 6 | class RNNTFloat32CPUTest(RNNTTestImpl, PytorchTestCase):
 7 |     dtype = torch.float32
 8 |     device = torch.device("cpu")
 9 | 
10 | 
11 | class RNNTFloat64CPUTest(RNNTTestImpl, PytorchTestCase):
12 |     dtype = torch.float64
13 |     device = torch.device("cpu")
14 | 


--------------------------------------------------------------------------------
/third_party/LICENSES_BUNDLED.txt:
--------------------------------------------------------------------------------
 1 | The Torchaudio repository and source distributions bundle several libraries that are 
 2 | compatibly licensed.  We list some here.
 3 | 
 4 | Name: cuctc
 5 | License: BSD-2-Clause (Files without specific notes)
 6 | BSD-3-Clause File:
 7 |      torchaudio/csrc/cuctc/src/ctc_fast_divmod.cuh,
 8 | Apache 2.0 Files:
 9 |      torchaudio/csrc/cuctc/src/bitonic_topk
10 |   For details, see: cuctc/LICENSE,
11 |      torchaudio/csrc/cuctc/src/bitonic_topk/LICENSE
12 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eux
 4 | 
 5 | this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 6 | build_dir="${this_dir}/build"
 7 | 
 8 | mkdir -p "${build_dir}"
 9 | cd "${build_dir}"
10 | 
11 | git submodule update
12 | cmake -GNinja \
13 |       -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \
14 |       -DBUILD_SOX=ON \
15 |       -DBUILD_KALDI=OFF \
16 |       ..
17 | cmake --build .
18 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/autograd_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest import common_utils
 3 | 
 4 | from .autograd_impl import Autograd, AutogradFloat32
 5 | 
 6 | 
 7 | class TestAutogradLfilterCPU(Autograd, common_utils.PytorchTestCase):
 8 |     dtype = torch.float64
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestAutogradRNNTCPU(AutogradFloat32, common_utils.PytorchTestCase):
13 |     dtype = torch.float32
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/hifi_gan/hifi_gan_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .hifi_gan_test_impl import HiFiGANTestImpl
 5 | 
 6 | 
 7 | class HiFiGANFloat32CPUTest(HiFiGANTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class HiFiGANFloat64CPUTest(HiFiGANTestImpl, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/sox_io/common.py:
--------------------------------------------------------------------------------
 1 | def name_func(func, _, params):
 2 |     return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
 3 | 
 4 | 
 5 | def get_enc_params(dtype):
 6 |     if dtype == "float32":
 7 |         return "PCM_F", 32
 8 |     if dtype == "int32":
 9 |         return "PCM_S", 32
10 |     if dtype == "int16":
11 |         return "PCM_S", 16
12 |     if dtype == "uint8":
13 |         return "PCM_U", 8
14 |     raise ValueError(f"Unexpected dtype: {dtype}")
15 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace torchaudio {
 4 | namespace rnnt {
 5 | 
 6 | enum status_t {
 7 |   SUCCESS = 0,
 8 |   FAILURE = 1,
 9 |   COMPUTE_DENOMINATOR_REDUCE_MAX_FAILED = 2,
10 |   COMPUTE_DENOMINATOR_REDUCE_SUM_FAILED = 3,
11 |   COMPUTE_LOG_PROBS_FAILED = 4,
12 |   COMPUTE_ALPHAS_BETAS_COSTS_FAILED = 5,
13 |   COMPUTE_GRADIENTS_FAILED = 6
14 | };
15 | 
16 | enum device_t { UNDEFINED = 0, CPU = 1, GPU = 2 };
17 | 
18 | } // namespace rnnt
19 | } // namespace torchaudio
20 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/dispatcher/sox/common.py:
--------------------------------------------------------------------------------
 1 | def name_func(func, _, params):
 2 |     return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
 3 | 
 4 | 
 5 | def get_enc_params(dtype):
 6 |     if dtype == "float32":
 7 |         return "PCM_F", 32
 8 |     if dtype == "int32":
 9 |         return "PCM_S", 32
10 |     if dtype == "int16":
11 |         return "PCM_S", 16
12 |     if dtype == "uint8":
13 |         return "PCM_U", 8
14 |     raise ValueError(f"Unexpected dtype: {dtype}")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/transforms_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .transforms_test_impl import TransformsTestImpl
 5 | 
 6 | 
 7 | class TransformsFloat32CPUTest(TransformsTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TransformsFloat64CPUTest(TransformsTestImpl, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/kaldi_compatibility_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .kaldi_compatibility_test_impl import Kaldi
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestKaldiFloat32(Kaldi, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TestKaldiFloat64(Kaldi, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/emformer/emformer_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | from torchaudio_unittest.models.emformer.emformer_test_impl import EmformerTestImpl
 4 | 
 5 | 
 6 | class EmformerFloat32CPUTest(EmformerTestImpl, PytorchTestCase):
 7 |     dtype = torch.float32
 8 |     device = torch.device("cpu")
 9 | 
10 | 
11 | class EmformerFloat64CPUTest(EmformerTestImpl, PytorchTestCase):
12 |     dtype = torch.float64
13 |     device = torch.device("cpu")
14 | 


--------------------------------------------------------------------------------
/src/torchaudio/backend/common.py:
--------------------------------------------------------------------------------
 1 | def __getattr__(name: str):
 2 |     if name == "AudioMetaData":
 3 |         import warnings
 4 | 
 5 |         warnings.warn(
 6 |             "`torchaudio.backend.common.AudioMetaData` has been moved to "
 7 |             "`torchaudio.AudioMetaData`. Please update the import path.",
 8 |             stacklevel=2,
 9 |         )
10 |         from torchaudio import AudioMetaData
11 | 
12 |         return AudioMetaData
13 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
14 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/rnnt_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | from torchaudio_unittest.prototype.rnnt_test_impl import ConformerRNNTTestImpl
 4 | 
 5 | 
 6 | class ConformerRNNTFloat32CPUTest(ConformerRNNTTestImpl, PytorchTestCase):
 7 |     dtype = torch.float32
 8 |     device = torch.device("cpu")
 9 | 
10 | 
11 | class ConformerRNNTFloat64CPUTest(ConformerRNNTTestImpl, PytorchTestCase):
12 |     dtype = torch.float64
13 |     device = torch.device("cpu")
14 | 


--------------------------------------------------------------------------------
/src/torchaudio/pipelines/_tts/__init__.py:
--------------------------------------------------------------------------------
 1 | from .impl import (
 2 |     TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
 3 |     TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
 4 |     TACOTRON2_WAVERNN_CHAR_LJSPEECH,
 5 |     TACOTRON2_WAVERNN_PHONE_LJSPEECH,
 6 | )
 7 | from .interface import Tacotron2TTSBundle
 8 | 
 9 | 
10 | __all__ = [
11 |     "Tacotron2TTSBundle",
12 |     "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH",
13 |     "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH",
14 |     "TACOTRON2_WAVERNN_CHAR_LJSPEECH",
15 |     "TACOTRON2_WAVERNN_PHONE_LJSPEECH",
16 | ]
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/conformer/conformer_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | from torchaudio_unittest.models.conformer.conformer_test_impl import ConformerTestImpl
 4 | 
 5 | 
 6 | class ConformerFloat32CPUTest(ConformerTestImpl, PytorchTestCase):
 7 |     dtype = torch.float32
 8 |     device = torch.device("cpu")
 9 | 
10 | 
11 | class ConformerFloat64CPUTest(ConformerTestImpl, PytorchTestCase):
12 |     dtype = torch.float64
13 |     device = torch.device("cpu")
14 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/transforms_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .transforms_test_impl import TransformsTestBase
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TransformsCUDAFloat32Test(TransformsTestBase, PytorchTestCase):
 9 |     device = "cuda"
10 |     dtype = torch.float32
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TransformsCUDAFloat64Test(TransformsTestBase, PytorchTestCase):
15 |     device = "cuda"
16 |     dtype = torch.float64
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/compliance/kaldi/kaldi_compatibility_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest import common_utils
 3 | 
 4 | from .kaldi_compatibility_impl import Kaldi
 5 | 
 6 | 
 7 | @common_utils.skipIfNoCuda
 8 | class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @common_utils.skipIfNoCuda
14 | class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/torchscript_consistency_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .torchscript_consistency_impl import Functional, FunctionalFloat32Only
 5 | 
 6 | 
 7 | class TestFunctionalFloat32(Functional, FunctionalFloat32Only, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestFunctionalFloat64(Functional, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/rnnt/rnnt_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.models.rnnt.rnnt_test_impl import RNNTTestImpl
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class RNNTFloat32GPUTest(RNNTTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 
11 | 
12 | @skipIfNoCuda
13 | class RNNTFloat64GPUTest(RNNTTestImpl, PytorchTestCase):
14 |     dtype = torch.float64
15 |     device = torch.device("cuda")
16 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/conv_emformer_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | from torchaudio_unittest.prototype.conv_emformer_test_impl import ConvEmformerTestImpl
 4 | 
 5 | 
 6 | class ConvEmformerFloat32CPUTest(ConvEmformerTestImpl, PytorchTestCase):
 7 |     dtype = torch.float32
 8 |     device = torch.device("cpu")
 9 | 
10 | 
11 | class ConvEmformerFloat64CPUTest(ConvEmformerTestImpl, PytorchTestCase):
12 |     dtype = torch.float64
13 |     device = torch.device("cpu")
14 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .torchscript_consistency_impl import Transforms, TransformsFloat32Only
 5 | 
 6 | 
 7 | class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestTransformsFloat64(Transforms, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/hifi_gan/hifi_gan_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .hifi_gan_test_impl import HiFiGANTestImpl
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class HiFiGANFloat32CPUTest(HiFiGANTestImpl, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class HiFiGANFloat64CPUTest(HiFiGANTestImpl, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/examples/asr/librispeech_conformer_rnnt_biasing/blists/README.md:
--------------------------------------------------------------------------------
1 | This is the default directory where rare word list files should be found.
2 | 
3 | To train or evaluate a model, please download the following files, and save them here.
4 | 
5 | - [`rareword_f15.txt`](https://download.pytorch.org/torchaudio/pipeline-assets/tcpgen/rareword_f15.txt)
6 | - [`rareword_f30.txt`](https://download.pytorch.org/torchaudio/pipeline-assets/tcpgen/rareword_f30.txt)
7 | - [`all_rare_words.txt`](https://download.pytorch.org/torchaudio/pipeline-assets/tcpgen/all_rare_words.txt)
8 | 


--------------------------------------------------------------------------------
/examples/pipeline_wav2letter/ctc_decoders.py:
--------------------------------------------------------------------------------
 1 | from torch import topk
 2 | 
 3 | 
 4 | class GreedyDecoder:
 5 |     def __call__(self, outputs):
 6 |         """Greedy Decoder. Returns highest probability of class labels for each timestep
 7 | 
 8 |         Args:
 9 |             outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank))
10 | 
11 |         Returns:
12 |             torch.Tensor: class labels per time step.
13 |         """
14 |         _, indices = topk(outputs, k=1, dim=-1)
15 |         return indices[..., 0]
16 | 


--------------------------------------------------------------------------------
/src/torchaudio/backend/no_backend.py:
--------------------------------------------------------------------------------
 1 | def __getattr__(name: str):
 2 |     import warnings
 3 | 
 4 |     warnings.warn(
 5 |         "Torchaudio's I/O functions now support par-call bakcend dispatch. "
 6 |         "Importing backend implementation directly is no longer guaranteed to work. "
 7 |         "Please use `backend` keyword with load/save/info function, instead of "
 8 |         "calling the udnerlying implementation directly.",
 9 |         stacklevel=2,
10 |     )
11 | 
12 |     from . import _no_backend
13 | 
14 |     return getattr(_no_backend, name)
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/autograd_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest import common_utils
 3 | 
 4 | from .autograd_impl import Autograd, AutogradFloat32
 5 | 
 6 | 
 7 | @common_utils.skipIfNoCuda
 8 | class TestAutogradLfilterCUDA(Autograd, common_utils.PytorchTestCase):
 9 |     dtype = torch.float64
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @common_utils.skipIfNoCuda
14 | class TestAutogradRNNTCUDA(AutogradFloat32, common_utils.PytorchTestCase):
15 |     dtype = torch.float32
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | from torchaudio_unittest.models.rnnt_decoder.rnnt_decoder_test_impl import RNNTBeamSearchTestImpl
 4 | 
 5 | 
 6 | class RNNTBeamSearchFloat32CPUTest(RNNTBeamSearchTestImpl, PytorchTestCase):
 7 |     dtype = torch.float32
 8 |     device = torch.device("cpu")
 9 | 
10 | 
11 | class RNNTBeamSearchFloat64CPUTest(RNNTBeamSearchTestImpl, PytorchTestCase):
12 |     dtype = torch.float64
13 |     device = torch.device("cpu")
14 | 


--------------------------------------------------------------------------------
/.github/scripts/unittest-windows/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
 6 | conda activate ./env
 7 | 
 8 | this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 9 | source "$this_dir/set_cuda_envs.sh"
10 | 
11 | python -m torch.utils.collect_env
12 | env | grep TORCHAUDIO || true
13 | 
14 | cd test
15 | pytest --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest
16 | coverage html
17 | 


--------------------------------------------------------------------------------
/examples/hubert/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .hubert_dataset import (
 2 |     _get_lengths_librilightlimited,
 3 |     _get_lengths_librispeech,
 4 |     BucketizeBatchSampler,
 5 |     CollateFnHubert,
 6 |     CollateFnLibriLightLimited,
 7 |     DistributedBatchSampler,
 8 |     HuBERTDataSet,
 9 | )
10 | 
11 | 
12 | __all__ = [
13 |     "_get_lengths_librilightlimited",
14 |     "_get_lengths_librispeech",
15 |     "BucketizeBatchSampler",
16 |     "CollateFnHubert",
17 |     "CollateFnLibriLightLimited",
18 |     "DistributedBatchSampler",
19 |     "HuBERTDataSet",
20 | ]
21 | 


--------------------------------------------------------------------------------
/src/torchaudio/backend/sox_io_backend.py:
--------------------------------------------------------------------------------
 1 | def __getattr__(name: str):
 2 |     import warnings
 3 | 
 4 |     warnings.warn(
 5 |         "Torchaudio's I/O functions now support par-call bakcend dispatch. "
 6 |         "Importing backend implementation directly is no longer guaranteed to work. "
 7 |         "Please use `backend` keyword with load/save/info function, instead of "
 8 |         "calling the udnerlying implementation directly.",
 9 |         stacklevel=2,
10 |     )
11 | 
12 |     from . import _sox_io_backend
13 | 
14 |     return getattr(_sox_io_backend, name)
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/transforms/transforms_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .transforms_test_impl import TransformsTestImpl
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TransformsFloat32CUDATest(TransformsTestImpl, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TransformsFloat64CUDATest(TransformsTestImpl, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_writer/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | namespace torio::io {
 3 | 
 4 | struct CodecConfig {
 5 |   int bit_rate = -1;
 6 |   int compression_level = -1;
 7 | 
 8 |   // qscale corresponds to ffmpeg CLI's qscale.
 9 |   // Example: MP3
10 |   // https://trac.ffmpeg.org/wiki/Encode/MP3
11 |   // This should be set like
12 |   // https://github.com/FFmpeg/FFmpeg/blob/n4.3.2/fftools/ffmpeg_opt.c#L1550
13 |   const c10::optional<int> qscale = -1;
14 | 
15 |   // video
16 |   int gop_size = -1;
17 |   int max_b_frames = -1;
18 | };
19 | } // namespace torio::io
20 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/emformer/emformer_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.models.emformer.emformer_test_impl import EmformerTestImpl
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class EmformerFloat32GPUTest(EmformerTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 
11 | 
12 | @skipIfNoCuda
13 | class EmformerFloat64GPUTest(EmformerTestImpl, PytorchTestCase):
14 |     dtype = torch.float64
15 |     device = torch.device("cuda")
16 | 


--------------------------------------------------------------------------------
/.github/workflows/bandit.yml:
--------------------------------------------------------------------------------
 1 | # GitHub Actions Bandit Workflow
 2 | 
 3 | name: Bandit
 4 | 
 5 | on:
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       # Task will fail if any high-severity issues are found
19 |       # Ignoring submodules
20 |       - name: Run Bandit Security Analysis
21 |         run: |
22 |               python -m pip install bandit
23 |               python -m bandit -r . -x ./third_party -lll
24 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/rnnt_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.prototype.rnnt_test_impl import ConformerRNNTTestImpl
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class ConformerRNNTFloat32GPUTest(ConformerRNNTTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 
11 | 
12 | @skipIfNoCuda
13 | class ConformerRNNTFloat64GPUTest(ConformerRNNTTestImpl, PytorchTestCase):
14 |     dtype = torch.float64
15 |     device = torch.device("cuda")
16 | 


--------------------------------------------------------------------------------
/docs/source/compliance.kaldi.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.compliance.kaldi
 2 | 
 3 | torchaudio.compliance.kaldi
 4 | ===========================
 5 | 
 6 | .. currentmodule:: torchaudio.compliance.kaldi
 7 | 
 8 | The useful processing operations of kaldi_ can be performed with torchaudio.
 9 | Various functions with identical parameters are given so that torchaudio can
10 | produce similar outputs.
11 | 
12 | .. _kaldi: https://github.com/kaldi-asr/kaldi
13 | 
14 | .. autosummary::
15 |    :toctree: generated
16 |    :nosignatures:
17 | 
18 |    spectrogram
19 |    fbank
20 |    mfcc
21 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/conformer/conformer_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.models.conformer.conformer_test_impl import ConformerTestImpl
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class ConformerFloat32GPUTest(ConformerTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 
11 | 
12 | @skipIfNoCuda
13 | class ConformerFloat64GPUTest(ConformerTestImpl, PytorchTestCase):
14 |     dtype = torch.float64
15 |     device = torch.device("cuda")
16 | 


--------------------------------------------------------------------------------
/src/torchaudio/backend/soundfile_backend.py:
--------------------------------------------------------------------------------
 1 | def __getattr__(name: str):
 2 |     import warnings
 3 | 
 4 |     warnings.warn(
 5 |         "Torchaudio's I/O functions now support par-call bakcend dispatch. "
 6 |         "Importing backend implementation directly is no longer guaranteed to work. "
 7 |         "Please use `backend` keyword with load/save/info function, instead of "
 8 |         "calling the udnerlying implementation directly.",
 9 |         stacklevel=2,
10 |     )
11 | 
12 |     from torchaudio._backend import soundfile_backend
13 | 
14 |     return getattr(soundfile_backend, name)
15 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/torchscript_consistency_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .torchscript_consistency_impl import Functional, FunctionalFloat32Only
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestFunctionalFloat32(Functional, FunctionalFloat32Only, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TestFunctionalFloat64(Functional, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/conv_emformer_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.prototype.conv_emformer_test_impl import ConvEmformerTestImpl
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class ConvEmformerFloat32GPUTest(ConvEmformerTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 
11 | 
12 | @skipIfNoCuda
13 | class ConvEmformerFloat64GPUTest(ConvEmformerTestImpl, PytorchTestCase):
14 |     dtype = torch.float64
15 |     device = torch.device("cuda")
16 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/transforms/torchscript_consistency_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .torchscript_consistency_impl import Transforms, TransformsFloat32Only
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TestTransformsFloat64(Transforms, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/packaging/torchaudio/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | 
 4 | torch_cuda_version=$(python -c "import torch; print(torch.version.cuda)")
 5 | echo "torch.cuda.version is $torch_cuda_version"
 6 | 
 7 | echo USE_CUDA is "$USE_CUDA"
 8 | 
 9 | shopt -s nocasematch
10 | if [ "${USE_CUDA}" == "1" ] ; then
11 |     if [ "$torch_cuda_version" == "None" ]; then
12 |         echo "We want to build torch auido with cuda but the installed pytorch isn't with cuda"
13 |         exit 1
14 |     fi
15 | fi
16 | shopt -u nocasematch
17 | python setup.py install --single-version-externally-managed --record=record.txt
18 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | from torchaudio_unittest.models.rnnt_decoder.rnnt_decoder_test_impl import RNNTBeamSearchTestImpl
 4 | 
 5 | 
 6 | @skipIfNoCuda
 7 | class RNNTBeamSearchFloat32GPUTest(RNNTBeamSearchTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cuda")
10 | 
11 | 
12 | @skipIfNoCuda
13 | class RNNTBeamSearchFloat64GPUTest(RNNTBeamSearchTestImpl, PytorchTestCase):
14 |     dtype = torch.float64
15 |     device = torch.device("cuda")
16 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/DynamicLibrary.h>
 2 | #include <libtorchaudio/utils.h>
 3 | 
 4 | #ifdef USE_CUDA
 5 | #include <cuda.h>
 6 | #endif
 7 | 
 8 | namespace torchaudio {
 9 | 
10 | bool is_rir_available() {
11 | #ifdef INCLUDE_RIR
12 |   return true;
13 | #else
14 |   return false;
15 | #endif
16 | }
17 | 
18 | bool is_align_available() {
19 | #ifdef INCLUDE_ALIGN
20 |   return true;
21 | #else
22 |   return false;
23 | #endif
24 | }
25 | 
26 | c10::optional<int64_t> cuda_version() {
27 | #ifdef USE_CUDA
28 |   return CUDA_VERSION;
29 | #else
30 |   return {};
31 | #endif
32 | }
33 | 
34 | } // namespace torchaudio
35 | 


--------------------------------------------------------------------------------
/docs/post_process_dispatcher.py:
--------------------------------------------------------------------------------
 1 | """Replaces every instance of 'torchaudio._backend' with 'torchaudio' in torchaudio.html.
 2 | Temporary hack while we maintain both the existing set of info/load/save functions and the
 3 | new ones backed by the backend dispatcher in torchaudio._backend.
 4 | """
 5 | import sys
 6 | 
 7 | if __name__ == "__main__":
 8 |     build_dir = sys.argv[1]
 9 |     filepath = f"{build_dir}/html/torchaudio.html"
10 | 
11 |     with open(filepath, "r") as f:
12 |         text = f.read()
13 |         text = text.replace("torchaudio._backend", "torchaudio")
14 | 
15 |     with open(filepath, "w") as f:
16 |         f.write(text)
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/io/common.py:
--------------------------------------------------------------------------------
 1 | import torchaudio
 2 | 
 3 | 
 4 | # If FFmpeg is 4.1 or older
 5 | # Tests that checks the number of output samples from OPUS fails
 6 | # They work on 4.2+
 7 | # Probably this commit fixed it.
 8 | # https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
 9 | def lt42():
10 |     ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
11 |     # 5.1 libavcodec     59. 18.100
12 |     # 4.4 libavcodec     58.134.100
13 |     # 4.3 libavcodec     58. 91.100
14 |     # 4.2 libavcodec     58. 54.100
15 |     # 4.1 libavcodec     58. 35.100
16 |     return ver[0] < 59 and ver[1] < 54
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/torchscript_consistency_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TorchScriptConsistencyCUDAFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TorchScriptConsistencyCUDAFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda")
17 | 


--------------------------------------------------------------------------------
/docs/source/sox_effects.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.sox_effects
 2 | 
 3 | torchaudio.sox_effects
 4 | ======================
 5 | 
 6 | .. currentmodule:: torchaudio.sox_effects
 7 | 
 8 | Applying effects
 9 | ----------------
10 | 
11 | Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor.
12 | 
13 | .. autosummary::
14 |    :toctree: generated
15 |    :nosignatures:
16 | 
17 |    apply_effects_tensor
18 |    apply_effects_file
19 | 
20 | .. minigallery:: torchaudio.sox_effects.apply_effects_tensor
21 |    
22 | Utilities
23 | ---------
24 | 
25 | .. autosummary::
26 |    :toctree: generated
27 |    :nosignatures:
28 | 
29 |    effect_names
30 | 


--------------------------------------------------------------------------------
/docs/source/torio.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torio
 2 | 
 3 | torio
 4 | =====
 5 | 
 6 | .. currentmodule:: torio.io
 7 | 
 8 | ``torio`` is an alternative top-level module for I/O features. It is the extraction of the core implementation of I/O feature of ``torchaudio``.
 9 | 
10 | If you want to use the multimedia processing features, but do not want to depend on the entire ``torchaudio`` package, you can use ``torio``.
11 | 
12 | .. note::
13 | 
14 |    Currently, ``torio`` is distributed alongside ``torchaudio``, and there is no stand-alone
15 |    procedure to install ``torio`` only. Please refer to https://pytorch.org/get-started/locally/
16 |    for the installation of ``torchaudio``.
17 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/backend/common.py:
--------------------------------------------------------------------------------
 1 | from torchaudio_unittest.common_utils import sox_utils
 2 | 
 3 | 
 4 | def get_encoding(ext, dtype):
 5 |     exts = {
 6 |         "mp3",
 7 |         "flac",
 8 |         "vorbis",
 9 |     }
10 |     encodings = {
11 |         "float32": "PCM_F",
12 |         "int32": "PCM_S",
13 |         "int16": "PCM_S",
14 |         "uint8": "PCM_U",
15 |     }
16 |     return ext.upper() if ext in exts else encodings[dtype]
17 | 
18 | 
19 | def get_bits_per_sample(ext, dtype):
20 |     bits_per_samples = {
21 |         "flac": 24,
22 |         "mp3": 0,
23 |         "vorbis": 0,
24 |     }
25 |     return bits_per_samples.get(ext, sox_utils.get_bit_depth(dtype))
26 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/functional_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl
 5 | 
 6 | 
 7 | class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 
16 | 
17 | class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase):
18 |     dtype = torch.float64
19 |     device = torch.device("cpu")
20 | 


--------------------------------------------------------------------------------
/CITATION:
--------------------------------------------------------------------------------
1 | @misc{hwang2023torchaudio,
2 |       title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, 
3 |       author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
4 |       year={2023},
5 |       eprint={2310.17864},
6 |       archivePrefix={arXiv},
7 |       primaryClass={eess.AS}
8 | }
9 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | project(libtorchaudio-cpp-example)
 4 | 
 5 | SET(BUILD_SOX ON CACHE BOOL "Build libsox into libtorchaudio")
 6 | 
 7 | SET(BUILD_KALDI OFF CACHE BOOL "Build Kaldi into libtorchaudio")
 8 | SET(BUILD_RNNT ON CACHE BOOL "Build RNN transducer into libtorchaudio")
 9 | SET(BUILD_TORCHAUDIO_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding")
10 | 
11 | find_package(Torch REQUIRED)
12 | message("libtorchaudio CMakeLists: ${TORCH_CXX_FLAGS}")
13 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
14 | 
15 | add_subdirectory(../.. libtorchaudio)
16 | add_subdirectory(augmentation)
17 | add_subdirectory(speech_recognition)
18 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/tacotron2/model_test_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .model_test_impl import Tacotron2DecoderTests, Tacotron2EncoderTests, Tacotron2Tests
 5 | 
 6 | 
 7 | class TestTacotron2EncoderFloat32CPU(Tacotron2EncoderTests, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestTacotron2DecoderFloat32CPU(Tacotron2DecoderTests, PytorchTestCase):
13 |     dtype = torch.float32
14 |     device = torch.device("cpu")
15 | 
16 | 
17 | class TestTacotron2Float32CPU(Tacotron2Tests, PytorchTestCase):
18 |     dtype = torch.float32
19 |     device = torch.device("cpu")
20 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <libtorio/ffmpeg/ffmpeg.h>
 3 | #include <libtorio/ffmpeg/stream_reader/typedefs.h>
 4 | #include <torch/types.h>
 5 | #include <deque>
 6 | 
 7 | namespace torio::io::detail {
 8 | 
 9 | class UnchunkedBuffer {
10 |   // Each AVFrame is converted to a Tensor and stored here.
11 |   std::deque<torch::Tensor> chunks;
12 |   double pts = -1.;
13 |   AVRational time_base;
14 | 
15 |  public:
16 |   explicit UnchunkedBuffer(AVRational time_base);
17 |   bool is_ready() const;
18 |   void push_frame(torch::Tensor frame, int64_t pts_);
19 |   c10::optional<Chunk> pop_chunk();
20 |   void flush();
21 | };
22 | 
23 | } // namespace torio::io::detail
24 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/functional/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._dsp import (
 2 |     adsr_envelope,
 3 |     exp_sigmoid,
 4 |     extend_pitch,
 5 |     filter_waveform,
 6 |     frequency_impulse_response,
 7 |     oscillator_bank,
 8 |     sinc_impulse_response,
 9 | )
10 | from ._rir import ray_tracing, simulate_rir_ism
11 | from .functional import barkscale_fbanks, chroma_filterbank
12 | 
13 | 
14 | __all__ = [
15 |     "adsr_envelope",
16 |     "exp_sigmoid",
17 |     "barkscale_fbanks",
18 |     "chroma_filterbank",
19 |     "extend_pitch",
20 |     "filter_waveform",
21 |     "frequency_impulse_response",
22 |     "oscillator_bank",
23 |     "ray_tracing",
24 |     "sinc_impulse_response",
25 |     "simulate_rir_ism",
26 | ]
27 | 


--------------------------------------------------------------------------------
/test/integration_tests/prototype/vggish_pipeline_test.py:
--------------------------------------------------------------------------------
 1 | import torchaudio
 2 | from torchaudio.prototype.pipelines import VGGISH
 3 | 
 4 | 
 5 | def test_vggish():
 6 |     input_sr = VGGISH.sample_rate
 7 |     input_proc = VGGISH.get_input_processor()
 8 |     model = VGGISH.get_model()
 9 |     path = torchaudio.utils.download_asset("test-assets/Chopin_Ballade_-1_In_G_Minor,_Op._23_excerpt.mp3")
10 |     waveform, sr = torchaudio.load(path, backend="ffmpeg")
11 |     waveform = waveform.mean(axis=0)
12 |     waveform = torchaudio.functional.resample(waveform, sr, input_sr)
13 |     batch = input_proc(waveform)
14 |     assert batch.shape == (62, 1, 96, 64)
15 |     output = model(batch)
16 |     assert output.shape == (62, 128)
17 | 


--------------------------------------------------------------------------------
/test/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | FetchContent_Declare(
 3 |   googletest
 4 |   URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
 5 | )
 6 | 
 7 | # For Windows: Prevent overriding the parent project's compiler/linker settings
 8 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 9 | FetchContent_MakeAvailable(googletest)
10 | 
11 | enable_testing()
12 | 
13 | add_executable(
14 |   wall_collision
15 |   rir/wall_collision.cpp
16 | )
17 | target_link_libraries(
18 |   wall_collision
19 |   torch
20 |   GTest::gtest_main
21 | )
22 | target_include_directories(
23 |   wall_collision
24 |   PRIVATE
25 |   "${PROJECT_SOURCE_DIR}/src"
26 | )
27 | add_test(NAME wall_collision_test COMMAND wall_collision)
28 | 


--------------------------------------------------------------------------------
/docs/source/prototype.rst:
--------------------------------------------------------------------------------
 1 | torchaudio.prototype
 2 | ====================
 3 | 
 4 | ``torchaudio.prototype`` provides prototype features;
 5 | they are at an early stage for feedback and testing.
 6 | Their interfaces might be changed without prior notice.
 7 | 
 8 | Most modules of prototypes are excluded from release.
 9 | Please refer to `here <https://pytorch.org/audio>`_ for
10 | more information on prototype features.
11 | 
12 | The modules under ``torchaudio.prototype`` must be
13 | imported explicitly, e.g.
14 | 
15 | .. code-block:: python
16 | 
17 |    import torchaudio.prototype.models
18 | 
19 | .. toctree::
20 |    prototype.datasets
21 |    prototype.functional
22 |    prototype.models
23 |    prototype.pipelines
24 |    prototype.transforms
25 | 


--------------------------------------------------------------------------------
/packaging/vs2019/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set vcver="14.2" %}
 2 | {% set vcfeature="14" %}
 3 | {% set vsyear="2019" %}
 4 | {% set fullver="15.4.27004.2010" %}
 5 | 
 6 | package:
 7 |   name: vs{{ vsyear }}
 8 |   version: {{ fullver }}
 9 | 
10 | build:
11 |   skip: True  [not win]
12 |   script_env:
13 |     - VSDEVCMD_ARGS # [win]
14 | 
15 | outputs:
16 |   - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
17 |     script: install_activate.bat
18 |     track_features:
19 |       # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14.  Tools are "v142".
20 |       strong:
21 |         - vc{{ vcfeature }}
22 |     about:
23 |       summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
24 |       license: BSD 3-clause
25 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_reader/packet_buffer.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorio/ffmpeg/stream_reader/packet_buffer.h>
 2 | 
 3 | namespace torio::io {
 4 | void PacketBuffer::push_packet(AVPacket* packet) {
 5 |   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null.");
 6 |   AVPacket* p = av_packet_clone(packet);
 7 |   TORCH_INTERNAL_ASSERT(p, "Failed to clone packet.");
 8 |   packets.emplace_back(p);
 9 | }
10 | std::vector<AVPacketPtr> PacketBuffer::pop_packets() {
11 |   std::vector<AVPacketPtr> ret{
12 |       std::make_move_iterator(packets.begin()),
13 |       std::make_move_iterator(packets.end())};
14 |   packets.clear();
15 |   return ret;
16 | }
17 | bool PacketBuffer::has_packets() {
18 |   return packets.size() > 0;
19 | }
20 | } // namespace torio::io
21 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/common_utils/psd_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def psd_numpy(
 8 |     X: np.array, mask: Optional[np.array], multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15
 9 | ) -> np.array:
10 |     X_conj = np.conj(X)
11 |     psd_X = np.einsum("...cft,...eft->...ftce", X, X_conj)
12 |     if mask is not None:
13 |         if multi_mask:
14 |             mask = mask.mean(axis=-3)
15 |         if normalize:
16 |             mask = mask / (mask.sum(axis=-1, keepdims=True) + eps)
17 |         psd = psd_X * mask[..., None, None]
18 |     else:
19 |         psd = psd_X
20 | 
21 |     psd = psd.sum(axis=-3)
22 | 
23 |     return torch.tensor(psd, dtype=torch.cdouble)
24 | 


--------------------------------------------------------------------------------
/.github/scripts/unittest-linux/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | eval "$($(which conda) shell.bash hook)"
 6 | 
 7 | conda activate ci
 8 | 
 9 | python -m torch.utils.collect_env
10 | env | grep TORCHAUDIO || true
11 | 
12 | export PATH="${PWD}/third_party/install/bin/:${PATH}"
13 | 
14 | declare -a args=(
15 |     '--continue-on-collection-errors'
16 |     '-v'
17 |     '--cov=torchaudio'
18 |     "--junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml"
19 |     '--durations' '20'
20 | )
21 | 
22 | if [[ "${CUDA_TESTS_ONLY}" = "1" ]]; then
23 |   args+=('-k' 'cuda or gpu')
24 | fi
25 | 
26 | (
27 |     cd build/temp*/test/cpp
28 |     ctest
29 | )
30 | 
31 | (
32 |     cd test
33 |     pytest "${args[@]}" torchaudio_unittest
34 |     coverage html
35 | )
36 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/functional_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class FunctionalFloat32CUDATest(FunctionalTestImpl, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda", 0)
11 | 
12 | 
13 | @skipIfNoCuda
14 | class FunctionalFloat64CUDATest(FunctionalTestImpl, PytorchTestCase):
15 |     dtype = torch.float64
16 |     device = torch.device("cuda", 0)
17 | 
18 | 
19 | @skipIfNoCuda
20 | class FunctionalFloat64OnlyCUDATest(Functional64OnlyTestImpl, PytorchTestCase):
21 |     dtype = torch.float64
22 |     device = torch.device("cuda")
23 | 


--------------------------------------------------------------------------------
/test/integration_tests/ctc_decoder_integration_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "model,expected",
 6 |     [
 7 |         ("librispeech", ["the", "captain", "shook", "his", "head"]),
 8 |         ("librispeech-3-gram", ["the", "captain", "shook", "his", "head"]),
 9 |     ],
10 | )
11 | def test_decoder_from_pretrained(model, expected, emissions):
12 |     from torchaudio.models.decoder import ctc_decoder, download_pretrained_files
13 | 
14 |     pretrained_files = download_pretrained_files(model)
15 |     decoder = ctc_decoder(
16 |         lexicon=pretrained_files.lexicon,
17 |         tokens=pretrained_files.tokens,
18 |         lm=pretrained_files.lm,
19 |     )
20 |     result = decoder(emissions)
21 |     assert result[0][0].words == expected
22 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/models/tacotron2/model_test_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .model_test_impl import Tacotron2DecoderTests, Tacotron2EncoderTests, Tacotron2Tests
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestTacotron2EncoderFloat32CUDA(Tacotron2EncoderTests, PytorchTestCase):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TestTacotron2DecoderFloat32CUDA(Tacotron2DecoderTests, PytorchTestCase):
15 |     dtype = torch.float32
16 |     device = torch.device("cuda")
17 | 
18 | 
19 | @skipIfNoCuda
20 | class TestTacotron2Float32CUDA(Tacotron2Tests, PytorchTestCase):
21 |     dtype = torch.float32
22 |     device = torch.device("cuda")
23 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/augmentation/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/script.h>
 2 | 
 3 | int main(int argc, char* argv[]) {
 4 |   if (argc != 4) {
 5 |     std::cerr << "Usage: " << argv[0]
 6 |               << " <JIT_OBJECT> <INPUT_FILE> <OUTPUT_FILE>" << std::endl;
 7 |     return -1;
 8 |   }
 9 | 
10 |   torch::jit::script::Module module;
11 |   std::cout << "Loading module from: " << argv[1] << std::endl;
12 |   try {
13 |     module = torch::jit::load(argv[1]);
14 |   } catch (const c10::Error& error) {
15 |     std::cerr << "Failed to load the module:" << error.what() << std::endl;
16 |     return -1;
17 |   }
18 | 
19 |   std::cout << "Performing the process ..." << std::endl;
20 |   module.forward({c10::IValue(argv[2]), c10::IValue(argv[3])});
21 |   std::cout << "Done." << std::endl;
22 | }
23 | 


--------------------------------------------------------------------------------
/third_party/sox/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | 
 3 | FetchContent_Declare(
 4 |   sox_src
 5 |   URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
 6 |   URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
 7 |   PATCH_COMMAND ""
 8 |   CONFIGURE_COMMAND ""
 9 |   BUILD_COMMAND ""
10 |   )
11 | # FetchContent_MakeAvailable will parse the downloaded content and setup the targets.
12 | # We want to only download and not build, so we run Populate manually.
13 | if(NOT sox_src_POPULATED)
14 |   FetchContent_Populate(sox_src)
15 | endif()
16 | 
17 | add_library(sox SHARED stub.c)
18 | if(APPLE)
19 |   set_target_properties(sox PROPERTIES SUFFIX .dylib)
20 | endif(APPLE)
21 | target_include_directories(sox PUBLIC ${sox_src_SOURCE_DIR}/src)
22 | 


--------------------------------------------------------------------------------
/packaging/vs2019/conda_build_config.yaml:
--------------------------------------------------------------------------------
 1 | c_compiler:
 2 |   - vs2019                     # [win]
 3 | cxx_compiler:
 4 |   - vs2019                     # [win]
 5 | python:
 6 |   - 3.8
 7 | # This differs from target_platform in that it determines what subdir the compiler
 8 | #    will target, not what subdir the compiler package will be itself.
 9 | #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
10 | #    code on win-64 miniconda.
11 | cross_compiler_target_platform:
12 |   - win-64                     # [win]
13 | target_platform:
14 |   - win-64                     # [win]
15 | vc:
16 |   - 14
17 | zip_keys:
18 |   -                             # [win]
19 |     - vc                        # [win]
20 |     - c_compiler                # [win]
21 |     - cxx_compiler              # [win]
22 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/tacotron2/tacotron2_loss_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .tacotron2_loss_impl import Tacotron2LossGradcheckTests, Tacotron2LossShapeTests, Tacotron2LossTorchscriptTests
 5 | 
 6 | 
 7 | class TestTacotron2LossShapeFloat32CPU(Tacotron2LossShapeTests, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TestTacotron2TorchsciptFloat32CPU(Tacotron2LossTorchscriptTests, PytorchTestCase):
13 |     dtype = torch.float32
14 |     device = torch.device("cpu")
15 | 
16 | 
17 | class TestTacotron2GradcheckFloat64CPU(Tacotron2LossGradcheckTests, PytorchTestCase):
18 |     dtype = torch.float64  # gradcheck needs a higher numerical accuracy
19 |     device = torch.device("cpu")
20 | 


--------------------------------------------------------------------------------
/docs/source/prototype.functional.rst:
--------------------------------------------------------------------------------
 1 | torchaudio.prototype.functional
 2 | ===============================
 3 | 
 4 | .. py:module:: torchaudio.prototype.functional
 5 | .. currentmodule:: torchaudio.prototype.functional
 6 | 
 7 | Utility
 8 | ~~~~~~~
 9 | 
10 | .. autosummary::
11 |    :toctree: generated
12 |    :nosignatures:
13 | 
14 |    barkscale_fbanks
15 |    chroma_filterbank
16 | 
17 | DSP
18 | ~~~
19 | 
20 | .. autosummary::
21 |    :toctree: generated
22 |    :nosignatures:
23 | 
24 |    adsr_envelope
25 |    filter_waveform
26 |    extend_pitch
27 |    oscillator_bank
28 |    sinc_impulse_response
29 |    frequency_impulse_response
30 | 
31 | Room Impulse Response Simulation
32 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33 | 
34 | .. autosummary::
35 |    :toctree: generated
36 |    :nosignatures:
37 | 
38 |    ray_tracing
39 |    simulate_rir_ism
40 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/decoder/kenlm.arpa:
--------------------------------------------------------------------------------
 1 | \data\
 2 | ngram 1=6
 3 | ngram 2=9
 4 | ngram 3=8
 5 | 
 6 | \1-grams:
 7 | -0.8515802	<unk>	0
 8 | 0	<s>	-0.30103
 9 | -0.8515802	</s>	0
10 | -0.8515802	foo	-0.30103
11 | -0.44013768	bar	-0.30103
12 | -0.6679358	foobar	-0.30103
13 | 
14 | \2-grams:
15 | -0.7091413	foo </s>	0
16 | -0.6251838	bar </s>	0
17 | -0.24384303	foobar </s>	0
18 | -0.6251838	<s> foo	-0.30103
19 | -0.49434766	foo foo	-0.30103
20 | -0.39393726	bar foo	-0.30103
21 | -0.4582359	<s> bar	-0.30103
22 | -0.51359576	foo bar	-0.30103
23 | -0.56213206	<s> foobar	-0.30103
24 | 
25 | \3-grams:
26 | -0.45881382	bar foo </s>
27 | -0.43354067	foo bar </s>
28 | -0.105027884	<s> foobar </s>
29 | -0.18033421	<s> foo foo
30 | -0.38702002	bar foo foo
31 | -0.15375455	<s> bar foo
32 | -0.34500393	foo bar foo
33 | -0.18492673	foo foo bar
34 | 
35 | \end\
36 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/common_utils/autograd_utils.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | @contextlib.contextmanager
 7 | def use_deterministic_algorithms(mode: bool, warn_only: bool):
 8 |     r"""
 9 |     This context manager can be used to temporarily enable or disable deterministic algorithms.
10 |     Upon exiting the context manager, the previous state of the flag will be restored.
11 |     """
12 |     previous_mode: bool = torch.are_deterministic_algorithms_enabled()
13 |     previous_warn_only: bool = torch.is_deterministic_algorithms_warn_only_enabled()
14 |     try:
15 |         torch.use_deterministic_algorithms(mode, warn_only=warn_only)
16 |         yield {}
17 |     except RuntimeError as err:
18 |         raise err
19 |     finally:
20 |         torch.use_deterministic_algorithms(previous_mode, warn_only=previous_warn_only)
21 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/forced_align/compute.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorchaudio/forced_align/compute.h>
 2 | #include <torch/script.h>
 3 | 
 4 | std::tuple<torch::Tensor, torch::Tensor> forced_align(
 5 |     const torch::Tensor& logProbs,
 6 |     const torch::Tensor& targets,
 7 |     const torch::Tensor& inputLengths,
 8 |     const torch::Tensor& targetLengths,
 9 |     const int64_t blank) {
10 |   static auto op = torch::Dispatcher::singleton()
11 |                        .findSchemaOrThrow("torchaudio::forced_align", "")
12 |                        .typed<decltype(forced_align)>();
13 |   return op.call(logProbs, targets, inputLengths, targetLengths, blank);
14 | }
15 | 
16 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
17 |   m.def(
18 |       "forced_align(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> (Tensor, Tensor)");
19 | }
20 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_writer/encoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <libtorio/ffmpeg/ffmpeg.h>
 4 | #include <libtorio/ffmpeg/filter_graph.h>
 5 | #include <torch/types.h>
 6 | 
 7 | namespace torio::io {
 8 | 
 9 | // Encoder + Muxer
10 | class Encoder {
11 |   // Reference to the AVFormatContext (muxer)
12 |   AVFormatContext* format_ctx;
13 |   // Reference to codec context (encoder)
14 |   AVCodecContext* codec_ctx;
15 |   // Stream object as reference. Owned by AVFormatContext.
16 |   AVStream* stream;
17 |   // Temporary object used during the encoding
18 |   // Encoder owns it.
19 |   AVPacketPtr packet{alloc_avpacket()};
20 | 
21 |  public:
22 |   Encoder(
23 |       AVFormatContext* format_ctx,
24 |       AVCodecContext* codec_ctx,
25 |       AVStream* stream) noexcept;
26 | 
27 |   void encode(AVFrame* frame);
28 | };
29 | 
30 | } // namespace torio::io
31 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/sox/effects.h:
--------------------------------------------------------------------------------
 1 | #ifndef TORCHAUDIO_SOX_EFFECTS_H
 2 | #define TORCHAUDIO_SOX_EFFECTS_H
 3 | 
 4 | #include <libtorchaudio/sox/utils.h>
 5 | #include <torch/script.h>
 6 | 
 7 | namespace torchaudio::sox {
 8 | 
 9 | void initialize_sox_effects();
10 | 
11 | void shutdown_sox_effects();
12 | 
13 | auto apply_effects_tensor(
14 |     torch::Tensor waveform,
15 |     int64_t sample_rate,
16 |     const std::vector<std::vector<std::string>>& effects,
17 |     bool channels_first) -> std::tuple<torch::Tensor, int64_t>;
18 | 
19 | auto apply_effects_file(
20 |     const std::string& path,
21 |     const std::vector<std::vector<std::string>>& effects,
22 |     c10::optional<bool> normalize,
23 |     c10::optional<bool> channels_first,
24 |     const c10::optional<std::string>& format)
25 |     -> std::tuple<torch::Tensor, int64_t>;
26 | 
27 | } // namespace torchaudio::sox
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-labels.yml:
--------------------------------------------------------------------------------
 1 | name: pr-labels
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   is-properly-labeled:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Set up python
14 |         uses: actions/setup-python@v2
15 | 
16 |       - name: Install requests
17 |         run: pip install requests
18 | 
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v2
21 | 
22 |       - name: Process commit and find merger responsible for labeling
23 |         id: commit
24 |         env:
25 |           SHA1: ${{ github.sha }}
26 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27 |         run: python .github/process_commit.py "${SHA1}"
28 | 
29 | concurrency:
30 |   group: pr-labels-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
31 |   cancel-in-progress: true
32 | 


--------------------------------------------------------------------------------
/src/torchaudio/backend/_no_backend.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Callable, Optional, Tuple, Union
 3 | 
 4 | from torch import Tensor
 5 | from torchaudio import AudioMetaData
 6 | 
 7 | 
 8 | def load(
 9 |     filepath: Union[str, Path],
10 |     out: Optional[Tensor] = None,
11 |     normalization: Union[bool, float, Callable] = True,
12 |     channels_first: bool = True,
13 |     num_frames: int = 0,
14 |     offset: int = 0,
15 |     filetype: Optional[str] = None,
16 | ) -> Tuple[Tensor, int]:
17 |     raise RuntimeError("No audio I/O backend is available.")
18 | 
19 | 
20 | def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
21 |     raise RuntimeError("No audio I/O backend is available.")
22 | 
23 | 
24 | def info(filepath: str) -> AudioMetaData:
25 |     raise RuntimeError("No audio I/O backend is available.")
26 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/tacotron2/tacotron2_loss_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 3 | 
 4 | from .tacotron2_loss_impl import Tacotron2LossGradcheckTests, Tacotron2LossShapeTests, Tacotron2LossTorchscriptTests
 5 | 
 6 | 
 7 | @skipIfNoCuda
 8 | class TestTacotron2LossShapeFloat32CUDA(PytorchTestCase, Tacotron2LossShapeTests):
 9 |     dtype = torch.float32
10 |     device = torch.device("cuda")
11 | 
12 | 
13 | @skipIfNoCuda
14 | class TestTacotron2TorchsciptFloat32CUDA(PytorchTestCase, Tacotron2LossTorchscriptTests):
15 |     dtype = torch.float32
16 |     device = torch.device("cuda")
17 | 
18 | 
19 | @skipIfNoCuda
20 | class TestTacotron2GradcheckFloat64CUDA(PytorchTestCase, Tacotron2LossGradcheckTests):
21 |     dtype = torch.float64  # gradcheck needs a higher numerical accuracy
22 |     device = torch.device("cuda")
23 | 


--------------------------------------------------------------------------------
/examples/avsr/models/conformer_rnnt.py:
--------------------------------------------------------------------------------
 1 | from torchaudio.prototype.models import conformer_rnnt_model
 2 | 
 3 | # https://pytorch.org/audio/master/_modules/torchaudio/prototype/models/rnnt.html#conformer_rnnt_model
 4 | 
 5 | 
 6 | def conformer_rnnt():
 7 |     return conformer_rnnt_model(
 8 |         input_dim=512,
 9 |         encoding_dim=1024,
10 |         time_reduction_stride=1,
11 |         conformer_input_dim=256,
12 |         conformer_ffn_dim=1024,
13 |         conformer_num_layers=16,
14 |         conformer_num_heads=4,
15 |         conformer_depthwise_conv_kernel_size=31,
16 |         conformer_dropout=0.1,
17 |         num_symbols=1024,
18 |         symbol_embedding_dim=256,
19 |         num_lstm_layers=2,
20 |         lstm_hidden_dim=512,
21 |         lstm_layer_norm=True,
22 |         lstm_layer_norm_epsilon=1e-5,
23 |         lstm_dropout=0.3,
24 |         joiner_activation="tanh",
25 |     )
26 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/gpu/half.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_C10_HALF
 4 | #include "c10/util/Half.h"
 5 | #endif // USE_C10_HALF
 6 | 
 7 | #include <libtorchaudio/rnnt/macros.h>
 8 | 
 9 | namespace torchaudio {
10 | namespace rnnt {
11 | 
12 | struct alignas(sizeof(__half)) Half {
13 |   __half x;
14 | 
15 |   HOST_AND_DEVICE Half() = default;
16 | 
17 |   FORCE_INLINE HOST_AND_DEVICE Half(float f) {
18 |     x = __float2half_rn(f);
19 |     if (isinf(__half2float(x))) {
20 |       x = __float2half_rz(f); // round toward 0.
21 |     }
22 |   }
23 | 
24 |   FORCE_INLINE HOST_AND_DEVICE operator float() const {
25 |     return __half2float(x);
26 |   }
27 | 
28 |   FORCE_INLINE HOST_AND_DEVICE Half(__half f) {
29 |     x = f;
30 |   }
31 | 
32 |   FORCE_INLINE HOST_AND_DEVICE operator __half() const {
33 |     return x;
34 |   }
35 | };
36 | 
37 | } // namespace rnnt
38 | } // namespace torchaudio
39 | 


--------------------------------------------------------------------------------
/docs/source/models.decoder.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.models.decoder
 2 | 
 3 | torchaudio.models.decoder
 4 | =========================
 5 | 
 6 | .. currentmodule:: torchaudio.models.decoder
 7 | 
 8 | CTC Decoder
 9 | -----------
10 | 
11 | .. autosummary::
12 |    :toctree: generated
13 |    :nosignatures:
14 |    :template: autosummary/ctc_decoder_class.rst
15 | 
16 |    CTCDecoder
17 |    ctc_decoder
18 |    download_pretrained_files
19 | 
20 | .. rubric:: Tutorials using CTC Decoder
21 | 
22 | .. minigallery:: torchaudio.models.decoder.CTCDecoder
23 | 
24 | CUDA CTC Decoder
25 | ----------------
26 | 
27 | .. autosummary::
28 |    :toctree: generated
29 |    :nosignatures:
30 |    :template: autosummary/cuda_ctc_decoder_class.rst
31 | 
32 |    CUCTCDecoder
33 |    cuda_ctc_decoder
34 | 
35 | 
36 | .. rubric:: Tutorials using CUDA CTC Decoder
37 | 
38 | .. minigallery:: torchaudio.models.decoder.CUCTCDecoder
39 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/sox_effect/common.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from parameterized import param
 4 | from torchaudio_unittest.common_utils import get_asset_path
 5 | 
 6 | 
 7 | def name_func(func, _, params):
 8 |     if isinstance(params.args[0], str):
 9 |         args = "_".join([str(arg) for arg in params.args])
10 |     else:
11 |         args = "_".join([str(arg) for arg in params.args[0]])
12 |     return f"{func.__name__}_{args}"
13 | 
14 | 
15 | def load_params(*paths):
16 |     params = []
17 |     with open(get_asset_path(*paths), "r") as file:
18 |         for line in file:
19 |             data = json.loads(line)
20 |             for effect in data["effects"]:
21 |                 for i, arg in enumerate(effect):
22 |                     if arg.startswith("<ASSET_DIR>"):
23 |                         effect[i] = arg.replace("<ASSET_DIR>", get_asset_path())
24 |             params.append(param(data))
25 |     return params
26 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=torchaudio
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h>
 2 | 
 3 | namespace torio::io::detail {
 4 | 
 5 | UnchunkedBuffer::UnchunkedBuffer(AVRational time_base) : time_base(time_base){};
 6 | 
 7 | bool UnchunkedBuffer::is_ready() const {
 8 |   return chunks.size() > 0;
 9 | }
10 | 
11 | void UnchunkedBuffer::push_frame(torch::Tensor frame, int64_t pts_) {
12 |   if (chunks.size() == 0) {
13 |     pts = double(pts_) * time_base.num / time_base.den;
14 |   }
15 |   chunks.push_back(frame);
16 | }
17 | 
18 | c10::optional<Chunk> UnchunkedBuffer::pop_chunk() {
19 |   if (chunks.size() == 0) {
20 |     return {};
21 |   }
22 | 
23 |   auto frames =
24 |       torch::cat(std::vector<torch::Tensor>{chunks.begin(), chunks.end()}, 0);
25 |   chunks.clear();
26 |   return {Chunk{frames, pts}};
27 | }
28 | 
29 | void UnchunkedBuffer::flush() {
30 |   chunks.clear();
31 | }
32 | 
33 | } // namespace torio::io::detail
34 | 


--------------------------------------------------------------------------------
/examples/avsr/average_checkpoints.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def average_checkpoints(last):
 7 |     avg = None
 8 |     for path in last:
 9 |         states = torch.load(path, map_location=lambda storage, loc: storage)["state_dict"]
10 |         if avg is None:
11 |             avg = states
12 |         else:
13 |             for k in avg.keys():
14 |                 avg[k] += states[k]
15 |     # average
16 |     for k in avg.keys():
17 |         if avg[k] is not None:
18 |             if avg[k].is_floating_point():
19 |                 avg[k] /= len(last)
20 |             else:
21 |                 avg[k] //= len(last)
22 |     return avg
23 | 
24 | 
25 | def ensemble(args):
26 |     last = [os.path.join(args.exp_dir, args.exp_name, f"epoch={n}.ckpt") for n in range(args.epochs - 10, args.epochs)]
27 |     model_path = os.path.join(args.exp_dir, args.exp_name, "model_avg_10.pth")
28 |     torch.save({"state_dict": average_checkpoints(last)}, model_path)
29 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/speech_recognition/greedy_decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class Decoder(torch.nn.Module):
 5 |     def __init__(self, labels):
 6 |         super().__init__()
 7 |         self.labels = labels
 8 | 
 9 |     def forward(self, logits: torch.Tensor) -> str:
10 |         """Given a sequence logits over labels, get the best path string
11 | 
12 |         Args:
13 |             logits (Tensor): Logit tensors. Shape `[num_seq, num_label]`.
14 | 
15 |         Returns:
16 |             str: The resulting transcript
17 |         """
18 |         best_path = torch.argmax(logits, dim=-1)  # [num_seq,]
19 |         best_path = torch.unique_consecutive(best_path, dim=-1)
20 |         hypothesis = ""
21 |         for i in best_path:
22 |             char = self.labels[i]
23 |             if char in ["<s>", "<pad>"]:
24 |                 continue
25 |             if char == "|":
26 |                 char = " "
27 |             hypothesis += char
28 |         return hypothesis
29 | 


--------------------------------------------------------------------------------
/test/integration_tests/tacotron2_pipeline_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from torchaudio.pipelines import (
 3 |     TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
 4 |     TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
 5 |     TACOTRON2_WAVERNN_CHAR_LJSPEECH,
 6 |     TACOTRON2_WAVERNN_PHONE_LJSPEECH,
 7 | )
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "bundle",
12 |     [
13 |         TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
14 |         TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
15 |         TACOTRON2_WAVERNN_CHAR_LJSPEECH,
16 |         TACOTRON2_WAVERNN_PHONE_LJSPEECH,
17 |     ],
18 | )
19 | def test_tts_models(bundle):
20 |     """Smoke test of TTS pipeline"""
21 |     text = "Hello world! Text to Speech!"
22 | 
23 |     processor = bundle.get_text_processor()
24 |     tacotron2 = bundle.get_tacotron2()
25 |     vocoder = bundle.get_vocoder()
26 |     processed, lengths = processor(text)
27 |     mel_spec, lengths, _ = tacotron2.infer(processed, lengths)
28 |     waveforms, lengths = vocoder(mel_spec, lengths)
29 | 


--------------------------------------------------------------------------------
/docs/source/models.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.models
 2 | 
 3 | torchaudio.models
 4 | =================
 5 | 
 6 | .. currentmodule:: torchaudio.models
 7 | 
 8 | The ``torchaudio.models`` subpackage contains definitions of models for addressing common audio tasks.
 9 | 
10 | .. note::
11 |    For models with pre-trained parameters, please refer to :mod:`torchaudio.pipelines` module.
12 | 
13 | Model defintions are responsible for constructing computation graphs and executing them.
14 | 
15 | Some models have complex structure and variations.
16 | For such models, factory functions are provided.
17 | 
18 | .. autosummary::
19 |    :toctree: generated
20 |    :nosignatures:
21 |    :template: autosummary/model_class.rst
22 | 
23 |    Conformer
24 |    ConvTasNet
25 |    DeepSpeech
26 |    Emformer
27 |    HDemucs
28 |    HuBERTPretrainModel
29 |    RNNT
30 |    RNNTBeamSearch
31 |    SquimObjective
32 |    SquimSubjective
33 |    Tacotron2
34 |    Wav2Letter
35 |    Wav2Vec2Model
36 |    WaveRNN
37 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/cpu/math.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <libtorchaudio/rnnt/macros.h>
 4 | #include <math.h>
 5 | 
 6 | namespace torchaudio {
 7 | namespace rnnt {
 8 | 
 9 | namespace math {
10 | 
11 | template <typename DTYPE>
12 | FORCE_INLINE HOST_AND_DEVICE DTYPE max(DTYPE x, DTYPE y) {
13 |   if (x > y) {
14 |     return x;
15 |   } else {
16 |     return y;
17 |   }
18 | }
19 | 
20 | template <typename DTYPE>
21 | FORCE_INLINE HOST_AND_DEVICE DTYPE min(DTYPE x, DTYPE y) {
22 |   if (x > y) {
23 |     return y;
24 |   } else {
25 |     return x;
26 |   }
27 | }
28 | 
29 | // log_sum_exp
30 | template <typename DTYPE>
31 | FORCE_INLINE HOST_AND_DEVICE DTYPE lse(DTYPE x, DTYPE y);
32 | 
33 | template <>
34 | FORCE_INLINE HOST_AND_DEVICE float lse(float x, float y) {
35 |   if (y > x) {
36 |     return y + log1pf(expf(x - y));
37 |   } else {
38 |     return x + log1pf(expf(y - x));
39 |   }
40 | }
41 | 
42 | } // namespace math
43 | 
44 | } // namespace rnnt
45 | } // namespace torchaudio
46 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/functional/torchscript_consistency_cpu_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio_unittest.common_utils import PytorchTestCase
 3 | 
 4 | from .torchscript_consistency_test_impl import TorchScriptConsistencyCPUOnlyTestImpl, TorchScriptConsistencyTestImpl
 5 | 
 6 | 
 7 | class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
 8 |     dtype = torch.float32
 9 |     device = torch.device("cpu")
10 | 
11 | 
12 | class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
13 |     dtype = torch.float64
14 |     device = torch.device("cpu")
15 | 
16 | 
17 | class TorchScriptConsistencyCPUOnlyFloat32Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
18 |     dtype = torch.float32
19 |     device = torch.device("cpu")
20 | 
21 | 
22 | class TorchScriptConsistencyCPUOnlyFloat64Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
23 |     dtype = torch.float64
24 |     device = torch.device("cpu")
25 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/gpu/math.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_CUDA
 4 | 
 5 | #include <cmath>
 6 | 
 7 | #endif // USE_CUDA
 8 | 
 9 | #include <libtorchaudio/rnnt/gpu/half.cuh>
10 | 
11 | namespace torchaudio {
12 | namespace rnnt {
13 | 
14 | namespace math {
15 | 
16 | template <typename DTYPE>
17 | FORCE_INLINE HOST_AND_DEVICE DTYPE max(DTYPE x, DTYPE y) {
18 |   if (x > y)
19 |     return x;
20 |   else
21 |     return y;
22 | }
23 | 
24 | template <typename DTYPE>
25 | FORCE_INLINE HOST_AND_DEVICE DTYPE min(DTYPE x, DTYPE y) {
26 |   if (x > y)
27 |     return y;
28 |   else
29 |     return x;
30 | }
31 | 
32 | // log_sum_exp
33 | template <typename DTYPE>
34 | FORCE_INLINE HOST_AND_DEVICE DTYPE lse(DTYPE x, DTYPE y);
35 | 
36 | template <>
37 | FORCE_INLINE HOST_AND_DEVICE float lse(float x, float y) {
38 |   if (y > x) {
39 |     return y + log1pf(expf(x - y));
40 |   } else {
41 |     return x + log1pf(expf(y - x));
42 |   }
43 | }
44 | 
45 | } // namespace math
46 | 
47 | } // namespace rnnt
48 | } // namespace torchaudio
49 | 


--------------------------------------------------------------------------------
/packaging/windows/internal/driver_update.bat:
--------------------------------------------------------------------------------
 1 | set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
 2 | curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
 3 | if errorlevel 1 exit /b 1
 4 | 
 5 | start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
 6 | if errorlevel 1 exit /b 1
 7 | 
 8 | del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL
 9 | 
10 | setlocal EnableDelayedExpansion
11 | set NVIDIA_GPU_EXISTS=0
12 | for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
13 |     set GPUS=%%i
14 |     if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
15 |         SET NVIDIA_GPU_EXISTS=1
16 |         goto gpu_check_end
17 |     )
18 | )
19 | :gpu_check_end
20 | endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
21 | 
22 | if "%NVIDIA_GPU_EXISTS%" == "0" (
23 |     echo "CUDA Driver installation Failed"
24 |     exit /b 1
25 | )
26 | 


--------------------------------------------------------------------------------
/docs/source/kaldi_io.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | torchaudio.kaldi_io
 5 | ======================
 6 | 
 7 | .. py:module:: torchaudio.kaldi_io
 8 | 
 9 | .. currentmodule:: torchaudio.kaldi_io
10 | 
11 | To use this module, the dependency kaldi_io_ needs to be installed.
12 | This is a light wrapper around ``kaldi_io`` that returns :class:`torch.Tensor`.
13 | 
14 | .. _kaldi_io: https://github.com/vesis84/kaldi-io-for-python
15 | 
16 | Vectors
17 | -------
18 | 
19 | :hidden:`read_vec_int_ark`
20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
21 | 
22 | .. autofunction:: read_vec_int_ark
23 | 
24 | :hidden:`read_vec_flt_scp`
25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
26 | 
27 | .. autofunction:: read_vec_flt_scp
28 | 
29 | :hidden:`read_vec_flt_ark`
30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
31 | 
32 | .. autofunction:: read_vec_flt_ark
33 | 
34 | Matrices
35 | --------
36 | 
37 | :hidden:`read_mat_scp`
38 | ~~~~~~~~~~~~~~~~~~~~~~
39 | 
40 | .. autofunction:: read_mat_scp
41 | 
42 | :hidden:`read_mat_ark`
43 | ~~~~~~~~~~~~~~~~~~~~~~
44 | 
45 | .. autofunction:: read_mat_ark
46 | 


--------------------------------------------------------------------------------
/examples/avsr/data_prep/detectors/retinaface/detector.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2021 Imperial College London (Pingchuan Ma)
 5 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import warnings
 8 | 
 9 | import numpy as np
10 | from ibug.face_detection import RetinaFacePredictor
11 | 
12 | warnings.filterwarnings("ignore")
13 | 
14 | 
15 | class LandmarksDetector:
16 |     def __init__(self, device="cuda:0", model_name="resnet50"):
17 |         self.face_detector = RetinaFacePredictor(
18 |             device=device, threshold=0.8, model=RetinaFacePredictor.get_model(model_name)
19 |         )
20 | 
21 |     def __call__(self, video_frames):
22 |         landmarks = []
23 |         for frame in video_frames:
24 |             detected_faces = self.face_detector(frame, rgb=False)
25 |             if len(detected_faces) >= 1:
26 |                 landmarks.append(np.reshape(detected_faces[0][:4], (2, 2)))
27 |             else:
28 |                 landmarks.append(None)
29 |         return landmarks
30 | 


--------------------------------------------------------------------------------
/docs/source/feature_classifications.rst:
--------------------------------------------------------------------------------
 1 | Feature Classifications
 2 | =======================
 3 | 
 4 | Features described in this documentation are classified by release status:
 5 | 
 6 | *Stable:*  These features will be maintained long-term and there should generally
 7 | be no major performance limitations or gaps in documentation.
 8 | We also expect to maintain backwards compatibility (although
 9 | breaking changes can happen and notice will be given one release ahead
10 | of time).
11 | 
12 | *Beta:*  Features are tagged as Beta because the API may change based on
13 | user feedback, because the performance needs to improve, or because
14 | coverage across operators is not yet complete. For Beta features, we are
15 | committing to seeing the feature through to the Stable classification.
16 | We are not, however, committing to backwards compatibility.
17 | 
18 | *Prototype:*  These features are typically not available as part of
19 | binary distributions like PyPI or Conda, except sometimes behind run-time
20 | flags, and are at an early stage for feedback and testing.
21 | 


--------------------------------------------------------------------------------
/examples/avsr/models/emformer_rnnt.py:
--------------------------------------------------------------------------------
 1 | from torchaudio.models.rnnt import emformer_rnnt_model
 2 | 
 3 | 
 4 | # https://pytorch.org/audio/master/_modules/torchaudio/models/rnnt.html#emformer_rnnt_base
 5 | def emformer_rnnt():
 6 |     return emformer_rnnt_model(
 7 |         input_dim=512,
 8 |         encoding_dim=1024,
 9 |         num_symbols=1024,
10 |         segment_length=64,
11 |         right_context_length=0,
12 |         time_reduction_input_dim=128,
13 |         time_reduction_stride=1,
14 |         transformer_num_heads=4,
15 |         transformer_ffn_dim=2048,
16 |         transformer_num_layers=20,
17 |         transformer_dropout=0.1,
18 |         transformer_activation="gelu",
19 |         transformer_left_context_length=30,
20 |         transformer_max_memory_size=0,
21 |         transformer_weight_init_scale_strategy="depthwise",
22 |         transformer_tanh_on_mem=True,
23 |         symbol_embedding_dim=512,
24 |         num_lstm_layers=3,
25 |         lstm_layer_norm=True,
26 |         lstm_layer_norm_epsilon=1e-3,
27 |         lstm_dropout=0.3,
28 |     )
29 | 


--------------------------------------------------------------------------------
/examples/avsr/schedulers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler):
 7 |     def __init__(
 8 |         self,
 9 |         optimizer: torch.optim.Optimizer,
10 |         warmup_epochs: int,
11 |         total_epochs: int,
12 |         steps_per_epoch: int,
13 |         last_epoch=-1,
14 |         verbose=False,
15 |     ):
16 |         self.warmup_steps = warmup_epochs * steps_per_epoch
17 |         self.total_steps = total_epochs * steps_per_epoch
18 |         super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
19 | 
20 |     def get_lr(self):
21 |         if self._step_count < self.warmup_steps:
22 |             return [self._step_count / self.warmup_steps * base_lr for base_lr in self.base_lrs]
23 |         else:
24 |             decay_steps = self.total_steps - self.warmup_steps
25 |             return [
26 |                 0.5 * base_lr * (1 + math.cos(math.pi * (self._step_count - self.warmup_steps) / decay_steps))
27 |                 for base_lr in self.base_lrs
28 |             ]
29 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/functional_cuda_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
 5 | 
 6 | from .functional_impl import Functional, FunctionalCUDAOnly
 7 | 
 8 | 
 9 | @skipIfNoCuda
10 | class TestFunctionalFloat32(Functional, PytorchTestCase):
11 |     dtype = torch.float32
12 |     device = torch.device("cuda")
13 | 
14 |     @unittest.expectedFailure
15 |     def test_lfilter_9th_order_filter_stability(self):
16 |         super().test_lfilter_9th_order_filter_stability()
17 | 
18 | 
19 | @skipIfNoCuda
20 | class TestLFilterFloat64(Functional, PytorchTestCase):
21 |     dtype = torch.float64
22 |     device = torch.device("cuda")
23 | 
24 | 
25 | @skipIfNoCuda
26 | class TestFunctionalCUDAOnlyFloat32(FunctionalCUDAOnly, PytorchTestCase):
27 |     dtype = torch.float32
28 |     device = torch.device("cuda")
29 | 
30 | 
31 | @skipIfNoCuda
32 | class TestFunctionalCUDAOnlyFloat64(FunctionalCUDAOnly, PytorchTestCase):
33 |     dtype = torch.float64
34 |     device = torch.device("cuda")
35 | 


--------------------------------------------------------------------------------
/packaging/vc_env_helper.bat:
--------------------------------------------------------------------------------
 1 | @echo on
 2 | 
 3 | set VC_VERSION_LOWER=16
 4 | set VC_VERSION_UPPER=17
 5 | 
 6 | for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
 7 |     if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
 8 |         set "VS15INSTALLDIR=%%i"
 9 |         set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
10 |         goto vswhere
11 |     )
12 | )
13 | 
14 | :vswhere
15 | if "%VSDEVCMD_ARGS%" == "" (
16 |     call "%VS15VCVARSALL%" x64 || exit /b 1
17 | ) else (
18 |     call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
19 | )
20 | 
21 | @echo on
22 | 
23 | set DISTUTILS_USE_SDK=1
24 | 
25 | set args=%1
26 | shift
27 | :start
28 | if [%1] == [] goto done
29 | set args=%args% %1
30 | shift
31 | goto start
32 | 
33 | :done
34 | if "%args%" == "" (
35 |     echo Usage: vc_env_helper.bat [command] [args]
36 |     echo e.g. vc_env_helper.bat cl /c test.cpp
37 | )
38 | 
39 | %args% || exit /b 1
40 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/cuctc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Custom CMakeLists for building cuda ctc decoder
 2 | 
 3 | set(CMAKE_CXX_VISIBILITY_PRESET default)
 4 | 
 5 | # the following line is added in order to export symbols when building on Windows
 6 | # this approach has some limitations as documented in https://github.com/pytorch/pytorch/pull/3650
 7 | if (MSVC)
 8 |   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 9 | endif()
10 | 
11 | set(
12 |   libctc_prefix_decoder_src
13 |   src/ctc_prefix_decoder.cpp
14 |   src/ctc_prefix_decoder_kernel_v2.cu
15 |   )
16 | 
17 | set(
18 |   additional_libs
19 |   )
20 | 
21 | list(
22 |   APPEND
23 |   additional_libs
24 |   cuda_deps
25 |   )
26 | 
27 | torchaudio_library(
28 |   libctc_prefix_decoder
29 |   "${libctc_prefix_decoder_src}"
30 |   "${CMAKE_CURRENT_SOURCE_DIR}"
31 |   "${additional_libs}"
32 |   ""
33 |   )
34 | 
35 | if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
36 |   torchaudio_extension(
37 |     pybind11_prefixctc
38 |     src/python_binding.cpp
39 |     "${CMAKE_CURRENT_SOURCE_DIR}"
40 |     "libctc_prefix_decoder;${additional_libs}"
41 |     ""
42 |     )
43 | endif()
44 | 


--------------------------------------------------------------------------------
/test/integration_tests/rnnt_pipeline_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torchaudio
 3 | from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
 4 | from torchaudio.prototype.pipelines import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "bundle,lang,expected",
 9 |     [
10 |         (EMFORMER_RNNT_BASE_LIBRISPEECH, "en", "i have that curiosity beside me at this moment"),
11 |         (EMFORMER_RNNT_BASE_MUSTC, "en", "I had that curiosity beside me at this moment."),
12 |         (EMFORMER_RNNT_BASE_TEDLIUM3, "en", "i had that curiosity beside me at this moment"),
13 |     ],
14 | )
15 | def test_rnnt(bundle, sample_speech, expected):
16 |     feature_extractor = bundle.get_feature_extractor()
17 |     decoder = bundle.get_decoder().eval()
18 |     token_processor = bundle.get_token_processor()
19 |     waveform, _ = torchaudio.load(sample_speech)
20 |     features, length = feature_extractor(waveform.squeeze())
21 |     hypotheses = decoder(features, length, 10)
22 |     text = token_processor(hypotheses[0][0])
23 |     assert text == expected
24 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | Jinja2<3.1.0
 2 | matplotlib<=3.8
 3 | pyparsing<3,>=2.0.2
 4 | 
 5 | # C++ docs
 6 | breathe==4.34.0
 7 | 
 8 | # Note:
 9 | # When changing Sphinx-related packages, make sure that the custom behaviors in the following
10 | # locations are working as expected.
11 | # - source/_templates/layout.html
12 | # - source/_static/css/custom.css
13 | -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@32a6550#egg=pytorch_sphinx_theme
14 | sphinx==5.1.1
15 | sphinxcontrib.katex==0.8.6
16 | sphinxcontrib.bibtex==2.4.2
17 | sphinx_gallery==0.11.1
18 | nbsphinx==0.8.8
19 | 
20 | # https://github.com/bmcfee/resampy/issues/106
21 | # Since 2022-07-07 build_docs CI job started to fail.
22 | # Pinning resampy to 0.2.2 resolves this.
23 | # The real cause is not know at the moment but the use
24 | # of librosa seems to cause this
25 | # https://github.com/bmcfee/resampy/issues/106
26 | # In our case, the tutorial timed out is online_asr_tutorial,
27 | # which itself does not use resampy
28 | # However audio_feature_augmentation_tutorial is executed before that,
29 | # which uses librosa.
30 | resampy==0.2.2
31 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/rnnt/compute.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorchaudio/rnnt/compute.h>
 2 | #include <torch/script.h>
 3 | 
 4 | std::tuple<torch::Tensor, c10::optional<torch::Tensor>> rnnt_loss(
 5 |     torch::Tensor& logits,
 6 |     const torch::Tensor& targets,
 7 |     const torch::Tensor& logit_lengths,
 8 |     const torch::Tensor& target_lengths,
 9 |     int64_t blank,
10 |     double clamp,
11 |     bool fused_log_softmax = true) {
12 |   static auto op = torch::Dispatcher::singleton()
13 |                        .findSchemaOrThrow("torchaudio::rnnt_loss", "")
14 |                        .typed<decltype(rnnt_loss)>();
15 |   return op.call(
16 |       logits,
17 |       targets,
18 |       logit_lengths,
19 |       target_lengths,
20 |       blank,
21 |       clamp,
22 |       fused_log_softmax);
23 | }
24 | 
25 | TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
26 |   m.def(
27 |       "rnnt_loss(Tensor logits,"
28 |       "Tensor targets,"
29 |       "Tensor logit_lengths,"
30 |       "Tensor target_lengths,"
31 |       "int blank,"
32 |       "float clamp,"
33 |       "bool fused_log_softmax) -> (Tensor, Tensor?)");
34 | }
35 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/hw_context.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorio/ffmpeg/hw_context.h>
 2 | 
 3 | namespace torio::io {
 4 | namespace {
 5 | 
 6 | static std::mutex MUTEX;
 7 | static std::map<int, AVBufferRefPtr> CUDA_CONTEXT_CACHE;
 8 | 
 9 | } // namespace
10 | 
11 | AVBufferRef* get_cuda_context(int index) {
12 |   std::lock_guard<std::mutex> lock(MUTEX);
13 |   if (index == -1) {
14 |     index = 0;
15 |   }
16 |   if (CUDA_CONTEXT_CACHE.count(index) == 0) {
17 |     AVBufferRef* p = nullptr;
18 |     int ret = av_hwdevice_ctx_create(
19 |         &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0);
20 |     TORCH_CHECK(
21 |         ret >= 0,
22 |         "Failed to create CUDA device context on device ",
23 |         index,
24 |         "(",
25 |         av_err2string(ret),
26 |         ")");
27 |     assert(p);
28 |     CUDA_CONTEXT_CACHE.emplace(index, p);
29 |     return p;
30 |   }
31 |   AVBufferRefPtr& buffer = CUDA_CONTEXT_CACHE.at(index);
32 |   return buffer;
33 | }
34 | 
35 | void clear_cuda_context_cache() {
36 |   std::lock_guard<std::mutex> lock(MUTEX);
37 |   CUDA_CONTEXT_CACHE.clear();
38 | }
39 | 
40 | } // namespace torio::io
41 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/sox/types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TORCHAUDIO_SOX_TYPES_H
 2 | #define TORCHAUDIO_SOX_TYPES_H
 3 | 
 4 | #include <sox.h>
 5 | #include <torch/script.h>
 6 | 
 7 | namespace torchaudio::sox {
 8 | 
 9 | enum class Format {
10 |   WAV,
11 |   MP3,
12 |   FLAC,
13 |   VORBIS,
14 |   AMR_NB,
15 |   AMR_WB,
16 |   AMB,
17 |   SPHERE,
18 |   GSM,
19 |   HTK,
20 | };
21 | 
22 | Format get_format_from_string(const std::string& format);
23 | 
24 | enum class Encoding {
25 |   NOT_PROVIDED,
26 |   UNKNOWN,
27 |   PCM_SIGNED,
28 |   PCM_UNSIGNED,
29 |   PCM_FLOAT,
30 |   FLAC,
31 |   ULAW,
32 |   ALAW,
33 |   MP3,
34 |   VORBIS,
35 |   AMR_WB,
36 |   AMR_NB,
37 |   OPUS,
38 | };
39 | 
40 | std::string to_string(Encoding v);
41 | Encoding get_encoding_from_option(const c10::optional<std::string>& encoding);
42 | 
43 | enum class BitDepth : unsigned {
44 |   NOT_PROVIDED = 0,
45 |   B8 = 8,
46 |   B16 = 16,
47 |   B24 = 24,
48 |   B32 = 32,
49 |   B64 = 64,
50 | };
51 | 
52 | BitDepth get_bit_depth_from_option(const c10::optional<int64_t>& bit_depth);
53 | 
54 | std::string get_encoding(sox_encoding_t encoding);
55 | 
56 | } // namespace torchaudio::sox
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/src/torchaudio/__init__.py:
--------------------------------------------------------------------------------
 1 | # Initialize extension and backend first
 2 | from . import _extension  # noqa  # usort: skip
 3 | from ._backend import (  # noqa  # usort: skip
 4 |     AudioMetaData,
 5 |     get_audio_backend,
 6 |     info,
 7 |     list_audio_backends,
 8 |     load,
 9 |     save,
10 |     set_audio_backend,
11 | )
12 | 
13 | from . import (  # noqa: F401
14 |     compliance,
15 |     datasets,
16 |     functional,
17 |     io,
18 |     kaldi_io,
19 |     models,
20 |     pipelines,
21 |     sox_effects,
22 |     transforms,
23 |     utils,
24 | )
25 | 
26 | # For BC
27 | from . import backend  # noqa # usort: skip
28 | 
29 | try:
30 |     from .version import __version__, git_version  # noqa: F401
31 | except ImportError:
32 |     pass
33 | 
34 | 
35 | __all__ = [
36 |     "AudioMetaData",
37 |     "load",
38 |     "info",
39 |     "save",
40 |     "io",
41 |     "compliance",
42 |     "datasets",
43 |     "functional",
44 |     "models",
45 |     "pipelines",
46 |     "kaldi_io",
47 |     "utils",
48 |     "sox_effects",
49 |     "transforms",
50 |     "list_audio_backends",
51 |     "get_audio_backend",
52 |     "set_audio_backend",
53 | ]
54 | 


--------------------------------------------------------------------------------
/examples/self_supervised_learning/README.md:
--------------------------------------------------------------------------------
 1 | # Modularized Self-supervised Learning Recipe
 2 | 
 3 | This directory contains the modularized training recipe for audio/speech self-supervised learning. The principle is to let users easily inject a new component (model, data_module, loss function, etc) to the existing recipe for different tasks (e.g. Wav2Vec 2.0, HuBERT, etc).
 4 | 
 5 | 
 6 | ## HuBERT Pre-training Example
 7 | To get the K-Means labels for HuBERT pre-training, please check the [pre-processing step](../hubert/README.md#pre-processing-1st-iteration) in hubert example.
 8 | 
 9 | In order to run the HuBERT pre-training script for the first iteration, users need to go to `examples` directory and run the following SLURM command:
10 | ```
11 | cd examples
12 | 
13 | srun \
14 | --gpus-per-node=8 \
15 | --ntasks-per-node=8 \
16 | -N 4 \
17 | --cpus-per-task=10 \
18 | python -m self_supervised_learning.train_hubert \
19 | --dataset-path hubert/exp/data/mfcc/ \
20 | --exp-dir self_supervised_learning/exp_iter1 \
21 | --feature-type mfcc \
22 | --num-class 100 \
23 | --max-updates 250000 \
24 | --learning-rate 0.0005 \
25 | --gpus 8 \
26 | --num-nodes 4
27 | ```
28 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_reader/post_process.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <libtorio/ffmpeg/filter_graph.h>
 3 | #include <libtorio/ffmpeg/stream_reader/typedefs.h>
 4 | 
 5 | namespace torio::io {
 6 | 
 7 | struct IPostDecodeProcess {
 8 |   virtual ~IPostDecodeProcess() = default;
 9 | 
10 |   virtual int process_frame(AVFrame* frame) = 0;
11 |   virtual c10::optional<Chunk> pop_chunk() = 0;
12 |   virtual bool is_buffer_ready() const = 0;
13 |   virtual const std::string& get_filter_desc() const = 0;
14 |   virtual FilterGraphOutputInfo get_filter_output_info() const = 0;
15 |   virtual void flush() = 0;
16 | };
17 | 
18 | std::unique_ptr<IPostDecodeProcess> get_audio_process(
19 |     AVRational input_time_base,
20 |     AVCodecContext* codec_ctx,
21 |     const std::string& desc,
22 |     int frames_per_chunk,
23 |     int num_chunks);
24 | 
25 | std::unique_ptr<IPostDecodeProcess> get_video_process(
26 |     AVRational input_time_base,
27 |     AVRational frame_rate,
28 |     AVCodecContext* codec_ctx,
29 |     const std::string& desc,
30 |     int frames_per_chunk,
31 |     int num_chunks,
32 |     const torch::Device& device);
33 | 
34 | } // namespace torio::io
35 | 


--------------------------------------------------------------------------------
/src/torchaudio/models/wav2vec2/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import utils
 2 | from .model import (
 3 |     hubert_base,
 4 |     hubert_large,
 5 |     hubert_pretrain_base,
 6 |     hubert_pretrain_large,
 7 |     hubert_pretrain_model,
 8 |     hubert_pretrain_xlarge,
 9 |     hubert_xlarge,
10 |     HuBERTPretrainModel,
11 |     wav2vec2_base,
12 |     wav2vec2_large,
13 |     wav2vec2_large_lv60k,
14 |     wav2vec2_model,
15 |     wav2vec2_xlsr_1b,
16 |     wav2vec2_xlsr_2b,
17 |     wav2vec2_xlsr_300m,
18 |     Wav2Vec2Model,
19 |     wavlm_base,
20 |     wavlm_large,
21 |     wavlm_model,
22 | )
23 | 
24 | __all__ = [
25 |     "Wav2Vec2Model",
26 |     "HuBERTPretrainModel",
27 |     "wavlm_model",
28 |     "wavlm_base",
29 |     "wavlm_large",
30 |     "wav2vec2_model",
31 |     "wav2vec2_base",
32 |     "wav2vec2_large",
33 |     "wav2vec2_large_lv60k",
34 |     "hubert_base",
35 |     "hubert_large",
36 |     "hubert_xlarge",
37 |     "hubert_pretrain_model",
38 |     "hubert_pretrain_base",
39 |     "hubert_pretrain_large",
40 |     "hubert_pretrain_xlarge",
41 |     "utils",
42 |     "wav2vec2_xlsr_300m",
43 |     "wav2vec2_xlsr_1b",
44 |     "wav2vec2_xlsr_2b",
45 | ]
46 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/dataset_class.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/dataset_class.rst
 3 | 
 4 | {{ name | underline }}
 5 | 
 6 | .. autoclass:: {{ fullname }}
 7 | 
 8 | {%- if "get_metadata" in methods %}
 9 |   {%- set meth=["__getitem__", "get_metadata"] %}
10 | {%- else %}
11 |   {%- set meth=["__getitem__"] %}
12 | {%- endif %}
13 | 
14 | {%- if name == "CMUDict" %}
15 |   {%- set properties=["symbols"] %}
16 | {%- elif name == "TEDLIUM" %}
17 |   {%- set properties=["phoneme_dict"] %}
18 | {%- else %}
19 |   {%- set properties=[] %}
20 | {%- endif %}
21 | 
22 | {%- if properties %}
23 | 
24 | Properties
25 | ==========
26 | 
27 | {% for item in properties %}
28 | 
29 | {{item | underline("-") }}
30 | 
31 | .. container:: py attribute
32 | 
33 |    .. autoproperty:: {{[fullname, item] | join('.')}}
34 | 
35 | {%- endfor %}
36 | 
37 | {%- endif %}
38 | 
39 | {%- if properties %}
40 | 
41 | Methods
42 | =======
43 | 
44 | {%- endif %}
45 | 
46 | {% for item in meth %}
47 | 
48 | {{item | underline("-") }}
49 | 
50 | .. container:: py attribute
51 | 
52 |    .. automethod:: {{[fullname, item] | join('.')}}
53 | 
54 | {%- endfor %}
55 | 


--------------------------------------------------------------------------------
/examples/self_supervised_learning/lr_schedulers/_linear_decay.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim.optimizer import Optimizer
 3 | 
 4 | 
 5 | class LinearDecayLRScheduler(torch.optim.lr_scheduler._LRScheduler):
 6 |     """Linear learning rate scheduler with warm up."""
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         optimizer: Optimizer,
11 |         warmup_updates: int,
12 |         max_updates: int,
13 |         last_epoch: int = -1,
14 |         verbose: bool = False,
15 |     ):
16 |         self.warmup_updates = warmup_updates
17 |         self.max_updates = max_updates
18 |         super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
19 | 
20 |     def get_lr(self):
21 |         if self._step_count <= self.warmup_updates:
22 |             return [self._step_count / self.warmup_updates * base_lr for base_lr in self.base_lrs]
23 |         elif self._step_count >= self.max_updates:
24 |             return [0.0 for _ in self.base_lrs]
25 |         else:
26 |             pct_remaining = (self.max_updates - self._step_count) / (self.max_updates - self.warmup_updates)
27 |             return [base_lr * pct_remaining for base_lr in self.base_lrs]
28 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/README.md:
--------------------------------------------------------------------------------
 1 | # Libtorchaudio Examples
 2 | 
 3 | * [Augmentation](./augmentation)
 4 | * [Speech Recognition with wav2vec2.0](./speech_recognition)
 5 | 
 6 | ## Build
 7 | 
 8 | The example applications in this directory depend on `libtorch` and `libtorchaudio`.
 9 | If you have a working `PyTorch`, you already have `libtorch`.
10 | Please refer to [this tutorial](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) for the use of `libtorch` and TorchScript.
11 | 
12 | `libtorchaudio` is the library of torchaudio's C++ components without Python component.
13 | It is currently not distributed, and it will be built alongside with the applications.
14 | 
15 | The following commands will build `libtorchaudio` and applications.
16 | 
17 | ```bash
18 | git submodule update
19 | mkdir build
20 | cd build
21 | cmake -GNinja \
22 |       -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \
23 |       -DBUILD_SOX=ON \
24 |       -DBUILD_KALDI=OFF \
25 |       -DBUILD_RNNT=ON \
26 |       ..
27 | cmake --build .
28 | ```
29 | 
30 | For the usages of each application, refer to the corresponding application directory.
31 | 


--------------------------------------------------------------------------------
/.github/workflows/integration-test.yml:
--------------------------------------------------------------------------------
 1 | name: Integration Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main ]
 6 | 
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-22.04
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         python-version: [ 3.8 ]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         sudo apt install -y -qq libavfilter-dev libavdevice-dev
27 |     - name: Install packages
28 |       run: |
29 |         python -m pip install --quiet --upgrade pip
30 |         python -m pip install --quiet --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
31 |         python -m pip install --quiet pytest requests cmake ninja deep-phonemizer sentencepiece flashlight-text git+https://github.com/kpu/kenlm
32 |         python setup.py install
33 |     - name: Run integration test
34 |       run: |
35 |         cd test && pytest integration_tests -v --use-tmp-hub-dir
36 | 


--------------------------------------------------------------------------------
/examples/avsr/data_prep/tools/README.md:
--------------------------------------------------------------------------------
 1 | ## Face Recognition
 2 | We provide [ibug.face_detection](https://github.com/hhj1897/face_detection) in this repository. You can install directly from github repositories or by using compressed files.
 3 | 
 4 | ### Option 1. Install from github repositories
 5 | 
 6 | * [Git LFS](https://git-lfs.github.com/), needed for downloading the pretrained weights that are larger than 100 MB.
 7 | 
 8 | You could install *`Homebrew`* and then install *`git-lfs`* without sudo priviledges.
 9 | 
10 | ```Shell
11 | git clone https://github.com/hhj1897/face_detection.git
12 | cd face_detection
13 | git lfs pull
14 | pip install -e .
15 | cd ..
16 | ```
17 | 
18 | ### Option 2. Install by using compressed files
19 | 
20 | If you are experiencing over-quota issues for the above repositoies, you can download both packages [ibug.face_detection](https://www.doc.ic.ac.uk/~pm4115/tracker/face_detection.zip), unzip the files, and then run `pip install -e .` to install each package.
21 | 
22 | ```Shell
23 | wget https://www.doc.ic.ac.uk/~pm4115/tracker/face_detection.zip -O ./face_detection.zip
24 | unzip -o ./face_detection.zip -d ./
25 | cd face_detection
26 | pip install -e .
27 | cd ..
28 | ```
29 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/prototype/conv_emformer_test_impl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchaudio.prototype.models.conv_emformer import ConvEmformer
 3 | from torchaudio_unittest.common_utils import TestBaseMixin
 4 | from torchaudio_unittest.models.emformer.emformer_test_impl import EmformerTestMixin
 5 | 
 6 | 
 7 | class ConvEmformerTestImpl(EmformerTestMixin, TestBaseMixin):
 8 |     def gen_model(self, input_dim, right_context_length):
 9 |         emformer = ConvEmformer(
10 |             input_dim,
11 |             8,
12 |             256,
13 |             3,
14 |             4,
15 |             12,
16 |             left_context_length=30,
17 |             right_context_length=right_context_length,
18 |             max_memory_size=1,
19 |         ).to(device=self.device, dtype=self.dtype)
20 |         return emformer
21 | 
22 |     def gen_inputs(self, input_dim, batch_size, num_frames, right_context_length):
23 |         input = torch.rand(batch_size, num_frames, input_dim).to(device=self.device, dtype=self.dtype)
24 |         lengths = torch.randint(1, num_frames - right_context_length, (batch_size,)).to(
25 |             device=self.device, dtype=self.dtype
26 |         )
27 |         return input, lengths
28 | 


--------------------------------------------------------------------------------
/examples/pipeline_wavernn/processing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class NormalizeDB(nn.Module):
 6 |     r"""Normalize the spectrogram with a minimum db value"""
 7 | 
 8 |     def __init__(self, min_level_db, normalization):
 9 |         super().__init__()
10 |         self.min_level_db = min_level_db
11 |         self.normalization = normalization
12 | 
13 |     def forward(self, specgram):
14 |         specgram = torch.log10(torch.clamp(specgram.squeeze(0), min=1e-5))
15 |         if self.normalization:
16 |             return torch.clamp((self.min_level_db - 20 * specgram) / self.min_level_db, min=0, max=1)
17 |         return specgram
18 | 
19 | 
20 | def normalized_waveform_to_bits(waveform: torch.Tensor, bits: int) -> torch.Tensor:
21 |     r"""Transform waveform [-1, 1] to label [0, 2 ** bits - 1]"""
22 | 
23 |     assert abs(waveform).max() <= 1.0
24 |     waveform = (waveform + 1.0) * (2**bits - 1) / 2
25 |     return torch.clamp(waveform, 0, 2**bits - 1).int()
26 | 
27 | 
28 | def bits_to_normalized_waveform(label: torch.Tensor, bits: int) -> torch.Tensor:
29 |     r"""Transform label [0, 2 ** bits - 1] to waveform [-1, 1]"""
30 | 
31 |     return 2 * label / (2**bits - 1.0) - 1.0
32 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <libtorio/ffmpeg/ffmpeg.h>
 3 | #include <libtorio/ffmpeg/stream_reader/typedefs.h>
 4 | 
 5 | namespace torio::io::detail {
 6 | 
 7 | class ChunkedBuffer {
 8 |   // Each AVFrame is converted to a Tensor and stored here.
 9 |   std::deque<torch::Tensor> chunks;
10 |   // Time stamps corresponding the first frame of each chunk
11 |   std::deque<int64_t> pts;
12 |   AVRational time_base;
13 | 
14 |   // The number of frames to return as a chunk
15 |   // If <0, then user wants to receive all the frames
16 |   const int64_t frames_per_chunk;
17 |   // The numbe of chunks to retain
18 |   const int64_t num_chunks;
19 |   // The number of currently stored chunks
20 |   // For video, one Tensor corresponds to one frame, but for audio,
21 |   // one Tensor contains multiple samples, so we track here.
22 |   int64_t num_buffered_frames = 0;
23 | 
24 |  public:
25 |   ChunkedBuffer(AVRational time_base, int frames_per_chunk, int num_chunks);
26 | 
27 |   bool is_ready() const;
28 |   void flush();
29 |   c10::optional<Chunk> pop_chunk();
30 |   void push_frame(torch::Tensor frame, int64_t pts_);
31 | };
32 | 
33 | } // namespace torio::io::detail
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://pytorch.org/audio/stable/index.html
 3 | 
 4 | body:
 5 | - type: markdown
 6 |   attributes:
 7 |     value: >
 8 |       <strong>PLEASE NOTE THAT THE TORCHAUDIO REPOSITORY IS NO LONGER ACTIVELY MONITORED.</strong> You will not likely get a response. For open discussions, visit https://discuss.pytorch.org/.
 9 | - type: textarea
10 |   attributes:
11 |     label: 📚 The doc issue
12 |     description: >
13 |       A description of what content in https://pytorch.org/audio/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new.
14 |   validations:
15 |     required: true
16 | - type: textarea
17 |   attributes:
18 |     label: Suggest a potential alternative/fix
19 |     description: >
20 |       Tell us how we could improve the documentation in this regard.
21 | - type: markdown
22 |   attributes:
23 |     value: >
24 |       Thanks for contributing 🎉!
25 | 


--------------------------------------------------------------------------------
/docs/source/datasets.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: torchaudio.datasets
 2 | 
 3 | torchaudio.datasets
 4 | ====================
 5 | 
 6 | All datasets are subclasses of :class:`torch.utils.data.Dataset`
 7 | and have ``__getitem__`` and ``__len__`` methods implemented.
 8 | 
 9 | Hence, they can all be passed to a :class:`torch.utils.data.DataLoader`
10 | which can load multiple samples parallelly using :mod:`torch.multiprocessing` workers.
11 | For example:
12 | 
13 | .. code::
14 | 
15 |    yesno_data = torchaudio.datasets.YESNO('.', download=True)
16 |    data_loader = torch.utils.data.DataLoader(
17 |        yesno_data,
18 |        batch_size=1,
19 |        shuffle=True,
20 |        num_workers=args.nThreads)
21 | 
22 | .. currentmodule:: torchaudio.datasets
23 | 
24 | .. autosummary::
25 |    :toctree: generated
26 |    :nosignatures:
27 |    :template: autosummary/dataset_class.rst
28 | 
29 |    CMUARCTIC
30 |    CMUDict
31 |    COMMONVOICE
32 |    DR_VCTK
33 |    FluentSpeechCommands
34 |    GTZAN
35 |    IEMOCAP
36 |    LibriMix
37 |    LIBRISPEECH
38 |    LibriLightLimited
39 |    LIBRITTS
40 |    LJSPEECH
41 |    MUSDB_HQ
42 |    QUESST14
43 |    Snips
44 |    SPEECHCOMMANDS
45 |    TEDLIUM
46 |    VCTK_092
47 |    VoxCeleb1Identification
48 |    VoxCeleb1Verification
49 |    YESNO
50 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/sox/io.h:
--------------------------------------------------------------------------------
 1 | #ifndef TORCHAUDIO_SOX_IO_H
 2 | #define TORCHAUDIO_SOX_IO_H
 3 | 
 4 | #include <libtorchaudio/sox/utils.h>
 5 | #include <torch/script.h>
 6 | 
 7 | namespace torchaudio::sox {
 8 | 
 9 | auto get_effects(
10 |     const c10::optional<int64_t>& frame_offset,
11 |     const c10::optional<int64_t>& num_frames)
12 |     -> std::vector<std::vector<std::string>>;
13 | 
14 | std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> get_info_file(
15 |     const std::string& path,
16 |     const c10::optional<std::string>& format);
17 | 
18 | std::tuple<torch::Tensor, int64_t> load_audio_file(
19 |     const std::string& path,
20 |     const c10::optional<int64_t>& frame_offset,
21 |     const c10::optional<int64_t>& num_frames,
22 |     c10::optional<bool> normalize,
23 |     c10::optional<bool> channels_first,
24 |     const c10::optional<std::string>& format);
25 | 
26 | void save_audio_file(
27 |     const std::string& path,
28 |     torch::Tensor tensor,
29 |     int64_t sample_rate,
30 |     bool channels_first,
31 |     c10::optional<double> compression,
32 |     c10::optional<std::string> format,
33 |     c10::optional<std::string> encoding,
34 |     c10::optional<int64_t> bits_per_sample);
35 | 
36 | } // namespace torchaudio::sox
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - nightly
 8 |       - main
 9 |       - release/*
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   python-source-and-configs:
14 |     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
15 |     with:
16 |       repository: pytorch/audio
17 |       script: |
18 |         set -euo pipefail
19 | 
20 |         echo '::group::Setup environment'
21 |         eval "$("$(which conda)" shell.bash hook)"
22 |         # libcst does not have 3.11 pre-built binaries yet. Use python 3.10
23 |         conda create -y --name env python=3.10
24 |         conda activate env
25 |         pip3 install --progress-bar=off pre-commit
26 |         echo '::endgroup::'
27 | 
28 |         set +e
29 |         pre-commit run --all-files --show-diff-on-failure
30 |         status=$?
31 | 
32 |         echo '::group::Add Summry'
33 |         if [ $status -ne 0 ]; then
34 |           echo '### Lint failure'  >> $GITHUB_STEP_SUMMARY
35 |           echo '```diff'           >> $GITHUB_STEP_SUMMARY
36 |           git --no-pager diff      >> $GITHUB_STEP_SUMMARY
37 |           echo '```'               >> $GITHUB_STEP_SUMMARY
38 |         fi
39 |         echo '::endgroup::'
40 |         exit $status
41 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/augmentation/README.md:
--------------------------------------------------------------------------------
 1 | # Augmentation
 2 | 
 3 | This example demonstrates how you can use torchaudio's I/O features and augmentations in C++ application.
 4 | 
 5 | **NOTE**
 6 | This example uses `"sox_io"` backend, thus does not work on Windows.
 7 | 
 8 | ## Steps
 9 | ### 1. Create augmentation pipeline TorchScript file.
10 | 
11 | First, we implement our data process pipeline as a regular Python, and save it as a TorchScript object.
12 | We will load and execute it in our C++ application. The C++ code is found in [`main.cpp`](./main.cpp).
13 | 
14 | ```python
15 | python create_jittable_pipeline.py \
16 |     --rir-path "../data/rir.wav" \
17 |     --output-path "./pipeline.zip"
18 | ```
19 | 
20 | ### 2. Build the application
21 | 
22 | Please refer to [the top level README.md](../README.md)
23 | 
24 | ### 3. Run the application
25 | 
26 | Now we run the C++ application `augment`, with the TorchScript object we created in Step.1 and an input audio file.
27 | 
28 | In [the top level directory](../)
29 | 
30 | ```bash
31 | input_audio_file="./data/input.wav"
32 | ./build/augmentation/augment ./augmentation/pipeline.zip "${input_audio_file}" "output.wav"
33 | ```
34 | 
35 | When you give a clean speech file, the output audio sounds like it's a phone conversation.
36 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/wav2vec2/huggingface/generate_huggingface_model_config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from transformers import Wav2Vec2Model
 5 | 
 6 | _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | 
 9 | def _main():
10 |     keys = [
11 |         # pretrained
12 |         "facebook/wav2vec2-base",
13 |         "facebook/wav2vec2-large",
14 |         "facebook/wav2vec2-large-lv60",
15 |         "facebook/wav2vec2-base-10k-voxpopuli",
16 |         "facebook/wav2vec2-large-xlsr-53",
17 |         # finetuned
18 |         "facebook/wav2vec2-base-960h",
19 |         "facebook/wav2vec2-large-960h",
20 |         "facebook/wav2vec2-large-960h-lv60",
21 |         "facebook/wav2vec2-large-960h-lv60-self",
22 |         "facebook/wav2vec2-large-xlsr-53-german",
23 |     ]
24 |     for key in keys:
25 |         path = os.path.join(_THIS_DIR, f"{key}.json")
26 |         print("Generating ", path)
27 |         cfg = Wav2Vec2Model.from_pretrained(key).config
28 |         cfg = json.loads(cfg.to_json_string())
29 |         del cfg["_name_or_path"]
30 | 
31 |         with open(path, "w") as file_:
32 |             file_.write(json.dumps(cfg, indent=4, sort_keys=True))
33 |             file_.write("\n")
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     _main()
38 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio.functional as F
 3 | from torchaudio_unittest.common_utils import skipIfNoExec, TempDirMixin, TestBaseMixin
 4 | from torchaudio_unittest.common_utils.kaldi_utils import convert_args, run_kaldi
 5 | 
 6 | 
 7 | class Kaldi(TempDirMixin, TestBaseMixin):
 8 |     def assert_equal(self, output, *, expected, rtol=None, atol=None):
 9 |         expected = expected.to(dtype=self.dtype, device=self.device)
10 |         self.assertEqual(output, expected, rtol=rtol, atol=atol)
11 | 
12 |     @skipIfNoExec("apply-cmvn-sliding")
13 |     def test_sliding_window_cmn(self):
14 |         """sliding_window_cmn should be numerically compatible with apply-cmvn-sliding"""
15 |         kwargs = {
16 |             "cmn_window": 600,
17 |             "min_cmn_window": 100,
18 |             "center": False,
19 |             "norm_vars": False,
20 |         }
21 | 
22 |         tensor = torch.randn(40, 10, dtype=self.dtype, device=self.device)
23 |         result = F.sliding_window_cmn(tensor, **kwargs)
24 |         command = ["apply-cmvn-sliding"] + convert_args(**kwargs) + ["ark:-", "ark:-"]
25 |         kaldi_result = run_kaldi(command, "ark", tensor)
26 |         self.assert_equal(result, expected=kaldi_result)
27 | 


--------------------------------------------------------------------------------
/.github/scripts/unittest-windows/set_cuda_envs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euxo pipefail
 3 | 
 4 | if [ -z "${CUDA_VERSION:-}" ] ; then
 5 |     version="cpu"
 6 | else
 7 |     version="$CUDA_VERSION"
 8 | fi
 9 | 
10 | # Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
11 | # It would exit the shell. One result is cpu tests would not run if the shell exit.
12 | # Unless there's an error, Don't exit.
13 | if [[ "$version" != "cpu" ]]; then
14 |     # set cuda envs
15 |     export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
16 |     export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
17 |     export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
18 | 
19 |     if  [ ! -d "$CUDA_PATH" ]
20 |     then
21 |         echo "$CUDA_PATH" does not exist
22 |         exit 1
23 |     fi
24 | 
25 |     # check cuda driver version
26 |     for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
27 |         if [[ -x "$path" ]]; then
28 |             "$path" || echo "true";
29 |             break
30 |         fi
31 |     done
32 | 
33 |     which nvcc
34 |     nvcc --version
35 |     env | grep CUDA
36 | fi
37 | 


--------------------------------------------------------------------------------
/examples/avsr/models/fusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class FeedForwardModule(torch.nn.Module):
 5 |     r"""Positionwise feed forward layer.
 6 | 
 7 |     Args:
 8 |         input_dim (int): input dimension.
 9 |         hidden_dim (int): hidden dimension.
10 |         dropout (float, optional): dropout probability. (Default: 0.0)
11 |     """
12 | 
13 |     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, dropout: float = 0.0) -> None:
14 |         super().__init__()
15 |         self.sequential = torch.nn.Sequential(
16 |             torch.nn.LayerNorm(input_dim),
17 |             torch.nn.Linear(input_dim, hidden_dim, bias=True),
18 |             torch.nn.SiLU(),
19 |             torch.nn.Dropout(dropout),
20 |             torch.nn.Linear(hidden_dim, output_dim, bias=True),
21 |             torch.nn.Dropout(dropout),
22 |         )
23 | 
24 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
25 |         r"""
26 |         Args:
27 |             input (torch.Tensor): with shape `(*, D)`.
28 | 
29 |         Returns:
30 |             torch.Tensor: output, with shape `(*, D)`.
31 |         """
32 |         return self.sequential(input)
33 | 
34 | 
35 | def fusion_module(input_dim=1024, hidden_dim=3072, output_dim=512, dropout=0.1):
36 |     return FeedForwardModule(input_dim, hidden_dim, output_dim, dropout)
37 | 


--------------------------------------------------------------------------------
/docs/source/prototype.models.rst:
--------------------------------------------------------------------------------
 1 | torchaudio.prototype.models
 2 | ===========================
 3 | 
 4 | .. py:module:: torchaudio.prototype.models
 5 | .. currentmodule:: torchaudio.prototype.models
 6 | 
 7 | 
 8 | The ``torchaudio.prototype.models`` subpackage contains definitions of models for addressing common audio tasks.
 9 | 
10 | .. note::
11 |    For models with pre-trained parameters, please refer to :mod:`torchaudio.prototype.pipelines` module.
12 | 
13 | Model defintions are responsible for constructing computation graphs and executing them.
14 | 
15 | Some models have complex structure and variations.
16 | For such models, factory functions are provided.
17 | 
18 | .. autosummary::
19 |   :toctree: generated
20 |   :nosignatures:
21 |   :template: autosummary/prototype_model_class.rst
22 | 
23 |   ConformerWav2Vec2PretrainModel
24 |   ConvEmformer
25 |   HiFiGANVocoder
26 | 
27 | Prototype Factory Functions of Beta Models
28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 | 
30 | .. currentmodule:: torchaudio.models
31 | 
32 | Some model definitions are in beta, but there are new factory functions that are still in prototype. Please check "Prototype Factory Functions" section in each model.
33 | 
34 | .. autosummary::
35 |   :toctree: generated
36 |   :nosignatures:
37 |   :template: autosummary/model_class.rst
38 | 
39 |   Wav2Vec2Model
40 |   RNNT
41 | 


--------------------------------------------------------------------------------
/third_party/ffmpeg/single/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # CMake file for searching existing FFmpeg installation and defining ffmpeg TARGET
 2 | 
 3 | message(STATUS "Searching existing FFmpeg installation")
 4 | message(STATUS FFMPEG_ROOT=$ENV{FFMPEG_ROOT})
 5 | if (NOT DEFINED ENV{FFMPEG_ROOT})
 6 |   message(FATAL_ERROR "Environment variable FFMPEG_ROOT is not set.")
 7 | endif()
 8 | 
 9 | set(_root $ENV{FFMPEG_ROOT})
10 | set(lib_dirs "${_root}/lib" "${_root}/bin")
11 | set(include_dir "${_root}/include")
12 | 
13 | add_library(ffmpeg INTERFACE)
14 | target_include_directories(ffmpeg INTERFACE "${include_dir}")
15 | 
16 | function (_find_ffmpeg_lib component)
17 |   find_path("${component}_header"
18 |     NAMES "lib${component}/${component}.h"
19 |     PATHS "${include_dir}"
20 |     DOC "The include directory for ${component}"
21 |     REQUIRED
22 |     NO_DEFAULT_PATH)
23 |   find_library("lib${component}"
24 |     NAMES "${component}"
25 |     PATHS ${lib_dirs}
26 |     DOC "${component} library"
27 |     REQUIRED
28 |     NO_DEFAULT_PATH)
29 |   message(STATUS "Found ${component}: ${lib${component}}")
30 |   target_link_libraries(
31 |     ffmpeg
32 |     INTERFACE
33 |     ${lib${component}})
34 | endfunction ()
35 | 
36 | _find_ffmpeg_lib(avutil)
37 | _find_ffmpeg_lib(avcodec)
38 | _find_ffmpeg_lib(avformat)
39 | _find_ffmpeg_lib(avdevice)
40 | _find_ffmpeg_lib(avfilter)
41 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/cuda_ctc_decoder_class.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/cuda_ctc_decoder_class.rst
 3 | 
 4 | 
 5 | {#
 6 |     ################################################################################
 7 |     # autosummary template for CUCTCDecoder
 8 |     # Since the class has multiple methods and support structure.
 9 |     # we want to have them show up in the table of contents.
10 |     # The default class template does not do this, so we use custom one here.
11 |     ################################################################################
12 | #}
13 | 
14 | {{ name | underline }}
15 | 
16 | {%- if name != "CUCTCDecoder" %}
17 | 
18 | .. autofunction:: {{fullname}}
19 | 
20 | {%- else %}
21 | 
22 | .. autoclass:: {{ fullname }}()
23 | 
24 | Methods
25 | =======
26 | 
27 | {%- for item in members %}
28 | {%- if not item.startswith('_') or item == "__call__" %}
29 | 
30 | {{ item | underline("-") }}
31 | 
32 | .. container:: py attribute
33 | 
34 |    .. automethod:: {{[fullname, item] | join('.')}}
35 | 
36 | {%- endif %}
37 | {%- endfor %}
38 | 
39 | Support Structures
40 | ==================
41 | 
42 | {%- for item in ["CUCTCHypothesis"] %}
43 | 
44 | {{ item | underline("-") }}
45 | 
46 | .. autoclass:: torchaudio.models.decoder.{{item}}
47 |    :members:
48 | 
49 | {%- endfor %}
50 | 
51 | {%- endif %}
52 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | ---
 2 | # NOTE there must be no spaces before the '-' and check name.
 3 | # If you edit this list, please verify list of enabled check with
 4 | #   clang-tidy --list-checks
 5 | InheritParentConfig: true
 6 | Checks: '
 7 | bugprone-*,
 8 | -bugprone-forward-declaration-namespace,
 9 | -bugprone-macro-parentheses,
10 | -clang-analyzer-*,
11 | cppcoreguidelines-*,
12 | -cppcoreguidelines-interfaces-global-init,
13 | -cppcoreguidelines-owning-memory,
14 | -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
15 | -cppcoreguidelines-pro-bounds-constant-array-index,
16 | -cppcoreguidelines-pro-bounds-pointer-arithmetic,
17 | -cppcoreguidelines-pro-type-cstyle-cast,
18 | -cppcoreguidelines-pro-type-reinterpret-cast,
19 | -cppcoreguidelines-pro-type-static-cast-downcast,
20 | -cppcoreguidelines-pro-type-union-access,
21 | -cppcoreguidelines-pro-type-vararg,
22 | -cppcoreguidelines-special-member-functions,
23 | -facebook-hte-RelativeInclude,
24 | hicpp-exception-baseclass,
25 | hicpp-avoid-goto,
26 | modernize-*,
27 | -modernize-concat-nested-namespaces,
28 | -modernize-return-braced-init-list,
29 | -modernize-use-auto,
30 | -modernize-use-default-member-init,
31 | -modernize-use-trailing-return-type,
32 | -modernize-use-using,
33 | performance-unnecessary-value-param,
34 | '
35 | HeaderFilterRegex: 'torchaudio/.*'
36 | AnalyzeTemporaryDtors: false
37 | CheckOptions:
38 | ...
39 | 


--------------------------------------------------------------------------------
/examples/pipeline_wav2letter/languagemodels.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import itertools
 3 | 
 4 | 
 5 | class LanguageModel:
 6 |     def __init__(self, labels, char_blank, char_space):
 7 | 
 8 |         self.char_space = char_space
 9 |         self.char_blank = char_blank
10 | 
11 |         labels = list(labels)
12 |         self.length = len(labels)
13 |         enumerated = list(enumerate(labels))
14 |         flipped = [(sub[1], sub[0]) for sub in enumerated]
15 | 
16 |         d1 = collections.OrderedDict(enumerated)
17 |         d2 = collections.OrderedDict(flipped)
18 |         self.mapping = {**d1, **d2}
19 | 
20 |     def encode(self, iterable):
21 |         if isinstance(iterable, list):
22 |             return [self.encode(i) for i in iterable]
23 |         else:
24 |             return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable]
25 | 
26 |     def decode(self, tensor):
27 |         if len(tensor) > 0 and isinstance(tensor[0], list):
28 |             return [self.decode(t) for t in tensor]
29 |         else:
30 |             # not idempotent, since clean string
31 |             x = (self.mapping[i] for i in tensor)
32 |             x = "".join(i for i, _ in itertools.groupby(x))
33 |             x = x.replace(self.char_blank, "")
34 |             # x = x.strip()
35 |             return x
36 | 
37 |     def __len__(self):
38 |         return self.length
39 | 


--------------------------------------------------------------------------------
/packaging/cut_release.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Usage (run from root of project):
 4 | # TEST_INFRA_BRANCH=release/2.1 RELEASE_BRANCH=release/2.1 RELEASE_VERSION=2.1.0 packaging/cut_release.sh
 5 | #
 6 | # TEST_INFRA_BRANCH: The release branch of test-infra that houses all reusable
 7 | # workflows
 8 | #
 9 | # RELEASE_BRANCH: The name of the release branch for this repo
10 | #
11 | # RELEASE_VERSION: Version of this current release
12 | 
13 | set -eou pipefail
14 | 
15 | # Create and Check out to Release Branch
16 | git checkout -b "${RELEASE_BRANCH}"
17 | 
18 | # Change all GitHub Actions to reference the test-infra release branch
19 | # as opposed to main.
20 | for i in .github/workflows/*.yml; do 
21 |   if [[ "$OSTYPE" == "darwin"* ]]; then
22 |     sed -i '' -e s#@main#@"${TEST_INFRA_BRANCH}"# $i;
23 |     sed -i '' -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i;
24 |   else
25 |     sed -i -e s#@main#@"${TEST_INFRA_BRANCH}"# $i;
26 |     sed -i -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i;
27 |   fi
28 | done
29 | 
30 | # Update the Release Version in version.txt
31 | echo "${RELEASE_VERSION}" >version.txt
32 | 
33 | # Optional
34 | # git add ./github/workflows/*.yml version.txt
35 | # git commit -m "[RELEASE-ONLY CHANGES] Branch Cut for Release {RELEASE_VERSION}"
36 | # git push origin "${RELEASE_BRANCH}"
37 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/ctc_decoder_class.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/ctc_decoder_class.rst
 3 | 
 4 | 
 5 | {#
 6 |     ################################################################################
 7 |     # autosummary template for CTCDecoder
 8 |     # Since the class has multiple methods and support structure.
 9 |     # we want to have them show up in the table of contents.
10 |     # The default class template does not do this, so we use custom one here.
11 |     ################################################################################
12 | #}
13 | 
14 | {{ name | underline }}
15 | 
16 | {%- if name != "CTCDecoder" %}
17 | 
18 | .. autofunction:: {{fullname}}
19 | 
20 | {%- else %}
21 | 
22 | .. autoclass:: {{ fullname }}()
23 | 
24 | Methods
25 | =======
26 | 
27 | {%- for item in members %}
28 | {%- if not item.startswith('_') or item == "__call__" %}
29 | 
30 | {{ item | underline("-") }}
31 | 
32 | .. container:: py attribute
33 | 
34 |    .. automethod:: {{[fullname, item] | join('.')}}
35 | 
36 | {%- endif %}
37 | {%- endfor %}
38 | 
39 | Support Structures
40 | ==================
41 | 
42 | {%- for item in ["CTCHypothesis", "CTCDecoderLM", "CTCDecoderLMState"] %}
43 | 
44 | {{ item | underline("-") }}
45 | 
46 | .. autoclass:: torchaudio.models.decoder.{{item}}
47 |    :members:
48 | 
49 | {%- endfor %}
50 | 
51 | {%- endif %}
52 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = torchaudio
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | docset: html
16 | 	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/audio/ --force $(BUILDDIR)/html/
17 | 
18 | 	# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
19 | 	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
20 | 	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
21 | 
22 | .PHONY: help Makefile docset
23 | 
24 | # Catch-all target: route all unknown targets to Sphinx using the new
25 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
26 | %: Makefile
27 | 	doxygen source/Doxyfile
28 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
29 | 	@python post_process_dispatcher.py $(BUILDDIR)
30 | 
31 | clean:
32 | 	rm -rf $(BUILDDIR)/*
33 | 	rm -rf $(SOURCEDIR)/generated/
34 | 	rm -rf $(SOURCEDIR)/aen_images/
35 | 	rm -rf $(SOURCEDIR)/gen_modules/
36 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/emformer_rnnt/utils.py:
--------------------------------------------------------------------------------
 1 | class MockSentencePieceProcessor:
 2 |     def __init__(self, num_symbols, *args, **kwargs):
 3 |         self.num_symbols = num_symbols
 4 | 
 5 |     def get_piece_size(self):
 6 |         return self.num_symbols
 7 | 
 8 |     def encode(self, input):
 9 |         return [1, 5, 2]
10 | 
11 |     def decode(self, input):
12 |         return "hey"
13 | 
14 |     def unk_id(self):
15 |         return 0
16 | 
17 |     def eos_id(self):
18 |         return 1
19 | 
20 |     def pad_id(self):
21 |         return 2
22 | 
23 | 
24 | class MockCustomDataset:
25 |     def __init__(self, base_dataset, *args, **kwargs):
26 |         self.base_dataset = base_dataset
27 | 
28 |     def __getitem__(self, n: int):
29 |         return [self.base_dataset[n]]
30 | 
31 |     def __len__(self):
32 |         return len(self.base_dataset)
33 | 
34 | 
35 | class MockDataloader:
36 |     def __init__(self, base_dataset, batch_size, collate_fn, *args, **kwargs):
37 |         self.base_dataset = base_dataset
38 |         self.batch_size = batch_size
39 |         self.collate_fn = collate_fn
40 | 
41 |     def __iter__(self):
42 |         for sample in iter(self.base_dataset):
43 |             if self.batch_size == 1:
44 |                 sample = [sample]
45 |             yield self.collate_fn(sample)
46 | 
47 |     def __len__(self):
48 |         return len(self.base_dataset)
49 | 


--------------------------------------------------------------------------------
/src/torchaudio/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cmuarctic import CMUARCTIC
 2 | from .cmudict import CMUDict
 3 | from .commonvoice import COMMONVOICE
 4 | from .dr_vctk import DR_VCTK
 5 | from .fluentcommands import FluentSpeechCommands
 6 | from .gtzan import GTZAN
 7 | from .iemocap import IEMOCAP
 8 | from .librilight_limited import LibriLightLimited
 9 | from .librimix import LibriMix
10 | from .librispeech import LIBRISPEECH
11 | from .librispeech_biasing import LibriSpeechBiasing
12 | from .libritts import LIBRITTS
13 | from .ljspeech import LJSPEECH
14 | from .musdb_hq import MUSDB_HQ
15 | from .quesst14 import QUESST14
16 | from .snips import Snips
17 | from .speechcommands import SPEECHCOMMANDS
18 | from .tedlium import TEDLIUM
19 | from .vctk import VCTK_092
20 | from .voxceleb1 import VoxCeleb1Identification, VoxCeleb1Verification
21 | from .yesno import YESNO
22 | 
23 | 
24 | __all__ = [
25 |     "COMMONVOICE",
26 |     "LIBRISPEECH",
27 |     "LibriSpeechBiasing",
28 |     "LibriLightLimited",
29 |     "SPEECHCOMMANDS",
30 |     "VCTK_092",
31 |     "DR_VCTK",
32 |     "YESNO",
33 |     "LJSPEECH",
34 |     "GTZAN",
35 |     "CMUARCTIC",
36 |     "CMUDict",
37 |     "LibriMix",
38 |     "LIBRITTS",
39 |     "TEDLIUM",
40 |     "QUESST14",
41 |     "MUSDB_HQ",
42 |     "FluentSpeechCommands",
43 |     "VoxCeleb1Identification",
44 |     "VoxCeleb1Verification",
45 |     "IEMOCAP",
46 |     "Snips",
47 | ]
48 | 


--------------------------------------------------------------------------------
/packaging/vs2019/activate.bat:
--------------------------------------------------------------------------------
 1 | :: Set env vars that tell distutils to use the compiler that we put on path
 2 | SET DISTUTILS_USE_SDK=1
 3 | SET MSSdk=1
 4 | 
 5 | SET "VS_VERSION=16.0"
 6 | SET "VS_MAJOR=16"
 7 | SET "VS_YEAR=2019"
 8 | 
 9 | set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
10 | set "MSYS2_ENV_CONV_EXCL=CL"
11 | 
12 | :: For Python 3.5+, ensure that we link with the dynamic runtime.  See
13 | :: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
14 | set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
15 | 
16 | for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
17 |     if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
18 |         set "VSINSTALLDIR=%%i\"
19 |         goto :vswhere
20 |     )
21 | )
22 | 
23 | :vswhere
24 | 
25 | :: Shorten PATH to avoid the `input line too long` error.
26 | SET MyPath=%PATH%
27 | 
28 | setlocal EnableDelayedExpansion
29 | 
30 | SET TempPath="%MyPath:;=";"%"
31 | SET var=
32 | FOR %%a IN (%TempPath%) DO (
33 |     IF EXIST %%~sa (
34 |         SET "var=!var!;%%~sa"
35 |     )
36 | )
37 | 
38 | set "TempPath=!var:~1!"
39 | endlocal & set "PATH=%TempPath%"
40 | 
41 | :: Shorten current directory too
42 | FOR %%A IN (.) DO CD "%%~sA"
43 | 
44 | :: other things added by install_activate.bat at package build time
45 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   node: 16.14.2
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.0.1
 7 |     hooks:
 8 |       - id: check-docstring-first
 9 |       - id: check-toml
10 |       - id: check-yaml
11 |         exclude: packaging/.*
12 |       - id: end-of-file-fixer
13 | 
14 |   - repo: https://github.com/omnilib/ufmt
15 |     rev: v1.3.2
16 |     hooks:
17 |       - id: ufmt
18 |         additional_dependencies:
19 |           - black == 22.3
20 |           - usort == 1.0.2
21 |           - libcst == 0.4.1
22 | 
23 |   - repo: https://github.com/pre-commit/mirrors-clang-format
24 |     rev: v11.0.1
25 |     hooks:
26 |       - id: clang-format
27 | 
28 |   - repo: https://github.com/pycqa/flake8
29 |     rev: 4.0.1
30 |     hooks:
31 |       - id: flake8
32 |         args: ['src', 'test', 'tools', 'docs/source/conf.py', 'examples']
33 |         exclude: 'build|docs/src|third_party'
34 |         additional_dependencies:
35 |           - flake8-breakpoint == 1.1.0
36 |           - flake8-bugbear == 22.6.22
37 |           - flake8-comprehensions == 3.10.0
38 |           - flake8-pyi == 22.5.1
39 |           - mccabe == 0.6.0
40 |           - pycodestyle == 2.8.0
41 | 
42 |   - repo: https://github.com/pycqa/pydocstyle
43 |     rev: 6.3.0
44 |     hooks:
45 |       - id: pydocstyle
46 |         exclude: 'build|test|examples|third_party|docs|tools'
47 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new torchaudio feature
 3 | 
 4 | body:
 5 | - type: markdown
 6 |   attributes:
 7 |     value: >
 8 |       <strong>PLEASE NOTE THAT THE TORCHAUDIO REPOSITORY IS NO LONGER ACTIVELY MONITORED.</strong> You may not get a response. For open discussions, visit https://discuss.pytorch.org/.
 9 | - type: textarea
10 |   attributes:
11 |     label: 🚀 The feature
12 |     description: >
13 |       A clear and concise description of the feature proposal
14 |   validations:
15 |     required: true
16 | - type: textarea
17 |   attributes:
18 |     label: Motivation, pitch
19 |     description: >
20 |       Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
21 |   validations:
22 |     required: true
23 | - type: textarea
24 |   attributes:
25 |     label: Alternatives
26 |     description: >
27 |       A description of any alternative solutions or features you've considered, if any.
28 | - type: textarea
29 |   attributes:
30 |     label: Additional context
31 |     description: >
32 |       Add any other context or screenshots about the feature request.
33 | - type: markdown
34 |   attributes:
35 |     value: >
36 |       Thanks for contributing 🎉!
37 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/decoder/kenlm_char.arpa:
--------------------------------------------------------------------------------
 1 | \data\
 2 | ngram 1=8
 3 | ngram 2=8
 4 | ngram 3=8
 5 | ngram 4=8
 6 | ngram 5=8
 7 | 
 8 | \1-grams:
 9 | -1.146128	<unk>	0
10 | 0	<s>	-0.30103
11 | -0.8731268	</s>	0
12 | -0.70679533	f	-0.30103
13 | -0.70679533	o	-0.30103
14 | -0.8731268	b	-0.30103
15 | -0.8731268	a	-0.30103
16 | -0.8731268	r	-0.30103
17 | 
18 | \2-grams:
19 | -0.24644431	r </s>	0
20 | -0.22314323	<s> f	-0.30103
21 | -0.57694924	o f	-0.30103
22 | -0.22314323	f o	-0.30103
23 | -0.57694924	o o	-0.30103
24 | -0.6314696	o b	-0.30103
25 | -0.24644431	b a	-0.30103
26 | -0.24644431	a r	-0.30103
27 | 
28 | \3-grams:
29 | -0.105970904	a r </s>	0
30 | -0.41743615	o o f	-0.30103
31 | -0.097394995	<s> f o	-0.30103
32 | -0.097394995	o f o	-0.30103
33 | -0.19898036	f o o	-0.30103
34 | -0.43555236	o o b	-0.30103
35 | -0.105970904	o b a	-0.30103
36 | -0.105970904	b a r	-0.30103
37 | 
38 | \4-grams:
39 | -0.049761247	b a r </s>	0
40 | -0.4462542	f o o f	-0.30103
41 | -0.045972984	o o f o	-0.30103
42 | -0.08819265	<s> f o o	-0.30103
43 | -0.08819265	o f o o	-0.30103
44 | -0.286727	f o o b	-0.30103
45 | -0.049761247	o o b a	-0.30103
46 | -0.049761247	o b a r	-0.30103
47 | 
48 | \5-grams:
49 | -0.02416831	o b a r </s>
50 | -0.36759996	<s> f o o f
51 | -0.022378458	f o o f o
52 | -0.041861475	o o f o o
53 | -0.29381964	<s> f o o b
54 | -0.12011856	o f o o b
55 | -0.02416831	f o o b a
56 | -0.02416831	o o b a r
57 | 
58 | \end\
59 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/cuctc/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2023 Nvidia 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2017 Facebook Inc. (Soumith Chintala), 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/src/torchaudio/models/decoder/__init__.py:
--------------------------------------------------------------------------------
 1 | _CTC_DECODERS = [
 2 |     "CTCHypothesis",
 3 |     "CTCDecoder",
 4 |     "CTCDecoderLM",
 5 |     "CTCDecoderLMState",
 6 |     "ctc_decoder",
 7 |     "download_pretrained_files",
 8 | ]
 9 | _CUDA_CTC_DECODERS = [
10 |     "CUCTCDecoder",
11 |     "CUCTCHypothesis",
12 |     "cuda_ctc_decoder",
13 | ]
14 | 
15 | 
16 | def __getattr__(name: str):
17 |     if name in _CTC_DECODERS:
18 |         try:
19 |             from . import _ctc_decoder
20 |         except Exception as err:
21 |             raise RuntimeError(
22 |                 "CTC Decoder suit requires flashlight-text package and optionally KenLM. Please install them."
23 |             ) from err
24 | 
25 |         item = getattr(_ctc_decoder, name)
26 |         globals()[name] = item
27 |         return item
28 |     elif name in _CUDA_CTC_DECODERS:
29 |         try:
30 |             from . import _cuda_ctc_decoder
31 |         except AttributeError as err:
32 |             raise RuntimeError(
33 |                 "To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source."
34 |             ) from err
35 | 
36 |         item = getattr(_cuda_ctc_decoder, name)
37 |         globals()[name] = item
38 |         return item
39 |     raise AttributeError(f"module {__name__} has no attribute {name}")
40 | 
41 | 
42 | def __dir__():
43 |     return sorted(__all__)
44 | 
45 | 
46 | __all__ = _CTC_DECODERS + _CUDA_CTC_DECODERS
47 | 


--------------------------------------------------------------------------------
/src/torchaudio/prototype/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._conformer_wav2vec2 import (
 2 |     conformer_wav2vec2_base,
 3 |     conformer_wav2vec2_model,
 4 |     conformer_wav2vec2_pretrain_base,
 5 |     conformer_wav2vec2_pretrain_large,
 6 |     conformer_wav2vec2_pretrain_model,
 7 |     ConformerWav2Vec2PretrainModel,
 8 | )
 9 | from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
10 | from .conv_emformer import ConvEmformer
11 | from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
12 | from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
13 | from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing
14 | 
15 | __all__ = [
16 |     "conformer_rnnt_base",
17 |     "conformer_rnnt_model",
18 |     "conformer_rnnt_biasing",
19 |     "conformer_rnnt_biasing_base",
20 |     "ConvEmformer",
21 |     "conformer_wav2vec2_model",
22 |     "conformer_wav2vec2_base",
23 |     "conformer_wav2vec2_pretrain_model",
24 |     "conformer_wav2vec2_pretrain_base",
25 |     "conformer_wav2vec2_pretrain_large",
26 |     "ConformerWav2Vec2PretrainModel",
27 |     "emformer_hubert_base",
28 |     "emformer_hubert_model",
29 |     "Hypothesis",
30 |     "RNNTBeamSearchBiasing",
31 |     "HiFiGANVocoder",
32 |     "hifigan_vocoder_v1",
33 |     "hifigan_vocoder_v2",
34 |     "hifigan_vocoder_v3",
35 |     "hifigan_vocoder",
36 | ]
37 | 


--------------------------------------------------------------------------------
/src/libtorio/ffmpeg/stream_writer/packet_writer.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorio/ffmpeg/stream_writer/packet_writer.h>
 2 | 
 3 | namespace torio::io {
 4 | namespace {
 5 | AVStream* add_stream(
 6 |     AVFormatContext* format_ctx,
 7 |     const StreamParams& stream_params) {
 8 |   AVStream* stream = avformat_new_stream(format_ctx, nullptr);
 9 |   int ret =
10 |       avcodec_parameters_copy(stream->codecpar, stream_params.codec_params);
11 |   TORCH_CHECK(
12 |       ret >= 0,
13 |       "Failed to copy the stream's codec parameters. (",
14 |       av_err2string(ret),
15 |       ")");
16 |   stream->time_base = stream_params.time_base;
17 |   return stream;
18 | }
19 | } // namespace
20 | PacketWriter::PacketWriter(
21 |     AVFormatContext* format_ctx_,
22 |     const StreamParams& stream_params_)
23 |     : format_ctx(format_ctx_),
24 |       stream(add_stream(format_ctx_, stream_params_)),
25 |       original_time_base(stream_params_.time_base) {}
26 | 
27 | void PacketWriter::write_packet(const AVPacketPtr& packet) {
28 |   AVPacket dst_packet;
29 |   int ret = av_packet_ref(&dst_packet, packet);
30 |   TORCH_CHECK(ret >= 0, "Failed to copy packet.");
31 |   av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base);
32 |   dst_packet.stream_index = stream->index;
33 |   ret = av_interleaved_write_frame(format_ctx, &dst_packet);
34 |   TORCH_CHECK(ret >= 0, "Failed to write packet to destination.");
35 | }
36 | } // namespace torio::io
37 | 


--------------------------------------------------------------------------------
/.github/scripts/unittest-windows/setup_env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script is for setting up environment in which unit test is ran.
 4 | # To speed up the CI time, the resulting environment is cached.
 5 | #
 6 | # Do not install PyTorch and torchaudio here, otherwise they also get cached.
 7 | 
 8 | set -euxo pipefail
 9 | 
10 | this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
11 | root_dir="$(git rev-parse --show-toplevel)"
12 | conda_dir="${root_dir}/conda"
13 | env_dir="${root_dir}/env"
14 | 
15 | cd "${root_dir}"
16 | 
17 | # 1. Install conda at ./conda
18 | if [ ! -d "${conda_dir}" ]; then
19 |     printf "* Installing conda\n"
20 |     export tmp_conda="$(echo $conda_dir | tr '/' '\\')"
21 |     export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe"
22 |     curl --silent --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
23 |     "$this_dir/install_conda.bat"
24 |     unset tmp_conda
25 |     unset miniconda_exe
26 | fi
27 | eval "$("${conda_dir}/Scripts/conda.exe" 'shell.bash' 'hook')"
28 | 
29 | # 2. Create test environment at ./env
30 | if [ ! -d "${env_dir}" ]; then
31 |     printf "* Creating a test environment with PYTHON_VERSION=%s\n" "${PYTHON_VERSION}"
32 |     conda create --prefix "${env_dir}" -y python="${PYTHON_VERSION}"
33 | fi
34 | conda activate "${env_dir}"
35 | 
36 | # 3. Install minimal build tools
37 | pip --quiet install cmake ninja
38 | conda install --quiet -y 'ffmpeg>=4.1'
39 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/common_utils/kaldi_utils.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def convert_args(**kwargs):
 7 |     args = []
 8 |     for key, value in kwargs.items():
 9 |         if key == "sample_rate":
10 |             key = "sample_frequency"
11 |         key = "--" + key.replace("_", "-")
12 |         value = str(value).lower() if value in [True, False] else str(value)
13 |         args.append("%s=%s" % (key, value))
14 |     return args
15 | 
16 | 
17 | def run_kaldi(command, input_type, input_value):
18 |     """Run provided Kaldi command, pass a tensor and get the resulting tensor
19 | 
20 |     Args:
21 |         command (list of str): The command with arguments
22 |         input_type (str): 'ark' or 'scp'
23 |         input_value (Tensor for 'ark', string for 'scp'): The input to pass.
24 |             Must be a path to an audio file for 'scp'.
25 |     """
26 |     import kaldi_io
27 | 
28 |     key = "foo"
29 |     process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
30 |     if input_type == "ark":
31 |         kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
32 |     elif input_type == "scp":
33 |         process.stdin.write(f"{key} {input_value}".encode("utf8"))
34 |     else:
35 |         raise NotImplementedError("Unexpected type")
36 |     process.stdin.close()
37 |     result = dict(kaldi_io.read_mat_ark(process.stdout))["foo"]
38 |     return torch.from_numpy(result.copy())  # copy supresses some torch warning
39 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/io_class.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |   autogenerated from source/_templates/autosummary/io_class.rst
 3 | 
 4 | {#-
 5 |     ################################################################################
 6 |     # autosummary template for torchaudio.io module
 7 |     # Since StreamReader/StreamWriter have many methods/properties,
 8 |     # we want to list them up in the table of contents.
 9 |     # The default class template does not do this, so we use custom one here.
10 |     ################################################################################
11 | #}
12 | 
13 | {{ name | underline }}
14 | 
15 | .. autoclass:: {{ fullname }}
16 | 
17 | {%- if name not in ["StreamReader", "StreamWriter"] %}
18 | 
19 | {%- if attributes %}
20 | 
21 | Properties
22 | ----------
23 | 
24 | {%- for item in attributes %}
25 | {%- if not item.startswith('_') and item not in inherited_members %}
26 | 
27 | {{ item | underline("~") }}
28 | 
29 | .. container:: py attribute
30 | 
31 |    .. autoproperty:: {{[fullname, item] | join('.')}}
32 | 
33 | {%- endif %}
34 | {%- endfor %}
35 | {%- endif %}
36 | 
37 | {%- if members %}
38 | 
39 | Methods
40 | -------
41 | 
42 | {%- for item in members %}
43 | {%- if
44 |    not item.startswith('_')
45 |    and item not in inherited_members
46 |    and item not in attributes
47 |    %}
48 | 
49 | {{ item | underline("~") }}
50 | 
51 | .. container:: py attribute
52 | 
53 |    .. automethod:: {{[fullname, item] | join('.')}}
54 | 
55 | {%- endif %}
56 | {%- endfor %}
57 | {%- endif %}
58 | 
59 | {%- endif %}
60 | 


--------------------------------------------------------------------------------
/examples/libtorchaudio/speech_recognition/transcribe.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/script.h>
 2 | 
 3 | int main(int argc, char* argv[]) {
 4 |   if (argc != 3) {
 5 |     std::cerr << "Usage: " << argv[0] << " <JIT_OBJECT_DIR> <INPUT_AUDIO_FILE>"
 6 |               << std::endl;
 7 |     return -1;
 8 |   }
 9 | 
10 |   torch::jit::script::Module loader, encoder, decoder;
11 |   std::cout << "Loading module from: " << argv[1] << std::endl;
12 |   try {
13 |     loader = torch::jit::load(std::string(argv[1]) + "/loader.zip");
14 |   } catch (const c10::Error& error) {
15 |     std::cerr << "Failed to load the module:" << error.what() << std::endl;
16 |     return -1;
17 |   }
18 |   try {
19 |     encoder = torch::jit::load(std::string(argv[1]) + "/encoder.zip");
20 |   } catch (const c10::Error& error) {
21 |     std::cerr << "Failed to load the module:" << error.what() << std::endl;
22 |     return -1;
23 |   }
24 |   try {
25 |     decoder = torch::jit::load(std::string(argv[1]) + "/decoder.zip");
26 |   } catch (const c10::Error& error) {
27 |     std::cerr << "Failed to load the module:" << error.what() << std::endl;
28 |     return -1;
29 |   }
30 | 
31 |   std::cout << "Loading the audio" << std::endl;
32 |   auto waveform = loader.forward({c10::IValue(argv[2])});
33 |   std::cout << "Running inference" << std::endl;
34 |   auto emission = encoder.forward({waveform});
35 |   std::cout << "Generating the transcription" << std::endl;
36 |   auto result = decoder.forward({emission});
37 |   std::cout << result.toStringRef() << std::endl;
38 |   std::cout << "Done." << std::endl;
39 | }
40 | 


--------------------------------------------------------------------------------
/src/libtorchaudio/sox/pybind/pybind.cpp:
--------------------------------------------------------------------------------
 1 | #include <libtorchaudio/sox/effects.h>
 2 | #include <libtorchaudio/sox/io.h>
 3 | #include <libtorchaudio/sox/utils.h>
 4 | #include <torch/extension.h>
 5 | 
 6 | namespace torchaudio {
 7 | namespace sox {
 8 | namespace {
 9 | 
10 | TORCH_LIBRARY(torchaudio_sox, m) {
11 |   m.def("torchaudio_sox::get_info", &get_info_file);
12 |   m.def("torchaudio_sox::load_audio_file", &load_audio_file);
13 |   m.def("torchaudio_sox::save_audio_file", &save_audio_file);
14 |   m.def("torchaudio_sox::initialize_sox_effects", &initialize_sox_effects);
15 |   m.def("torchaudio_sox::shutdown_sox_effects", &shutdown_sox_effects);
16 |   m.def("torchaudio_sox::apply_effects_tensor", &apply_effects_tensor);
17 |   m.def("torchaudio_sox::apply_effects_file", &apply_effects_file);
18 | }
19 | 
20 | PYBIND11_MODULE(_torchaudio_sox, m) {
21 |   m.def("set_seed", &set_seed, "Set random seed.");
22 |   m.def("set_verbosity", &set_verbosity, "Set verbosity.");
23 |   m.def("set_use_threads", &set_use_threads, "Set threading.");
24 |   m.def("set_buffer_size", &set_buffer_size, "Set buffer size.");
25 |   m.def("get_buffer_size", &get_buffer_size, "Get buffer size.");
26 |   m.def("list_effects", &list_effects, "List available effects.");
27 |   m.def(
28 |       "list_read_formats",
29 |       &list_read_formats,
30 |       "List supported formats for decoding.");
31 |   m.def(
32 |       "list_write_formats",
33 |       &list_write_formats,
34 |       "List supported formats for encoding.");
35 | }
36 | 
37 | } // namespace
38 | } // namespace sox
39 | } // namespace torchaudio
40 | 


--------------------------------------------------------------------------------
/tools/travis/test_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is meant to be called by the "script" step defined in
 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details.
 4 | # The behavior of the script is controlled by environment variabled defined
 5 | # in the .travis.yml in the top level folder of the project.
 6 | set -e
 7 | 
 8 | python --version
 9 | python -c 'import torch;print("torch:", torch.__version__)'
10 | 
11 | run_tests() {
12 |   # find all the test files that match "test*.py"
13 |   TEST_FILES="$(find test -type f -name "test*.py" | sort)"
14 |   echo "Test files are:"
15 |   echo $TEST_FILES
16 | 
17 |   echo "Executing tests:"
18 |   EXIT_STATUS=0
19 |   for FILE in $TEST_FILES; do
20 |     # run each file on a separate process. if one fails, just keep going and
21 |     # return the final exit status.
22 |     python -m pytest -v $FILE
23 |     STATUS=$?
24 |     EXIT_STATUS="$(($EXIT_STATUS+STATUS))"
25 |   done
26 | 
27 |   echo "Done, exit status: $EXIT_STATUS"
28 |   exit $EXIT_STATUS
29 | }
30 | 
31 | if [[ "$RUN_FLAKE8" == "true" ]]; then
32 |   flake8
33 | fi
34 | 
35 | if [[ "$SKIP_TESTS" != "true" ]]; then
36 |   echo "run_tests"
37 |   run_tests
38 | fi
39 | 
40 | if [[ "$RUN_EXAMPLE_TESTS" == "true" ]]; then
41 |   echo "run_example_tests"
42 |   pushd examples
43 |   ASR_MODEL_PATH=$HOME/download/data/model.pt \
44 |   ASR_INPUT_FILE=interactive_asr/data/sample.wav \
45 |   ASR_DATA_PATH=$HOME/download/data \
46 |   ASR_USER_DIR=$HOME/download/fairseq/examples/speech_recognition \
47 |   python -m unittest test/test_interactive_asr.py
48 |   popd
49 | fi
50 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/example/souce_sepration/metrics_test.py:
--------------------------------------------------------------------------------
 1 | from itertools import product
 2 | 
 3 | import torch
 4 | from parameterized import parameterized
 5 | from source_separation.utils import metrics
 6 | from torch.testing._internal.common_utils import TestCase
 7 | 
 8 | from . import sdr_reference
 9 | 
10 | 
11 | class TestSDR(TestCase):
12 |     @parameterized.expand([(1,), (2,), (32,)])
13 |     def test_sdr(self, batch_size):
14 |         """sdr produces the same result as the reference implementation"""
15 |         num_frames = 256
16 | 
17 |         estimation = torch.rand(batch_size, num_frames)
18 |         origin = torch.rand(batch_size, num_frames)
19 | 
20 |         sdr_ref = sdr_reference.calc_sdr_torch(estimation, origin)
21 |         sdr = metrics.sdr(estimation.unsqueeze(1), origin.unsqueeze(1)).squeeze(1)
22 | 
23 |         self.assertEqual(sdr, sdr_ref)
24 | 
25 |     @parameterized.expand(list(product([1, 2, 32], [2, 3, 4, 5])))
26 |     def test_sdr_pit(self, batch_size, num_sources):
27 |         """sdr_pit produces the same result as the reference implementation"""
28 |         num_frames = 256
29 | 
30 |         estimation = torch.randn(batch_size, num_sources, num_frames)
31 |         origin = torch.randn(batch_size, num_sources, num_frames)
32 | 
33 |         estimation -= estimation.mean(axis=2, keepdim=True)
34 |         origin -= origin.mean(axis=2, keepdim=True)
35 | 
36 |         batch_sdr_ref = sdr_reference.batch_SDR_torch(estimation, origin)
37 |         batch_sdr = metrics.sdr_pit(estimation, origin)
38 | 
39 |         self.assertEqual(batch_sdr, batch_sdr_ref)
40 | 


--------------------------------------------------------------------------------
/examples/pipeline_wav2letter/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import shutil
 5 | from collections import defaultdict
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | class MetricLogger(defaultdict):
11 |     def __init__(self, name, print_freq=1, disable=False):
12 |         super().__init__(lambda: 0.0)
13 |         self.disable = disable
14 |         self.print_freq = print_freq
15 |         self._iter = 0
16 |         self["name"] = name
17 | 
18 |     def __str__(self):
19 |         return json.dumps(self)
20 | 
21 |     def __call__(self):
22 |         self._iter = (self._iter + 1) % self.print_freq
23 |         if not self.disable and not self._iter:
24 |             print(self, flush=True)
25 | 
26 | 
27 | def save_checkpoint(state, is_best, filename, disable):
28 |     """
29 |     Save the model to a temporary file first,
30 |     then copy it to filename, in case the signal interrupts
31 |     the torch.save() process.
32 |     """
33 | 
34 |     if disable:
35 |         return
36 | 
37 |     if filename == "":
38 |         return
39 | 
40 |     tempfile = filename + ".temp"
41 | 
42 |     # Remove tempfile in case interuption during the copying from tempfile to filename
43 |     if os.path.isfile(tempfile):
44 |         os.remove(tempfile)
45 | 
46 |     torch.save(state, tempfile)
47 |     if os.path.isfile(tempfile):
48 |         os.rename(tempfile, filename)
49 |     if is_best:
50 |         shutil.copyfile(filename, "model_best.pth.tar")
51 |     logging.warning("Checkpoint: saved")
52 | 
53 | 
54 | def count_parameters(model):
55 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
56 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/kaldi_io_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio.kaldi_io as kio
 3 | from torchaudio_unittest import common_utils
 4 | 
 5 | 
 6 | class Test_KaldiIO(common_utils.TorchaudioTestCase):
 7 |     data1 = [[1, 2, 3], [11, 12, 13], [21, 22, 23]]
 8 |     data2 = [[31, 32, 33], [41, 42, 43], [51, 52, 53]]
 9 | 
10 |     def _test_helper(self, file_name, expected_data, fn, expected_dtype):
11 |         """Takes a file_name to the input data and a function fn to extract the
12 |         data. It compares the extracted data to the expected_data. The expected_dtype
13 |         will be used to check that the extracted data is of the right type.
14 |         """
15 |         test_filepath = common_utils.get_asset_path(file_name)
16 |         expected_output = {
17 |             "key" + str(idx + 1): torch.tensor(val, dtype=expected_dtype) for idx, val in enumerate(expected_data)
18 |         }
19 | 
20 |         for key, vec in fn(test_filepath):
21 |             self.assertTrue(key in expected_output)
22 |             self.assertTrue(isinstance(vec, torch.Tensor))
23 |             self.assertEqual(vec.dtype, expected_dtype)
24 |             self.assertTrue(torch.all(torch.eq(vec, expected_output[key])))
25 | 
26 |     def test_read_vec_int_ark(self):
27 |         self._test_helper("vec_int.ark", self.data1, kio.read_vec_int_ark, torch.int32)
28 | 
29 |     def test_read_vec_flt_ark(self):
30 |         self._test_helper("vec_flt.ark", self.data1, kio.read_vec_flt_ark, torch.float32)
31 | 
32 |     def test_read_mat_ark(self):
33 |         self._test_helper("mat.ark", [self.data1, self.data2], kio.read_mat_ark, torch.float32)
34 | 


--------------------------------------------------------------------------------
/src/torchaudio/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
 2 | from ._transforms import (
 3 |     AddNoise,
 4 |     AmplitudeToDB,
 5 |     ComputeDeltas,
 6 |     Convolve,
 7 |     Deemphasis,
 8 |     Fade,
 9 |     FFTConvolve,
10 |     FrequencyMasking,
11 |     GriffinLim,
12 |     InverseMelScale,
13 |     InverseSpectrogram,
14 |     LFCC,
15 |     Loudness,
16 |     MelScale,
17 |     MelSpectrogram,
18 |     MFCC,
19 |     MuLawDecoding,
20 |     MuLawEncoding,
21 |     PitchShift,
22 |     Preemphasis,
23 |     Resample,
24 |     RNNTLoss,
25 |     SlidingWindowCmn,
26 |     SpecAugment,
27 |     SpectralCentroid,
28 |     Spectrogram,
29 |     Speed,
30 |     SpeedPerturbation,
31 |     TimeMasking,
32 |     TimeStretch,
33 |     Vad,
34 |     Vol,
35 | )
36 | 
37 | 
38 | __all__ = [
39 |     "AddNoise",
40 |     "AmplitudeToDB",
41 |     "ComputeDeltas",
42 |     "Convolve",
43 |     "Deemphasis",
44 |     "Fade",
45 |     "FFTConvolve",
46 |     "FrequencyMasking",
47 |     "GriffinLim",
48 |     "InverseMelScale",
49 |     "InverseSpectrogram",
50 |     "LFCC",
51 |     "Loudness",
52 |     "MFCC",
53 |     "MVDR",
54 |     "MelScale",
55 |     "MelSpectrogram",
56 |     "MuLawDecoding",
57 |     "MuLawEncoding",
58 |     "PSD",
59 |     "PitchShift",
60 |     "Preemphasis",
61 |     "RNNTLoss",
62 |     "RTFMVDR",
63 |     "Resample",
64 |     "SlidingWindowCmn",
65 |     "SoudenMVDR",
66 |     "SpecAugment",
67 |     "SpectralCentroid",
68 |     "Spectrogram",
69 |     "Speed",
70 |     "SpeedPerturbation",
71 |     "TimeMasking",
72 |     "TimeStretch",
73 |     "Vad",
74 |     "Vol",
75 | ]
76 | 


--------------------------------------------------------------------------------
/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_53_56k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_name": "wav2vec2",
 3 |     "activation_dropout": 0.0,
 4 |     "activation_fn": "gelu",
 5 |     "attention_dropout": 0.0,
 6 |     "codebook_negatives": 0,
 7 |     "conv_bias": true,
 8 |     "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
 9 |     "conv_pos": 128,
10 |     "conv_pos_groups": 16,
11 |     "cross_sample_negatives": 0,
12 |     "dropout": 0.0,
13 |     "dropout_features": 0.0,
14 |     "dropout_input": 0.0,
15 |     "encoder_attention_heads": 16,
16 |     "encoder_embed_dim": 1024,
17 |     "encoder_ffn_embed_dim": 4096,
18 |     "encoder_layerdrop": 0.0,
19 |     "encoder_layers": 24,
20 |     "extractor_mode": "layer_norm",
21 |     "feature_grad_mult": 1.0,
22 |     "final_dim": 768,
23 |     "latent_dim": 0,
24 |     "latent_groups": 2,
25 |     "latent_temp": [
26 |         2.0,
27 |         0.1,
28 |         0.999995
29 |     ],
30 |     "latent_vars": 320,
31 |     "layer_norm_first": true,
32 |     "logit_temp": 0.1,
33 |     "mask_channel_length": 10,
34 |     "mask_channel_min_space": 1,
35 |     "mask_channel_other": 0.0,
36 |     "mask_channel_prob": 0.0,
37 |     "mask_channel_selection": "static",
38 |     "mask_length": 10,
39 |     "mask_min_space": 1,
40 |     "mask_other": 0.0,
41 |     "mask_prob": 0.65,
42 |     "mask_selection": "static",
43 |     "negatives_from_everywhere": false,
44 |     "no_mask_channel_overlap": false,
45 |     "no_mask_overlap": false,
46 |     "num_negatives": 100,
47 |     "quantize_input": false,
48 |     "quantize_targets": true,
49 |     "same_quantizer": false,
50 |     "target_glu": false
51 | }
52 | 


--------------------------------------------------------------------------------