├── demo ├── __init__.py ├── bus.opus ├── ir │ ├── misc_file.txt │ ├── rir48000.wav │ ├── impulse_response_0.wav │ └── stereo_impulse_response.wav ├── ms_adpcm.wav ├── p286_011.wav ├── testing.m4a ├── mono_int24.wav ├── mono_int32.wav ├── mono_float64.wav ├── signed_24bit.wav ├── stereo_16bit.wav ├── stereo_24bit.WAV ├── perfect-alley1.ogg ├── silence │ └── silence.wav ├── acoustic_guitar_0.wav ├── background_noises │ ├── hens.ogg │ └── sheep.ogg ├── short_noises │ ├── friction0.wav │ ├── friction1.wav │ ├── 130921_iPhone_rub_channel0_chunk83_aug2.wav │ ├── 130921_laptopmic-dell_tap_channel0_chunk200_aug0.wav │ └── 130921_laptopmic-dell_tap_channel0_chunk204_aug3.wav ├── almost_silent │ └── almost_silent.wav ├── digital_silence │ └── digital_silence.wav └── p286_011_license ├── tests ├── __init__.py ├── test_transforms_interface.py ├── test_trim.py ├── test_equalizer.py ├── test_gaussian_noise.py ├── test_resample.py ├── test_clip.py ├── test_reverse.py ├── utils.py ├── test_lambda.py ├── test_polarity_inversion.py ├── test_aliasing.py ├── test_tanh_distortion.py ├── test_clipping_distortion.py ├── test_time_stretch.py ├── test_bit_crush.py ├── test_air_absorption.py ├── test_gain.py ├── test_gaussian_snr.py ├── test_spec_compose.py ├── test_adjust_duration.py ├── test_pitch_shift.py ├── test_loudness_normalization.py ├── test_limiter.py ├── test_gain_transition.py ├── test_compose.py ├── test_normalize.py ├── test_time_mask.py ├── test_apply_impulse_response.py └── test_post_gain.py ├── audiomentations ├── core │ ├── __init__.py │ └── audio_loading_utils.py ├── augmentations │ ├── __init__.py │ ├── trim.py │ ├── lambda_transform.py │ ├── reverse.py │ ├── polarity_inversion.py │ ├── add_gaussian_noise.py │ ├── normalize.py │ ├── resample.py │ ├── clip.py │ ├── bit_crush.py │ ├── low_pass_filter.py │ ├── high_pass_filter.py │ ├── clipping_distortion.py │ ├── band_pass_filter.py │ ├── aliasing.py │ ├── band_stop_filter.py │ ├── tanh_distortion.py │ ├── pitch_shift.py │ ├── time_mask.py │ ├── time_stretch.py │ ├── gain.py │ ├── padding.py │ ├── add_gaussian_snr.py │ └── adjust_duration.py ├── spec_augmentations │ ├── __init__.py │ ├── spec_channel_shuffle.py │ └── spec_frequency_mask.py └── __init__.py ├── .codecov.yml ├── docs ├── requirements.txt ├── google874768f12a0e923e.html ├── waveform_transforms │ ├── Trim.webp │ ├── Limiter.webp │ ├── Reverse.webp │ ├── Aliasing.webp │ ├── BitCrush.webp │ ├── PitchShift.webp │ ├── RepeatPart.webp │ ├── TimeMask.webp │ ├── Trim_input.flac │ ├── TimeStretch.webp │ ├── AddGaussianSNR.webp │ ├── AddShortNoises.webp │ ├── AdjustDuration.webp │ ├── AirAbsorption.webp │ ├── Aliasing_input.flac │ ├── BandPassFilter.webp │ ├── BandStopFilter.webp │ ├── BitCrush_input.flac │ ├── Limiter_input.flac │ ├── Reverse_input.flac │ ├── TanhDistortion.webp │ ├── TimeMask_input.flac │ ├── AddGaussianNoise.webp │ ├── PitchShift_input.flac │ ├── RepeatPart_input.flac │ ├── TimeStretch_input.flac │ ├── Trim_transformed.flac │ ├── AddBackgroundNoise.webp │ ├── AddGaussianSNR_input.flac │ ├── AddShortNoises_input.flac │ ├── AdjustDuration_input.flac │ ├── AirAbsorption_input.flac │ ├── Aliasing_transformed.flac │ ├── ApplyImpulseResponse.webp │ ├── BandPassFilter_input.flac │ ├── BandStopFilter_input.flac │ ├── BitCrush_transformed.flac │ ├── Limiter_transformed.flac │ ├── Reverse_transformed.flac │ ├── TanhDistortion_input.flac │ ├── TimeMask_transformed.flac │ ├── AddGaussianNoise_input.flac │ ├── PitchShift_transformed.flac │ ├── RepeatPart_transformed.flac │ ├── AddBackgroundNoise_input.flac │ ├── AirAbsorption_transformed.flac │ ├── TimeStretch_transformed.flac │ ├── AddGaussianSNR_transformed.flac │ ├── AddShortNoises_transformed.flac │ ├── AdjustDuration_transformed.flac │ ├── ApplyImpulseResponse_input.flac │ ├── BandPassFilter_transformed.flac │ ├── BandStopFilter_transformed.flac │ ├── TanhDistortion_transformed.flac │ ├── AddBackgroundNoise_transformed.flac │ ├── AddGaussianNoise_transformed.flac │ ├── ApplyImpulseResponse_transformed.flac │ ├── normalize.md │ ├── resample.md │ ├── polarity_inversion.md │ ├── clipping_distortion.md │ ├── padding.md │ ├── clip.md │ ├── gain.md │ ├── lambda.md │ ├── peaking_filter.md │ ├── post_gain.md │ ├── trim.md │ ├── reverse.md │ ├── seven_band_parametric_eq.md │ ├── add_gaussian_noise.md │ ├── mp3_compression.md │ ├── low_pass_filter.md │ ├── high_pass_filter.md │ ├── low_shelf_filter.md │ ├── high_shelf_filter.md │ ├── loudness_normalization.md │ ├── bit_crush.md │ ├── time_mask.md │ ├── gain_transition.md │ ├── add_color_noise.md │ ├── tanh_distortion.md │ ├── pitch_shift.md │ ├── aliasing.md │ ├── time_stretch.md │ ├── adjust_duration.md │ └── add_gaussian_snr.md ├── spectrogram_transforms.md └── guides │ ├── transform_parameters.md │ ├── multichannel_audio_array_shapes.md │ └── cpu_vs_gpu.md ├── .coveragerc ├── .editorconfig ├── pytest.ini ├── requirements.txt ├── .github └── workflows │ └── main.yml ├── packaging.md ├── LICENSE ├── .circleci └── config.yml ├── .gitignore └── setup.py /demo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audiomentations/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | # Use codecov defaults 2 | -------------------------------------------------------------------------------- /audiomentations/augmentations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audiomentations/spec_augmentations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs==1.5.2 2 | mkdocs-material==9.1.21 3 | -------------------------------------------------------------------------------- /demo/bus.opus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/bus.opus -------------------------------------------------------------------------------- /docs/google874768f12a0e923e.html: -------------------------------------------------------------------------------- 1 | google-site-verification: google874768f12a0e923e.html -------------------------------------------------------------------------------- /demo/ir/misc_file.txt: -------------------------------------------------------------------------------- 1 | This file serves as an example of a file that is not an audio file 2 | -------------------------------------------------------------------------------- /demo/ms_adpcm.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/ms_adpcm.wav -------------------------------------------------------------------------------- /demo/p286_011.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/p286_011.wav -------------------------------------------------------------------------------- /demo/testing.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/testing.m4a -------------------------------------------------------------------------------- /demo/ir/rir48000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/ir/rir48000.wav -------------------------------------------------------------------------------- /demo/mono_int24.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/mono_int24.wav -------------------------------------------------------------------------------- /demo/mono_int32.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/mono_int32.wav -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | pragma: no cover 4 | raise NotImplementedError 5 | -------------------------------------------------------------------------------- /demo/mono_float64.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/mono_float64.wav -------------------------------------------------------------------------------- /demo/signed_24bit.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/signed_24bit.wav -------------------------------------------------------------------------------- /demo/stereo_16bit.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/stereo_16bit.wav -------------------------------------------------------------------------------- /demo/stereo_24bit.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/stereo_24bit.WAV -------------------------------------------------------------------------------- /demo/perfect-alley1.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/perfect-alley1.ogg -------------------------------------------------------------------------------- /demo/silence/silence.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/silence/silence.wav -------------------------------------------------------------------------------- /demo/acoustic_guitar_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/acoustic_guitar_0.wav -------------------------------------------------------------------------------- /demo/ir/impulse_response_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/ir/impulse_response_0.wav -------------------------------------------------------------------------------- /demo/background_noises/hens.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/background_noises/hens.ogg -------------------------------------------------------------------------------- /demo/background_noises/sheep.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/background_noises/sheep.ogg -------------------------------------------------------------------------------- /demo/short_noises/friction0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/short_noises/friction0.wav -------------------------------------------------------------------------------- /demo/short_noises/friction1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/short_noises/friction1.wav -------------------------------------------------------------------------------- /demo/ir/stereo_impulse_response.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/ir/stereo_impulse_response.wav -------------------------------------------------------------------------------- /docs/waveform_transforms/Trim.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Trim.webp -------------------------------------------------------------------------------- /demo/almost_silent/almost_silent.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/almost_silent/almost_silent.wav -------------------------------------------------------------------------------- /docs/waveform_transforms/Limiter.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Limiter.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/Reverse.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Reverse.webp -------------------------------------------------------------------------------- /demo/digital_silence/digital_silence.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/digital_silence/digital_silence.wav -------------------------------------------------------------------------------- /docs/waveform_transforms/Aliasing.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Aliasing.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/BitCrush.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BitCrush.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/PitchShift.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/PitchShift.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/RepeatPart.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/RepeatPart.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/TimeMask.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TimeMask.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/Trim_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Trim_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TimeStretch.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TimeStretch.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/AddGaussianSNR.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddGaussianSNR.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/AddShortNoises.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddShortNoises.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/AdjustDuration.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AdjustDuration.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/AirAbsorption.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AirAbsorption.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/Aliasing_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Aliasing_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/BandPassFilter.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BandPassFilter.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/BandStopFilter.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BandStopFilter.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/BitCrush_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BitCrush_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/Limiter_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Limiter_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/Reverse_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Reverse_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TanhDistortion.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TanhDistortion.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/TimeMask_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TimeMask_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddGaussianNoise.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddGaussianNoise.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/PitchShift_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/PitchShift_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/RepeatPart_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/RepeatPart_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TimeStretch_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TimeStretch_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/Trim_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Trim_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddBackgroundNoise.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddBackgroundNoise.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/AddGaussianSNR_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddGaussianSNR_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddShortNoises_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddShortNoises_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AdjustDuration_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AdjustDuration_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AirAbsorption_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AirAbsorption_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/Aliasing_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Aliasing_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/ApplyImpulseResponse.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/ApplyImpulseResponse.webp -------------------------------------------------------------------------------- /docs/waveform_transforms/BandPassFilter_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BandPassFilter_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/BandStopFilter_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BandStopFilter_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/BitCrush_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BitCrush_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/Limiter_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Limiter_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/Reverse_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/Reverse_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TanhDistortion_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TanhDistortion_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TimeMask_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TimeMask_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddGaussianNoise_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddGaussianNoise_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/PitchShift_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/PitchShift_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/RepeatPart_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/RepeatPart_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddBackgroundNoise_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddBackgroundNoise_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AirAbsorption_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AirAbsorption_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TimeStretch_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TimeStretch_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddGaussianSNR_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddGaussianSNR_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddShortNoises_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddShortNoises_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AdjustDuration_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AdjustDuration_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/ApplyImpulseResponse_input.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/ApplyImpulseResponse_input.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/BandPassFilter_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BandPassFilter_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/BandStopFilter_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/BandStopFilter_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/TanhDistortion_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/TanhDistortion_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddBackgroundNoise_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddBackgroundNoise_transformed.flac -------------------------------------------------------------------------------- /docs/waveform_transforms/AddGaussianNoise_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/AddGaussianNoise_transformed.flac -------------------------------------------------------------------------------- /demo/short_noises/130921_iPhone_rub_channel0_chunk83_aug2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/short_noises/130921_iPhone_rub_channel0_chunk83_aug2.wav -------------------------------------------------------------------------------- /docs/waveform_transforms/ApplyImpulseResponse_transformed.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/docs/waveform_transforms/ApplyImpulseResponse_transformed.flac -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | 7 | [*.py] 8 | charset = utf-8 9 | indent_style = space 10 | indent_size = 4 11 | -------------------------------------------------------------------------------- /demo/short_noises/130921_laptopmic-dell_tap_channel0_chunk200_aug0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/short_noises/130921_laptopmic-dell_tap_channel0_chunk200_aug0.wav -------------------------------------------------------------------------------- /demo/short_noises/130921_laptopmic-dell_tap_channel0_chunk204_aug3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wetdog/audiomentations/main/demo/short_noises/130921_laptopmic-dell_tap_channel0_chunk204_aug3.wav -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files=test*.py 3 | norecursedirs = .circleci .git .github .idea audiomentations.egg-info build dist docs site 4 | addopts=--cov audiomentations --cov-report=xml 5 | -------------------------------------------------------------------------------- /demo/p286_011_license: -------------------------------------------------------------------------------- 1 | p286_011.wav comes from "Noisy speech database for training speech enhancement algorithms and TTS models", published by University of Edinburgh. School of Informatics. Centre for Speech Technology Research (CSTR). The license is Creative Commons License: Attribution 4.0 International. For more info, including a link to the original license file, see https://datashare.ed.ac.uk/handle/10283/2791 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | audioread==2.1.9 2 | black 3 | coverage==7.4.4 4 | cylimiter==0.3.0 5 | fast-align-audio==0.3.0 6 | lameenc==1.4.2 7 | librosa==0.10.0.post2 8 | matplotlib>=3.0.0,<4 9 | numba==0.57.0 10 | numpy==1.23.0 11 | numpy-minmax>=0.3.0,<1 12 | numpy-rms>=0.4.2,<1 13 | pydub==0.23.1 14 | pyloudnorm==0.1.0 15 | pyroomacoustics==0.7.3 16 | pytest==7.4.4 17 | pytest-cov==5.0.0 18 | scipy>=1.4,<1.13 19 | soxr==0.3.5 20 | tqdm==4.66.3 21 | twine 22 | -------------------------------------------------------------------------------- /tests/test_transforms_interface.py: -------------------------------------------------------------------------------- 1 | from audiomentations import Normalize 2 | 3 | 4 | class TestTransformsInterface: 5 | def test_freeze_and_unfreeze_parameters(self): 6 | normalizer = Normalize(p=1.0) 7 | 8 | assert normalizer.are_parameters_frozen == False 9 | 10 | normalizer.freeze_parameters() 11 | assert normalizer.are_parameters_frozen == True 12 | 13 | normalizer.unfreeze_parameters() 14 | assert normalizer.are_parameters_frozen == False 15 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Publish docs via GitHub Pages 2 | on: 3 | push: 4 | branches: 5 | - ij/docs 6 | tags: 7 | - "**" 8 | jobs: 9 | build: 10 | name: Deploy docs 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout master 14 | uses: actions/checkout@v1 15 | 16 | - name: Deploy docs 17 | uses: mhausenblas/mkdocs-deploy-gh-pages@master 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | REQUIREMENTS: docs/requirements.txt 21 | -------------------------------------------------------------------------------- /docs/waveform_transforms/normalize.md: -------------------------------------------------------------------------------- 1 | # `Normalize` 2 | 3 | _Added in v0.6.0_ 4 | 5 | Apply a constant amount of gain, so that highest signal level present in the sound 6 | becomes 0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. 7 | Also known as peak normalization. 8 | 9 | # Normalize API 10 | 11 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 12 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 13 | 14 | ## Source code :octicons-mark-github-16: 15 | 16 | [audiomentations/augmentations/normalize.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/normalize.py){target=_blank} 17 | -------------------------------------------------------------------------------- /tests/test_trim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from audiomentations import Trim, Compose 4 | 5 | 6 | class TestTrim: 7 | def test_trim(self): 8 | sample_len = 1024 9 | samples1 = np.zeros((sample_len,), dtype=np.float32) 10 | samples2 = np.random.normal(0, 1, size=sample_len).astype(np.float32) 11 | sample_rate = 16000 12 | augmenter = Compose([Trim(top_db=20, p=1.0)]) 13 | samples_in = np.hstack((samples1, samples2)) 14 | assert len(samples_in) == sample_len * 2 15 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 16 | 17 | assert samples_out.dtype == np.float32 18 | assert len(samples_out) < sample_len * 2 19 | -------------------------------------------------------------------------------- /docs/spectrogram_transforms.md: -------------------------------------------------------------------------------- 1 | audiomentations is in a very early (read: not very useful yet) stage when it comes to spectrogram transforms. Consider applying waveform transforms before converting your waveforms to spectrograms, or check out [alternative libraries](alternatives.md) 2 | 3 | # `SpecChannelShuffle` 4 | 5 | _Added in v0.13.0_ 6 | 7 | Shuffle the channels of a multichannel spectrogram. This can help combat positional bias. 8 | 9 | # `SpecFrequencyMask` 10 | 11 | _Added in v0.13.0_ 12 | 13 | Mask a set of frequencies in a spectrogram, à la Google AI SpecAugment. This type of data 14 | augmentation has proved to make speech recognition models more robust. 15 | 16 | The masked frequencies can be replaced with either the mean of the original values or a 17 | given constant (e.g. zero). 18 | -------------------------------------------------------------------------------- /tests/test_equalizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from numpy.testing import assert_array_almost_equal 4 | 5 | from audiomentations.augmentations.seven_band_parametric_eq import SevenBandParametricEQ 6 | 7 | 8 | class TestSevenBandParametricEQ: 9 | @pytest.mark.parametrize( 10 | "shape", 11 | [(44100,), (1, 22049), (2, 10000)], 12 | ) 13 | def test_apply_eq(self, shape: tuple): 14 | samples_in = np.random.normal(0.0, 0.5, size=shape).astype(np.float32) 15 | sample_rate = 44100 16 | augmenter = SevenBandParametricEQ(p=1.0) 17 | 18 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 19 | assert samples_out.dtype == np.float32 20 | assert samples_out.shape == shape 21 | 22 | with np.testing.assert_raises(AssertionError): 23 | assert_array_almost_equal(samples_out, samples_in) 24 | -------------------------------------------------------------------------------- /audiomentations/augmentations/trim.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from numpy.typing import NDArray 4 | 5 | from audiomentations.core.transforms_interface import BaseWaveformTransform 6 | 7 | 8 | class Trim(BaseWaveformTransform): 9 | """ 10 | Trim leading and trailing silence from an audio signal using librosa.effects.trim 11 | """ 12 | 13 | supports_multichannel = True 14 | 15 | def __init__(self, top_db: float = 30.0, p: float = 0.5): 16 | """ 17 | :param top_db: The threshold (in decibels) below reference to consider as silence 18 | :param p: The probability of applying this transform 19 | """ 20 | super().__init__(p) 21 | self.top_db = top_db 22 | 23 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 24 | samples, lens = librosa.effects.trim(samples, top_db=self.top_db) 25 | return samples 26 | -------------------------------------------------------------------------------- /audiomentations/spec_augmentations/spec_channel_shuffle.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from audiomentations.core.transforms_interface import BaseSpectrogramTransform 4 | 5 | 6 | class SpecChannelShuffle(BaseSpectrogramTransform): 7 | """ 8 | Shuffle the channels of a multichannel spectrogram (channels last). 9 | This can help combat positional bias. 10 | """ 11 | supports_multichannel = True 12 | supports_mono = False 13 | 14 | def randomize_parameters(self, magnitude_spectrogram): 15 | super().randomize_parameters(magnitude_spectrogram) 16 | if self.parameters["should_apply"]: 17 | self.parameters["shuffled_channel_indexes"] = list(range(magnitude_spectrogram.shape[-1])) 18 | random.shuffle(self.parameters["shuffled_channel_indexes"]) 19 | 20 | def apply(self, magnitude_spectrogram): 21 | return magnitude_spectrogram[..., self.parameters["shuffled_channel_indexes"]] 22 | -------------------------------------------------------------------------------- /tests/test_gaussian_noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import AddGaussianNoise, Compose 5 | 6 | 7 | class TestGaussianNoise: 8 | def test_gaussian_noise(self): 9 | samples = np.zeros((20,), dtype=np.float32) 10 | sample_rate = 16000 11 | augmenter = Compose([AddGaussianNoise(p=1.0)]) 12 | samples = augmenter(samples=samples, sample_rate=sample_rate) 13 | 14 | assert samples.dtype == np.float32 15 | assert not (float(np.sum(np.abs(samples))) == pytest.approx(0.0)) 16 | 17 | def test_gaussian_noise_stereo(self): 18 | samples = np.zeros((2, 2000), dtype=np.float32) 19 | sample_rate = 16000 20 | augmenter = Compose([AddGaussianNoise(p=1.0)]) 21 | samples = augmenter(samples=samples, sample_rate=sample_rate) 22 | 23 | assert samples.dtype == np.float32 24 | assert not (float(np.sum(np.abs(samples))) == pytest.approx(0.0)) 25 | -------------------------------------------------------------------------------- /packaging.md: -------------------------------------------------------------------------------- 1 | * Check that all unit tests are OK 2 | * Run the demo and listen to the sounds to empirically check the results of any new or changed transforms 3 | * Bump the version number in `audiomentations/__init__.py` in accordance with the [semantic versioning specification](https://semver.org/) 4 | * Write a summary of the changes in the version history section in changelog.md. Remember to add a link to the new version near the bottom of the file. 5 | * Include changelog for only the newest version in README.md 6 | * Commit and push the change with a commit message like this: "Release vx.y.z" (replace x.y.z with the package version) 7 | * Add and push a git tag to the release commit 8 | * Add a release here: https://github.com/iver56/audiomentations/releases/new 9 | * Update the Zenodo badge in README.md and docs/index.md. Commit and push. 10 | * Remove any old files inside the dist folder 11 | * `python setup.py sdist bdist_wheel` 12 | * `python -m twine upload dist/*` 13 | -------------------------------------------------------------------------------- /docs/waveform_transforms/resample.md: -------------------------------------------------------------------------------- 1 | # `Resample` 2 | 3 | _Added in v0.8.0_ 4 | 5 | Resample signal using librosa.core.resample 6 | 7 | To do downsampling only set both minimum and maximum sampling rate lower than original 8 | sampling rate and vice versa to do upsampling only. 9 | 10 | # Resample API 11 | 12 | [`min_sample_rate`](#min_sample_rate){ #min_sample_rate }: `int` • unit: Hz 13 | : :octicons-milestone-24: Default: `8000`. Minimum sample rate 14 | 15 | [`max_sample_rate`](#max_sample_rate){ #max_sample_rate }: `int` • unit: Hz 16 | : :octicons-milestone-24: Default: `44100`. Maximum sample rate 17 | 18 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 19 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 20 | 21 | ## Source code :octicons-mark-github-16: 22 | 23 | [audiomentations/augmentations/resample.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/resample.py){target=_blank} 24 | -------------------------------------------------------------------------------- /tests/test_resample.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pytest 4 | 5 | from audiomentations import Resample, Compose 6 | 7 | 8 | class TestResample: 9 | @pytest.mark.parametrize( 10 | "samples", 11 | [ 12 | np.zeros((512,), dtype=np.float32), 13 | np.zeros( 14 | ( 15 | 2, 16 | 2512, 17 | ), 18 | dtype=np.float32, 19 | ), 20 | ], 21 | ) 22 | def test_resample(self, samples): 23 | sample_rate = 16000 24 | augmenter = Compose( 25 | [Resample(min_sample_rate=8000, max_sample_rate=44100, p=1.0)] 26 | ) 27 | samples = augmenter(samples=samples, sample_rate=sample_rate) 28 | 29 | assert samples.dtype == np.float32 30 | assert samples.shape[-1] <= math.ceil(samples.shape[-1] * 44100 / sample_rate) 31 | assert samples.shape[-1] >= math.ceil(samples.shape[-1] * 8000 / sample_rate) 32 | -------------------------------------------------------------------------------- /audiomentations/augmentations/lambda_transform.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | 8 | 9 | class Lambda(BaseWaveformTransform): 10 | """ 11 | Apply a user-defined transform (callable) to the signal. 12 | """ 13 | 14 | supports_multichannel = True 15 | 16 | def __init__(self, transform: Callable, p: float = 0.5, **kwargs): 17 | """ 18 | :param transform: A callable to be applied over samples. It should input 19 | samples (ndarray), sample_rate (int) and optionally some user-defined 20 | keyword arguments. 21 | :param p: The probability of applying this transform 22 | :param **kwargs: Any extra keyword arguments to be passed to the transform. 23 | """ 24 | super().__init__(p=p) 25 | self.transform = transform 26 | self.kwargs = kwargs 27 | 28 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 29 | return self.transform(samples, sample_rate, **self.kwargs) 30 | -------------------------------------------------------------------------------- /docs/waveform_transforms/polarity_inversion.md: -------------------------------------------------------------------------------- 1 | # `PolarityInversion` 2 | 3 | _Added in v0.11.0_ 4 | 5 | Flip the audio samples upside-down, reversing their polarity. In other words, multiply the 6 | waveform by -1, so negative values become positive, and vice versa. The result will sound 7 | the same compared to the original when played back in isolation. However, when mixed with 8 | other audio sources, the result may be different. This waveform inversion technique 9 | is sometimes used for audio cancellation or obtaining the difference between two waveforms. 10 | However, in the context of audio data augmentation, this transform can be useful when 11 | training phase-aware machine learning models. 12 | 13 | # PolarityInversion API 14 | 15 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 16 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 17 | 18 | ## Source code :octicons-mark-github-16: 19 | 20 | [audiomentations/augmentations/polarity_inversion.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/polarity_inversion.py){target=_blank} 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Iver Jordal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /audiomentations/augmentations/reverse.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from audiomentations.core.transforms_interface import BaseWaveformTransform 5 | 6 | 7 | class Reverse(BaseWaveformTransform): 8 | """ 9 | Reverse the audio. Also known as time inversion. Inversion of an audio track along its time 10 | axis relates to the random flip of an image, which is an augmentation technique that is 11 | widely used in the visual domain. This can be relevant in the context of audio 12 | classification. It was successfully applied in the paper 13 | AudioCLIP: Extending CLIP to Image, Text and Audio 14 | https://arxiv.org/pdf/2106.13043.pdf 15 | """ 16 | 17 | supports_multichannel = True 18 | 19 | def __init__(self, p: float = 0.5): 20 | """ 21 | :param p: The probability of applying this transform 22 | """ 23 | super().__init__(p) 24 | 25 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 26 | if len(samples.shape) > 1: 27 | return np.fliplr(samples) 28 | else: 29 | return np.flipud(samples) 30 | -------------------------------------------------------------------------------- /tests/test_clip.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import Clip 5 | 6 | 7 | class TestClip: 8 | def test_single_channel(self): 9 | samples = np.array([0.5, 0.6, -0.2, 0.0], dtype=np.float32) 10 | sample_rate = 16000 11 | augmenter = Clip(a_min=-0.1, a_max=0.1, p=1.0) 12 | samples = augmenter(samples=samples, sample_rate=sample_rate) 13 | 14 | assert np.amin(samples) == pytest.approx(-0.1) 15 | assert np.amax(samples) == pytest.approx(0.1) 16 | assert samples.dtype == np.float32 17 | assert samples.shape[-1] == 4 18 | 19 | def test_multichannel(self): 20 | samples = np.array( 21 | [[0.9, 0.5, -0.25, -0.125, 0.0], [0.95, 0.5, -0.25, -0.125, 0.0]], 22 | dtype=np.float32, 23 | ) 24 | sample_rate = 16000 25 | augmenter = Clip(a_min=-0.1, a_max=0.1, p=1.0) 26 | samples = augmenter(samples=samples, sample_rate=sample_rate) 27 | 28 | assert np.amin(samples) == pytest.approx(-0.1) 29 | assert np.amax(samples) == pytest.approx(0.1) 30 | assert samples.dtype == np.float32 31 | -------------------------------------------------------------------------------- /tests/test_reverse.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.testing import assert_array_almost_equal 3 | 4 | from audiomentations import Reverse 5 | 6 | 7 | class TestReverse: 8 | def test_single_channel(self): 9 | samples = np.array([0.5, 0.6, -0.2, 0.0], dtype=np.float32) 10 | sample_rate = 16000 11 | augmenter = Reverse(p=1.0) 12 | samples = augmenter(samples=samples, sample_rate=sample_rate) 13 | 14 | assert samples.dtype == np.float32 15 | assert samples.shape[-1] == 4 16 | 17 | def test_multichannel(self): 18 | samples = np.array( 19 | [[0.9, 0.5, -0.25, -0.125, 0.0], [0.95, 0.5, -0.25, -0.125, 0.0]], 20 | dtype=np.float32, 21 | ) 22 | sample_rate = 16000 23 | augmenter = Reverse(p=1.0) 24 | reversed_samples = augmenter(samples=samples, sample_rate=sample_rate) 25 | 26 | assert samples.dtype == np.float32 27 | assert_array_almost_equal( 28 | reversed_samples, 29 | np.array( 30 | [[0.0, -0.125, -0.25, 0.5, 0.9], [0.0, -0.125, -0.25, 0.5, 0.95]], 31 | dtype=np.float32, 32 | ), 33 | ) 34 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | def plot_matrix(matrix, output_image_path=None, vmin=None, vmax=None, title=None): 2 | """ 3 | Plot a 2D matrix with viridis color map 4 | 5 | :param matrix: 2D numpy array 6 | :return: 7 | """ 8 | import matplotlib.pyplot as plt 9 | 10 | fig = plt.figure() 11 | ax = fig.add_subplot(111) 12 | if title is not None: 13 | ax.set_title(title) 14 | plt.imshow(matrix, vmin=vmin, vmax=vmax) 15 | if matrix.shape[-1] != 3: 16 | plt.colorbar() 17 | if output_image_path: 18 | plt.savefig(str(output_image_path), dpi=200) 19 | else: 20 | plt.show() 21 | plt.close(fig) 22 | 23 | 24 | def plot_waveforms(wf1, wf2=None, wf3=None, title="Untitled"): 25 | """Plot one, two or three short 1D waveforms. Useful for debugging.""" 26 | import matplotlib.pyplot as plt 27 | 28 | fig = plt.figure() 29 | ax = fig.add_subplot(111) 30 | ax.set_title(title) 31 | plt.plot(wf1, label="wf1", alpha=0.66) 32 | if wf2 is not None: 33 | plt.plot(wf2, label="wf2", alpha=0.66) 34 | if wf3 is not None: 35 | plt.plot(wf3, label="wf3", alpha=0.66) 36 | plt.legend() 37 | plt.show() 38 | plt.close(fig) 39 | -------------------------------------------------------------------------------- /docs/waveform_transforms/clipping_distortion.md: -------------------------------------------------------------------------------- 1 | # `ClippingDistortion` 2 | 3 | _Added in v0.8.0_ 4 | 5 | Distort signal by clipping a random percentage of points 6 | 7 | The percentage of points that will be clipped is drawn from a uniform distribution between 8 | the two input parameters `min_percentile_threshold` and `max_percentile_threshold`. If for instance 9 | 30% is drawn, the samples are clipped if they're below the 15th or above the 85th percentile. 10 | 11 | # ClippingDistortion API 12 | 13 | [`min_percentile_threshold`](#min_percentile_threshold){ #min_percentile_threshold }: `int` 14 | : :octicons-milestone-24: Default: `0`. A lower bound on the total percent of samples 15 | that will be clipped 16 | 17 | [`max_percentile_threshold`](#max_percentile_threshold){ #max_percentile_threshold }: `int` 18 | : :octicons-milestone-24: Default: `40`. An upper bound on the total percent of 19 | samples that will be clipped 20 | 21 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 22 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 23 | 24 | ## Source code :octicons-mark-github-16: 25 | 26 | [audiomentations/augmentations/clipping_distortion.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/clipping_distortion.py){target=_blank} 27 | -------------------------------------------------------------------------------- /tests/test_lambda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import Lambda, Gain 5 | 6 | 7 | class TestLambda: 8 | def test_gain_lambda(self): 9 | samples_in = np.random.normal(0, 1, size=1024).astype(np.float32) 10 | augmenter = Lambda( 11 | transform=Gain(min_gain_db=50, max_gain_db=50, p=1.0), p=1.0 12 | ) 13 | std_in = np.mean(np.abs(samples_in)) 14 | samples_out = augmenter(samples=samples_in, sample_rate=16000) 15 | std_out = np.mean(np.abs(samples_out)) 16 | assert samples_out.dtype == np.float32 17 | assert samples_out.shape == samples_in.shape 18 | assert std_out > 100 * std_in 19 | 20 | def test_lambda_with_kwargs(self): 21 | samples_in = np.random.normal(0, 1, size=1024).astype(np.float32) 22 | augmenter = Lambda( 23 | transform=lambda samples, sample_rate, offset: samples + offset, 24 | p=1.0, 25 | offset=-0.2, 26 | ) 27 | input_mean = np.mean(samples_in) 28 | samples_out = augmenter(samples=samples_in, sample_rate=16000) 29 | output_mean = np.mean(samples_out) 30 | assert samples_out.dtype == np.float32 31 | assert samples_out.shape == samples_in.shape 32 | assert output_mean == pytest.approx(input_mean - 0.2) 33 | -------------------------------------------------------------------------------- /audiomentations/augmentations/polarity_inversion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from audiomentations.core.transforms_interface import BaseWaveformTransform 5 | 6 | 7 | class PolarityInversion(BaseWaveformTransform): 8 | """ 9 | Flip the audio samples upside-down, reversing their polarity. In other words, multiply the 10 | waveform by -1, so negative values become positive, and vice versa. The result will sound 11 | the same compared to the original when played back in isolation. However, when mixed with 12 | other audio sources, the result may be different. This waveform inversion technique 13 | is sometimes used for audio cancellation or obtaining the difference between two waveforms. 14 | However, in the context of audio data augmentation, this transform can be useful when 15 | training phase-aware machine learning models. 16 | """ 17 | 18 | supports_multichannel = True 19 | 20 | def __init__(self, p: float = 0.5): 21 | """ 22 | :param p: The probability of applying this transform 23 | """ 24 | super().__init__(p) 25 | 26 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 27 | super().randomize_parameters(samples, sample_rate) 28 | 29 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 30 | return -samples 31 | -------------------------------------------------------------------------------- /tests/test_polarity_inversion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.testing import assert_almost_equal 3 | 4 | from audiomentations import PolarityInversion, Compose 5 | 6 | 7 | class TestPolarityInversion: 8 | def test_polarity_inversion(self): 9 | samples = np.array([1.0, 0.5, -0.25, -0.125, 0.0], dtype=np.float32) 10 | sample_rate = 16000 11 | 12 | augment = Compose([PolarityInversion(p=1.0)]) 13 | inverted_samples = augment(samples=samples, sample_rate=sample_rate) 14 | assert_almost_equal( 15 | inverted_samples, np.array([-1.0, -0.5, 0.25, 0.125, 0.0], dtype=np.float32) 16 | ) 17 | assert inverted_samples.dtype == np.float32 18 | 19 | def test_polarity_inversion_multichannel(self): 20 | samples = np.array( 21 | [[1.0, 0.5, -0.25, -0.125, 0.0], [1.0, 0.5, -0.25, -0.125, 0.0]], 22 | dtype=np.float32, 23 | ) 24 | sample_rate = 16000 25 | 26 | augment = Compose([PolarityInversion(p=1.0)]) 27 | inverted_samples = augment(samples=samples, sample_rate=sample_rate) 28 | assert_almost_equal( 29 | inverted_samples, 30 | np.array( 31 | [[-1.0, -0.5, 0.25, 0.125, 0.0], [-1.0, -0.5, 0.25, 0.125, 0.0]], 32 | dtype=np.float32, 33 | ), 34 | ) 35 | assert inverted_samples.dtype == np.float32 36 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.1 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2.1 6 | orbs: 7 | codecov: codecov/codecov@3.2.3 8 | jobs: 9 | build: 10 | docker: 11 | - image: cimg/python:3.9.13 12 | working_directory: ~/repo 13 | steps: 14 | - checkout 15 | 16 | # Download and cache dependencies 17 | - restore_cache: 18 | keys: 19 | - v3-dependencies-{{ checksum "requirements.txt" }} 20 | # Fall back to using the latest cache if no exact match is found 21 | - v3-dependencies- 22 | 23 | - run: 24 | name: install dependencies 25 | command: | 26 | python3 -m venv venv 27 | . venv/bin/activate 28 | pip install wheel 29 | pip install -r requirements.txt 30 | sudo apt update 31 | sudo apt install software-properties-common 32 | sudo apt install ffmpeg 33 | 34 | - save_cache: 35 | paths: 36 | - ./venv 37 | key: v3-dependencies-{{ checksum "requirements.txt" }} 38 | 39 | # Run tests and measure code coverage 40 | - run: 41 | name: run tests 42 | command: | 43 | . venv/bin/activate 44 | pytest 45 | 46 | - codecov/upload: 47 | file: coverage.xml 48 | -------------------------------------------------------------------------------- /docs/waveform_transforms/padding.md: -------------------------------------------------------------------------------- 1 | # `Padding` 2 | 3 | _Added in v0.23.0_ 4 | 5 | Apply padding to the audio signal - take a fraction of the end or the start of the 6 | audio and replace that part with padding. This can be useful for preparing ML models 7 | with constant input length for padded inputs. 8 | 9 | # Padding API 10 | 11 | [`mode`](#mode){ #mode }: `str` • choices: `"silence"`, `"wrap"`, `"reflect"` 12 | : :octicons-milestone-24: Default: `"silence"`. Padding mode. 13 | 14 | [`min_fraction`](#min_fraction){ #min_fraction }: `float` • range: [0.0, 1.0] 15 | : :octicons-milestone-24: Default: `0.01`. Minimum fraction of the signal duration to be padded 16 | 17 | [`max_fraction`](#max_fraction){ #max_fraction }: `float` • range: [0.0, 1.0] 18 | : :octicons-milestone-24: Default: `0.7`. Maximum fraction of the signal duration to be padded 19 | 20 | [`pad_section`](#pad_section){ #pad_section }: `str` • choices: `"start"`, `"end"` 21 | : :octicons-milestone-24: Default: `"end"`. Which part of the signal should be replaced with padding 22 | 23 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 24 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 25 | 26 | ## Source code :octicons-mark-github-16: 27 | 28 | [audiomentations/augmentations/padding.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/padding.py){target=_blank} 29 | -------------------------------------------------------------------------------- /audiomentations/augmentations/add_gaussian_noise.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | 8 | 9 | class AddGaussianNoise(BaseWaveformTransform): 10 | """Add gaussian noise to the samples""" 11 | 12 | supports_multichannel = True 13 | 14 | def __init__(self, min_amplitude=0.001, max_amplitude=0.015, p=0.5): 15 | """ 16 | 17 | :param min_amplitude: Minimum noise amplification factor 18 | :param max_amplitude: Maximum noise amplification factor 19 | :param p: 20 | """ 21 | super().__init__(p) 22 | assert min_amplitude > 0.0 23 | assert max_amplitude > 0.0 24 | assert max_amplitude >= min_amplitude 25 | self.min_amplitude = min_amplitude 26 | self.max_amplitude = max_amplitude 27 | 28 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 29 | super().randomize_parameters(samples, sample_rate) 30 | if self.parameters["should_apply"]: 31 | self.parameters["amplitude"] = random.uniform( 32 | self.min_amplitude, self.max_amplitude 33 | ) 34 | 35 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 36 | noise = np.random.randn(*samples.shape).astype(np.float32) 37 | samples = samples + self.parameters["amplitude"] * noise 38 | return samples 39 | -------------------------------------------------------------------------------- /tests/test_aliasing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import Aliasing 5 | 6 | 7 | class TestAliasing: 8 | def test_single_channel(self): 9 | samples = np.random.normal(0, 0.1, size=(2048,)).astype(np.float32) 10 | sample_rate = 16000 11 | augmenter = Aliasing(min_sample_rate=8000, max_sample_rate=32000, p=1.0) 12 | 13 | distorted_samples = augmenter(samples=samples, sample_rate=sample_rate) 14 | 15 | assert samples.dtype == distorted_samples.dtype 16 | assert samples.shape == distorted_samples.shape 17 | assert not np.array_equal(samples, distorted_samples) 18 | 19 | def test_multichannel(self): 20 | num_channels = 3 21 | samples = np.random.normal(0, 0.1, size=(num_channels, 2048)).astype(np.float32) 22 | sample_rate = 16000 23 | augmenter = Aliasing(min_sample_rate=8000, max_sample_rate=32000, p=1.0) 24 | 25 | distorted_samples = augmenter(samples=samples, sample_rate=sample_rate) 26 | 27 | assert samples.dtype == distorted_samples.dtype 28 | assert samples.shape == distorted_samples.shape 29 | assert not np.array_equal(samples, distorted_samples) 30 | 31 | def test_param_range(self): 32 | with pytest.raises(ValueError): 33 | Aliasing(min_sample_rate=0, max_sample_rate=6000, p=1.0) 34 | with pytest.raises(ValueError): 35 | Aliasing(min_sample_rate=8000, max_sample_rate=6000, p=1.0) 36 | -------------------------------------------------------------------------------- /tests/test_tanh_distortion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import TanhDistortion 5 | from audiomentations.core.utils import calculate_rms 6 | 7 | 8 | def test_single_channel(): 9 | samples = np.random.normal(0, 0.1, size=(20480,)).astype(np.float32) 10 | sample_rate = 16000 11 | augmenter = TanhDistortion(min_distortion=0.2, max_distortion=0.6, p=1.0) 12 | 13 | distorted_samples = augmenter(samples=samples, sample_rate=sample_rate) 14 | 15 | assert samples.dtype == distorted_samples.dtype 16 | assert samples.shape == distorted_samples.shape 17 | assert np.amax(distorted_samples) < np.amax(samples) 18 | assert calculate_rms(distorted_samples) == pytest.approx( 19 | calculate_rms(samples), abs=1e-3 20 | ) 21 | 22 | 23 | def test_multichannel(): 24 | num_channels = 3 25 | samples = np.random.normal(0, 0.1, size=(num_channels, 5555)).astype(np.float32) 26 | sample_rate = 16000 27 | augmenter = TanhDistortion(min_distortion=0.05, max_distortion=0.6, p=1.0) 28 | 29 | distorted_samples = augmenter(samples=samples, sample_rate=sample_rate) 30 | 31 | assert samples.dtype == distorted_samples.dtype 32 | assert samples.shape == distorted_samples.shape 33 | for i in range(num_channels): 34 | assert not np.allclose(samples[i], distorted_samples[i]) 35 | assert calculate_rms(distorted_samples[i]) == pytest.approx( 36 | calculate_rms(samples[i]), abs=1e-3 37 | ) 38 | -------------------------------------------------------------------------------- /audiomentations/augmentations/normalize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from audiomentations.core.transforms_interface import BaseWaveformTransform 5 | from audiomentations.core.utils import get_max_abs_amplitude 6 | 7 | 8 | class Normalize(BaseWaveformTransform): 9 | """ 10 | Apply a constant amount of gain, so that highest signal level present in the sound becomes 11 | 0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. Also known 12 | as peak normalization. 13 | """ 14 | 15 | supports_multichannel = True 16 | 17 | def __init__(self, apply_to: str = "all", p: float = 0.5): 18 | super().__init__(p) 19 | assert apply_to in ("all", "only_too_loud_sounds") 20 | self.apply_to = apply_to 21 | 22 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 23 | super().randomize_parameters(samples, sample_rate) 24 | if self.parameters["should_apply"]: 25 | self.parameters["max_amplitude"] = get_max_abs_amplitude(samples) 26 | 27 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 28 | if ( 29 | self.apply_to == "only_too_loud_sounds" 30 | and self.parameters["max_amplitude"] < 1.0 31 | ): 32 | return samples 33 | 34 | if self.parameters["max_amplitude"] > 0: 35 | return samples / self.parameters["max_amplitude"] 36 | else: 37 | return samples 38 | -------------------------------------------------------------------------------- /tests/test_clipping_distortion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import ClippingDistortion, Compose 5 | 6 | 7 | class TestClippingDistortion: 8 | def test_distort(self): 9 | sample_len = 1024 10 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 11 | sample_rate = 16000 12 | augmenter = Compose( 13 | [ 14 | ClippingDistortion( 15 | min_percentile_threshold=20, max_percentile_threshold=40, p=1.0 16 | ) 17 | ] 18 | ) 19 | 20 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 21 | assert samples_out.dtype == np.float32 22 | assert len(samples_out) == sample_len 23 | assert sum(abs(samples_out)) < sum(abs(samples_in)) 24 | 25 | def test_distort_multichannel(self): 26 | sample_len = 32000 27 | samples_in = np.random.normal(0, 1, size=(2, sample_len)).astype(np.float32) 28 | sample_rate = 16000 29 | augmenter = ClippingDistortion( 30 | min_percentile_threshold=20, max_percentile_threshold=40, p=1.0 31 | ) 32 | 33 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 34 | assert samples_out.dtype == np.float32 35 | assert samples_out.shape == samples_in.shape 36 | assert np.sum(np.abs(samples_out)) < np.sum(np.abs(samples_in)) 37 | assert np.amax(samples_out[0, :]) == pytest.approx(np.amax(samples_out[1, :])) 38 | -------------------------------------------------------------------------------- /docs/waveform_transforms/clip.md: -------------------------------------------------------------------------------- 1 | # `Clip` 2 | 3 | _Added in v0.17.0_ 4 | 5 | Clip audio by specified values. e.g. set `a_min=-1.0` and `a_max=1.0` to ensure that no 6 | samples in the audio exceed that extent. This can be relevant for avoiding integer 7 | overflow or underflow (which results in unintended wrap distortion that can sound 8 | horrible) when exporting to e.g. 16-bit PCM wav. 9 | 10 | Another way of ensuring that all values stay between -1.0 and 1.0 is to apply 11 | `PeakNormalization`. 12 | 13 | This transform is different from `ClippingDistortion` in that it takes fixed values 14 | for clipping instead of clipping a random percentile of the samples. Arguably, this 15 | transform is not very useful for data augmentation. Instead, think of it as a very 16 | cheap and harsh limiter (for samples that exceed the allotted extent) that can 17 | sometimes be useful at the end of a data augmentation pipeline. 18 | 19 | # Clip API 20 | 21 | [`a_min`](#a_min){ #a_min }: `float` 22 | : :octicons-milestone-24: Default: `-1.0`. Minimum value for clipping. 23 | 24 | [`a_max`](#a_max){ #a_max }: `float` 25 | : :octicons-milestone-24: Default: `1.0`. Maximum value for clipping. 26 | 27 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 28 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 29 | 30 | ## Source code :octicons-mark-github-16: 31 | 32 | [audiomentations/augmentations/clip.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/clip.py){target=_blank} 33 | -------------------------------------------------------------------------------- /docs/waveform_transforms/gain.md: -------------------------------------------------------------------------------- 1 | # `Gain` 2 | 3 | _Added in v0.11.0_ 4 | 5 | Multiply the audio by a random amplitude factor to reduce or increase the volume. This 6 | technique can help a model become somewhat invariant to the overall gain of the input audio. 7 | 8 | Warning: This transform can return samples outside the [-1, 1] range, which may lead to 9 | clipping or wrap distortion, depending on what you do with the audio in a later stage. 10 | See also [https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping](https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping) 11 | 12 | # Gain API 13 | 14 | [`min_gain_in_db`](#min_gain_in_db){ #min_gain_in_db }: `float` • unit: Decibel 15 | : :warning: Deprecated as of v0.31.0. Use [`min_gain_db`](#min_gain_db) instead 16 | 17 | [`max_gain_in_db`](#max_gain_in_db){ #max_gain_in_db }: `float` • unit: Decibel 18 | : :warning: Deprecated as of v0.31.0. Use [`max_gain_db`](#max_gain_db) instead 19 | 20 | [`min_gain_db`](#min_gain_db){ #min_gain_db }: `float` • unit: Decibel 21 | : :octicons-milestone-24: Default: `-12.0`. Minimum gain. 22 | 23 | [`max_gain_db`](#max_gain_db){ #max_gain_db }: `float` • unit: Decibel 24 | : :octicons-milestone-24: Default: `12.0`. Maximum gain. 25 | 26 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 27 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 28 | 29 | ## Source code :octicons-mark-github-16: 30 | 31 | [audiomentations/augmentations/gain.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/gain.py){target=_blank} 32 | -------------------------------------------------------------------------------- /tests/test_time_stretch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from audiomentations import TimeStretch 4 | 5 | 6 | class TestTimeStretch: 7 | def test_dynamic_length(self): 8 | samples = np.zeros((2048,), dtype=np.float32) 9 | sample_rate = 16000 10 | augmenter = TimeStretch( 11 | min_rate=0.8, max_rate=0.9, leave_length_unchanged=False, p=1.0 12 | ) 13 | 14 | samples = augmenter(samples=samples, sample_rate=sample_rate) 15 | 16 | assert samples.dtype == np.float32 17 | assert len(samples) > 2048 18 | 19 | def test_fixed_length(self): 20 | samples = np.zeros((2048,), dtype=np.float32) 21 | sample_rate = 16000 22 | augmenter = TimeStretch( 23 | min_rate=0.8, max_rate=0.9, leave_length_unchanged=True, p=1.0 24 | ) 25 | 26 | samples = augmenter(samples=samples, sample_rate=sample_rate) 27 | 28 | assert samples.dtype == np.float32 29 | assert len(samples) == 2048 30 | 31 | def test_multichannel(self): 32 | num_channels = 3 33 | samples = np.random.normal(0, 0.1, size=(num_channels, 5555)).astype(np.float32) 34 | sample_rate = 16000 35 | augmenter = TimeStretch( 36 | min_rate=0.8, max_rate=0.9, leave_length_unchanged=True, p=1.0 37 | ) 38 | 39 | samples_out = augmenter(samples=samples, sample_rate=sample_rate) 40 | 41 | assert samples.dtype == samples_out.dtype 42 | assert samples.shape == samples_out.shape 43 | for i in range(num_channels): 44 | assert not np.allclose(samples[i], samples_out[i]) 45 | -------------------------------------------------------------------------------- /docs/waveform_transforms/lambda.md: -------------------------------------------------------------------------------- 1 | # `Lambda` 2 | 3 | _Added in v0.26.0_ 4 | 5 | Apply a user-defined transform (callable) to the signal. The inspiration for this 6 | transform comes from albumentation's lambda transform. This allows one to have a little 7 | more fine-grained control over the operations in the context of a `Compose`, `OneOf` or `SomeOf` 8 | 9 | ## Usage example 10 | 11 | ```python 12 | import random 13 | 14 | from audiomentations import Lambda, OneOf, Gain 15 | 16 | 17 | def gain_only_left_channel(samples, sample_rate): 18 | samples[0, :] *= random.uniform(0.8, 1.25) 19 | return samples 20 | 21 | 22 | transform = OneOf( 23 | transforms=[Lambda(transform=gain_only_left_channel, p=1.0), Gain(p=1.0)] 24 | ) 25 | 26 | augmented_sound = transform(my_stereo_waveform_ndarray, sample_rate=16000) 27 | ``` 28 | 29 | # Lambda API 30 | 31 | [`transform`](#transform){ #transform }: `Callable` 32 | : :octicons-milestone-24: A callable to be applied. It should input 33 | samples (ndarray), sample_rate (int) and optionally some user-defined 34 | keyword arguments. 35 | 36 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 37 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 38 | 39 | [`**kwargs`](#**kwargs){ #**kwargs } 40 | : :octicons-milestone-24: Optional extra parameters passed to the callable transform 41 | 42 | ## Source code :octicons-mark-github-16: 43 | 44 | [audiomentations/augmentations/lambda_transform.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/lambda_transform.py){target=_blank} 45 | -------------------------------------------------------------------------------- /tests/test_bit_crush.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import BitCrush 5 | 6 | 7 | class TestBitCrush: 8 | def test_single_channel(self): 9 | samples = np.random.normal(0, 0.1, size=(2048,)).astype(np.float32) 10 | sample_rate = 16000 11 | augmenter = BitCrush(min_bit_depth=3, max_bit_depth=6, p=1.0) 12 | 13 | distorted_samples = augmenter(samples=samples, sample_rate=sample_rate) 14 | 15 | assert samples.dtype == distorted_samples.dtype 16 | assert samples.shape == distorted_samples.shape 17 | assert 2 ** augmenter.parameters["bit_depth"] + 1 > len( 18 | np.unique(np.round(distorted_samples, 5)) 19 | ) 20 | 21 | def test_multichannel(self): 22 | num_channels = 3 23 | samples = np.random.normal(0, 0.1, size=(num_channels, 2048)).astype(np.float32) 24 | sample_rate = 16000 25 | augmenter = BitCrush(min_bit_depth=3, max_bit_depth=6, p=1.0) 26 | 27 | distorted_samples = augmenter(samples=samples, sample_rate=sample_rate) 28 | 29 | assert samples.dtype == distorted_samples.dtype 30 | assert samples.shape == distorted_samples.shape 31 | assert 2 ** augmenter.parameters["bit_depth"] + 1 > len( 32 | np.unique(np.round(distorted_samples, 5)) 33 | ) 34 | 35 | def test_param_range(self): 36 | with pytest.raises(ValueError): 37 | BitCrush(min_bit_depth=0, max_bit_depth=6, p=1.0) 38 | with pytest.raises(ValueError): 39 | BitCrush(min_bit_depth=7, max_bit_depth=64, p=1.0) 40 | with pytest.raises(ValueError): 41 | BitCrush(min_bit_depth=8, max_bit_depth=6, p=1.0) 42 | -------------------------------------------------------------------------------- /docs/waveform_transforms/peaking_filter.md: -------------------------------------------------------------------------------- 1 | # `PeakingFilter` 2 | 3 | _Added in v0.21.0_ 4 | 5 | Add a biquad peaking filter transform 6 | 7 | # PeakingFilter API 8 | 9 | [`min_center_freq`](#min_center_freq){ #min_center_freq }: `float` • unit: hertz • range: [0.0, ∞) 10 | : :octicons-milestone-24: Default: `50.0`. The minimum center frequency of the peaking filter 11 | 12 | [`max_center_freq`](#max_center_freq){ #max_center_freq }: `float` • unit: hertz • range: [0.0, ∞) 13 | : :octicons-milestone-24: Default: `7500.0`. The maximum center frequency of the peaking filter 14 | 15 | [`min_gain_db`](#min_gain_db){ #min_gain_db }: `float` • unit: Decibel 16 | : :octicons-milestone-24: Default: `-24.0`. The minimum gain at center frequency 17 | 18 | [`max_gain_db`](#max_gain_db){ #max_gain_db }: `float` • unit: Decibel 19 | : :octicons-milestone-24: Default: `24.0`. The maximum gain at center frequency 20 | 21 | [`min_q`](#min_q){ #min_q }: `float` • range: [0.0, ∞) 22 | : :octicons-milestone-24: Default: `0.5`. The minimum quality factor Q. The higher the 23 | Q, the steeper the transition band will be. 24 | 25 | [`max_q`](#max_q){ #max_q }: `float` • range: [0.0, ∞) 26 | : :octicons-milestone-24: Default: `5.0`. The maximum quality factor Q. The higher the 27 | Q, the steeper the transition band will be. 28 | 29 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 30 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 31 | 32 | ## Source code :octicons-mark-github-16: 33 | 34 | [audiomentations/augmentations/peaking_filter.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/peaking_filter.py){target=_blank} 35 | -------------------------------------------------------------------------------- /audiomentations/augmentations/resample.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import librosa 4 | import numpy as np 5 | from numpy.typing import NDArray 6 | 7 | from audiomentations.core.transforms_interface import BaseWaveformTransform 8 | 9 | 10 | class Resample(BaseWaveformTransform): 11 | """ 12 | Resample signal using librosa.core.resample 13 | 14 | To do downsampling only set both minimum and maximum sampling rate lower than 15 | original sampling rate and vice versa to do upsampling only. 16 | """ 17 | 18 | supports_multichannel = True 19 | 20 | def __init__( 21 | self, min_sample_rate: int = 8000, max_sample_rate: int = 44100, p: float = 0.5 22 | ): 23 | """ 24 | :param min_sample_rate: Minimum sample rate 25 | :param max_sample_rate: Maximum sample rate 26 | :param p: The probability of applying this transform 27 | """ 28 | super().__init__(p) 29 | assert min_sample_rate <= max_sample_rate 30 | self.min_sample_rate = min_sample_rate 31 | self.max_sample_rate = max_sample_rate 32 | 33 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 34 | super().randomize_parameters(samples, sample_rate) 35 | if self.parameters["should_apply"]: 36 | self.parameters["target_sample_rate"] = random.randint( 37 | self.min_sample_rate, self.max_sample_rate 38 | ) 39 | 40 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 41 | samples = librosa.core.resample( 42 | samples, 43 | orig_sr=sample_rate, 44 | target_sr=self.parameters["target_sample_rate"], 45 | ) 46 | return samples 47 | -------------------------------------------------------------------------------- /audiomentations/augmentations/clip.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from audiomentations.core.transforms_interface import BaseWaveformTransform 5 | 6 | 7 | class Clip(BaseWaveformTransform): 8 | """ 9 | Clip audio by specified values. e.g. set a_min=-1.0 and a_max=1.0 to ensure that no 10 | samples in the audio exceed that extent. This can be relevant for avoiding integer 11 | overflow or underflow (which results in unintended wrap distortion that can sound 12 | horrible) when exporting to e.g. 16-bit PCM wav. 13 | 14 | Another way of ensuring that all values stay between -1.0 and 1.0 is to apply 15 | PeakNormalization. 16 | 17 | This transform is different from ClippingDistortion in that it takes fixed values 18 | for clipping instead of clipping a random percentile of the samples. Arguably, this 19 | transform is not very useful for data augmentation. Instead, think of it as a very 20 | cheap and harsh limiter (for samples that exceed the allotted extent) that can 21 | sometimes be useful at the end of a data augmentation pipeline. 22 | """ 23 | 24 | supports_multichannel = True 25 | 26 | def __init__(self, a_min: float = -1.0, a_max: float = 1.0, p: float = 0.5): 27 | """ 28 | :param a_min: float, minimum value for clipping 29 | :param a_max: float, maximum value for clipping 30 | :param p: The probability of applying this transform 31 | """ 32 | super().__init__(p) 33 | assert a_min < a_max 34 | self.a_min = a_min 35 | self.a_max = a_max 36 | 37 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 38 | return np.clip(samples, self.a_min, self.a_max) 39 | -------------------------------------------------------------------------------- /tests/test_air_absorption.py: -------------------------------------------------------------------------------- 1 | DEBUG = False 2 | 3 | from audiomentations import AirAbsorption 4 | import numpy as np 5 | import pytest 6 | import scipy 7 | import scipy.signal 8 | 9 | 10 | def get_chirp_test(sample_rate, duration): 11 | """Create a `duration` seconds chirp from 0Hz to `nyquist frequency`""" 12 | n = np.arange(0, duration, 1 / sample_rate) 13 | samples = scipy.signal.chirp(n, 0, duration, sample_rate // 2, method="linear") 14 | return samples.astype(np.float32) 15 | 16 | 17 | class TestAirAbsorptionTransform: 18 | @pytest.mark.parametrize("temperature", [10, 20]) 19 | @pytest.mark.parametrize("humidity", [30, 50, 70, 90]) 20 | @pytest.mark.parametrize("distance", [5, 10, 20, 100]) 21 | @pytest.mark.parametrize("sample_rate", [8000, 16000, 48000]) 22 | def test_input_shapes(self, temperature, humidity, distance, sample_rate): 23 | np.random.seed(1) 24 | 25 | samples = get_chirp_test(sample_rate, 10) 26 | 27 | augment = AirAbsorption( 28 | min_temperature=temperature, 29 | max_temperature=temperature, 30 | min_humidity=humidity, 31 | max_humidity=humidity, 32 | min_distance=distance, 33 | max_distance=distance, 34 | ) 35 | 36 | # Test 1D case 37 | processed_samples = augment(samples, sample_rate=sample_rate) 38 | assert processed_samples.shape == samples.shape 39 | assert processed_samples.dtype == np.float32 40 | 41 | # Test 2D case 42 | samples = np.tile(samples, (2, 1)) 43 | processed_samples = augment(samples, sample_rate=sample_rate) 44 | assert processed_samples.shape == samples.shape 45 | assert processed_samples.dtype == np.float32 46 | -------------------------------------------------------------------------------- /docs/waveform_transforms/post_gain.md: -------------------------------------------------------------------------------- 1 | # `PostGain` 2 | 3 | _Added in v0.31.0_ 4 | 5 | Gain up or down the audio after the given transform (or set of transforms) has 6 | processed the audio. There are several methods that determine how the audio should 7 | be gained. `PostGain` can be useful for compensating for any gain differences introduced 8 | by a (set of) transform(s), or for preventing clipping in the output. 9 | 10 | # PostGain API 11 | 12 | [`transform`](#transform){ #transform }: `Callable[[NDArray[np.float32], int], NDArray[np.float32]]` 13 | : :octicons-milestone-24: A callable to be applied. It should input 14 | samples (ndarray), sample_rate (int) and optionally some user-defined 15 | keyword arguments. 16 | 17 | [`method`](#method){ #method }: `str` • choices: `"same_rms"`, `"same_lufs"` or `"peak_normalize_always"` 18 | : :octicons-milestone-24: This parameter defines the method for choosing the post gain amount. 19 | 20 | * `"same_rms"`: The sound gets post-gained so that the RMS (Root Mean Square) of 21 | the output matches the RMS of the input. 22 | * `"same_lufs"`: The sound gets post-gained so that the LUFS (Loudness Units Full Scale) of 23 | the output matches the LUFS of the input. 24 | * `"peak_normalize_always"`: The sound gets peak normalized (gained up or down so 25 | that the absolute value of the most extreme sample in the output is 1.0) 26 | * `"peak_normalize_if_too_loud"`: The sound gets peak normalized if it is too 27 | loud (max absolute value greater than 1.0). This option can be useful for 28 | avoiding clipping. 29 | 30 | ## Source code :octicons-mark-github-16: 31 | 32 | [audiomentations/core/post_gain.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/core/post_gain.py){target=_blank} 33 | -------------------------------------------------------------------------------- /docs/waveform_transforms/trim.md: -------------------------------------------------------------------------------- 1 | # `Trim` 2 | 3 | _Added in v0.7.0_ 4 | 5 | Trim leading and trailing silence from an audio signal using `librosa.effects.trim`. It considers threshold 6 | (in decibels) below reference defined in parameter `top_db` as silence. 7 | 8 | ## Input-output example 9 | 10 | In this example we remove silence from the start and end, using the default top_db parameter value 11 | 12 | ![Input-output waveforms and spectrograms](Trim.webp) 13 | 14 | | Input sound | Transformed sound | 15 | |-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| 16 | | | | 17 | 18 | ## Usage example 19 | 20 | ```python 21 | from audiomentations import Trim 22 | 23 | transform = Trim( 24 | top_db=30.0, 25 | p=1.0 26 | ) 27 | 28 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 29 | ``` 30 | 31 | ## Trim API 32 | 33 | [`top_db`](#top_db){ #top_db }: `float` • unit: Decibel 34 | : :octicons-milestone-24: Default: `30.0`. The threshold value (in decibels) below which to consider silence and trim. 35 | 36 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 37 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 38 | 39 | ## Source code :octicons-mark-github-16: 40 | 41 | [audiomentations/augmentations/trim.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/trim.py){target=_blank} 42 | -------------------------------------------------------------------------------- /docs/waveform_transforms/reverse.md: -------------------------------------------------------------------------------- 1 | # `Reverse` 2 | 3 | _Added in v0.18.0_ 4 | 5 | Reverse the audio. Also known as time inversion. Inversion of an audio track along its time 6 | axis relates to the random flip of an image, which is an augmentation technique that is 7 | widely used in the visual domain. This can be relevant in the context of audio 8 | classification. It was successfully applied in the paper 9 | [AudioCLIP: Extending CLIP to Image, Text and Audio :octicons-link-external-16:](https://arxiv.org/pdf/2106.13043.pdf){target=_blank}. 10 | 11 | ## Input-output example 12 | 13 | In this example, we reverse a speech recording 14 | 15 | ![Input-output waveforms and spectrograms](Reverse.webp) 16 | 17 | | Input sound | Transformed sound | 18 | |---------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| 19 | | | | 20 | 21 | ## Usage example 22 | 23 | ```python 24 | from audiomentations import Reverse 25 | 26 | transform = Reverse(p=1.0) 27 | 28 | augmented_sound = transform(my_waveform_ndarray, sample_rate=44100) 29 | ``` 30 | 31 | # Reverse API 32 | 33 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 34 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 35 | 36 | 37 | ## Source code :octicons-mark-github-16: 38 | 39 | [audiomentations/augmentations/reverse.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/reverse.py){target=_blank} 40 | -------------------------------------------------------------------------------- /audiomentations/core/audio_loading_utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import librosa 4 | import numpy as np 5 | 6 | 7 | def load_sound_file(file_path, sample_rate, mono=True, resample_type="auto"): 8 | """ 9 | Load an audio file as a floating point time series. Audio will be automatically 10 | resampled to the given sample rate. 11 | 12 | :param file_path: str or Path instance that points to a sound file 13 | :param sample_rate: If not None, resample to this sample rate 14 | :param mono: If True, mix any multichannel data down to mono, and return a 1D array 15 | :param resample_type: "auto" means use "kaiser_fast" when upsampling and "kaiser_best" when 16 | downsampling 17 | """ 18 | file_path = str(file_path) 19 | samples, actual_sample_rate = librosa.load( 20 | str(file_path), sr=None, mono=mono, dtype=np.float32 21 | ) 22 | 23 | if sample_rate is not None and actual_sample_rate != sample_rate: 24 | if resample_type == "auto": 25 | if librosa.__version__.startswith("0.8."): 26 | resample_type = ( 27 | "kaiser_fast" if actual_sample_rate < sample_rate else "kaiser_best" 28 | ) 29 | else: 30 | resample_type = "soxr_hq" 31 | samples = librosa.resample( 32 | samples, 33 | orig_sr=actual_sample_rate, 34 | target_sr=sample_rate, 35 | res_type=resample_type, 36 | ) 37 | warnings.warn( 38 | "{} had to be resampled from {} Hz to {} Hz. This hurt execution time.".format( 39 | str(file_path), actual_sample_rate, sample_rate 40 | ) 41 | ) 42 | 43 | actual_sample_rate = actual_sample_rate if sample_rate is None else sample_rate 44 | 45 | if mono: 46 | assert len(samples.shape) == 1 47 | return samples, actual_sample_rate 48 | -------------------------------------------------------------------------------- /docs/waveform_transforms/seven_band_parametric_eq.md: -------------------------------------------------------------------------------- 1 | # `SevenBandParametricEQ` 2 | 3 | _Added in v0.24.0_ 4 | 5 | Adjust the volume of different frequency bands. This transform is a 7-band 6 | parametric equalizer - a combination of one low shelf filter, five peaking filters 7 | and one high shelf filter, all with randomized gains, Q values and center frequencies. 8 | 9 | Because this transform changes the timbre, but keeps the overall "class" of the 10 | sound the same (depending on application), it can be used for data augmentation to 11 | make ML models more robust to various frequency spectrums. Many things can affect 12 | the spectrum, for example: 13 | 14 | * the nature and quality of the sound source 15 | * room acoustics 16 | * any objects between the microphone and the sound source 17 | * microphone type/model 18 | * the distance between the sound source and the microphone 19 | 20 | The seven bands have center frequencies picked in the following ranges (min-max): 21 | 22 | * 42-95 Hz 23 | * 91-204 Hz 24 | * 196-441 Hz 25 | * 421-948 Hz 26 | * 909-2045 Hz 27 | * 1957-4404 Hz 28 | * 4216-9486 Hz 29 | 30 | 31 | ## SevenBandParametricEQ API 32 | 33 | [`min_gain_db`](#min_gain_db){ #min_gain_db }: `float` • unit: Decibel 34 | : :octicons-milestone-24: Default: `-12.0`. Minimum number of dB to cut or boost a band 35 | 36 | [`max_gain_db`](#max_gain_db){ #max_gain_db }: `float` • unit: decibel 37 | : :octicons-milestone-24: Default: `12.0`. Maximum number of dB to cut or boost a band 38 | 39 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 40 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 41 | 42 | ## Source code :octicons-mark-github-16: 43 | 44 | [audiomentations/augmentations/seven_band_parametric_eq.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/seven_band_parametric_eq.py){target=_blank} 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # Jetbrains IDE 109 | .idea 110 | 111 | # Demo output 112 | demo/output/**/*.wav 113 | -------------------------------------------------------------------------------- /audiomentations/augmentations/bit_crush.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | 8 | 9 | class BitCrush(BaseWaveformTransform): 10 | """ 11 | Apply a bit crush effect to the audio by reducing the bit depth. In other words, it 12 | reduces the number of bits that can be used for representing each audio sample. 13 | This adds quantization noise, and affects dynamic range. This transform does not 14 | apply dithering. 15 | """ 16 | 17 | supports_multichannel = True 18 | 19 | def __init__(self, min_bit_depth: int = 5, max_bit_depth: int = 10, p: float = 0.5): 20 | """ 21 | :param min_bit_depth: The minimum bit depth the audio will be "converted" to 22 | :param max_bit_depth: The maximum bit depth the audio will be "converted" to 23 | :param p: The probability of applying this transform 24 | """ 25 | super().__init__(p) 26 | self.min_bit_depth = min_bit_depth 27 | self.max_bit_depth = max_bit_depth 28 | 29 | if min_bit_depth < 1: 30 | raise ValueError("min_bit_depth must be at least 1") 31 | 32 | if max_bit_depth > 32: 33 | raise ValueError("max_bit_depth must not be greater than 32") 34 | 35 | if min_bit_depth > max_bit_depth: 36 | raise ValueError("min_bit_depth must not be larger than max_bit_depth") 37 | 38 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 39 | super().randomize_parameters(samples, sample_rate) 40 | if self.parameters["should_apply"]: 41 | self.parameters["bit_depth"] = random.randint( 42 | self.min_bit_depth, self.max_bit_depth 43 | ) 44 | 45 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 46 | q = (2 ** self.parameters["bit_depth"] / 2) + 1 47 | return np.round(samples * q) / q 48 | -------------------------------------------------------------------------------- /docs/waveform_transforms/add_gaussian_noise.md: -------------------------------------------------------------------------------- 1 | # `AddGaussianNoise` 2 | 3 | _Added in v0.1.0_ 4 | 5 | Add gaussian noise to the samples 6 | 7 | ## Input-output example 8 | 9 | Here we add some gaussian noise (with amplitude 0.01) to a speech recording. 10 | 11 | ![Input-output waveforms and spectrograms](AddGaussianNoise.webp) 12 | 13 | | Input sound | Transformed sound | 14 | |-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| 15 | | | | 16 | 17 | 18 | ## Usage example 19 | 20 | ```python 21 | from audiomentations import AddGaussianNoise 22 | 23 | transform = AddGaussianNoise( 24 | min_amplitude=0.001, 25 | max_amplitude=0.015, 26 | p=1.0 27 | ) 28 | 29 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 30 | ``` 31 | 32 | ## AddGaussianNoise API 33 | 34 | [`min_amplitude`](#min_amplitude){ #min_amplitude }: `float` • unit: linear amplitude 35 | : :octicons-milestone-24: Default: `0.001`. Minimum noise amplification factor. 36 | 37 | [`max_amplitude`](#max_amplitude){ #max_amplitude }: `float` • unit: linear amplitude 38 | : :octicons-milestone-24: Default: `0.015`. Maximum noise amplification factor. 39 | 40 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 41 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 42 | 43 | ## Source code :octicons-mark-github-16: 44 | 45 | [audiomentations/augmentations/add_gaussian_noise.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/add_gaussian_noise.py){target=_blank} 46 | -------------------------------------------------------------------------------- /docs/waveform_transforms/mp3_compression.md: -------------------------------------------------------------------------------- 1 | # `Mp3Compression` 2 | 3 | _Added in v0.12.0_ 4 | 5 | Compress the audio using an MP3 encoder to lower the audio quality. This may help machine 6 | learning models deal with compressed, low-quality audio. 7 | 8 | This transform depends on either lameenc or pydub/ffmpeg. 9 | 10 | Note that bitrates below 32 kbps are only supported for low sample rates (up to 24000 Hz). 11 | 12 | Note: When using the `"lameenc"` backend, the output may be slightly longer than the input due 13 | to the fact that the LAME encoder inserts some silence at the beginning of the audio. 14 | 15 | Warning: This transform writes to disk, so it may be slow. 16 | 17 | # Mp3Compression API 18 | 19 | [`min_bitrate`](#min_bitrate){ #min_bitrate }: `int` • unit: kbps • range: [8, `max_bitrate`] 20 | : :octicons-milestone-24: Default: `8`. Minimum bitrate in kbps 21 | 22 | [`max_bitrate`](#max_bitrate){ #max_bitrate }: `int` • unit: kbps • range: [`min_bitrate`, 320] 23 | : :octicons-milestone-24: Default: `64`. Maximum bitrate in kbps 24 | 25 | [`backend`](#backend){ #backend }: `str` • choices: `"pydub"`, `"lameenc"` 26 | : :octicons-milestone-24: Default: `"pydub"`. 27 | 28 | * `"pydub"`: May use ffmpeg under the hood. Pro: Seems to avoid introducing latency in 29 | the output. Con: Slightly slower than `"lameenc"`. 30 | * `"lameenc"`: Pro: With this backend you can set the quality parameter in addition 31 | to the bitrate (although this parameter is not exposed in the audiomentations API 32 | yet). Con: Seems to introduce some silence at the start of the audio. 33 | 34 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 35 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 36 | 37 | ## Source code :octicons-mark-github-16: 38 | 39 | [audiomentations/augmentations/mp3_compression.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/mp3_compression.py){target=_blank} 40 | -------------------------------------------------------------------------------- /audiomentations/augmentations/low_pass_filter.py: -------------------------------------------------------------------------------- 1 | from audiomentations.augmentations.base_butterword_filter import BaseButterworthFilter 2 | 3 | 4 | class LowPassFilter(BaseButterworthFilter): 5 | """ 6 | Apply low-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). 7 | Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoff). 8 | """ 9 | 10 | supports_multichannel = True 11 | 12 | def __init__( 13 | self, 14 | min_cutoff_freq: float = 150.0, 15 | max_cutoff_freq: float = 7500.0, 16 | min_rolloff: int = 12, 17 | max_rolloff: int = 24, 18 | zero_phase: bool = False, 19 | p: float = 0.5, 20 | ): 21 | """ 22 | :param min_cutoff_freq: Minimum cutoff frequency in hertz 23 | :param max_cutoff_freq: Maximum cutoff frequency in hertz 24 | :param min_rolloff: Minimum filter roll-off (in dB/octave). 25 | Must be a multiple of 6 26 | :param max_rolloff: Maximum filter roll-off (in dB/octave) 27 | Must be a multiple of 6 28 | :param zero_phase: Whether filtering should be zero phase. 29 | When this is set to `true` it will not affect the phase of the 30 | input signal but will sound 3 dB lower at the cutoff frequency 31 | compared to the non-zero phase case (6 dB vs. 3 dB). Additionally, 32 | it is 2 times slower than in the non-zero phase case. If you 33 | absolutely want no phase distortions (e.g. want to augment a 34 | drum track), set this to `true`. 35 | :param p: The probability of applying this transform 36 | """ 37 | super().__init__( 38 | min_cutoff_freq=min_cutoff_freq, 39 | max_cutoff_freq=max_cutoff_freq, 40 | min_rolloff=min_rolloff, 41 | max_rolloff=max_rolloff, 42 | zero_phase=zero_phase, 43 | p=p, 44 | filter_type="lowpass", 45 | ) 46 | -------------------------------------------------------------------------------- /audiomentations/augmentations/high_pass_filter.py: -------------------------------------------------------------------------------- 1 | from audiomentations.augmentations.base_butterword_filter import BaseButterworthFilter 2 | 3 | 4 | class HighPassFilter(BaseButterworthFilter): 5 | """ 6 | Apply high-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). 7 | Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoff). 8 | """ 9 | 10 | supports_multichannel = True 11 | 12 | def __init__( 13 | self, 14 | min_cutoff_freq: float = 20.0, 15 | max_cutoff_freq: float = 2400.0, 16 | min_rolloff: int = 12, 17 | max_rolloff: int = 24, 18 | zero_phase: bool = False, 19 | p: float = 0.5, 20 | ): 21 | """ 22 | :param min_cutoff_freq: Minimum cutoff frequency in hertz 23 | :param max_cutoff_freq: Maximum cutoff frequency in hertz 24 | :param min_rolloff: Minimum filter roll-off (in dB/octave). 25 | Must be a multiple of 6 26 | :param max_rolloff: Maximum filter roll-off (in dB/octave) 27 | Must be a multiple of 6 28 | :param zero_phase: Whether filtering should be zero phase. 29 | When this is set to `true` it will not affect the phase of the 30 | input signal but will sound 3 dB lower at the cutoff frequency 31 | compared to the non-zero phase case (6 dB vs. 3 dB). Additionally, 32 | it is 2 times slower than in the non-zero phase case. If you 33 | absolutely want no phase distortions (e.g. want to augment a 34 | drum track), set this to `true`. 35 | :param p: The probability of applying this transform 36 | """ 37 | super().__init__( 38 | min_cutoff_freq=min_cutoff_freq, 39 | max_cutoff_freq=max_cutoff_freq, 40 | min_rolloff=min_rolloff, 41 | max_rolloff=max_rolloff, 42 | zero_phase=zero_phase, 43 | p=p, 44 | filter_type="highpass", 45 | ) 46 | -------------------------------------------------------------------------------- /docs/waveform_transforms/low_pass_filter.md: -------------------------------------------------------------------------------- 1 | # `LowPassFilter` 2 | 3 | _Added in v0.18.0, updated in v0.21.0_ 4 | 5 | Apply low-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). 6 | Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoff). 7 | 8 | # LowPassFilter API 9 | 10 | [`min_cutoff_freq`](#min_cutoff_freq){ #min_cutoff_freq }: `float` • unit: hertz 11 | : :octicons-milestone-24: Default: `150.0`. Minimum cutoff frequency 12 | 13 | [`max_cutoff_freq`](#max_cutoff_freq){ #max_cutoff_freq }: `float` • unit: hertz 14 | : :octicons-milestone-24: Default: `7500.0`. Maximum cutoff frequency 15 | 16 | [`min_rolloff`](#min_rolloff){ #min_rolloff }: `float` • unit: Decibels/octave 17 | : :octicons-milestone-24: Default: `12`. Minimum filter roll-off (in dB/octave). 18 | Must be a multiple of 6 19 | 20 | [`max_rolloff`](#max_rolloff){ #max_rolloff }: `float` • unit: Decibels/octave 21 | : :octicons-milestone-24: Default: `24`. Maximum filter roll-off (in dB/octave) 22 | Must be a multiple of 6 23 | 24 | [`zero_phase`](#zero_phase){ #zero_phase }: `bool` 25 | : :octicons-milestone-24: Default: `False`. Whether filtering should be zero phase. 26 | When this is set to `True` it will not affect the phase of the input signal but will 27 | sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB 28 | vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If 29 | you absolutely want no phase distortions (e.g. want to augment an audio file with 30 | lots of transients, like a drum track), set this to `True`. 31 | 32 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 33 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 34 | 35 | ## Source code :octicons-mark-github-16: 36 | 37 | [audiomentations/augmentations/low_pass_filter.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/low_pass_filter.py){target=_blank} 38 | -------------------------------------------------------------------------------- /docs/waveform_transforms/high_pass_filter.md: -------------------------------------------------------------------------------- 1 | # `HighPassFilter` 2 | 3 | _Added in v0.18.0, updated in v0.21.0_ 4 | 5 | Apply high-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). 6 | Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoff). 7 | 8 | # HighPassFilter API 9 | 10 | [`min_cutoff_freq`](#min_cutoff_freq){ #min_cutoff_freq }: `float` • unit: hertz 11 | : :octicons-milestone-24: Default: `20.0`. Minimum cutoff frequency 12 | 13 | [`max_cutoff_freq`](#max_cutoff_freq){ #max_cutoff_freq }: `float` • unit: hertz 14 | : :octicons-milestone-24: Default: `2400.0`. Maximum cutoff frequency 15 | 16 | [`min_rolloff`](#min_rolloff){ #min_rolloff }: `float` • unit: Decibels/octave 17 | : :octicons-milestone-24: Default: `12`. Minimum filter roll-off (in dB/octave). 18 | Must be a multiple of 6 19 | 20 | [`max_rolloff`](#max_rolloff){ #max_rolloff }: `float` • unit: Decibels/octave 21 | : :octicons-milestone-24: Default: `24`. Maximum filter roll-off (in dB/octave). 22 | Must be a multiple of 6 23 | 24 | [`zero_phase`](#zero_phase){ #zero_phase }: `bool` 25 | : :octicons-milestone-24: Default: `False`. Whether filtering should be zero phase. 26 | When this is set to `True` it will not affect the phase of the input signal but will 27 | sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB 28 | vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If 29 | you absolutely want no phase distortions (e.g. want to augment an audio file with 30 | lots of transients, like a drum track), set this to `True`. 31 | 32 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 33 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 34 | 35 | ## Source code :octicons-mark-github-16: 36 | 37 | [audiomentations/augmentations/high_pass_filter.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/high_pass_filter.py){target=_blank} 38 | -------------------------------------------------------------------------------- /tests/test_gain.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | import pytest 5 | from numpy.testing import assert_almost_equal 6 | 7 | from audiomentations import Gain 8 | from audiomentations.core.transforms_interface import WrongMultichannelAudioShape 9 | 10 | 11 | class TestGain: 12 | def test_gain(self): 13 | samples = np.array([1.0, 0.5, -0.25, -0.125, 0.0], dtype=np.float32) 14 | sample_rate = 16000 15 | 16 | augment = Gain(min_gain_db=-6, max_gain_db=-6, p=1.0) 17 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 18 | assert_almost_equal( 19 | processed_samples, 20 | np.array( 21 | [0.5011872, 0.2505936, -0.1252968, -0.0626484, 0.0], dtype=np.float32 22 | ), 23 | ) 24 | assert processed_samples.dtype == np.float32 25 | 26 | def test_gain_multichannel(self): 27 | samples = np.array( 28 | [[1.0, 0.5, -0.25, -0.125, 0.0], [1.0, 0.5, -0.25, -0.125, 0.0]], 29 | dtype=np.float32, 30 | ) 31 | sample_rate = 16000 32 | 33 | augment = Gain(min_gain_db=-6, max_gain_db=-6, p=1.0) 34 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 35 | assert_almost_equal( 36 | processed_samples, 37 | np.array( 38 | [ 39 | [0.5011872, 0.2505936, -0.1252968, -0.0626484, 0.0], 40 | [0.5011872, 0.2505936, -0.1252968, -0.0626484, 0.0], 41 | ], 42 | dtype=np.float32, 43 | ), 44 | ) 45 | assert processed_samples.dtype == np.float32 46 | 47 | def test_gain_multichannel_with_wrong_dimension_ordering(self): 48 | samples = np.random.uniform(low=-0.5, high=0.5, size=(2000, 2)).astype( 49 | np.float32 50 | ) 51 | 52 | augment = Gain(min_gain_db=-6, max_gain_db=-6, p=1.0) 53 | 54 | with pytest.raises(WrongMultichannelAudioShape): 55 | augment(samples=samples, sample_rate=16000) 56 | -------------------------------------------------------------------------------- /tests/test_gaussian_snr.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | import numpy as np 5 | 6 | from audiomentations import AddGaussianSNR 7 | 8 | 9 | class TestGaussianSNR: 10 | def test_gaussian_noise_snr_defaults(self): 11 | np.random.seed(42) 12 | samples_in = np.random.normal(0, 1, size=1024).astype(np.float32) 13 | augmenter = AddGaussianSNR(p=1.0) 14 | std_in = np.mean(np.abs(samples_in)) 15 | samples_out = augmenter(samples=samples_in, sample_rate=16000) 16 | std_out = np.mean(np.abs(samples_out)) 17 | assert samples_out.dtype == np.float32 18 | assert not (float(std_out) == pytest.approx(0.0)) 19 | assert std_out > std_in 20 | 21 | def test_gaussian_noise_snr(self): 22 | np.random.seed(42) 23 | samples_in = np.random.normal(0, 1, size=1024).astype(np.float32) 24 | augmenter = AddGaussianSNR(min_snr_db=15, max_snr_db=35, p=1.0) 25 | std_in = np.mean(np.abs(samples_in)) 26 | samples_out = augmenter(samples=samples_in, sample_rate=16000) 27 | std_out = np.mean(np.abs(samples_out)) 28 | assert samples_out.dtype == np.float32 29 | assert not (float(std_out) == pytest.approx(0.0)) 30 | assert std_out > std_in 31 | 32 | def test_serialize_parameters(self): 33 | np.random.seed(42) 34 | transform = AddGaussianSNR(min_snr_db=15, max_snr_db=35, p=1.0) 35 | samples = np.random.normal(0, 1, size=1024).astype(np.float32) 36 | transform.randomize_parameters(samples, sample_rate=16000) 37 | json.dumps(transform.serialize_parameters()) 38 | 39 | def test_gaussian_noise_snr_multichannel(self): 40 | np.random.seed(42) 41 | samples = np.random.normal(0, 0.1, size=(3, 8888)).astype(np.float32) 42 | augmenter = AddGaussianSNR(min_snr_db=15, max_snr_db=35, p=1.0) 43 | samples_out = augmenter(samples=samples, sample_rate=16000) 44 | 45 | assert samples_out.dtype == np.float32 46 | assert float(np.sum(np.abs(samples_out))) > float(np.sum(np.abs(samples))) 47 | 48 | -------------------------------------------------------------------------------- /tests/test_spec_compose.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.testing import assert_array_equal 3 | 4 | from audiomentations import SpecChannelShuffle, SpecFrequencyMask, SpecCompose 5 | 6 | 7 | class TestSpecCompose: 8 | def test_freeze_and_unfreeze_parameters(self): 9 | spectrogram = np.random.random((256, 256, 3)) 10 | augmenter = SpecCompose( 11 | [ 12 | SpecChannelShuffle(p=1.0), 13 | SpecFrequencyMask(p=1.0), 14 | ] 15 | ) 16 | perturbed_samples1 = augmenter(magnitude_spectrogram=spectrogram) 17 | augmenter.freeze_parameters() 18 | for transform in augmenter.transforms: 19 | assert transform.are_parameters_frozen == True 20 | perturbed_samples2 = augmenter(magnitude_spectrogram=spectrogram) 21 | 22 | assert_array_equal(perturbed_samples1, perturbed_samples2) 23 | 24 | augmenter.unfreeze_parameters() 25 | for transform in augmenter.transforms: 26 | assert transform.are_parameters_frozen == False 27 | 28 | def test_randomize_parameters_and_apply(self): 29 | spectrogram = np.random.random((256, 256, 3)) 30 | augmenter = SpecCompose( 31 | [ 32 | SpecChannelShuffle(p=1.0), 33 | SpecFrequencyMask(p=1.0), 34 | ] 35 | ) 36 | augmenter.freeze_parameters() 37 | augmenter.randomize_parameters(magnitude_spectrogram=spectrogram) 38 | 39 | parameters = [transform.parameters for transform in augmenter.transforms] 40 | 41 | perturbed_samples1 = augmenter(magnitude_spectrogram=spectrogram) 42 | perturbed_samples2 = augmenter(magnitude_spectrogram=spectrogram) 43 | 44 | assert_array_equal(perturbed_samples1, perturbed_samples2) 45 | 46 | augmenter.unfreeze_parameters() 47 | 48 | for transform_parameters, transform in zip(parameters, augmenter.transforms): 49 | assert transform_parameters == transform.parameters 50 | assert transform.are_parameters_frozen == False 51 | -------------------------------------------------------------------------------- /tests/test_adjust_duration.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from audiomentations import AdjustDuration 7 | 8 | 9 | class TestAdjustDuration: 10 | @pytest.mark.parametrize("mode", ["silence", "wrap", "reflect"]) 11 | @pytest.mark.parametrize("pad_section", ["start", "end"]) 12 | @pytest.mark.parametrize("sample_len", [3, 4, 5]) 13 | @pytest.mark.parametrize("ndim", [None, 1, 2]) 14 | def test_padding(self, mode, pad_section, sample_len, ndim): 15 | random.seed(546) 16 | samples = np.ones((ndim, 4) if ndim else 4, dtype=np.float32) 17 | sample_rate = 16000 18 | input_shape = samples.shape 19 | target_shape = list(input_shape) 20 | target_shape[-1] = sample_len 21 | target_shape = tuple(target_shape) 22 | augmenter = AdjustDuration( 23 | duration_samples=sample_len, padding_mode=mode, padding_position=pad_section, p=1.0 24 | ) 25 | samples = augmenter(samples=samples, sample_rate=sample_rate) 26 | 27 | assert samples.dtype == np.float32 28 | assert samples.shape == target_shape 29 | 30 | @pytest.mark.parametrize("mode", ["silence", "wrap", "reflect"]) 31 | @pytest.mark.parametrize("pad_section", ["start", "end"]) 32 | @pytest.mark.parametrize("second", [0.4, 0.5, 0.6]) 33 | @pytest.mark.parametrize("ndim", [None, 1, 2]) 34 | def test_padding_second(self, mode, pad_section, second, ndim): 35 | random.seed(546) 36 | sample_rate = 80 37 | samples = np.ones((ndim, 40) if ndim else 40, dtype=np.float32) 38 | input_shape = samples.shape 39 | target_shape = list(input_shape) 40 | target_shape[-1] = int(second * sample_rate) 41 | target_shape = tuple(target_shape) 42 | augmenter = AdjustDuration( 43 | duration_seconds=second, padding_mode=mode, padding_position=pad_section, p=1.0 44 | ) 45 | samples = augmenter(samples=samples, sample_rate=sample_rate) 46 | 47 | assert samples.dtype == np.float32 48 | assert samples.shape == target_shape 49 | -------------------------------------------------------------------------------- /docs/waveform_transforms/low_shelf_filter.md: -------------------------------------------------------------------------------- 1 | # `LowShelfFilter` 2 | 3 | _Added in v0.21.0_ 4 | 5 | A low shelf filter is a filter that either boosts (increases amplitude) or cuts 6 | (decreases amplitude) frequencies below a certain center frequency. This transform 7 | applies a low-shelf filter at a specific center frequency in hertz. 8 | The gain at DC frequency is controlled by `{min,max}_gain_db` (note: can be positive or negative!). 9 | Filter coefficients are taken from [the W3 Audio EQ Cookbook :octicons-link-external-16:](https://www.w3.org/TR/audio-eq-cookbook/) 10 | 11 | # LowShelfFilter API 12 | 13 | [`min_center_freq`](#min_center_freq){ #min_center_freq }: `float` • unit: hertz 14 | : :octicons-milestone-24: Default: `50.0`. The minimum center frequency of the shelving filter 15 | 16 | [`max_center_freq`](#max_center_freq){ #max_center_freq }: `float` • unit: hertz 17 | : :octicons-milestone-24: Default: `4000.0`. The maximum center frequency of the shelving filter 18 | 19 | [`min_gain_db`](#min_gain_db){ #min_gain_db }: `float` • unit: Decibel 20 | : :octicons-milestone-24: Default: `-18.0`. The minimum gain at DC (0 Hz) 21 | 22 | [`max_gain_db`](#max_gain_db){ #max_gain_db }: `float` • unit: Decibel 23 | : :octicons-milestone-24: Default: `18.0`. The maximum gain at DC (0 Hz) 24 | 25 | [`min_q`](#min_q){ #min_q }: `float` • range: (0.0, 1.0] 26 | : :octicons-milestone-24: Default: `0.1`. The minimum quality factor Q. The higher 27 | the Q, the steeper the transition band will be. 28 | 29 | [`max_q`](#max_q){ #max_q }: `float` • range: (0.0, 1.0] 30 | : :octicons-milestone-24: Default: `0.999`. The maximum quality factor Q. The higher 31 | the Q, the steeper the transition band will be. 32 | 33 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 34 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 35 | 36 | ## Source code :octicons-mark-github-16: 37 | 38 | [audiomentations/augmentations/low_shelf_filter.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/low_shelf_filter.py){target=_blank} 39 | -------------------------------------------------------------------------------- /docs/waveform_transforms/high_shelf_filter.md: -------------------------------------------------------------------------------- 1 | # `HighShelfFilter` 2 | 3 | _Added in v0.21.0_ 4 | 5 | A high shelf filter is a filter that either boosts (increases amplitude) or cuts 6 | (decreases amplitude) frequencies above a certain center frequency. This transform 7 | applies a high-shelf filter at a specific center frequency in hertz. 8 | The gain at nyquist frequency is controlled by `{min,max}_gain_db` (note: can be positive or negative!). 9 | Filter coefficients are taken from [the W3 Audio EQ Cookbook :octicons-link-external-16:](https://www.w3.org/TR/audio-eq-cookbook/) 10 | 11 | # HighShelfFilter API 12 | 13 | [`min_center_freq`](#min_center_freq){ #min_center_freq }: `float` • unit: hertz 14 | : :octicons-milestone-24: Default: `300.0`. The minimum center frequency of the shelving filter 15 | 16 | [`max_center_freq`](#max_center_freq){ #max_center_freq }: `float` • unit: hertz 17 | : :octicons-milestone-24: Default: `7500.0`. The maximum center frequency of the shelving filter 18 | 19 | [`min_gain_db`](#min_gain_db){ #min_gain_db }: `float` • unit: Decibel 20 | : :octicons-milestone-24: Default: `-18.0`. The minimum gain at the nyquist frequency 21 | 22 | [`max_gain_db`](#max_gain_db){ #max_gain_db }: `float` • unit: Decibel 23 | : :octicons-milestone-24: Default: `18.0`. The maximum gain at the nyquist frequency 24 | 25 | [`min_q`](#min_q){ #min_q }: `float` • range: (0.0, 1.0] 26 | : :octicons-milestone-24: Default: `0.1`. The minimum quality factor Q. The higher 27 | the Q, the steeper the transition band will be. 28 | 29 | [`max_q`](#max_q){ #max_q }: `float` • range: (0.0, 1.0] 30 | : :octicons-milestone-24: Default: `0.999`. The maximum quality factor Q. The higher 31 | the Q, the steeper the transition band will be. 32 | 33 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 34 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 35 | 36 | ## Source code :octicons-mark-github-16: 37 | 38 | [audiomentations/augmentations/high_shelf_filter.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/high_shelf_filter.py){target=_blank} 39 | -------------------------------------------------------------------------------- /docs/waveform_transforms/loudness_normalization.md: -------------------------------------------------------------------------------- 1 | # `LoudnessNormalization` 2 | 3 | _Added in v0.14.0_ 4 | 5 | Apply a constant amount of gain to match a specific loudness (in LUFS). This is an 6 | implementation of ITU-R BS.1770-4. 7 | 8 | For an explanation on LUFS, see [https://en.wikipedia.org/wiki/LUFS :octicons-link-external-16:](https://en.wikipedia.org/wiki/LUFS){target=_blank} 9 | 10 | See also the following web pages for more info on audio loudness normalization: 11 | 12 | * [https://github.com/csteinmetz1/pyloudnorm :octicons-link-external-16:](https://github.com/csteinmetz1/pyloudnorm){target=_blank} 13 | * [https://en.wikipedia.org/wiki/Audio_normalization :octicons-link-external-16:](https://en.wikipedia.org/wiki/Audio_normalization){target=_blank} 14 | 15 | Warning: This transform can return samples outside the [-1, 1] range, which may lead to 16 | clipping or wrap distortion, depending on what you do with the audio in a later stage. 17 | See also [https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping :octicons-link-external-16:](https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping) 18 | 19 | # LoudnessNormalization API 20 | 21 | [`min_lufs_in_db`](#min_lufs_in_db){ #min_lufs_in_db }: `float` • unit: LUFS 22 | : :warning: Deprecated as of v0.31.0. Use [`min_lufs`](#min_lufs) instead 23 | 24 | [`max_lufs_in_db`](#max_lufs_in_db){ #max_lufs_in_db }: `float` • unit: LUFS 25 | : :warning: Deprecated as of v0.31.0. Use [`max_lufs`](#max_lufs) instead 26 | 27 | [`min_lufs`](#min_lufs){ #min_lufs }: `float` • unit: LUFS 28 | : :octicons-milestone-24: Default: `-31.0`. Minimum loudness target 29 | 30 | [`max_lufs`](#max_lufs){ #max_lufs }: `float` • unit: LUFS 31 | : :octicons-milestone-24: Default: `-13.0`. Maximum loudness target 32 | 33 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 34 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 35 | 36 | ## Source code :octicons-mark-github-16: 37 | 38 | [audiomentations/augmentations/loudness_normalization.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/loudness_normalization.py){target=_blank} 39 | -------------------------------------------------------------------------------- /tests/test_pitch_shift.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | from numpy.testing import assert_array_equal 5 | 6 | from audiomentations import PitchShift, Compose 7 | 8 | 9 | class TestPitchShift: 10 | def test_apply_pitch_shift(self): 11 | samples = np.zeros((2048,), dtype=np.float32) 12 | sample_rate = 16000 13 | augmenter = Compose([PitchShift(min_semitones=-2, max_semitones=-1, p=1.0)]) 14 | samples = augmenter(samples=samples, sample_rate=sample_rate) 15 | 16 | assert samples.dtype == np.float32 17 | assert samples.shape[-1] == 2048 18 | 19 | def test_apply_pitch_shift_multichannel(self): 20 | num_channels = 3 21 | samples = np.random.normal(0, 0.1, size=(num_channels, 5555)).astype(np.float32) 22 | sample_rate = 16000 23 | augmenter = Compose([PitchShift(min_semitones=1, max_semitones=2, p=1.0)]) 24 | samples_out = augmenter(samples=samples, sample_rate=sample_rate) 25 | 26 | assert samples_out.dtype == np.float32 27 | assert samples_out.shape == samples.shape 28 | for i in range(num_channels): 29 | assert not np.allclose(samples[i], samples_out[i]) 30 | 31 | def test_freeze_parameters(self): 32 | """ 33 | Test that the transform can freeze its parameters, e.g. to apply the effect with the 34 | same parameters to multiple sounds. 35 | """ 36 | samples = np.sin(np.linspace(0, 440 * 2 * np.pi, 8000)).astype(np.float32) 37 | sample_rate = 16000 38 | augmenter = Compose([PitchShift(min_semitones=1, max_semitones=12, p=1.0)]) 39 | 40 | first_samples = augmenter(samples=samples, sample_rate=sample_rate) 41 | first_parameters = deepcopy(augmenter.transforms[0].parameters) 42 | 43 | augmenter.transforms[0].min_semitones = -12 44 | augmenter.transforms[0].max_semitones = -1 45 | augmenter.transforms[0].are_parameters_frozen = True 46 | second_samples = augmenter(samples=samples, sample_rate=sample_rate) 47 | 48 | assert first_parameters == augmenter.transforms[0].parameters 49 | assert_array_equal(first_samples, second_samples) 50 | -------------------------------------------------------------------------------- /tests/test_loudness_normalization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | from numpy.testing import assert_almost_equal 5 | 6 | from audiomentations import LoudnessNormalization 7 | 8 | 9 | class TestLoudnessNormalization: 10 | def test_loudness_normalization(self): 11 | samples = np.random.uniform(low=-0.2, high=-0.001, size=(8000,)).astype( 12 | np.float32 13 | ) 14 | sample_rate = 16000 15 | 16 | augment = LoudnessNormalization(min_lufs=-32, max_lufs=-12, p=1.0) 17 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 18 | gain_factors = processed_samples / samples 19 | assert np.amin(gain_factors) == pytest.approx(np.amax(gain_factors)) 20 | assert processed_samples.dtype == np.float32 21 | 22 | def test_loudness_normalization_digital_silence(self): 23 | samples = np.zeros(8000, dtype=np.float32) 24 | sample_rate = 16000 25 | 26 | augment = LoudnessNormalization(min_lufs=-32, max_lufs=-12, p=1.0) 27 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 28 | assert_almost_equal(processed_samples, np.zeros(8000, dtype=np.float32)) 29 | assert processed_samples.dtype == np.float32 30 | 31 | def test_loudness_normalization_too_short_input(self): 32 | samples = np.random.uniform(low=-0.2, high=-0.001, size=(800,)).astype( 33 | np.float32 34 | ) 35 | sample_rate = 16000 36 | 37 | augment = LoudnessNormalization(min_lufs=-32, max_lufs=-12, p=1.0) 38 | with pytest.raises(ValueError): 39 | _ = augment(samples=samples, sample_rate=sample_rate) 40 | 41 | def test_loudness_normalization_multichannel(self): 42 | samples = np.random.uniform(low=-0.2, high=-0.001, size=(3, 8000)).astype( 43 | np.float32 44 | ) 45 | sample_rate = 16000 46 | 47 | augment = LoudnessNormalization(min_lufs=-32, max_lufs=-12, p=1.0) 48 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 49 | gain_factors = processed_samples / samples 50 | assert np.amin(gain_factors) == pytest.approx(np.amax(gain_factors)) 51 | assert processed_samples.dtype == np.float32 52 | -------------------------------------------------------------------------------- /audiomentations/augmentations/clipping_distortion.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | 8 | 9 | class ClippingDistortion(BaseWaveformTransform): 10 | """Distort signal by clipping a random percentage of points 11 | 12 | The percentage of points that will be clipped is drawn from a uniform distribution between 13 | the two input parameters min_percentile_threshold and max_percentile_threshold. If for instance 14 | 30% is drawn, the samples are clipped if they're below the 15th or above the 85th percentile. 15 | """ 16 | 17 | supports_multichannel = True 18 | 19 | def __init__( 20 | self, 21 | min_percentile_threshold: int = 0, 22 | max_percentile_threshold: int = 40, 23 | p: float = 0.5, 24 | ): 25 | """ 26 | :param min_percentile_threshold: int, A lower bound on the total percent of samples that 27 | will be clipped 28 | :param max_percentile_threshold: int, An upper bound on the total percent of samples that 29 | will be clipped 30 | :param p: The probability of applying this transform 31 | """ 32 | super().__init__(p) 33 | assert min_percentile_threshold <= max_percentile_threshold 34 | assert 0 <= min_percentile_threshold <= 100 35 | assert 0 <= max_percentile_threshold <= 100 36 | self.min_percentile_threshold = min_percentile_threshold 37 | self.max_percentile_threshold = max_percentile_threshold 38 | 39 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 40 | super().randomize_parameters(samples, sample_rate) 41 | if self.parameters["should_apply"]: 42 | self.parameters["percentile_threshold"] = random.randint( 43 | self.min_percentile_threshold, self.max_percentile_threshold 44 | ) 45 | 46 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 47 | lower_percentile_threshold = int(self.parameters["percentile_threshold"] / 2) 48 | lower_threshold, upper_threshold = np.percentile( 49 | samples, [lower_percentile_threshold, 100 - lower_percentile_threshold] 50 | ) 51 | samples = np.clip(samples, lower_threshold, upper_threshold) 52 | return samples 53 | -------------------------------------------------------------------------------- /docs/waveform_transforms/bit_crush.md: -------------------------------------------------------------------------------- 1 | # `BitCrush` 2 | 3 | _Added in v0.35.0_ 4 | 5 | Apply a bit crush effect to the audio by reducing the bit depth. In other words, it 6 | reduces the number of bits that can be used for representing each audio sample. 7 | This adds quantization noise, and affects dynamic range. This transform does not apply 8 | dithering. 9 | 10 | For more information, see 11 | 12 | * [Resolution reduction :octicons-link-external-16:](https://en.wikipedia.org/wiki/Bitcrusher#Resolution_reduction){target=_blank} on Wikipedia 13 | * [Intro to bit reduction :octicons-link-external-16:](http://gdsp.hf.ntnu.no/lessons/1/4/){target=_blank} by NTNU, Department of Music, Music Technology 14 | 15 | ## Input-output example 16 | 17 | Here we reduce the bit depth from 16 to 6 bits per sample 18 | 19 | ![Input-output waveforms and spectrograms](BitCrush.webp) 20 | 21 | | Input sound | Transformed sound | 22 | |---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------| 23 | | | | 24 | 25 | ## Usage example 26 | 27 | ```python 28 | from audiomentations import BitCrush 29 | 30 | transform = BitCrush(min_bit_depth=5, max_bit_depth=14, p=1.0) 31 | 32 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 33 | ``` 34 | 35 | # BitCrush API 36 | 37 | [`min_bit_depth`](#min_bit_depth){ #min_bit_depth }: `int` • unit: bits • range: [1, 32] 38 | : :octicons-milestone-24: Minimum bit depth the audio will be "converted" to 39 | 40 | [`max_bit_depth`](#max_bit_depth){ #max_bit_depth }: `int` • unit: bits • range: [1, 32] 41 | : :octicons-milestone-24: Maximum bit depth the audio will be "converted" to 42 | 43 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 44 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 45 | 46 | ## Source code :octicons-mark-github-16: 47 | 48 | [audiomentations/augmentations/bit_crush.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/bit_crush.py){target=_blank} 49 | -------------------------------------------------------------------------------- /docs/waveform_transforms/time_mask.md: -------------------------------------------------------------------------------- 1 | # `TimeMask` 2 | 3 | _Added in v0.7.0_ 4 | 5 | Make a randomly chosen part of the audio silent. Inspired by 6 | [https://arxiv.org/pdf/1904.08779.pdf](https://arxiv.org/pdf/1904.08779.pdf) 7 | 8 | 9 | ## Input-output example 10 | 11 | Here we silence a part of a speech recording. 12 | 13 | ![Input-output waveforms and spectrograms](TimeMask.webp) 14 | 15 | | Input sound | Transformed sound | 16 | |-------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------| 17 | | | | 18 | 19 | 20 | ## Usage example 21 | 22 | ```python 23 | from audiomentations import TimeMask 24 | 25 | transform = TimeMask( 26 | min_band_part=0.1, 27 | max_band_part=0.15, 28 | fade=True, 29 | p=1.0, 30 | ) 31 | 32 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 33 | ``` 34 | 35 | ## TimeMask API 36 | 37 | [`min_band_part`](#min_band_part){ #min_band_part }: `float` • range: [0.0, 1.0] 38 | : :octicons-milestone-24: Default: `0.0`. Minimum length of the silent part as a 39 | fraction of the total sound length. 40 | 41 | [`max_band_part`](#max_band_part){ #max_band_part }: `float` • range: [0.0, 1.0] 42 | : :octicons-milestone-24: Default: `0.5`. Maximum length of the silent part as a 43 | fraction of the total sound length. 44 | 45 | [`fade`](#fade){ #fade }: `bool` 46 | : :octicons-milestone-24: Default: `False`. When set to `True`, add a linear fade in 47 | and fade out of the silent part. This can smooth out an unwanted abrupt change 48 | between two consecutive samples (which sounds like a transient/click/pop). 49 | 50 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 51 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 52 | 53 | ## Source code :octicons-mark-github-16: 54 | 55 | [audiomentations/augmentations/time_mask.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/time_mask.py){target=_blank} 56 | -------------------------------------------------------------------------------- /docs/guides/transform_parameters.md: -------------------------------------------------------------------------------- 1 | # Transform parameters 2 | 3 | ## How to obtain the chosen parameters after calling a transform 4 | 5 | You can access the `parameters` property of a transform. Code example: 6 | 7 | ```python 8 | from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift 9 | import numpy as np 10 | 11 | augment = Compose([ 12 | AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5), 13 | TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), 14 | PitchShift(min_semitones=-4, max_semitones=4, p=0.5), 15 | Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5), 16 | ]) 17 | 18 | # Generate 2 seconds of dummy audio for the sake of example 19 | samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32) 20 | 21 | # Augment/transform/perturb the audio data 22 | augmented_samples = augment(samples=samples, sample_rate=16000) 23 | 24 | for transform in augment.transforms: 25 | print(f"{transform.__class__.__name__}: {transform.parameters}") 26 | ``` 27 | 28 | When running the example code above, it may print something like this: 29 | ``` 30 | AddGaussianNoise: {'should_apply': True, 'amplitude': 0.0027702725003923272} 31 | TimeStretch: {'should_apply': True, 'rate': 1.158377360016495} 32 | PitchShift: {'should_apply': False} 33 | Shift: {'should_apply': False} 34 | ``` 35 | 36 | ## How to use apply a transform with the same parameters to multiple inputs 37 | 38 | This technique can be useful if you want to transform e.g. a target sound in the same way as an input sound. Code example: 39 | 40 | ```python 41 | from audiomentations import Gain 42 | import numpy as np 43 | 44 | augment = Gain(p=1.0) 45 | 46 | samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32) 47 | samples2 = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32) 48 | 49 | augmented_samples = augment(samples=samples, sample_rate=16000) 50 | augment.freeze_parameters() 51 | print(augment.parameters) 52 | augmented_samples2 = augment(samples=samples2, sample_rate=16000) 53 | print(augment.parameters) 54 | augment.unfreeze_parameters() 55 | ``` 56 | 57 | When running the example code above, it may print something like this: 58 | 59 | ``` 60 | {'should_apply': True, 'amplitude_ratio': 0.9688148624484364} 61 | {'should_apply': True, 'amplitude_ratio': 0.9688148624484364} 62 | ``` 63 | 64 | In other words, this means that both sounds (`samples` and `samples2`) were gained by the same amount 65 | -------------------------------------------------------------------------------- /audiomentations/augmentations/band_pass_filter.py: -------------------------------------------------------------------------------- 1 | from audiomentations.augmentations.base_butterword_filter import BaseButterworthFilter 2 | 3 | 4 | class BandPassFilter(BaseButterworthFilter): 5 | """ 6 | Apply band-pass filtering to the input audio. Filter steepness (6/12/18... dB / octave) 7 | is parametrized. Can also be set for zero-phase filtering (will result in a 6 dB drop at 8 | cutoffs). 9 | """ 10 | 11 | supports_multichannel = True 12 | 13 | def __init__( 14 | self, 15 | min_center_freq: float = 200.0, 16 | max_center_freq: float = 4000.0, 17 | min_bandwidth_fraction: float = 0.5, 18 | max_bandwidth_fraction: float = 1.99, 19 | min_rolloff: int = 12, 20 | max_rolloff: int = 24, 21 | zero_phase: bool = False, 22 | p: float = 0.5, 23 | ): 24 | """ 25 | :param min_center_freq: Minimum center frequency in hertz 26 | :param max_center_freq: Maximum center frequency in hertz 27 | :param min_bandwidth_fraction: Minimum bandwidth relative to center frequency 28 | :param max_bandwidth_fraction: Maximum bandwidth relative to center frequency 29 | :param min_rolloff: Minimum filter roll-off (in dB/octave). 30 | Must be a multiple of 6 31 | :param max_rolloff: Maximum filter roll-off (in dB/octave) 32 | Must be a multiple of 6 33 | :param zero_phase: Whether filtering should be zero phase. 34 | When this is set to `True` it will not affect the phase of the 35 | input signal but will sound 3 dB lower at the cutoff frequency 36 | compared to the non-zero phase case (6 dB vs 3 dB). Additionally, 37 | it is 2 times slower than in the non-zero phase case. If you 38 | absolutely want no phase distortions (e.g. want to augment an 39 | audio file with lots of transients, like a drum track), set 40 | this to `True`. 41 | :param p: The probability of applying this transform 42 | """ 43 | super().__init__( 44 | min_center_freq=min_center_freq, 45 | max_center_freq=max_center_freq, 46 | min_bandwidth_fraction=min_bandwidth_fraction, 47 | max_bandwidth_fraction=max_bandwidth_fraction, 48 | min_rolloff=min_rolloff, 49 | max_rolloff=max_rolloff, 50 | zero_phase=zero_phase, 51 | p=p, 52 | filter_type="bandpass", 53 | ) 54 | -------------------------------------------------------------------------------- /audiomentations/__init__.py: -------------------------------------------------------------------------------- 1 | from .augmentations.add_background_noise import AddBackgroundNoise 2 | from .augmentations.add_gaussian_noise import AddGaussianNoise 3 | from .augmentations.add_gaussian_snr import AddGaussianSNR 4 | from .augmentations.add_color_noise import AddColorNoise, NOISE_COLOR_DECAYS 5 | from .augmentations.add_short_noises import AddShortNoises 6 | from .augmentations.adjust_duration import AdjustDuration 7 | from .augmentations.air_absorption import AirAbsorption 8 | from .augmentations.aliasing import Aliasing 9 | from .augmentations.apply_impulse_response import ApplyImpulseResponse 10 | from .augmentations.band_pass_filter import BandPassFilter 11 | from .augmentations.band_stop_filter import BandStopFilter 12 | from .augmentations.bit_crush import BitCrush 13 | from .augmentations.clip import Clip 14 | from .augmentations.clipping_distortion import ClippingDistortion 15 | from .augmentations.gain import Gain 16 | from .augmentations.gain_transition import GainTransition 17 | from .augmentations.high_pass_filter import HighPassFilter 18 | from .augmentations.high_shelf_filter import HighShelfFilter 19 | from .augmentations.lambda_transform import Lambda 20 | from .augmentations.limiter import Limiter 21 | from .augmentations.loudness_normalization import LoudnessNormalization 22 | from .augmentations.low_pass_filter import LowPassFilter 23 | from .augmentations.low_shelf_filter import LowShelfFilter 24 | from .augmentations.mp3_compression import Mp3Compression 25 | from .augmentations.normalize import Normalize 26 | from .augmentations.padding import Padding 27 | from .augmentations.peaking_filter import PeakingFilter 28 | from .augmentations.pitch_shift import PitchShift 29 | from .augmentations.polarity_inversion import PolarityInversion 30 | from .augmentations.repeat_part import RepeatPart 31 | from .augmentations.resample import Resample 32 | from .augmentations.reverse import Reverse 33 | from .augmentations.room_simulator import RoomSimulator 34 | from .augmentations.seven_band_parametric_eq import SevenBandParametricEQ 35 | from .augmentations.shift import Shift 36 | from .augmentations.tanh_distortion import TanhDistortion 37 | from .augmentations.time_mask import TimeMask 38 | from .augmentations.time_stretch import TimeStretch 39 | from .augmentations.trim import Trim 40 | from .core.composition import Compose, SpecCompose, OneOf, SomeOf 41 | from .spec_augmentations.spec_channel_shuffle import SpecChannelShuffle 42 | from .spec_augmentations.spec_frequency_mask import SpecFrequencyMask 43 | 44 | __version__ = "0.37.0" 45 | -------------------------------------------------------------------------------- /tests/test_limiter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | 4 | import fast_align_audio 5 | import numpy as np 6 | import pytest 7 | 8 | from audiomentations import Limiter 9 | 10 | 11 | class TestLimiter: 12 | @pytest.mark.parametrize( 13 | "samples_in", 14 | [ 15 | np.random.normal(0, 1, size=1000).astype(np.float32), 16 | np.random.normal(0, 0.001, size=(1, 250)).astype(np.float32), 17 | np.random.normal(0, 0.1, size=(3, 8888)).astype(np.float32), 18 | ], 19 | ) 20 | def test_limiter(self, samples_in): 21 | augmenter = Limiter(p=1.0, min_attack=0.0025, max_attack=0.0025) 22 | std_in = np.mean(np.abs(samples_in)) 23 | samples_out = augmenter(samples=samples_in, sample_rate=16000) 24 | std_out = np.mean(np.abs(samples_out)) 25 | length = samples_in.shape[-1] 26 | 27 | samples_in_mono = samples_in 28 | samples_out_mono = samples_out 29 | if samples_in_mono.ndim > 1: 30 | samples_in_mono = samples_in_mono[0] 31 | samples_out_mono = samples_out_mono[0] 32 | offset, _ = fast_align_audio.find_best_alignment_offset( 33 | reference_signal=samples_in_mono, 34 | delayed_signal=samples_out_mono, 35 | max_offset_samples=length // 2, 36 | lookahead_samples=length // 2, 37 | ) 38 | # Check that the output is aligned with the input, i.e. no delay was introduced 39 | assert offset == 0 40 | 41 | assert samples_out.dtype == np.float32 42 | assert samples_out.shape == samples_in.shape 43 | assert std_out < std_in 44 | 45 | def test_limiter_validation(self): 46 | with pytest.raises(AssertionError): 47 | Limiter(min_attack=-0.5) 48 | 49 | def test_serialize_parameters(self): 50 | random.seed(42) 51 | transform = Limiter(p=1.0) 52 | samples = np.random.normal(0, 1, size=1024).astype(np.float32) 53 | transform.randomize_parameters(samples, sample_rate=16000) 54 | json.dumps(transform.serialize_parameters()) 55 | 56 | def test_digital_silence(self): 57 | samples_in = np.zeros((1024,), np.float32) 58 | augmenter = Limiter(p=1.0) 59 | std_in = np.mean(np.abs(samples_in)) 60 | samples_out = augmenter(samples=samples_in, sample_rate=16000) 61 | std_out = np.mean(np.abs(samples_out)) 62 | assert samples_out.dtype == np.float32 63 | assert samples_out.shape == samples_in.shape 64 | assert std_out == std_in == 0.0 65 | -------------------------------------------------------------------------------- /audiomentations/augmentations/aliasing.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | from audiomentations.core.utils import ( 8 | convert_frequency_to_mel, 9 | convert_mel_to_frequency, 10 | ) 11 | 12 | 13 | class Aliasing(BaseWaveformTransform): 14 | """ 15 | Apply an aliasing effect to the audio by downsampling to a lower 16 | sample rate without filtering and upsampling after that. 17 | """ 18 | 19 | supports_multichannel = True 20 | 21 | def __init__( 22 | self, min_sample_rate: int = 8000, max_sample_rate: int = 30000, p: float = 0.5 23 | ): 24 | """ 25 | :param min_sample_rate: Minimum target sample rate to downsample to 26 | :param max_sample_rate: Maximum target sample rate to downsample to 27 | :param p: The probability of applying this transform 28 | """ 29 | super().__init__(p) 30 | 31 | if min_sample_rate < 2: 32 | raise ValueError("min_sample_rate must be greater than or equal to 2") 33 | 34 | if min_sample_rate > max_sample_rate: 35 | raise ValueError("min_sample_rate must not be larger than max_sample_rate") 36 | 37 | self.min_mel = convert_frequency_to_mel(min_sample_rate) 38 | self.max_mel = convert_frequency_to_mel(max_sample_rate) 39 | 40 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 41 | super().randomize_parameters(samples, sample_rate) 42 | if self.parameters["should_apply"]: 43 | self.parameters["new_sample_rate"] = convert_mel_to_frequency( 44 | random.uniform(self.min_mel, self.max_mel) 45 | ) 46 | 47 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 48 | n = samples.shape[-1] 49 | x = np.linspace(0, n, num=n) 50 | dwn_n = round(n * float(self.parameters["new_sample_rate"]) / sample_rate) 51 | dwn_x = np.linspace(0, n, num=dwn_n) 52 | if len(samples.shape) > 1: 53 | distorted_samples = np.zeros((samples.shape[0], n), dtype=np.float32) 54 | for i in range(samples.shape[0]): 55 | dwn_samples = np.interp(dwn_x, x, samples[i]) 56 | distorted_samples[i] = np.interp(x, dwn_x, dwn_samples) 57 | else: 58 | dwn_samples = np.interp(dwn_x, x, samples) 59 | distorted_samples = np.interp(x, dwn_x, dwn_samples).astype(np.float32) 60 | return distorted_samples 61 | -------------------------------------------------------------------------------- /docs/waveform_transforms/gain_transition.md: -------------------------------------------------------------------------------- 1 | # `GainTransition` 2 | 3 | _Added in v0.22.0_ 4 | 5 | Gradually change the volume up or down over a random time span. Also known as 6 | fade in and fade out. The fade works on a logarithmic scale, which is natural to 7 | human hearing. 8 | 9 | The way this works is that it picks two gains: a first gain and a second gain. 10 | Then it picks a time range for the transition between those two gains. 11 | Note that this transition can start before the audio starts and/or end after the 12 | audio ends, so the output audio can start or end in the middle of a transition. 13 | The gain starts at the first gain and is held constant until the transition start. 14 | Then it transitions to the second gain. Then that gain is held constant until the 15 | end of the sound. 16 | 17 | # GainTransition API 18 | 19 | [`min_gain_in_db`](#min_gain_in_db){ #min_gain_in_db }: `float` • unit: Decibel 20 | : :warning: Deprecated as of v0.31.0. Use [`min_gain_db`](#min_gain_db) instead 21 | 22 | [`max_gain_in_db`](#max_gain_in_db){ #max_gain_in_db }: `float` • unit: Decibel 23 | : :warning: Deprecated as of v0.31.0. Use [`max_gain_db`](#max_gain_db) instead 24 | 25 | [`min_gain_db`](#min_gain_db){ #min_gain_db }: `float` • unit: Decibel 26 | : :octicons-milestone-24: Default: `-24.0`. Minimum gain. 27 | 28 | [`max_gain_db`](#max_gain_db){ #max_gain_db }: `float` • unit: Decibel 29 | : :octicons-milestone-24: Default: `6.0`. Maximum gain. 30 | 31 | [`min_duration`](#min_duration){ #min_duration }: `Union[float, int]` • unit: see [`duration_unit`](#duration_unit) 32 | : :octicons-milestone-24: Default: `0.2`. Minimum length of transition. 33 | 34 | [`max_duration`](#max_duration){ #max_duration }: `Union[float, int]` • unit: see [`duration_unit`](#duration_unit) 35 | : :octicons-milestone-24: Default: `6.0`. Maximum length of transition. 36 | 37 | [`duration_unit`](#duration_unit){ #duration_unit }: `str` • choices: `"fraction"`, `"samples"`, `"seconds"` 38 | : :octicons-milestone-24: Default: `"seconds"`. Defines the unit of the value of `min_duration` and `max_duration`. 39 | 40 | * `"fraction"`: Fraction of the total sound length 41 | * `"samples"`: Number of audio samples 42 | * `"seconds"`: Number of seconds 43 | 44 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 45 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 46 | 47 | ## Source code :octicons-mark-github-16: 48 | 49 | [audiomentations/augmentations/gain_transition.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/gain_transition.py){target=_blank} 50 | -------------------------------------------------------------------------------- /docs/waveform_transforms/add_color_noise.md: -------------------------------------------------------------------------------- 1 | # `AddColorNoise` 2 | 3 | _Added in v0.35.0_ 4 | 5 | Mix in noise with color, optionally weighted by an [A-weighting :octicons-link-external-16:](https://en.wikipedia.org/wiki/A-weighting){target=_blank} curve. When 6 | `f_decay=0`, this is equivalent to `AddGaussianNoise`. Otherwise, see: [Colors of Noise :octicons-link-external-16:](https://en.wikipedia.org/wiki/Colors_of_noise){target=_blank}. 7 | 8 | 9 | ## AddColorNoise API 10 | 11 | [`min_snr_db`](#min_snr_db){ #min_snr_db }: `float` • unit: Decibel 12 | : :octicons-milestone-24: Default: `5.0`. Minimum signal-to-noise ratio in dB. A lower 13 | number means more noise. 14 | 15 | [`max_snr_db`](#max_snr_db){ #max_snr_db }: `float` • unit: Decibel 16 | : :octicons-milestone-24: Default: `40.0`. Maximum signal-to-noise ratio in dB. A 17 | greater number means less noise. 18 | 19 | [`min_f_decay`](#min_f_decay){ #min_f_decay }: `float` • unit: Decibels/octave 20 | : :octicons-milestone-24: Default: `-6.0`. Minimum noise decay in dB per octave. 21 | 22 | [`max_f_decay`](#max_f_decay){ #max_f_decay }: `float` • unit: Decibels/octave 23 | : :octicons-milestone-24: Default: `6.0`. Maximum noise decay in dB per octave. 24 | 25 | Those values can be chosen from the following table: 26 | 27 | | Colour | `f_decay` (dB/octave) | 28 | |----------------|-----------------------| 29 | | pink | -3.01 | 30 | | brown/brownian | -6.02 | 31 | | red | -6.02 | 32 | | blue | 3.01 | 33 | | azure | 3.01 | 34 | | violet | 6.02 | 35 | | white | 0.0 | 36 | 37 | See [Colors of noise :octicons-link-external-16:](https://en.wikipedia.org/wiki/Colors_of_noise){target=_blank} on Wikipedia about those values. 38 | 39 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 40 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 41 | 42 | [`p_apply_a_weighting`](#p_apply_a_weighting){ #p_apply_a_weighting }: `float` • range: [0.0, 1.0] 43 | : :octicons-milestone-24: Default: `0.0`. The probability of additionally weighting the transform using an `A-weighting` curve. 44 | 45 | [`n_fft`](#n_fft){ #n_fft }: `int` 46 | : :octicons-milestone-24: Default: `128`. The number of points the decay curve is computed (for coloring white noise). 47 | 48 | ## Source code :octicons-mark-github-16: 49 | 50 | [audiomentations/augmentations/add_color_noise.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/add_color_noise.py){target=_blank} 51 | -------------------------------------------------------------------------------- /docs/waveform_transforms/tanh_distortion.md: -------------------------------------------------------------------------------- 1 | # `TanhDistortion` 2 | 3 | _Added in v0.19.0_ 4 | 5 | Apply tanh (hyperbolic tangent) distortion to the audio. This technique is sometimes 6 | used for adding distortion to guitar recordings. The tanh() function can give a rounded 7 | "soft clipping" kind of distortion, and the distortion amount is proportional to the 8 | loudness of the input and the pre-gain. Tanh is symmetric, so the positive and 9 | negative parts of the signal are squashed in the same way. This transform can be 10 | useful as data augmentation because it adds harmonics. In other words, it changes 11 | the timbre of the sound. 12 | 13 | See this page for examples: [http://gdsp.hf.ntnu.no/lessons/3/17/](http://gdsp.hf.ntnu.no/lessons/3/17/) 14 | 15 | ## Input-output example 16 | 17 | In this example we apply tanh distortion with the "distortion amount" (think of it as a knob that goes from 0 to 1) set to 0.25 18 | 19 | ![Input-output waveforms and spectrograms](TanhDistortion.webp) 20 | 21 | | Input sound | Transformed sound | 22 | |-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| 23 | | | | 24 | 25 | ## Usage example 26 | 27 | ```python 28 | from audiomentations import TanhDistortion 29 | 30 | transform = TanhDistortion( 31 | min_distortion=0.01, 32 | max_distortion=0.7, 33 | p=1.0 34 | ) 35 | 36 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 37 | ``` 38 | 39 | ## TanhDistortion API 40 | 41 | [`min_distortion`](#min_distortion){ #min_distortion }: `float` • range: [0.0, 1.0] 42 | : :octicons-milestone-24: Default: `0.01`. Minimum "amount" of distortion to apply to the signal. 43 | 44 | [`max_distortion`](#max_distortion){ #max_distortion }: `float` • range: [0.0, 1.0] 45 | : :octicons-milestone-24: Default: `0.7`. Maximum "amount" of distortion to apply to the signal. 46 | 47 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 48 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 49 | 50 | ## Source code :octicons-mark-github-16: 51 | 52 | [audiomentations/augmentations/tanh_distortion.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/tanh_distortion.py){target=_blank} 53 | -------------------------------------------------------------------------------- /tests/test_gain_transition.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from audiomentations import GainTransition 7 | 8 | 9 | class TestGainTransition: 10 | @pytest.mark.parametrize( 11 | "samples", 12 | [ 13 | # Test both mono and stereo 14 | np.random.uniform(low=-0.5, high=0.5, size=(1234,)).astype(np.float32), 15 | np.random.uniform(low=-0.5, high=0.5, size=(2, 5678)).astype(np.float32), 16 | ], 17 | ) 18 | def test_gain_transition_fraction(self, samples): 19 | np.random.seed(42) 20 | random.seed(42) 21 | sample_rate = 8000 22 | 23 | augment = GainTransition(p=1.0) 24 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 25 | assert not np.allclose(samples, processed_samples) 26 | assert processed_samples.shape == samples.shape 27 | assert processed_samples.dtype == np.float32 28 | 29 | def test_gain_transition_seconds(self): 30 | np.random.seed(40) 31 | random.seed(40) 32 | samples = np.random.uniform(low=-0.5, high=0.5, size=(2345,)).astype(np.float32) 33 | sample_rate = 16000 34 | 35 | augment = GainTransition( 36 | min_duration=0.2, max_duration=0.3, duration_unit="seconds", p=1.0 37 | ) 38 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 39 | assert not np.allclose(samples, processed_samples) 40 | assert processed_samples.shape == samples.shape 41 | assert processed_samples.dtype == np.float32 42 | 43 | def test_gain_transition_samples(self): 44 | np.random.seed(1337) 45 | random.seed(1337) 46 | samples = np.random.uniform(low=-0.5, high=0.5, size=(3456,)).astype(np.float32) 47 | sample_rate = 32000 48 | 49 | augment = GainTransition( 50 | min_duration=1, max_duration=5000, duration_unit="samples", p=1.0 51 | ) 52 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 53 | assert not np.allclose(samples, processed_samples) 54 | assert processed_samples.shape == samples.shape 55 | assert processed_samples.dtype == np.float32 56 | 57 | def test_invalid_params(self): 58 | with pytest.raises(AssertionError): 59 | GainTransition( 60 | min_duration=-12, max_duration=324, duration_unit="samples", p=1.0 61 | ) 62 | 63 | augment = GainTransition( 64 | min_duration=45, max_duration=45, duration_unit="lightyears", p=1.0 65 | ) 66 | with pytest.raises(ValueError): 67 | augment(np.zeros(40, dtype=np.float32), 16000) 68 | -------------------------------------------------------------------------------- /audiomentations/augmentations/band_stop_filter.py: -------------------------------------------------------------------------------- 1 | from audiomentations.augmentations.base_butterword_filter import BaseButterworthFilter 2 | 3 | 4 | class BandStopFilter(BaseButterworthFilter): 5 | """ 6 | Apply band-stop filtering to the input audio. Also known as notch filter or 7 | band reject filter. It relates to the frequency mask idea in the SpecAugment paper. 8 | Center frequency gets picked in mel space, so it is 9 | more aligned with human hearing, which is not linear. Filter steepness 10 | (6/12/18... dB / octave) is parametrized. Can also be set for zero-phase filtering 11 | (will result in a 6 dB drop at cutoffs). 12 | """ 13 | 14 | supports_multichannel = True 15 | 16 | def __init__( 17 | self, 18 | min_center_freq: float = 200.0, 19 | max_center_freq: float = 4000.0, 20 | min_bandwidth_fraction: float = 0.5, 21 | max_bandwidth_fraction: float = 1.99, 22 | min_rolloff: int = 12, 23 | max_rolloff: int = 24, 24 | zero_phase: bool = False, 25 | p: float = 0.5, 26 | ): 27 | """ 28 | :param min_center_freq: Minimum center frequency in hertz 29 | :param max_center_freq: Maximum center frequency in hertz 30 | :param min_bandwidth_fraction: Minimum bandwidth fraction relative to center 31 | frequency (number between 0 and 2) 32 | :param max_bandwidth_fraction: Maximum bandwidth fraction relative to center 33 | frequency (number between 0 and 2) 34 | :param min_rolloff: Minimum filter roll-off (in dB/octave). 35 | Must be a multiple of 6 36 | :param max_rolloff: Maximum filter roll-off (in dB/octave) 37 | Must be a multiple of 6 38 | :param zero_phase: Whether filtering should be zero phase. 39 | When this is set to `true` it will not affect the phase of the 40 | input signal but will sound 3 dB lower at the cutoff frequency 41 | compared to the non-zero phase case (6 dB vs 3 dB). Additionally, 42 | it is 2 times slower than in the non-zero phase case. If you 43 | absolutely want no phase distortions (e.g. want to augment a 44 | drum track), set this to `true`. 45 | :param p: The probability of applying this transform 46 | """ 47 | super().__init__( 48 | min_center_freq=min_center_freq, 49 | max_center_freq=max_center_freq, 50 | min_bandwidth_fraction=min_bandwidth_fraction, 51 | max_bandwidth_fraction=max_bandwidth_fraction, 52 | min_rolloff=min_rolloff, 53 | max_rolloff=max_rolloff, 54 | zero_phase=zero_phase, 55 | p=p, 56 | filter_type="bandstop", 57 | ) 58 | -------------------------------------------------------------------------------- /docs/waveform_transforms/pitch_shift.md: -------------------------------------------------------------------------------- 1 | # `PitchShift` 2 | 3 | _Added in v0.4.0_ 4 | 5 | Pitch shift the sound up or down without changing the tempo. 6 | 7 | Under the hood this does time stretching (by phase vocoding) followed by resampling. 8 | Note that phase vocoding can degrade audio quality by "smearing" transient sounds, 9 | altering the timbre of harmonic sounds, and distorting pitch modulations. This may 10 | result in a loss of sharpness, clarity, or naturalness in the transformed audio. 11 | 12 | If you need a better sounding pitch shifting method, consider the following alternatives: 13 | 14 | * [signalsmith-stretch](https://github.com/Signalsmith-Audio/signalsmith-stretch) 15 | * [Rubber Band library](https://breakfastquay.com/rubberband/) 16 | * [https://github.com/KAIST-MACLab/PyTSMod](https://github.com/KAIST-MACLab/PyTSMod) 17 | * [https://github.com/vinusankars/ESOLA](https://github.com/vinusankars/ESOLA) 18 | 19 | ## Input-output example 20 | 21 | Here we pitch down a piano recording by 4 semitones: 22 | 23 | ![Input-output waveforms and spectrograms](PitchShift.webp) 24 | 25 | | Input sound | Transformed sound | 26 | |---------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| 27 | | | | 28 | 29 | ## Usage example 30 | 31 | ```python 32 | from audiomentations import PitchShift 33 | 34 | transform = PitchShift( 35 | min_semitones=-5.0, 36 | max_semitones=5.0, 37 | p=1.0 38 | ) 39 | 40 | augmented_sound = transform(my_waveform_ndarray, sample_rate=44100) 41 | ``` 42 | 43 | # PitchShift API 44 | 45 | [`min_semitones`](#min_semitones){ #min_semitones }: `float` • unit: semitones • range: [-12.0, 12.0] 46 | : :octicons-milestone-24: Default: `-4.0`. Minimum semitones to shift. Negative number means shift down. 47 | 48 | [`max_semitones`](#max_semitones){ #max_semitones }: `float` • unit: semitones • range: [-12.0, 12.0] 49 | : :octicons-milestone-24: Default: `4.0`. Maximum semitones to shift. Positive number means shift up. 50 | 51 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 52 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 53 | 54 | ## Source code :octicons-mark-github-16: 55 | 56 | [audiomentations/augmentations/pitch_shift.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/pitch_shift.py){target=_blank} 57 | -------------------------------------------------------------------------------- /tests/test_compose.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from numpy.testing import assert_array_equal 5 | 6 | from audiomentations import ( 7 | ClippingDistortion, 8 | AddBackgroundNoise, 9 | TimeMask, 10 | Shift, 11 | Compose, 12 | ) 13 | from demo.demo import DEMO_DIR 14 | 15 | 16 | class TestCompose: 17 | def test_freeze_and_unfreeze_parameters(self): 18 | samples = np.zeros((20,), dtype=np.float32) 19 | sample_rate = 44100 20 | augmenter = Compose( 21 | [ 22 | AddBackgroundNoise( 23 | sounds_path=os.path.join(DEMO_DIR, "background_noises"), 24 | min_snr_db=15, 25 | max_snr_db=35, 26 | p=1.0, 27 | ), 28 | ClippingDistortion(p=0.5), 29 | ] 30 | ) 31 | perturbed_samples1 = augmenter(samples=samples, sample_rate=sample_rate) 32 | augmenter.freeze_parameters() 33 | for transform in augmenter.transforms: 34 | assert transform.are_parameters_frozen 35 | perturbed_samples2 = augmenter(samples=samples, sample_rate=sample_rate) 36 | 37 | assert_array_equal(perturbed_samples1, perturbed_samples2) 38 | 39 | augmenter.unfreeze_parameters() 40 | for transform in augmenter.transforms: 41 | assert not transform.are_parameters_frozen 42 | 43 | def test_randomize_parameters_and_apply(self): 44 | samples = 1.0 / np.arange(1, 21, dtype=np.float32) 45 | sample_rate = 44100 46 | 47 | augmenter = Compose( 48 | [ 49 | AddBackgroundNoise( 50 | sounds_path=os.path.join(DEMO_DIR, "background_noises"), 51 | min_snr_db=15, 52 | max_snr_db=35, 53 | p=1.0, 54 | ), 55 | ClippingDistortion(p=0.5), 56 | TimeMask(min_band_part=0.2, max_band_part=0.5, p=0.5), 57 | Shift(min_shift=0.5, max_shift=0.5, p=0.5), 58 | ] 59 | ) 60 | augmenter.freeze_parameters() 61 | augmenter.randomize_parameters(samples=samples, sample_rate=sample_rate) 62 | 63 | parameters = [transform.parameters for transform in augmenter.transforms] 64 | 65 | perturbed_samples1 = augmenter(samples=samples, sample_rate=sample_rate) 66 | perturbed_samples2 = augmenter(samples=samples, sample_rate=sample_rate) 67 | 68 | assert_array_equal(perturbed_samples1, perturbed_samples2) 69 | 70 | augmenter.unfreeze_parameters() 71 | 72 | for transform_parameters, transform in zip(parameters, augmenter.transforms): 73 | assert transform_parameters == transform.parameters 74 | assert not transform.are_parameters_frozen 75 | -------------------------------------------------------------------------------- /docs/guides/multichannel_audio_array_shapes.md: -------------------------------------------------------------------------------- 1 | # Multichannel audio array shapes 2 | 3 | When working with audio files in Python, you may encounter two main formats for representing the data, especially when you are dealing with stereo (or multichannel) audio. These formats correspond to the shape of the numpy ndarray that holds the audio data. 4 | 5 | ## 1. Channels-first format 6 | 7 | This format has the shape `(channels, samples)`. In the context of a stereo audio file, the number of channels would be 2 (for left and right), and samples are the individual data points in the audio file. For example, a stereo audio file with a duration of 1 second sampled at 44100 Hz would have a shape of `(2, 44100)`. 8 | 9 | **This is the format expected by audiomentations when dealing with multichannel audio**. If you provide multichannel audio data in a different format, a `WrongMultichannelAudioShape` exception will be raised. 10 | 11 | Note that `audiomentations` also supports mono audio, i.e. shape like `(1, samples)` or `(samples,)` 12 | 13 | ## 2. Channels-last format 14 | 15 | This format has the shape `(samples, channels)`. Using the same stereo file example as above, the shape would be `(44100, 2)`. This format is commonly returned by the `soundfile` library when loading a stereo wav file, because channels last is the inherent data layout of a stereo wav file. This layout is the default in stereo wav files because it facilitates streaming audio, where data must be read and played back sequentially. 16 | 17 | ## Loading audio with different libraries 18 | 19 | Different libraries in Python may return audio data in different formats. For instance, `librosa` by default returns a mono ndarray, whereas `soundfile` will return a multichannel ndarray in channels-last format when loading a stereo wav file. 20 | 21 | Here is an example of how to load a file with each: 22 | 23 | ```python 24 | import librosa 25 | import soundfile as sf 26 | 27 | # Librosa, mono 28 | y, sr = librosa.load("stereo_audio_example.wav", sr=None, mono=True) 29 | print(y.shape) # (117833,) 30 | 31 | # Librosa, multichannel 32 | y, sr = librosa.load("stereo_audio_example.wav", sr=None, mono=False) 33 | print(y.shape) # (2, 117833) 34 | 35 | # Soundfile 36 | y, sr = sf.read("stereo_audio_example.wav") 37 | print(y.shape) # (117833, 2) 38 | ``` 39 | 40 | ## Converting between formats 41 | 42 | If you have audio data in the channels-last format but need it in channels-first format, you can easily convert it using the transpose operation of numpy ndarrays: 43 | 44 | ```python 45 | import numpy as np 46 | 47 | # Assuming y is your audio data in channels-last format 48 | y_transposed = np.transpose(y) 49 | 50 | # Alternative, shorter syntax: 51 | y_transposed = y.T 52 | ``` 53 | 54 | Now, `y_transposed` will be in channels-first format and can be used with `audiomentations`. 55 | -------------------------------------------------------------------------------- /audiomentations/augmentations/tanh_distortion.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | from audiomentations.core.utils import calculate_rms 8 | 9 | 10 | class TanhDistortion(BaseWaveformTransform): 11 | """ 12 | Apply tanh (hyperbolic tangent) distortion to the audio. This technique is sometimes 13 | used for adding distortion to guitar recordings. The tanh() function can give a rounded 14 | "soft clipping" kind of distortion, and the distortion amount is proportional to the 15 | loudness of the input and the pre-gain. Tanh is symmetric, so the positive and 16 | negative parts of the signal are squashed in the same way. This transform can be 17 | useful as data augmentation because it adds harmonics. In other words, it changes 18 | the timbre of the sound. 19 | 20 | See this page for examples: http://gdsp.hf.ntnu.no/lessons/3/17/ 21 | """ 22 | 23 | supports_multichannel = True 24 | 25 | def __init__( 26 | self, min_distortion: float = 0.01, max_distortion: float = 0.7, p: float = 0.5 27 | ): 28 | """ 29 | :param min_distortion: Minimum amount of distortion (between 0 and 1) 30 | :param max_distortion: Maximum amount of distortion (between 0 and 1) 31 | :param p: The probability of applying this transform 32 | """ 33 | super().__init__(p) 34 | assert 0 <= min_distortion <= 1 35 | assert 0 <= max_distortion <= 1 36 | assert min_distortion <= max_distortion 37 | self.min_distortion = min_distortion 38 | self.max_distortion = max_distortion 39 | 40 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 41 | super().randomize_parameters(samples, sample_rate) 42 | if self.parameters["should_apply"]: 43 | self.parameters["distortion_amount"] = random.uniform( 44 | self.min_distortion, self.max_distortion 45 | ) 46 | 47 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 48 | # Find out how much to pre-gain the audio to get a given amount of distortion 49 | percentile = 100 - 99 * self.parameters["distortion_amount"] 50 | threshold = np.percentile(np.abs(samples), percentile) 51 | gain_factor = 0.5 / (threshold + 1e-6) 52 | 53 | # Distort the audio 54 | distorted_samples = np.tanh(gain_factor * samples) 55 | 56 | # Scale the output so its loudness matches the input 57 | rms_before = calculate_rms(samples) 58 | if rms_before > 1e-9: 59 | rms_after = calculate_rms(distorted_samples) 60 | post_gain = rms_before / rms_after 61 | distorted_samples = post_gain * distorted_samples 62 | 63 | return distorted_samples 64 | -------------------------------------------------------------------------------- /tests/test_normalize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.testing import assert_array_equal 3 | 4 | from audiomentations import Normalize 5 | 6 | 7 | class TestNormalize: 8 | def test_normalize_positive_peak(self): 9 | samples = np.array([0.5, 0.6, -0.2, 0.0], dtype=np.float32) 10 | sample_rate = 16000 11 | augmenter = Normalize(p=1.0) 12 | samples = augmenter(samples=samples, sample_rate=sample_rate) 13 | 14 | assert np.amax(samples) == 1.0 15 | assert samples.dtype == np.float32 16 | assert samples.shape[-1] == 4 17 | 18 | def test_normalize_negative_peak(self): 19 | samples = np.array([0.5, 0.6, -0.8, 0.0], dtype=np.float32) 20 | sample_rate = 16000 21 | augmenter = Normalize(p=1.0) 22 | samples = augmenter(samples=samples, sample_rate=sample_rate) 23 | 24 | assert np.amin(samples) == -1.0 25 | assert samples[-1] == 0.0 26 | assert samples.dtype == np.float32 27 | assert samples.shape[-1] == 4 28 | 29 | def test_normalize_all_zeros(self): 30 | samples = np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float32) 31 | sample_rate = 16000 32 | augmenter = Normalize(p=1.0) 33 | samples = augmenter(samples=samples, sample_rate=sample_rate) 34 | 35 | assert np.amin(samples) == 0.0 36 | assert samples[-1] == 0.0 37 | assert samples.dtype == np.float32 38 | assert samples.shape[-1] == 4 39 | 40 | def test_normalize_multichannel(self): 41 | samples = np.array( 42 | [[0.9, 0.5, -0.25, -0.125, 0.0], [0.95, 0.5, -0.25, -0.125, 0.0]], 43 | dtype=np.float32, 44 | ) 45 | sample_rate = 16000 46 | augmenter = Normalize(p=1.0) 47 | processed_samples = augmenter(samples=samples, sample_rate=sample_rate) 48 | 49 | assert_array_equal(processed_samples, samples / 0.95) 50 | assert processed_samples.dtype == np.float32 51 | 52 | def test_normalize_multichannel_conditionally(self): 53 | sample_rate = 16000 54 | augmenter = Normalize(apply_to="only_too_loud_sounds", p=1.0) 55 | 56 | samples = np.array( 57 | [[0.9, 0.5, -0.25, -0.125, 0.0], [0.95, 0.5, -0.25, -0.125, 0.0]], 58 | dtype=np.float32, 59 | ) 60 | processed_samples = augmenter(samples=samples, sample_rate=sample_rate) 61 | assert_array_equal(processed_samples, samples) 62 | assert processed_samples.dtype == np.float32 63 | 64 | samples_too_loud = np.array( 65 | [[0.9, 0.5, -0.25, -0.125, 0.0], [1.2, 0.5, -0.25, -0.125, 0.0]], 66 | dtype=np.float32, 67 | ) 68 | processed_samples = augmenter(samples=samples_too_loud, sample_rate=sample_rate) 69 | assert_array_equal(processed_samples, samples_too_loud / 1.2) 70 | assert processed_samples.dtype == np.float32 71 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import re 4 | 5 | from setuptools import setup, find_packages 6 | 7 | with open("README.md", "r") as readme_file: 8 | long_description = readme_file.read() 9 | 10 | here = os.path.abspath(os.path.dirname(__file__)) 11 | 12 | 13 | def read(*parts): 14 | with codecs.open(os.path.join(here, *parts), "r") as fp: 15 | return fp.read() 16 | 17 | 18 | def find_version(*file_paths): 19 | version_file = read(*file_paths) 20 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) 21 | if version_match: 22 | return version_match.group(1) 23 | raise RuntimeError("Unable to find version string.") 24 | 25 | 26 | setup( 27 | name="audiomentations", 28 | version=find_version("audiomentations", "__init__.py"), 29 | author="Iver Jordal", 30 | description=( 31 | "A Python library for audio data augmentation. Inspired by albumentations." 32 | " Useful for machine learning." 33 | ), 34 | license="MIT", 35 | long_description=long_description, 36 | long_description_content_type="text/markdown", 37 | url="https://github.com/iver56/audiomentations", 38 | packages=find_packages(exclude=["demo", "tests"]), 39 | install_requires=[ 40 | "numpy>=1.21.0,<2", 41 | "numpy-minmax>=0.3.0,<1", 42 | "numpy-rms>=0.4.2,<1", 43 | "librosa>=0.8.0,!=0.10.0,<0.11.0", 44 | "scipy>=1.4,<1.13", 45 | "soxr>=0.3.2,<1.0.0", 46 | ], 47 | extras_require={ 48 | "extras": [ 49 | "cylimiter==0.3.0", 50 | "lameenc>=1.2.0,<2", 51 | "pydub>=0.22.0,<1", 52 | "pyloudnorm>=0.1.0", 53 | "pyroomacoustics>=0.6.0", 54 | ] 55 | }, 56 | python_requires=">=3.8,<=3.12", 57 | classifiers=[ 58 | "Programming Language :: Python :: 3.8", 59 | "Programming Language :: Python :: 3.9", 60 | "Programming Language :: Python :: 3.10", 61 | "Programming Language :: Python :: 3.11", 62 | "Programming Language :: Python :: 3.12", 63 | "License :: OSI Approved :: MIT License", 64 | "Operating System :: OS Independent", 65 | "Development Status :: 3 - Alpha", 66 | "Intended Audience :: Developers", 67 | "Intended Audience :: Science/Research", 68 | "Topic :: Multimedia", 69 | "Topic :: Multimedia :: Sound/Audio", 70 | "Topic :: Scientific/Engineering", 71 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 72 | ], 73 | project_urls={ 74 | "Homepage": "https://github.com/iver56/audiomentations", 75 | "Documentation": "https://iver56.github.io/audiomentations/", 76 | "Changelog": "https://iver56.github.io/audiomentations/changelog/", 77 | "Issue Tracker": "https://github.com/iver56/audiomentations/issues", 78 | }, 79 | ) 80 | -------------------------------------------------------------------------------- /audiomentations/augmentations/pitch_shift.py: -------------------------------------------------------------------------------- 1 | import random 2 | import warnings 3 | 4 | import librosa 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | 8 | from audiomentations.core.transforms_interface import BaseWaveformTransform 9 | 10 | 11 | class PitchShift(BaseWaveformTransform): 12 | """Pitch shift the sound up or down without changing the tempo""" 13 | 14 | supports_multichannel = True 15 | 16 | def __init__( 17 | self, min_semitones: float = -4.0, max_semitones: float = 4.0, p: float = 0.5 18 | ): 19 | """ 20 | :param min_semitones: Minimum semitones to shift. Negative number means shift down. 21 | :param max_semitones: Maximum semitones to shift. Positive number means shift up. 22 | :param p: The probability of applying this transform 23 | """ 24 | super().__init__(p) 25 | assert min_semitones >= -12 26 | assert max_semitones <= 12 27 | assert min_semitones <= max_semitones 28 | self.min_semitones = min_semitones 29 | self.max_semitones = max_semitones 30 | 31 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 32 | super().randomize_parameters(samples, sample_rate) 33 | if self.parameters["should_apply"]: 34 | self.parameters["num_semitones"] = random.uniform( 35 | self.min_semitones, self.max_semitones 36 | ) 37 | 38 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 39 | try: 40 | resample_type = ( 41 | "kaiser_best" if librosa.__version__.startswith("0.8.") else "soxr_hq" 42 | ) 43 | pitch_shifted_samples = librosa.effects.pitch_shift( 44 | samples, 45 | sr=sample_rate, 46 | n_steps=self.parameters["num_semitones"], 47 | res_type=resample_type, 48 | ) 49 | except librosa.util.exceptions.ParameterError: 50 | warnings.warn( 51 | "Warning: You are probably using an old version of librosa. Upgrade" 52 | " librosa to 0.9.0 or later for better performance when applying" 53 | " PitchShift to stereo audio." 54 | ) 55 | # In librosa<0.9.0 pitch_shift doesn't natively support multichannel audio. 56 | # Here we use a workaround that simply loops over the channels instead. 57 | # TODO: Remove this workaround when we remove support for librosa<0.9.0 58 | pitch_shifted_samples = np.copy(samples) 59 | for i in range(samples.shape[0]): 60 | pitch_shifted_samples[i] = librosa.effects.pitch_shift( 61 | pitch_shifted_samples[i], 62 | sr=sample_rate, 63 | n_steps=self.parameters["num_semitones"], 64 | ) 65 | 66 | return pitch_shifted_samples 67 | -------------------------------------------------------------------------------- /audiomentations/spec_augmentations/spec_frequency_mask.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | 5 | from audiomentations.core.transforms_interface import BaseSpectrogramTransform 6 | 7 | 8 | class SpecFrequencyMask(BaseSpectrogramTransform): 9 | """ 10 | Mask a set of frequencies in a spectrogram, à la Google AI SpecAugment. This type of data 11 | augmentation has proved to make speech recognition models more robust. 12 | 13 | The masked frequencies can be replaced with either the mean of the original values or a 14 | given constant (e.g. zero). 15 | """ 16 | 17 | supports_multichannel = True 18 | 19 | def __init__( 20 | self, 21 | min_mask_fraction: float = 0.03, 22 | max_mask_fraction: float = 0.25, 23 | fill_mode: str = "constant", 24 | fill_constant: float = 0.0, 25 | p: float = 0.5, 26 | ): 27 | super().__init__(p) 28 | self.min_mask_fraction = min_mask_fraction 29 | self.max_mask_fraction = max_mask_fraction 30 | assert fill_mode in ("mean", "constant") 31 | self.fill_mode = fill_mode 32 | self.fill_constant = fill_constant 33 | 34 | def randomize_parameters(self, magnitude_spectrogram): 35 | super().randomize_parameters(magnitude_spectrogram) 36 | if self.parameters["should_apply"]: 37 | num_frequency_bins = magnitude_spectrogram.shape[0] 38 | min_frequencies_to_mask = int( 39 | round(self.min_mask_fraction * num_frequency_bins) 40 | ) 41 | max_frequencies_to_mask = int( 42 | round(self.max_mask_fraction * num_frequency_bins) 43 | ) 44 | num_frequencies_to_mask = random.randint( 45 | min_frequencies_to_mask, max_frequencies_to_mask 46 | ) 47 | self.parameters["start_frequency_index"] = random.randint( 48 | 0, num_frequency_bins - num_frequencies_to_mask 49 | ) 50 | self.parameters["end_frequency_index"] = ( 51 | self.parameters["start_frequency_index"] + num_frequencies_to_mask 52 | ) 53 | 54 | def apply(self, magnitude_spectrogram): 55 | if self.fill_mode == "mean": 56 | fill_value = np.mean( 57 | magnitude_spectrogram[ 58 | self.parameters["start_frequency_index"] : self.parameters[ 59 | "end_frequency_index" 60 | ] 61 | ] 62 | ) 63 | else: 64 | # self.fill_mode == "constant" 65 | fill_value = self.fill_constant 66 | magnitude_spectrogram = magnitude_spectrogram.copy() 67 | magnitude_spectrogram[ 68 | self.parameters["start_frequency_index"] : self.parameters[ 69 | "end_frequency_index" 70 | ] 71 | ] = fill_value 72 | return magnitude_spectrogram 73 | 74 | -------------------------------------------------------------------------------- /docs/guides/cpu_vs_gpu.md: -------------------------------------------------------------------------------- 1 | # CPU vs. GPU: Which to use for online data augmentation when training audio ML models? 2 | 3 | When training an audio machine learning model that includes online data augmentation as part of the training pipeline, you can choose to run the transforms on CPU or GPU. While some libraries, such as torch-audiomentations, support GPU, audiomentations is CPU-only. So, which one is better? The answer is: it depends. 4 | 5 | ## Pros of using CPU-only libraries like audiomentations 6 | 7 | There are several advantages to using CPU-only data augmentation libraries like audiomentations: 8 | 9 | * Easy to get started: Audiomentations is straightforward to install and use, which makes it a good choice for beginners or for those who want to quickly prototype an idea. 10 | * No VRAM usage: These libraries don't use valuable VRAM, which you might want to allocate to your model with large batch sizes. 11 | * Often fast enough to keep GPU(s) busy: Running augmentations on CPU on multiple threads in a data loader can be fast enough to keep your GPU(s) busy, which means that data loading doesn't become a bottleneck if the model's GPU utilization is already high. This can speed up model training. 12 | * Larger selection of transforms: Some types of transforms, such as Mp3Compression, only have CPU implementations that can't run on GPU. This means that audiomentations provides a more extensive selection of transforms than torch-audiomentations. 13 | * Independent of specific tensor processing libraries: Audiomentations is CPU-only, which means it is not tied to a specific tensor processing library like TensorFlow or PyTorch. 14 | 15 | ## Pros of running audio augmentation transforms on GPU(s) 16 | 17 | There are also advantages to running audio augmentation transforms on GPU, for example, with the help of [torch-audiomentations :octicons-link-external-16:](https://github.com/asteroid-team/torch-audiomentations): 18 | 19 | * Faster processing: When your model is not big enough to utilize your GPU fully (in terms of processing capabilities and VRAM), running transforms on GPU can make sense, especially when the transforms are much faster on GPU than on CPU. An example of this is convolution, which can be used for applying room reverb or various filters. 20 | * Can speed up training: If running the data loader becomes a bottleneck when running the transforms on CPU, running transforms on GPU(s) instead can speed up the training. 21 | 22 | In summary, whether to use CPU-only libraries like audiomentations or GPU-accelerated libraries like torch-audiomentations depends on the specific requirements of your model and the available hardware. If your model training pipeline doesn't utilize your GPU(s) fully, running transforms on GPU might be the best choice. However, if your model's GPU utilization is already very high, running the transforms on multiple CPU threads might be the best option. It boils down to checking where your bottleneck is. 23 | -------------------------------------------------------------------------------- /audiomentations/augmentations/time_mask.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | 8 | 9 | class TimeMask(BaseWaveformTransform): 10 | """ 11 | Make a randomly chosen part of the audio silent. 12 | Inspired by https://arxiv.org/pdf/1904.08779.pdf 13 | """ 14 | 15 | supports_multichannel = True 16 | 17 | def __init__( 18 | self, 19 | min_band_part: float = 0.0, 20 | max_band_part: float = 0.5, 21 | fade: bool = False, 22 | p: float = 0.5, 23 | ): 24 | """ 25 | :param min_band_part: Minimum length of the silent part as a fraction of the 26 | total sound length. Must be between 0.0 and 1.0 27 | :param max_band_part: Maximum length of the silent part as a fraction of the 28 | total sound length. Must be between 0.0 and 1.0 29 | :param fade: When set to True, add a linear fade in and fade out of the silent 30 | part. This can smooth out an unwanted abrupt change between two consecutive 31 | samples (which sounds like a transient/click/pop). 32 | :param p: The probability of applying this transform 33 | """ 34 | super().__init__(p) 35 | if min_band_part < 0.0 or min_band_part > 1.0: 36 | raise ValueError("min_band_part must be between 0.0 and 1.0") 37 | if max_band_part < 0.0 or max_band_part > 1.0: 38 | raise ValueError("max_band_part must be between 0.0 and 1.0") 39 | if min_band_part > max_band_part: 40 | raise ValueError("min_band_part must not be greater than max_band_part") 41 | self.min_band_part = min_band_part 42 | self.max_band_part = max_band_part 43 | self.fade = fade 44 | 45 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 46 | super().randomize_parameters(samples, sample_rate) 47 | if self.parameters["should_apply"]: 48 | num_samples = samples.shape[-1] 49 | self.parameters["t"] = random.randint( 50 | int(num_samples * self.min_band_part), 51 | int(num_samples * self.max_band_part), 52 | ) 53 | self.parameters["t0"] = random.randint( 54 | 0, num_samples - self.parameters["t"] 55 | ) 56 | 57 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 58 | new_samples = samples.copy() 59 | t = self.parameters["t"] 60 | t0 = self.parameters["t0"] 61 | mask = np.zeros(t) 62 | if self.fade: 63 | fade_length = min(int(sample_rate * 0.01), int(t * 0.1)) 64 | mask[0:fade_length] = np.linspace(1, 0, num=fade_length) 65 | mask[-fade_length:] = np.linspace(0, 1, num=fade_length) 66 | new_samples[..., t0 : t0 + t] *= mask 67 | return new_samples 68 | -------------------------------------------------------------------------------- /audiomentations/augmentations/time_stretch.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import librosa 4 | import numpy as np 5 | from numpy.typing import NDArray 6 | 7 | from audiomentations.core.transforms_interface import BaseWaveformTransform 8 | 9 | 10 | class TimeStretch(BaseWaveformTransform): 11 | """Time stretch the signal without changing the pitch""" 12 | 13 | supports_multichannel = True 14 | 15 | def __init__( 16 | self, 17 | min_rate: float = 0.8, 18 | max_rate: float = 1.25, 19 | leave_length_unchanged: bool = True, 20 | p: float = 0.5, 21 | ): 22 | super().__init__(p) 23 | assert min_rate >= 0.1 24 | assert max_rate <= 10 25 | assert min_rate <= max_rate 26 | self.min_rate = min_rate 27 | self.max_rate = max_rate 28 | self.leave_length_unchanged = leave_length_unchanged 29 | 30 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 31 | super().randomize_parameters(samples, sample_rate) 32 | if self.parameters["should_apply"]: 33 | """ 34 | If rate > 1, then the signal is sped up. 35 | If rate < 1, then the signal is slowed down. 36 | """ 37 | self.parameters["rate"] = random.uniform(self.min_rate, self.max_rate) 38 | 39 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 40 | try: 41 | time_stretched_samples = librosa.effects.time_stretch( 42 | samples, rate=self.parameters["rate"] 43 | ) 44 | except librosa.util.exceptions.ParameterError: 45 | # In librosa<0.9.0 time_stretch doesn't natively support multichannel audio. 46 | # Here we use a workaround that simply loops over the channels instead. 47 | # TODO: Remove this workaround when we remove support for librosa<0.9.0 48 | time_stretched_channels = [] 49 | for i in range(samples.shape[0]): 50 | time_stretched_samples = librosa.effects.time_stretch( 51 | samples[i], rate=self.parameters["rate"] 52 | ) 53 | time_stretched_channels.append(time_stretched_samples) 54 | time_stretched_samples = np.array( 55 | time_stretched_channels, dtype=samples.dtype 56 | ) 57 | 58 | if self.leave_length_unchanged: 59 | # Apply zero padding if the time stretched audio is not long enough to fill the 60 | # whole space, or crop the time stretched audio if it ended up too long. 61 | padded_samples = np.zeros(shape=samples.shape, dtype=samples.dtype) 62 | window = time_stretched_samples[..., : samples.shape[-1]] 63 | actual_window_length = window.shape[ 64 | -1 65 | ] # may be smaller than samples.shape[-1] 66 | padded_samples[..., :actual_window_length] = window 67 | time_stretched_samples = padded_samples 68 | return time_stretched_samples 69 | -------------------------------------------------------------------------------- /tests/test_time_mask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from audiomentations import TimeMask, Compose 5 | 6 | 7 | class TestTimeMask: 8 | def test_apply_time_mask(self): 9 | sample_len = 1024 10 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 11 | sample_rate = 16000 12 | augmenter = Compose([TimeMask(min_band_part=0.2, max_band_part=0.5, p=1.0)]) 13 | 14 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 15 | assert samples_out.dtype == np.float32 16 | assert len(samples_out) == sample_len 17 | 18 | std_in = np.mean(np.abs(samples_in)) 19 | std_out = np.mean(np.abs(samples_out)) 20 | assert std_out < std_in 21 | 22 | def test_invalid_params(self): 23 | with pytest.raises(ValueError): 24 | TimeMask(min_band_part=0.5, max_band_part=1.5) 25 | 26 | with pytest.raises(ValueError): 27 | TimeMask(min_band_part=-0.5, max_band_part=0.5) 28 | 29 | with pytest.raises(ValueError): 30 | TimeMask(min_band_part=0.6, max_band_part=0.5) 31 | 32 | def test_apply_time_mask_multichannel(self): 33 | sample_len = 1024 34 | samples_in = np.random.normal(0, 1, size=(2, sample_len)).astype(np.float32) 35 | sample_rate = 16000 36 | augmenter = TimeMask(min_band_part=0.2, max_band_part=0.5, p=1.0) 37 | 38 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 39 | assert samples_out.dtype == np.float32 40 | assert samples_out.shape == samples_in.shape 41 | 42 | std_in = np.mean(np.abs(samples_in)) 43 | std_out = np.mean(np.abs(samples_out)) 44 | assert std_out < std_in 45 | 46 | def test_apply_time_mask_with_fade(self): 47 | sample_len = 1024 48 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 49 | sample_rate = 16000 50 | augmenter = Compose( 51 | [TimeMask(min_band_part=0.2, max_band_part=0.5, fade=True, p=1.0)] 52 | ) 53 | 54 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 55 | assert samples_out.dtype == np.float32 56 | assert len(samples_out) == sample_len 57 | 58 | std_in = np.mean(np.abs(samples_in)) 59 | std_out = np.mean(np.abs(samples_out)) 60 | assert std_out < std_in 61 | 62 | def test_apply_time_mask_with_fade_short_signal(self): 63 | sample_len = 100 64 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 65 | sample_rate = 16000 66 | augmenter = Compose( 67 | [TimeMask(min_band_part=0.2, max_band_part=0.5, fade=True, p=1.0)] 68 | ) 69 | 70 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 71 | assert samples_out.dtype == np.float32 72 | assert len(samples_out) == sample_len 73 | 74 | std_in = np.mean(np.abs(samples_in)) 75 | std_out = np.mean(np.abs(samples_out)) 76 | assert std_out < std_in 77 | -------------------------------------------------------------------------------- /docs/waveform_transforms/aliasing.md: -------------------------------------------------------------------------------- 1 | # `Aliasing` 2 | 3 | _Added in v0.35.0_ 4 | 5 | Downsample the audio to a lower sample rate by linear interpolation, without low-pass 6 | filtering it first, resulting in aliasing artifacts. You get aliasing artifacts when 7 | there is high-frequency audio in the input audio that falls above the nyquist frequency 8 | of the chosen target sample rate. Audio with frequencies above the nyquist frequency 9 | cannot be reproduced accurately and get "reflected"/mirrored to other frequencies. The 10 | aliasing artifacts "replace" the original high frequency signals. The result can be 11 | described as coarse and metallic. 12 | 13 | After the downsampling, the signal gets upsampled to the original signal again, so the 14 | length of the output becomes the same as the length of the input. 15 | 16 | For more information, see 17 | 18 | * [Sample rate reduction :octicons-link-external-16:](https://en.wikipedia.org/wiki/Bitcrusher#Sample_rate_reduction){target=_blank} on Wikipedia 19 | * [Intro to downsampling :octicons-link-external-16:](http://gdsp.hf.ntnu.no/lessons/1/3/){target=_blank} by NTNU, Department of Music, Music Technology. Note: that article describes a slightly different downsampling technique, called sample-and-hold, while `Aliasing` in audiomentations currently implements linear interpolation. However, both methods lead to aliasing artifacts. 20 | 21 | ## Input-output example 22 | 23 | Here we target a sample rate of 12000 Hz. Note the vertical mirroring in the spectrogram in the transformed sound. 24 | 25 | ![Input-output waveforms and spectrograms](Aliasing.webp) 26 | 27 | | Input sound | Transformed sound | 28 | |---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------| 29 | | | | 30 | 31 | ## Usage example 32 | 33 | ```python 34 | from audiomentations import Aliasing 35 | 36 | transform = Aliasing(min_sample_rate=8000, max_sample_rate=30000, p=1.0) 37 | 38 | augmented_sound = transform(my_waveform_ndarray, sample_rate=44100) 39 | ``` 40 | 41 | # Aliasing API 42 | 43 | [`min_sample_rate`](#min_sample_rate){ #min_sample_rate }: `int` • unit: Hz • range: [2, ∞) 44 | : :octicons-milestone-24: Minimum target sample rate to downsample to 45 | 46 | [`max_sample_rate`](#max_sample_rate){ #max_sample_rate }: `int` • unit: Hz • range: [2, ∞) 47 | : :octicons-milestone-24: Maximum target sample rate to downsample to 48 | 49 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 50 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 51 | 52 | ## Source code :octicons-mark-github-16: 53 | 54 | [audiomentations/augmentations/aliasing.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/aliasing.py){target=_blank} 55 | -------------------------------------------------------------------------------- /docs/waveform_transforms/time_stretch.md: -------------------------------------------------------------------------------- 1 | # `TimeStretch` 2 | 3 | _Added in v0.2.0_ 4 | 5 | Change the speed or duration of the signal without changing the pitch. This transform 6 | employs `librosa.effects.time_stretch` under the hood to achieve the effect. 7 | 8 | Under the hood this uses phase vocoding. Note that phase vocoding can degrade audio 9 | quality by "smearing" transient sounds, altering the timbre of harmonic sounds, and 10 | distorting pitch modulations. This may result in a loss of sharpness, clarity, or 11 | naturalness in the transformed audio, especially when the rate is set to an extreme 12 | value. 13 | 14 | If you need a better sounding time stretch method, consider the following alternatives: 15 | 16 | * [atempo in ffmpeg](https://ffmpeg.org//ffmpeg-all.html#atempo) 17 | * [Rubber Band library](https://breakfastquay.com/rubberband/) 18 | * [https://github.com/KAIST-MACLab/PyTSMod](https://github.com/KAIST-MACLab/PyTSMod) 19 | * [https://github.com/vinusankars/ESOLA](https://github.com/vinusankars/ESOLA) 20 | 21 | ## Input-output example 22 | 23 | In this example we speed up a sound by 25%. This corresponds to a rate of 1.25. 24 | 25 | ![Input-output waveforms and spectrograms](TimeStretch.webp) 26 | 27 | | Input sound | Transformed sound | 28 | |-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| 29 | | | | 30 | 31 | ## Usage example 32 | 33 | ```python 34 | from audiomentations import TimeStretch 35 | 36 | transform = TimeStretch( 37 | min_rate=0.8, 38 | max_rate=1.25, 39 | leave_length_unchanged=True, 40 | p=1.0 41 | ) 42 | 43 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 44 | ``` 45 | 46 | ## TimeStretch API 47 | 48 | [`min_rate`](#min_rate){ #min_rate }: `float` • range: [0.1, 10.0] 49 | : :octicons-milestone-24: Default: `0.8`. Minimum rate of change of total duration of the signal. A rate below 1 means the audio is slowed down. 50 | 51 | [`max_rate`](#max_rate){ #max_rate }: `float` • range: [0.1, 10.0] 52 | : :octicons-milestone-24: Default: `1.25`. Maximum rate of change of total duration of the signal. A rate greater than 1 means the audio is sped up. 53 | 54 | [`leave_length_unchanged`](#leave_length_unchanged){ #leave_length_unchanged }: `bool` 55 | : :octicons-milestone-24: Default: `True`. The rate changes the duration and effects the samples. This flag is used to keep the total length of the generated output to be same as that of the input signal. 56 | 57 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 58 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 59 | 60 | ## Source code :octicons-mark-github-16: 61 | 62 | [audiomentations/augmentations/time_stretch.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/time_stretch.py){target=_blank} 63 | -------------------------------------------------------------------------------- /tests/test_apply_impulse_response.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | 6 | from audiomentations import ApplyImpulseResponse 7 | from audiomentations.core.composition import Compose 8 | from demo.demo import DEMO_DIR 9 | 10 | 11 | class TestImpulseResponse: 12 | def test_apply_impulse_response(self): 13 | sample_len = 1024 14 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 15 | sample_rate = 16000 16 | 17 | add_ir_transform = ApplyImpulseResponse( 18 | ir_path=os.path.join(DEMO_DIR, "ir"), p=1.0 19 | ) 20 | 21 | # Check that misc_file.txt is not one of the IR file candidates, as it's not audio 22 | assert len(add_ir_transform.ir_files) == 3 23 | 24 | augmenter = Compose([add_ir_transform]) 25 | 26 | assert len(samples_in) == sample_len 27 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 28 | 29 | # Check parameters 30 | assert augmenter.transforms[0].parameters["should_apply"] 31 | assert augmenter.transforms[0].parameters["ir_file_path"].endswith(".wav") 32 | 33 | assert samples_out.dtype == np.float32 34 | assert samples_out.shape == samples_in.shape 35 | 36 | def test_apply_impulse_response_multi_channel(self): 37 | sample_len = 1024 38 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 39 | sample_rate = 16000 40 | 41 | samples_in = np.expand_dims(samples_in, axis=0) 42 | samples_in = np.tile(samples_in, (2, 1)) 43 | 44 | add_ir_transform = ApplyImpulseResponse( 45 | ir_path=os.path.join(DEMO_DIR, "ir"), p=1.0 46 | ) 47 | 48 | # Check that misc_file.txt is not one of the IR file candidates, as it's not audio 49 | assert len(add_ir_transform.ir_files) == 3 50 | 51 | augmenter = Compose([add_ir_transform]) 52 | 53 | assert samples_in.shape[1] == sample_len 54 | samples_out = augmenter(samples=samples_in, sample_rate=sample_rate) 55 | 56 | # Check parameters 57 | assert augmenter.transforms[0].parameters["should_apply"] 58 | assert augmenter.transforms[0].parameters["ir_file_path"].endswith(".wav") 59 | 60 | assert samples_out.dtype == np.float32 61 | assert samples_out.shape == samples_in.shape 62 | 63 | def test_include_tail(self): 64 | sample_len = 1024 65 | samples_in = np.random.normal(0, 1, size=sample_len).astype(np.float32) 66 | sample_rate = 16000 67 | 68 | add_ir_transform = ApplyImpulseResponse( 69 | ir_path=os.path.join(DEMO_DIR, "ir"), p=1.0, leave_length_unchanged=False 70 | ) 71 | 72 | samples_out = add_ir_transform(samples=samples_in, sample_rate=sample_rate) 73 | 74 | assert samples_out.dtype == np.float32 75 | assert samples_out.shape[-1] > samples_in.shape[-1] 76 | 77 | def test_picklability(self): 78 | add_ir_transform = ApplyImpulseResponse( 79 | ir_path=os.path.join(DEMO_DIR, "ir"), p=1.0 80 | ) 81 | pickled = pickle.dumps(add_ir_transform) 82 | unpickled = pickle.loads(pickled) 83 | assert add_ir_transform.ir_files == unpickled.ir_files 84 | -------------------------------------------------------------------------------- /tests/test_post_gain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyloudnorm 3 | import pytest 4 | from numpy.testing import assert_almost_equal, assert_array_equal 5 | 6 | from audiomentations import Gain 7 | from audiomentations.core.post_gain import PostGain 8 | from audiomentations.core.utils import calculate_rms, get_max_abs_amplitude 9 | 10 | 11 | class TestPostGain: 12 | def test_same_rms(self): 13 | samples = np.array([1.0, 0.5, -0.25, -0.125, 0.0], dtype=np.float32) 14 | sample_rate = 16000 15 | 16 | augment = PostGain( 17 | Gain(min_gain_db=-6, max_gain_db=-6, p=1.0), method="same_rms" 18 | ) 19 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 20 | assert_almost_equal( 21 | calculate_rms(processed_samples), 22 | calculate_rms(samples), 23 | ) 24 | assert processed_samples.dtype == np.float32 25 | 26 | def test_same_lufs(self): 27 | samples = np.random.uniform(low=-0.5, high=0.5, size=(2, 8000)).astype( 28 | np.float32 29 | ) 30 | sample_rate = 16000 31 | 32 | augment = PostGain( 33 | Gain(min_gain_db=60, max_gain_db=60, p=1.0), method="same_lufs" 34 | ) 35 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 36 | 37 | meter = pyloudnorm.Meter(sample_rate) # create BS.1770 meter 38 | lufs_before = meter.integrated_loudness(samples.transpose()) 39 | lufs_after = meter.integrated_loudness(processed_samples.transpose()) 40 | assert_almost_equal(lufs_after, lufs_before, decimal=6) 41 | assert processed_samples.dtype == np.float32 42 | 43 | def test_peak_normalize_always(self): 44 | samples = np.random.uniform(low=-0.5, high=0.5, size=(2, 8000)).astype( 45 | np.float32 46 | ) 47 | sample_rate = 16000 48 | 49 | augment = PostGain( 50 | Gain(min_gain_db=-55, max_gain_db=-55, p=1.0), 51 | method="peak_normalize_always", 52 | ) 53 | processed_samples = augment(samples=samples, sample_rate=sample_rate) 54 | 55 | assert get_max_abs_amplitude(processed_samples) == pytest.approx(1.0) 56 | assert processed_samples.dtype == np.float32 57 | 58 | def test_peak_normalize_if_too_loud(self): 59 | samples = np.array( 60 | [[0.9, 0.5, -0.25, -0.125, 0.0], [0.95, 0.5, -0.25, -0.125, 0.0]], 61 | dtype=np.float32, 62 | ) 63 | sample_rate = 16000 64 | augmenter = PostGain( 65 | Gain(min_gain_db=0.0, max_gain_db=0.0, p=1.0), 66 | method="peak_normalize_if_too_loud", 67 | ) 68 | 69 | processed_samples = augmenter(samples=samples, sample_rate=sample_rate) 70 | assert_array_equal(processed_samples, samples) 71 | assert processed_samples.dtype == np.float32 72 | 73 | samples_too_loud = np.array( 74 | [[0.9, 0.5, -0.25, -0.125, 0.0], [1.2, 0.5, -0.25, -0.125, 0.0]], 75 | dtype=np.float32, 76 | ) 77 | processed_samples = augmenter(samples=samples_too_loud, sample_rate=sample_rate) 78 | assert_array_equal(processed_samples, samples_too_loud / 1.2) 79 | assert processed_samples.dtype == np.float32 80 | -------------------------------------------------------------------------------- /audiomentations/augmentations/gain.py: -------------------------------------------------------------------------------- 1 | import random 2 | import warnings 3 | 4 | import numpy as np 5 | from numpy.typing import NDArray 6 | 7 | from audiomentations.core.transforms_interface import BaseWaveformTransform 8 | from audiomentations.core.utils import convert_decibels_to_amplitude_ratio 9 | 10 | 11 | class Gain(BaseWaveformTransform): 12 | """ 13 | Multiply the audio by a random amplitude factor to reduce or increase the volume. This 14 | technique can help a model become somewhat invariant to the overall gain of the input audio. 15 | 16 | Warning: This transform can return samples outside the [-1, 1] range, which may lead to 17 | clipping or wrap distortion, depending on what you do with the audio in a later stage. 18 | See also https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping 19 | """ 20 | 21 | supports_multichannel = True 22 | 23 | def __init__( 24 | self, 25 | min_gain_in_db: float = None, 26 | max_gain_in_db: float = None, 27 | min_gain_db: float = None, 28 | max_gain_db: float = None, 29 | p: float = 0.5, 30 | ): 31 | """ 32 | :param min_gain_in_db: Deprecated. Use min_gain_db instead 33 | :param max_gain_in_db: Deprecated. Use max_gain_db instead 34 | :param min_gain_db: Minimum gain 35 | :param max_gain_db: Maximum gain 36 | :param p: The probability of applying this transform 37 | """ 38 | super().__init__(p) 39 | 40 | if min_gain_db is not None and min_gain_in_db is not None: 41 | raise ValueError( 42 | "Passing both min_gain_db and min_gain_in_db is not supported. Use only" 43 | " min_gain_db." 44 | ) 45 | elif min_gain_db is not None: 46 | self.min_gain_db = min_gain_db 47 | elif min_gain_in_db is not None: 48 | warnings.warn( 49 | "The min_gain_in_db parameter is deprecated. Use min_gain_db instead.", 50 | DeprecationWarning, 51 | ) 52 | self.min_gain_db = min_gain_in_db 53 | else: 54 | self.min_gain_db = -12.0 # the default 55 | 56 | if max_gain_db is not None and max_gain_in_db is not None: 57 | raise ValueError( 58 | "Passing both max_gain_db and max_gain_in_db is not supported. Use only" 59 | " max_gain_db." 60 | ) 61 | elif max_gain_db is not None: 62 | self.max_gain_db = max_gain_db 63 | elif max_gain_in_db is not None: 64 | warnings.warn( 65 | "The max_gain_in_db parameter is deprecated. Use max_gain_db instead.", 66 | DeprecationWarning, 67 | ) 68 | self.max_gain_db = max_gain_in_db 69 | else: 70 | self.max_gain_db = 12.0 # the default 71 | 72 | assert self.min_gain_db <= self.max_gain_db 73 | 74 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 75 | super().randomize_parameters(samples, sample_rate) 76 | if self.parameters["should_apply"]: 77 | self.parameters["amplitude_ratio"] = convert_decibels_to_amplitude_ratio( 78 | random.uniform(self.min_gain_db, self.max_gain_db) 79 | ) 80 | 81 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 82 | return samples * self.parameters["amplitude_ratio"] 83 | -------------------------------------------------------------------------------- /docs/waveform_transforms/adjust_duration.md: -------------------------------------------------------------------------------- 1 | # `AdjustDuration` 2 | 3 | _Added in v0.30.0_ 4 | 5 | Trim or pad the audio to the specified length/duration in samples or seconds. If the 6 | input sound is longer than the target duration, pick a random offset and crop the 7 | sound to the target duration. If the input sound is shorter than the target 8 | duration, pad the sound so the duration matches the target duration. 9 | 10 | This transform can be useful if you need audio with constant length, e.g. as input to a 11 | machine learning model. The reason for varying audio clip lengths can be e.g. 12 | 13 | * the nature of the audio dataset (different audio clips have different lengths) 14 | * data augmentation transforms that change the lengths (e.g. time stretching or 15 | convolving with impulse responses without cutting the tail) 16 | 17 | ## Input-output example 18 | 19 | Here we input an audio clip and remove a part of the start and the end, so the length of the result matches the specified target length. 20 | 21 | ![Input-output waveforms and spectrograms](AdjustDuration.webp) 22 | 23 | | Input sound | Transformed sound | 24 | |---------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| 25 | | | | 26 | 27 | ## Usage examples 28 | 29 | === "Target length in samples" 30 | 31 | ```python 32 | from audiomentations import AdjustDuration 33 | 34 | transform = AdjustDuration(duration_samples=60000, p=1.0) 35 | 36 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 37 | ``` 38 | 39 | === "Target duration in seconds" 40 | 41 | ```python 42 | from audiomentations import AdjustDuration 43 | 44 | transform = AdjustDuration(duration_seconds=3.75, p=1.0) 45 | 46 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 47 | ``` 48 | 49 | # AdjustDuration API 50 | 51 | [`duration_samples`](#duration_samples){ #duration_samples }: `int` • range: [0, ∞) 52 | : :octicons-milestone-24: Target duration in number of samples. 53 | 54 | [`duration_seconds`](#duration_seconds){ #duration_seconds }: `float` • range: [0.0, ∞) 55 | : :octicons-milestone-24: Target duration in seconds. 56 | 57 | [`padding_mode`](#padding_mode){ #padding_mode }: `str` • choices: `"silence"`, `"wrap"`, `"reflect"` 58 | : :octicons-milestone-24: Default: `"silence"`. Padding mode. Only used when audio input is shorter than the target duration. 59 | 60 | [`padding_position`](#padding_position){ #padding_position }: `str` • choices: `"start"`, `"end"` 61 | : :octicons-milestone-24: Default: `"end"`. The position of the inserted/added padding. Only used when audio input is shorter than the target duration. 62 | 63 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 64 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 65 | 66 | ## Source code :octicons-mark-github-16: 67 | 68 | [audiomentations/augmentations/adjust_duration.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/adjust_duration.py){target=_blank} 69 | -------------------------------------------------------------------------------- /audiomentations/augmentations/padding.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from audiomentations.core.transforms_interface import BaseWaveformTransform 7 | 8 | 9 | class Padding(BaseWaveformTransform): 10 | """ 11 | Apply padding to the audio signal - take a fraction of the end or the start of the 12 | audio and replace that part with padding. This can be useful for preparing ML models 13 | with constant input length for padded inputs. 14 | """ 15 | 16 | supports_multichannel = True 17 | 18 | def __init__( 19 | self, 20 | mode: str = "silence", 21 | min_fraction: float = 0.01, 22 | max_fraction: float = 0.7, 23 | pad_section: str = "end", 24 | p: float = 0.5, 25 | ): 26 | """ 27 | :param mode: Padding mode. Must be one of "silence", "wrap", "reflect" 28 | :param min_fraction: Minimum fraction of the signal duration to be padded 29 | :param max_fraction: Maximum fraction of the signal duration to be padded 30 | :param pad_section: Which part of the signal should be replaced with padding: 31 | "start" or "end" 32 | :param p: The probability of applying this transform 33 | """ 34 | super().__init__(p) 35 | 36 | assert mode in ("silence", "wrap", "reflect") 37 | self.mode = mode 38 | 39 | assert max_fraction <= 1.0 40 | assert min_fraction >= 0 41 | assert min_fraction <= max_fraction 42 | self.min_fraction = min_fraction 43 | self.max_fraction = max_fraction 44 | 45 | assert pad_section in ("start", "end") 46 | self.pad_section = pad_section 47 | 48 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 49 | super().randomize_parameters(samples, sample_rate) 50 | if self.parameters["should_apply"]: 51 | input_length = samples.shape[-1] 52 | self.parameters["padding_length"] = random.randint( 53 | int(round(self.min_fraction * input_length)), 54 | int(round(self.max_fraction * input_length)), 55 | ) 56 | 57 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 58 | padding_length = self.parameters["padding_length"] 59 | if padding_length == 0: 60 | return samples 61 | 62 | untouched_length = samples.shape[-1] - padding_length 63 | 64 | if self.mode == "silence": 65 | samples = np.copy(samples) 66 | if self.pad_section == "start": 67 | samples[..., :padding_length] = 0.0 68 | else: 69 | samples[..., -padding_length:] = 0.0 70 | else: 71 | if samples.ndim == 1: 72 | if self.pad_section == "start": 73 | pad_width = (padding_length, 0) 74 | else: 75 | pad_width = (0, padding_length) 76 | else: 77 | if self.pad_section == "start": 78 | pad_width = ((0, 0), (padding_length, 0)) 79 | else: 80 | pad_width = ((0, 0), (0, padding_length)) 81 | 82 | if self.pad_section == "start": 83 | samples = samples[..., -untouched_length:] 84 | else: 85 | samples = samples[..., :untouched_length] 86 | 87 | samples = np.pad(samples, pad_width, self.mode) 88 | 89 | return samples 90 | -------------------------------------------------------------------------------- /audiomentations/augmentations/add_gaussian_snr.py: -------------------------------------------------------------------------------- 1 | import random 2 | import warnings 3 | 4 | import numpy as np 5 | from numpy.typing import NDArray 6 | 7 | from audiomentations.core.transforms_interface import BaseWaveformTransform 8 | from audiomentations.core.utils import calculate_desired_noise_rms, calculate_rms 9 | 10 | 11 | class AddGaussianSNR(BaseWaveformTransform): 12 | """ 13 | Add gaussian noise to the input. A random Signal to Noise Ratio (SNR) will be picked 14 | uniformly in the decibel scale. This aligns with human hearing, which is more 15 | logarithmic than linear. 16 | """ 17 | 18 | supports_multichannel = True 19 | 20 | def __init__( 21 | self, 22 | min_snr_in_db: float = None, 23 | max_snr_in_db: float = None, 24 | min_snr_db: float = None, 25 | max_snr_db: float = None, 26 | p: float = 0.5, 27 | ): 28 | """ 29 | :param min_snr_in_db: Deprecated. Use min_snr_db instead. 30 | :param max_snr_in_db: Deprecated. Use max_snr_db instead. 31 | :param min_snr_db: Minimum signal-to-noise ratio in dB. A lower number means more noise. 32 | :param max_snr_db: Maximum signal-to-noise ratio in dB. A greater number means less noise. 33 | :param p: The probability of applying this transform 34 | """ 35 | super().__init__(p) 36 | 37 | if min_snr_db is not None and min_snr_in_db is not None: 38 | raise ValueError( 39 | "Passing both min_snr_db and min_snr_in_db is not supported. Use only" 40 | " min_snr_db." 41 | ) 42 | elif min_snr_db is not None: 43 | self.min_snr_db = min_snr_db 44 | elif min_snr_in_db is not None: 45 | warnings.warn( 46 | "The min_snr_in_db parameter is deprecated. Use min_snr_db instead.", 47 | DeprecationWarning, 48 | ) 49 | self.min_snr_db = min_snr_in_db 50 | else: 51 | self.min_snr_db = 5.0 # the default 52 | 53 | if max_snr_db is not None and max_snr_in_db is not None: 54 | raise ValueError( 55 | "Passing both max_snr_db and max_snr_in_db is not supported. Use only" 56 | " max_snr_db." 57 | ) 58 | elif max_snr_db is not None: 59 | self.max_snr_db = max_snr_db 60 | elif max_snr_in_db is not None: 61 | warnings.warn( 62 | "The max_snr_in_db parameter is deprecated. Use max_snr_db instead.", 63 | DeprecationWarning, 64 | ) 65 | self.max_snr_db = max_snr_in_db 66 | else: 67 | self.max_snr_db = 40.0 # the default 68 | 69 | def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int): 70 | super().randomize_parameters(samples, sample_rate) 71 | if self.parameters["should_apply"]: 72 | # Pick SNR in decibel scale 73 | snr = random.uniform(self.min_snr_db, self.max_snr_db) 74 | 75 | clean_rms = calculate_rms(samples) 76 | noise_rms = calculate_desired_noise_rms(clean_rms=clean_rms, snr=snr) 77 | 78 | # In gaussian noise, the RMS gets roughly equal to the std 79 | self.parameters["noise_std"] = noise_rms 80 | 81 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 82 | noise = np.random.normal( 83 | 0.0, self.parameters["noise_std"], size=samples.shape 84 | ).astype(np.float32) 85 | return samples + noise 86 | -------------------------------------------------------------------------------- /audiomentations/augmentations/adjust_duration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from audiomentations.core.transforms_interface import BaseWaveformTransform 5 | 6 | 7 | class AdjustDuration(BaseWaveformTransform): 8 | """ 9 | Trim or pad the audio to the specified length/duration in samples or seconds. If the 10 | input sound is longer than the target duration, pick a random offset and crop the 11 | sound to the target duration. If the input sound is shorter than the target 12 | duration, pad the sound so the duration matches the target duration. 13 | """ 14 | 15 | supports_multichannel = True 16 | 17 | def __init__( 18 | self, 19 | duration_samples: int = None, 20 | duration_seconds: float = None, 21 | padding_mode: str = "silence", 22 | padding_position: str = "end", 23 | p: float = 0.5, 24 | ): 25 | """ 26 | :param duration_samples: Target duration in number of samples 27 | :param duration_seconds: Target duration in seconds 28 | :param padding_mode: Padding mode. Must be "silence", "wrap" or "reflect". Only 29 | used when audio input is shorter than the target duration. 30 | :param padding_position: The position of the inserted/added padding. Must be 31 | "start" or "end". Only used when audio input is shorter than the target duration. 32 | :param p: The probability of applying this transform 33 | """ 34 | super().__init__(p) 35 | assert padding_mode in ("silence", "wrap", "reflect") 36 | if padding_mode == "silence": 37 | padding_mode = "constant" # for numpy.pad compatibility 38 | self.padding_mode = padding_mode 39 | 40 | assert padding_position in ("start", "end") 41 | self.padding_position = padding_position 42 | 43 | assert duration_samples is not None or duration_seconds is not None 44 | if duration_samples is not None and duration_seconds is not None: 45 | raise ValueError( 46 | "should have duration_samples or duration_seconds, but not both" 47 | ) 48 | elif duration_seconds: 49 | assert duration_seconds > 0 50 | self.get_target_samples = lambda sr: int(duration_seconds * sr) 51 | elif duration_samples: 52 | assert duration_samples > 0 53 | self.get_target_samples = lambda sr: duration_samples 54 | 55 | def apply(self, samples: NDArray[np.float32], sample_rate: int): 56 | target_samples = self.get_target_samples(sample_rate) 57 | sample_length = samples.shape[-1] 58 | 59 | if sample_length == target_samples: 60 | return samples 61 | 62 | elif sample_length > target_samples: 63 | start = np.random.randint(0, sample_length - target_samples) 64 | return samples[..., start : start + target_samples] 65 | 66 | elif sample_length < target_samples: 67 | padding_length = target_samples - sample_length 68 | if samples.ndim == 1: 69 | if self.padding_position == "start": 70 | pad_width = (padding_length, 0) 71 | else: 72 | pad_width = (0, padding_length) 73 | else: 74 | if self.padding_position == "start": 75 | pad_width = ((0, 0), (padding_length, 0)) 76 | else: 77 | pad_width = ((0, 0), (0, padding_length)) 78 | return np.pad(samples, pad_width, self.padding_mode) 79 | -------------------------------------------------------------------------------- /docs/waveform_transforms/add_gaussian_snr.md: -------------------------------------------------------------------------------- 1 | # `AddGaussianSNR` 2 | 3 | _Added in v0.7.0_ 4 | 5 | The `AddGaussianSNR` transform injects Gaussian noise into an audio signal. It applies 6 | a **Signal-to-Noise Ratio (SNR)** that is chosen randomly from a **uniform distribution on the 7 | decibel scale**. This choice is consistent with the nature of human hearing, which is 8 | logarithmic rather than linear. 9 | 10 | **SNR** is a common measure used in science and engineering to compare the level of a 11 | desired signal to the level of noise. In the context of audio, the signal is the 12 | meaningful sound that you're interested in, like a person's voice, music, or other 13 | audio content, while the noise is unwanted sound that can interfere with the signal. 14 | 15 | The SNR quantifies the ratio of the power of the signal to the power of the noise. **The 16 | higher the SNR, the less the noise** is present in relation to the signal. 17 | 18 | **Gaussian noise**, a kind of white noise, is a type of statistical noise where the 19 | amplitude of the noise signal follows a Gaussian distribution. This means that most of 20 | the samples are close to the mean (zero), and fewer of them are farther away. It's 21 | called Gaussian noise due to its characteristic bell-shaped Gaussian distribution. 22 | 23 | Gaussian noise is similar to the sound of a radio or TV tuned to a nonexistent station: 24 | a kind of **constant, uniform hiss or static**. 25 | 26 | ## Input-output example 27 | 28 | Here we add some gaussian noise (with SNR = 16 dB) to a speech recording. 29 | 30 | ![Input-output waveforms and spectrograms](AddGaussianSNR.webp) 31 | 32 | | Input sound | Transformed sound | 33 | |-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| 34 | | | | 35 | 36 | ## Usage example 37 | 38 | ```python 39 | from audiomentations import AddGaussianSNR 40 | 41 | transform = AddGaussianSNR( 42 | min_snr_db=5.0, 43 | max_snr_db=40.0, 44 | p=1.0 45 | ) 46 | 47 | augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 48 | ``` 49 | 50 | ## AddGaussianSNR API 51 | 52 | [`min_snr_db`](#min_snr_db){ #min_snr_db }: `float` • unit: Decibel 53 | : :octicons-milestone-24: Default: `5.0`. Minimum signal-to-noise ratio in dB. A lower 54 | number means more noise. 55 | 56 | [`max_snr_db`](#max_snr_db){ #max_snr_db }: `float` • unit: decibel 57 | : :octicons-milestone-24: Default: `40.0`. Maximum signal-to-noise ratio in dB. A 58 | greater number means less noise. 59 | 60 | [`min_snr_in_db`](#min_snr_in_db){ #min_snr_in_db }: `float` • unit: Decibel 61 | : :warning: Deprecated as of v0.31.0. Use [`min_snr_db`](#min_snr_db) instead 62 | 63 | [`max_snr_in_db`](#max_snr_in_db){ #max_snr_in_db }: `float` • unit: decibel 64 | : :warning: Deprecated as of v0.31.0. Use [`max_snr_db`](#max_snr_db) instead 65 | 66 | [`p`](#p){ #p }: `float` • range: [0.0, 1.0] 67 | : :octicons-milestone-24: Default: `0.5`. The probability of applying this transform. 68 | 69 | ## Source code :octicons-mark-github-16: 70 | 71 | [audiomentations/augmentations/add_gaussian_snr.py :octicons-link-external-16:](https://github.com/iver56/audiomentations/blob/main/audiomentations/augmentations/add_gaussian_snr.py){target=_blank} 72 | --------------------------------------------------------------------------------