├── .gitattributes ├── test ├── noise.wav ├── voice │ ├── arctic_a0007.wav │ └── COPYING ├── trim_test.py ├── split_test.py └── vad_test.py ├── .gitignore ├── pyvad ├── __init__.py ├── vad.py └── effects.py ├── .github └── workflows │ ├── pypi.yaml │ ├── lint.yaml │ └── test.yaml ├── LICENCE ├── README.md ├── pyproject.toml └── example.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored 2 | *.html linguist-vendored -------------------------------------------------------------------------------- /test/noise.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/F-Tag/python-vad/HEAD/test/noise.wav -------------------------------------------------------------------------------- /test/voice/arctic_a0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/F-Tag/python-vad/HEAD/test/voice/arctic_a0007.wav -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .python-version 3 | pyvad.egg-info/* 4 | *kernel*.json 5 | dist* 6 | build 7 | poetry.lock -------------------------------------------------------------------------------- /pyvad/__init__.py: -------------------------------------------------------------------------------- 1 | from pyvad import effects # noqa: F401 2 | from pyvad.effects import split, trim # noqa: F401 3 | from pyvad.vad import vad # noqa: F401 4 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Release to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [created] 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | - name: Set up Python 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: "3.9" 20 | - name: pip install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install poetry poetry-dynamic-versioning twine 24 | - name: PyPI Settings 25 | run: poetry config pypi-token.pypi ${{secrets.PYPI_TOKEN}} 26 | - name: Build and publish 27 | run: poetry publish --build 28 | -------------------------------------------------------------------------------- /test/trim_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from itertools import product 5 | 6 | import numpy as np 7 | from librosa import load 8 | from pyvad import trim 9 | 10 | fs_vads = (8000, 16000, 32000, 48000) 11 | hops = (10, 20, 30) 12 | vad_modes = (0, 1, 2, 3) 13 | 14 | name = "voice/arctic_a0007.wav" 15 | data, fs = load(name, sr=None) 16 | 17 | for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): 18 | vact = trim(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) 19 | assert vact[1] - vact[0] > 0, vact 20 | 21 | 22 | data = (np.random.rand(fs * 3) - 0.5) * 0.05 23 | for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): 24 | vact = trim(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) 25 | assert vact[1] - vact[0] == 0, vact 26 | -------------------------------------------------------------------------------- /test/split_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from itertools import product 5 | 6 | import numpy as np 7 | from librosa import load 8 | from pyvad import split 9 | 10 | fs_vads = (8000, 16000, 32000, 48000) 11 | hops = (10, 20, 30) 12 | vad_modes = (0, 1, 2, 3) 13 | 14 | name = "voice/arctic_a0007.wav" 15 | data, fs = load(name, sr=None) 16 | data = np.tile(data, 2) 17 | 18 | for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): 19 | vact = split(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) 20 | assert vact.size >= 0, vact 21 | 22 | 23 | data = (np.random.rand(fs * 3) - 0.5) * 0.05 24 | for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): 25 | vact = split(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) 26 | assert vact.size == 0, vact 27 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017: Fumiaki Taguchi 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 6 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 7 | subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 12 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 13 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 14 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 15 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 16 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: pysen lint 2 | 3 | # トリガー 4 | on: 5 | workflow_dispatch: 6 | push: 7 | branches: 8 | - master 9 | pull_request: 10 | types: [opened, reopened, synchronize] 11 | branches: 12 | - master 13 | 14 | # 実行job 15 | jobs: 16 | lint: 17 | name: Run pysen lint 18 | runs-on: ubuntu-latest 19 | steps: 20 | # リポジトリのチェックアウト 21 | - name: Checkout 22 | uses: actions/checkout@v2 23 | # Python のセットアップ 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v1 26 | with: 27 | python-version: 3.8 28 | # Poetry をインストールする 29 | - name: Install Poetry 30 | run: | 31 | curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python 32 | echo "$HOME/.poetry/bin" >> $GITHUB_PATH 33 | # インストールした Poetry を使って必要な Python パッケージをインストール 34 | - name: Install Dependencies 35 | run: poetry install --no-interaction 36 | # lint を実行 37 | - name: pysen lint 38 | run: poetry run pysen run lint 39 | -------------------------------------------------------------------------------- /test/vad_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from itertools import product 5 | 6 | from librosa import load 7 | from pyvad import vad 8 | 9 | fs_vads = (8000, 16000, 32000, 48000) 10 | hops = (10, 20, 30) 11 | vad_modes = (0, 1, 2, 3) 12 | fss = [16000, 22050] 13 | 14 | name = "voice/arctic_a0007.wav" 15 | 16 | for fs in fss: 17 | 18 | data, fs_r = load(name, sr=fs) 19 | for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): 20 | # print(fs, fs_vad, hop, vad_mode) 21 | vact = vad(data, fs_r, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) 22 | assert vact.sum() > data.size // 2, vact.sum() 23 | 24 | """ 25 | import matplotlib.pyplot as plt 26 | plt.plot(data) 27 | plt.plot(vact) 28 | plt.savefig(("voice_"+str(fs_r)+str(fs_vad)+str(hop)+str(vad_mode)+".png")) 29 | plt.close() 30 | """ 31 | 32 | """ 33 | data = (np.random.rand(fs*3)-0.5)*0.1 34 | for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): 35 | print(fs, fs_vad, hop, vad_mode) 36 | vact = vad(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) 37 | # assert not vact.any(), vact.sum() 38 | 39 | 40 | import matplotlib.pyplot as plt 41 | plt.plot(data) 42 | plt.plot(vact) 43 | plt.savefig(("noise_"+str(fs)+str(fs_vad)+str(hop)+str(vad_mode)+".png")) 44 | plt.close() 45 | """ 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [py-webrtcvad](https://github.com/wiseman/py-webrtcvad) wrapper for trimming speech clips 2 | [![Build](https://github.com/F-Tag/python-vad/actions/workflows/test.yaml/badge.svg)](https://github.com/F-Tag/python-vad/actions/workflows/test.yaml) 3 | [![PyPI version](https://badge.fury.io/py/pyvad.svg)](https://badge.fury.io/py/pyvad) 4 | [![Python Versions](https://img.shields.io/pypi/pyversions/pyvad.svg)](https://pypi.org/project/pyvad/) 5 | 6 | ## Requirement 7 | [numpy](https://github.com/numpy/numpy), 8 | [librosa](https://github.com/librosa/librosa) and 9 | [py-webrtcvad](https://github.com/wiseman/py-webrtcvad). 10 | 11 | ## Installation 12 | via pip 13 | ```sh 14 | $ pip install pyvad 15 | ``` 16 | 17 | or 18 | 19 | from github repository 20 | ```sh 21 | $ pip install git+https://github.com/F-Tag/python-vad.git 22 | ``` 23 | 24 | ## Usage 25 | ```python 26 | from pyvad import vad 27 | vact = vad(speech_data, speech_data_fs) 28 | ``` 29 | 30 | 31 | ## Example 32 | Please see `example.ipynb` jupyter notebook. 33 | 34 | ## License 35 | MIT License (see `LICENSE` file). 36 | 37 | ## Announcement 38 | The version 0.1.0 update break backward compatibility. 39 | 40 | The changes are as follows: 41 | 1. The `hoplength` argument has been changed to `hop_length`. 42 | 2. The `trim` returns (start_index, end_index) (`return_sec` argument is abolished). 43 | 3. Slightly changed the method of preprocessing a waveform in `vad`. 44 | 4. End of support for python 2.x. 45 | 46 | You can see the new API in the `example.ipynb`. 47 | 48 | The previous version is 0.0.8. 49 | ```sh 50 | $ pip install pyvad==0.0.8 51 | ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pyvad" 3 | version = "0.0.0" 4 | description = "'py-webrtcvad wrapper for trimming speech clips'" 5 | authors = ["Fumiaki Taguchi"] 6 | license = "MIT" 7 | readme = "README.md" 8 | homepage = "https://github.com/F-Tag/python-vad" 9 | repository = "https://github.com/F-Tag/python-vad" 10 | classifiers=[ 11 | 'Development Status :: 4 - Beta', 12 | 'Programming Language :: Python :: 3', 13 | 'Programming Language :: Python :: 3.8', 14 | 'Programming Language :: Python :: 3.9', 15 | 'License :: OSI Approved :: MIT License', 16 | 'Operating System :: OS Independent', 17 | 'Topic :: Multimedia :: Sound/Audio', 18 | 'Topic :: Multimedia :: Sound/Audio :: Analysis', 19 | 'Topic :: Multimedia :: Sound/Audio :: Speech', 20 | 'Topic :: Scientific/Engineering :: Information Analysis', 21 | 'Topic :: Scientific/Engineering :: Human Machine Interfaces', 22 | ] 23 | 24 | [tool.poetry.dependencies] 25 | python = "^3.8" 26 | numpy = "^1.23.0" 27 | librosa = "^0.9.2" 28 | webrtcvad = "^2.0.10" 29 | 30 | [tool.poetry.dev-dependencies] 31 | pysen = {version = "0.10.2", extras = ["lint"]} 32 | pytest = "^7.1.2" 33 | pysptk = "^0.1.21" 34 | jupyterlab = "^3.4.3" 35 | 36 | [build-system] 37 | requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] 38 | build-backend = "poetry.core.masonry.api" 39 | 40 | [tool.poetry-dynamic-versioning] 41 | enable = true 42 | style = "pep440" 43 | vcs = "git" 44 | 45 | [tool.pysen] 46 | version = "0.10" 47 | 48 | [tool.pysen.lint] 49 | enable_black = true 50 | enable_flake8 = true 51 | enable_isort = true 52 | enable_mypy = false 53 | mypy_preset = "strict" 54 | line_length = 88 55 | py_version = "py38" 56 | [[tool.pysen.lint.mypy_targets]] 57 | paths = ["."] 58 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: pytest 2 | 3 | # トリガー 4 | on: 5 | workflow_dispatch: 6 | push: 7 | branches: 8 | - master 9 | pull_request: 10 | types: [opened, reopened, synchronize] 11 | branches: 12 | - master 13 | 14 | 15 | # 実行job 16 | jobs: 17 | pytest: 18 | name: Run tests with pytest 19 | # 実行環境として `ubuntu-latest` という名前のものを選ぶ 20 | runs-on: ubuntu-latest 21 | # 複数の Python のバージョンでテストするために `strategy.matrix` を設定する 22 | strategy: 23 | matrix: 24 | python-version: [3.8, 3.9] 25 | steps: 26 | # リポジトリをチェックアウトする 27 | - name: Checkout 28 | uses: actions/checkout@v2 29 | # Python のセットアップ 30 | # See: https://github.com/actions/setup-python 31 | - name: Set up Python ${{ matrix.python-version }} 32 | uses: actions/setup-python@v1 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | # # Poetry そのものをインストールする 36 | # - name: Install Poetry 37 | # run: | 38 | # curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python 39 | # # Poetry へのパスを通す 40 | # - name: Add path for Poetry 41 | # run: echo "$HOME/.poetry/bin" >> $GITHUB_PATH 42 | # # インストールした Poetry を使って必要な Python パッケージをインストールする 43 | # - name: Install Dependencies 44 | # run: poetry install --no-interaction 45 | # test を実行 46 | - name: apt install dependencies 47 | run: sudo apt install libsndfile1-dev clang-9 48 | - name: pip install package 49 | run: | 50 | pip install -U pip 51 | pip install . 52 | - name: Run Tests 53 | run: | 54 | cd test 55 | python vad_test.py 56 | python trim_test.py 57 | python split_test.py 58 | -------------------------------------------------------------------------------- /test/voice/COPYING: -------------------------------------------------------------------------------- 1 | 2 | This voice is free for use for any purpose (commercial or otherwise) 3 | subject to the pretty light restrictions detailed below. 4 | 5 | ############################################################################ 6 | ### ## 7 | ### Carnegie Mellon University ## 8 | ### Copyright (c) 2003 ## 9 | ### All Rights Reserved. ## 10 | ### ## 11 | ### Permission to use, copy, modify, and licence this software and its ## 12 | ### documentation for any purpose, is hereby granted without fee, ## 13 | ### subject to the following conditions: ## 14 | ### 1. The code must retain the above copyright notice, this list of ## 15 | ### conditions and the following disclaimer. ## 16 | ### 2. Any modifications must be clearly marked as such. ## 17 | ### 3. Original authors' names are not deleted. ## 18 | ### ## 19 | ### THE AUTHORS OF THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO ## 20 | ### THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY ## 21 | ### AND FITNESS, IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY ## 22 | ### SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ## 23 | ### WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ## 24 | ### AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ## 25 | ### ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ## 26 | ### THIS SOFTWARE. ## 27 | ### ## 28 | ############################################################################ 29 | ### ## 30 | ### See http://www.festvox.org/cmu_arctic/ for more details ## 31 | ### ## 32 | ############################################################################ 33 | 34 | -------------------------------------------------------------------------------- /pyvad/vad.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from warnings import warn 5 | 6 | import numpy as np 7 | import webrtcvad 8 | from librosa.core import resample 9 | from librosa.util import frame 10 | 11 | 12 | def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0): 13 | """Voice activity detection. 14 | This was implementioned for easier use of py-webrtcvad. 15 | Parameters 16 | ---------- 17 | data : ndarray 18 | numpy array of mono (1 ch) speech data. 19 | 1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1). 20 | if data type is int, -32768 < data < 32767. 21 | if data type is float, -1 < data < 1. 22 | fs : int 23 | Sampling frequency of data. 24 | fs_vad : int, optional 25 | Sampling frequency for webrtcvad. 26 | fs_vad must be 8000, 16000, 32000 or 48000. 27 | Default is 16000. 28 | hop_length : int, optional 29 | Step size[milli second]. 30 | hop_length must be 10, 20, or 30. 31 | Default is 0.1. 32 | vad_mode : int, optional 33 | set vad aggressiveness. 34 | As vad_mode increases, it becomes more aggressive. 35 | vad_mode must be 0, 1, 2 or 3. 36 | Default is 0. 37 | 38 | Returns 39 | ------- 40 | vact : ndarray 41 | voice activity. time length of vact is same as input data. 42 | If 0, it is unvoiced, 1 is voiced. 43 | """ 44 | 45 | # check argument 46 | if fs_vad not in [8000, 16000, 32000, 48000]: 47 | raise ValueError("fs_vad must be 8000, 16000, 32000 or 48000.") 48 | 49 | if hop_length not in [10, 20, 30]: 50 | raise ValueError("hop_length must be 10, 20, or 30.") 51 | 52 | if vad_mode not in [0, 1, 2, 3]: 53 | raise ValueError("vad_mode must be 0, 1, 2 or 3.") 54 | 55 | # check data 56 | if data.dtype.kind == "i": 57 | if data.max() > 2 ** 15 - 1 or data.min() < -(2 ** 15): 58 | raise ValueError( 59 | "When data.type is int, data must be -32768 < data < 32767." 60 | ) 61 | data = data.astype("f") / 2.0 ** 15 62 | 63 | elif data.dtype.kind == "f": 64 | if np.abs(data).max() > 1: 65 | raise ValueError( 66 | "When data.type is float, data must be -1.0 <= data <= 1.0." 67 | ) 68 | data = data.astype("f") 69 | 70 | else: 71 | raise ValueError("data.dtype must be int or float.") 72 | 73 | data = data.squeeze() 74 | if not data.ndim == 1: 75 | raise ValueError("data must be mono (1 ch).") 76 | 77 | # resampling 78 | if fs != fs_vad: 79 | resampled = resample(data, orig_sr=fs, target_sr=fs_vad) 80 | if np.abs(resampled).max() > 1.0: 81 | resampled *= 0.99 / np.abs(resampled).max() 82 | warn("Resampling causes data clipping. data was rescaled.") 83 | 84 | else: 85 | resampled = data 86 | 87 | resampled = (resampled * 2.0 ** 15).astype("int16") 88 | 89 | hop = fs_vad * hop_length // 1000 90 | framelen = resampled.size // hop + 1 91 | padlen = framelen * hop - resampled.size 92 | paded = np.lib.pad(resampled, (0, padlen), "constant", constant_values=0) 93 | framed = frame(paded, frame_length=hop, hop_length=hop).T 94 | 95 | vad = webrtcvad.Vad() 96 | vad.set_mode(vad_mode) 97 | valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed] 98 | 99 | # smoothing 100 | valist = np.asarray(valist).astype("float") 101 | valist = np.convolve(valist, np.ones(3) / 3, mode="same") > 0 102 | 103 | hop_origin = fs * hop_length // 1000 104 | va_framed = np.zeros([len(valist), hop_origin]) 105 | va_framed[valist] = 1 106 | 107 | return va_framed.reshape(-1)[: data.size] 108 | -------------------------------------------------------------------------------- /pyvad/effects.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .vad import vad 4 | 5 | 6 | def _get_edges(vact): 7 | 8 | edges = np.flatnonzero(np.diff(vact.astype(int))) 9 | edges = edges + 1 10 | 11 | if vact[0]: 12 | edges = np.hstack((0, edges)) 13 | 14 | if vact[-1]: 15 | edges = np.hstack((edges, vact.size)) 16 | 17 | edges = np.minimum(edges, vact.size).reshape(-1, 2) 18 | edges = edges[(edges[:, 1] - edges[:, 0]) > 0] 19 | 20 | return edges 21 | 22 | 23 | def _rms(arr): 24 | return np.sqrt((arr ** 2.0).mean()) 25 | 26 | 27 | def _drop_silence(waveform, edges, threshold_db): 28 | 29 | rms = [] 30 | for s, e in edges: 31 | rms.append(_rms(waveform[s:e])) 32 | 33 | rms = 20 * np.log10(rms) 34 | 35 | return edges[rms >= threshold_db] 36 | 37 | 38 | def _merge_short_silence(edges, max_samples): 39 | if len(edges) == 0: 40 | return edges 41 | 42 | ret = [edges[0].tolist()] 43 | for s, e in edges[1:]: 44 | if s - ret[-1][-1] < max_samples: 45 | ret[-1][-1] = e 46 | else: 47 | ret.append([s, e]) 48 | 49 | return np.asarray(ret) 50 | 51 | 52 | def trim( 53 | data, fs, fs_vad=16000, hop_length=30, vad_mode=0, threshold_db=-35.0, min_dur=0.2 54 | ): 55 | """ 56 | Trim leading and trailing silence from an speech waveform by using vad. 57 | Parameters 58 | ---------- 59 | data : ndarray 60 | numpy array of mono (1 ch) speech data. 61 | 1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1). 62 | if data type is int, -32768 < data < 32767. 63 | if data type is float, -1 < data < 1. 64 | fs : int 65 | Sampling frequency of data. 66 | fs_vad : int, optional 67 | Sampling frequency for webrtcvad. 68 | fs_vad must be 8000, 16000, 32000 or 48000. 69 | Default is 16000. 70 | hop_length : int, optional 71 | Step size[milli second]. 72 | hop_length must be 10, 20, or 30. 73 | Default is 0.1. 74 | vad_mode : int, optional 75 | set vad aggressiveness. 76 | As vad_mode increases, it becomes more aggressive. 77 | vad_mode must be 0, 1, 2 or 3. 78 | Default is 0. 79 | threshold_db : float, optional 80 | The threshold level (in dB) below reference to consider as silence. 81 | Default is -35.0. 82 | min_dur : float, optional 83 | The minimum duration (in seconds) of each speech segment. 84 | Default is 0.5. 85 | 86 | Returns 87 | ------- 88 | (start_index, end_index) : int 89 | trimed waveform is data[start_index:end_index] 90 | If voice activity can't be detected, return 0, 0. 91 | """ 92 | 93 | vact = vad(data, fs, fs_vad, hop_length, vad_mode) 94 | 95 | edges = _get_edges(vact) 96 | edges = _merge_short_silence(edges, fs * 0.1) 97 | edges = edges[(edges[:, 1] - edges[:, 0]) > fs * min_dur] 98 | edges = _drop_silence(data, edges, threshold_db) 99 | 100 | edges = edges.ravel() 101 | 102 | if edges.any(): 103 | return edges[0], edges[-1] 104 | else: 105 | return 0, 0 106 | 107 | 108 | def split( 109 | data, fs, fs_vad=16000, hop_length=30, vad_mode=0, threshold_db=-35.0, min_dur=0.5 110 | ): 111 | """ 112 | Split a speech waveform into non-silent intervals by using vad. 113 | 114 | Parameters 115 | ---------- 116 | data : ndarray 117 | numpy array of mono (1 ch) speech data. 118 | 1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1). 119 | if data type is int, -32768 < data < 32767. 120 | if data type is float, -1 < data < 1. 121 | fs : int 122 | Sampling frequency of data. 123 | fs_vad : int, optional 124 | Sampling frequency for webrtcvad. 125 | fs_vad must be 8000, 16000, 32000 or 48000. 126 | Default is 16000. 127 | hop_length : int, optional 128 | Step size[milli second]. 129 | hop_length must be 10, 20, or 30. 130 | Default is 0.1. 131 | vad_mode : int, optional 132 | Set vad aggressiveness. 133 | As vad_mode increases, it becomes more aggressive. 134 | vad_mode must be 0, 1, 2 or 3. 135 | Default is 0. 136 | threshold_db : float, optional 137 | The threshold level (in dB) below reference to consider as silence. 138 | Default is -35.0. 139 | min_dur : float, optional 140 | The minimum duration (in seconds) of each speech segment. 141 | Default is 0.5. 142 | 143 | Returns 144 | ------- 145 | edges : np.ndarray, shape=(m, 2) 146 | `edges[i] == (start_i, end_i)` are the start and end time 147 | (in samples) of non-silent interval `i`. 148 | """ 149 | 150 | vact = vad(data, fs, fs_vad, hop_length, vad_mode) 151 | 152 | edges = _get_edges(vact) 153 | edges = _merge_short_silence(edges, fs * 0.1) 154 | edges = edges[(edges[:, 1] - edges[:, 0]) > fs * min_dur] 155 | edges = _drop_silence(data, edges, threshold_db) 156 | 157 | return edges 158 | -------------------------------------------------------------------------------- /example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import modules" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyvad import vad, trim, split\n", 17 | "from librosa import load\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import numpy as np\n", 20 | "import IPython.display" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Load speech data " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "image/png": "\n", 38 | "text/plain": [ 39 | "
" 40 | ] 41 | }, 42 | "metadata": { 43 | "needs_background": "light" 44 | }, 45 | "output_type": "display_data" 46 | } 47 | ], 48 | "source": [ 49 | "name = \"test/voice/arctic_a0007.wav\"\n", 50 | "data, fs = load(name)\n", 51 | "data = np.hstack((data, -data))\n", 52 | "data *=0.95 / np.abs(data).max()\n", 53 | "time = np.linspace(0, len(data)/fs, len(data)) # time axis\n", 54 | "plt.plot(time, data)\n", 55 | "plt.show()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Do VAD (int)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "CPU times: user 166 ms, sys: 3.9 ms, total: 169 ms\n", 75 | "Wall time: 176 ms\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "%time vact = vad(data, fs, fs_vad = 16000, hop_length = 30, vad_mode=3)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Plot result" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "image/png": "\n", 98 | "text/plain": [ 99 | "
" 100 | ] 101 | }, 102 | "metadata": { 103 | "needs_background": "light" 104 | }, 105 | "output_type": "display_data" 106 | } 107 | ], 108 | "source": [ 109 | "fig, ax1 = plt.subplots()\n", 110 | "\n", 111 | "ax1.plot(time, data, label='speech waveform')\n", 112 | "ax1.set_xlabel(\"TIME [s]\")\n", 113 | "\n", 114 | "ax2=ax1.twinx()\n", 115 | "ax2.plot(time, vact, color=\"r\", label = 'vad')\n", 116 | "plt.yticks([1] ,['voice'])\n", 117 | "ax2.set_ylim([-0.01, 1.01])\n", 118 | "\n", 119 | "plt.legend()\n", 120 | "plt.show()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "# trim" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "CPU times: user 173 ms, sys: 6.07 ms, total: 179 ms\n", 140 | "Wall time: 194 ms\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "%time edges = trim(data, fs, fs_vad = 16000, hop_length = 30, vad_mode=3)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "# Plot result" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 6, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "image/png": "\n", 163 | "text/plain": [ 164 | "
" 165 | ] 166 | }, 167 | "metadata": { 168 | "needs_background": "light" 169 | }, 170 | "output_type": "display_data" 171 | } 172 | ], 173 | "source": [ 174 | "trimed = data[edges[0]:edges[1]]\n", 175 | "time = np.linspace(0, len(trimed)/fs, len(trimed)) # time axis\n", 176 | "fig, ax1 = plt.subplots()\n", 177 | "\n", 178 | "ax1.plot(time, trimed, label='speech waveform')\n", 179 | "ax1.set_xlabel(\"TIME [s]\")\n", 180 | "\n", 181 | "plt.show()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# split" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 7, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "CPU times: user 171 ms, sys: 5.65 ms, total: 177 ms\n", 201 | "Wall time: 208 ms\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "%time edges = split(data, fs, fs_vad = 8000, hop_length = 10, vad_mode=3)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "# Plot result" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 8, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "image/png": "\n", 224 | "text/plain": [ 225 | "
" 226 | ] 227 | }, 228 | "metadata": { 229 | "needs_background": "light" 230 | }, 231 | "output_type": "display_data" 232 | }, 233 | { 234 | "data": { 235 | "image/png": "\n", 236 | "text/plain": [ 237 | "
" 238 | ] 239 | }, 240 | "metadata": { 241 | "needs_background": "light" 242 | }, 243 | "output_type": "display_data" 244 | } 245 | ], 246 | "source": [ 247 | "for i, edge in enumerate(edges):\n", 248 | " seg = data[edge[0]:edge[1]]\n", 249 | " time = np.linspace(0, len(seg)/fs, len(seg)) # time axis\n", 250 | " \n", 251 | " fig, ax1 = plt.subplots()\n", 252 | "\n", 253 | " ax1.plot(time, seg, label='speech waveform')\n", 254 | " ax1.set_xlabel(\"TIME [s]\")\n", 255 | "\n", 256 | " plt.show()" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.7.9" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 4 281 | } 282 | --------------------------------------------------------------------------------