├── README.md └── python_vad.py /README.md: -------------------------------------------------------------------------------- 1 | # PythonVAD 2 | 3 | Python Voice Activity Detection for Chat Bots 4 | Python_vad.py listens to micro-phone and captures voice activity in a sound chunck. My modifications just make the app run continouse and uses sound buffers instead of reading/writing to file. 5 | 6 | This code is the foundation if a more advanced version that actually sends sound chuncks to Google for speech to text conversion. 7 | 8 | YouTube: https://youtu.be/b8dxSCz5JnU 9 | 10 | Modification of: 11 | https://github.com/wiseman/py-webrtcvad (MIT Copyright (c) 2016 John Wiseman) 12 | https://github.com/wangshub/python-vad (MIT Copyright (c) 2017 wangshub) 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /python_vad.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | #--- Steve Cox --- 1/10/19 4 | # Copyright (c) Stef van der Struijk 5 | # License: GNU Lesser General Public License 6 | 7 | # Modified code to play sound from buffer recording 8 | # Added code to wait till sound is finished play so no echo occurs 9 | 10 | # Modification of: 11 | # https://github.com/wiseman/py-webrtcvad (MIT Copyright (c) 2016 John Wiseman) 12 | # https://github.com/wangshub/python-vad (MIT Copyright (c) 2017 wangshub) 13 | 14 | Requirements: 15 | + pyaudio - `pip install pyaudio` 16 | + py-webrtcvad - `pip install webrtcvad` 17 | ''' 18 | import webrtcvad 19 | import collections 20 | import sys 21 | import signal 22 | import pyaudio 23 | 24 | from array import array 25 | from struct import pack 26 | import wave 27 | import time 28 | 29 | FORMAT = pyaudio.paInt16 30 | CHANNELS = 1 31 | RATE = 16000 32 | CHUNK_DURATION_MS = 30 # supports 10, 20 and 30 (ms) 33 | PADDING_DURATION_MS = 1500 # 1 sec jugement 34 | CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000) # chunk to read 35 | CHUNK_BYTES = CHUNK_SIZE * 2 # 16bit = 2 bytes, PCM 36 | NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS) 37 | 38 | #--- Steve Cox 39 | NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS) 40 | #NUM_WINDOW_CHUNKS = int(400 / CHUNK_DURATION_MS) # 400 ms/ 30ms ge 41 | 42 | NUM_WINDOW_CHUNKS_END = NUM_WINDOW_CHUNKS * 2 43 | START_OFFSET = int(NUM_WINDOW_CHUNKS * CHUNK_DURATION_MS * 0.5 * RATE) 44 | 45 | vad = webrtcvad.Vad(1) 46 | 47 | #------ Steve Cox 48 | # One time Pygame init 49 | 50 | import pygame 51 | pygame.mixer.pre_init(RATE, -16, CHANNELS, 2048) # setup mixer to avoid sound lag 52 | pygame.mixer.init() 53 | pygame.init() 54 | 55 | #-------------------------- 56 | 57 | pa = pyaudio.PyAudio() 58 | stream = pa.open(format=FORMAT, 59 | channels=CHANNELS, 60 | rate=RATE, 61 | input=True, 62 | start=False, 63 | # input_device_index=2, 64 | frames_per_buffer=CHUNK_SIZE) 65 | 66 | 67 | got_a_sentence = False 68 | 69 | def normalize(snd_data): 70 | "Average the volume out" 71 | MAXIMUM = 32767 # 16384 72 | times = float(MAXIMUM) / max(abs(i) for i in snd_data) 73 | r = array('h') 74 | for i in snd_data: 75 | r.append(int(i * times)) 76 | return r 77 | 78 | 79 | while True: 80 | ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS) 81 | triggered = False 82 | voiced_frames = [] 83 | ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS 84 | ring_buffer_index = 0 85 | 86 | ring_buffer_flags_end = [0] * NUM_WINDOW_CHUNKS_END 87 | ring_buffer_index_end = 0 88 | buffer_in = '' 89 | # WangS 90 | raw_data = array('h') 91 | index = 0 92 | start_point = 0 93 | StartTime = time.time() 94 | print("* recording: ") 95 | stream.start_stream() 96 | 97 | while not got_a_sentence: 98 | chunk = stream.read(CHUNK_SIZE) 99 | # add WangS 100 | raw_data.extend(array('h', chunk)) 101 | index += CHUNK_SIZE 102 | TimeUse = time.time() - StartTime 103 | 104 | active = vad.is_speech(chunk, RATE) 105 | 106 | sys.stdout.write('1' if active else '_') 107 | ring_buffer_flags[ring_buffer_index] = 1 if active else 0 108 | ring_buffer_index += 1 109 | ring_buffer_index %= NUM_WINDOW_CHUNKS 110 | 111 | ring_buffer_flags_end[ring_buffer_index_end] = 1 if active else 0 112 | ring_buffer_index_end += 1 113 | ring_buffer_index_end %= NUM_WINDOW_CHUNKS_END 114 | 115 | # start point detection 116 | if not triggered: 117 | ring_buffer.append(chunk) 118 | num_voiced = sum(ring_buffer_flags) 119 | if num_voiced > 0.8 * NUM_WINDOW_CHUNKS: 120 | sys.stdout.write(' Open ') 121 | triggered = True 122 | start_point = index - CHUNK_SIZE * 20 # start point 123 | ring_buffer.clear() 124 | # end point detection 125 | else: 126 | ring_buffer.append(chunk) 127 | num_unvoiced = NUM_WINDOW_CHUNKS_END - sum(ring_buffer_flags_end) 128 | 129 | if num_unvoiced > 0.90 * NUM_WINDOW_CHUNKS_END or TimeUse > 10: 130 | sys.stdout.write(' Close ') 131 | triggered = False 132 | got_a_sentence = True 133 | 134 | sys.stdout.flush() 135 | 136 | sys.stdout.write('\n') 137 | 138 | stream.stop_stream() 139 | print("* done recording") 140 | got_a_sentence = False 141 | 142 | # write to file 143 | raw_data.reverse() 144 | for index in range(start_point): 145 | raw_data.pop() 146 | 147 | raw_data.reverse() 148 | raw_data = normalize(raw_data) 149 | 150 | #--- Steve Cox 151 | #--- the wav has a header, we need to strip it off before playing 152 | wav_data = raw_data[44:len(raw_data)] 153 | sound = pygame.mixer.Sound(buffer=wav_data) 154 | sound.play() 155 | #--- Wait for the sound to finish playing or we get an echo 156 | while pygame.mixer.get_busy(): 157 | pass 158 | 159 | #data = np.zeros((10, 10), dtype="uint8") 160 | #zmqWave.sendPlayEvent('zzzz',data) 161 | 162 | 163 | stream.close() 164 | --------------------------------------------------------------------------------