├── README.md
└── python_vad.py


/README.md:
--------------------------------------------------------------------------------
 1 | # PythonVAD
 2 | 
 3 | Python Voice Activity Detection for Chat Bots
 4 | Python_vad.py listens to micro-phone and captures voice activity in a sound chunck. My modifications just make the app run continouse and uses sound buffers instead of reading/writing to file.
 5 | 
 6 | This code is the foundation if a more advanced version that actually sends sound chuncks to Google for speech to text conversion.
 7 | 
 8 | YouTube: https://youtu.be/b8dxSCz5JnU
 9 | 
10 | Modification of:
11 | https://github.com/wiseman/py-webrtcvad (MIT Copyright (c) 2016 John Wiseman)
12 | https://github.com/wangshub/python-vad (MIT Copyright (c) 2017 wangshub)
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/python_vad.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | #--- Steve Cox --- 1/10/19
  4 | # Copyright (c) Stef van der Struijk
  5 | # License: GNU Lesser General Public License
  6 | 
  7 | # Modified code to play sound from buffer recording
  8 | # Added code to wait till sound is finished play so no echo occurs
  9 | 
 10 | # Modification of:
 11 | # https://github.com/wiseman/py-webrtcvad (MIT Copyright (c) 2016 John Wiseman)
 12 | # https://github.com/wangshub/python-vad (MIT Copyright (c) 2017 wangshub)
 13 | 
 14 | Requirements:
 15 | + pyaudio - `pip install pyaudio`
 16 | + py-webrtcvad - `pip install webrtcvad`
 17 | '''
 18 | import webrtcvad
 19 | import collections
 20 | import sys
 21 | import signal
 22 | import pyaudio
 23 | 
 24 | from array import array
 25 | from struct import pack
 26 | import wave
 27 | import time
 28 | 
 29 | FORMAT = pyaudio.paInt16
 30 | CHANNELS = 1
 31 | RATE = 16000
 32 | CHUNK_DURATION_MS = 30       # supports 10, 20 and 30 (ms)
 33 | PADDING_DURATION_MS = 1500   # 1 sec jugement
 34 | CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000)  # chunk to read
 35 | CHUNK_BYTES = CHUNK_SIZE * 2  # 16bit = 2 bytes, PCM
 36 | NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS)
 37 | 
 38 | #--- Steve Cox
 39 | NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS)
 40 | #NUM_WINDOW_CHUNKS = int(400 / CHUNK_DURATION_MS)  # 400 ms/ 30ms  ge
 41 | 
 42 | NUM_WINDOW_CHUNKS_END = NUM_WINDOW_CHUNKS * 2
 43 | START_OFFSET = int(NUM_WINDOW_CHUNKS * CHUNK_DURATION_MS * 0.5 * RATE)
 44 | 
 45 | vad = webrtcvad.Vad(1)
 46 | 
 47 | #------ Steve Cox
 48 | # One time Pygame init
 49 | 
 50 | import pygame
 51 | pygame.mixer.pre_init(RATE, -16, CHANNELS, 2048) # setup mixer to avoid sound lag
 52 | pygame.mixer.init()
 53 | pygame.init()
 54 | 
 55 | #-------------------------- 
 56 | 
 57 | pa = pyaudio.PyAudio()
 58 | stream = pa.open(format=FORMAT,
 59 |                  channels=CHANNELS,
 60 |                  rate=RATE,
 61 |                  input=True,
 62 |                  start=False,
 63 |                  # input_device_index=2,
 64 |                  frames_per_buffer=CHUNK_SIZE)
 65 | 
 66 | 
 67 | got_a_sentence = False
 68 | 
 69 | def normalize(snd_data):
 70 |     "Average the volume out"
 71 |     MAXIMUM = 32767  # 16384
 72 |     times = float(MAXIMUM) / max(abs(i) for i in snd_data)
 73 |     r = array('h')
 74 |     for i in snd_data:
 75 |         r.append(int(i * times))
 76 |     return r
 77 | 
 78 | 
 79 | while True:
 80 |     ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS)
 81 |     triggered = False
 82 |     voiced_frames = []
 83 |     ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS
 84 |     ring_buffer_index = 0
 85 | 
 86 |     ring_buffer_flags_end = [0] * NUM_WINDOW_CHUNKS_END
 87 |     ring_buffer_index_end = 0
 88 |     buffer_in = ''
 89 |     # WangS
 90 |     raw_data = array('h')
 91 |     index = 0
 92 |     start_point = 0
 93 |     StartTime = time.time()
 94 |     print("* recording: ")
 95 |     stream.start_stream()
 96 | 
 97 |     while not got_a_sentence:
 98 |         chunk = stream.read(CHUNK_SIZE)
 99 |         # add WangS
100 |         raw_data.extend(array('h', chunk))
101 |         index += CHUNK_SIZE
102 |         TimeUse = time.time() - StartTime
103 | 
104 |         active = vad.is_speech(chunk, RATE)
105 | 
106 |         sys.stdout.write('1' if active else '_')
107 |         ring_buffer_flags[ring_buffer_index] = 1 if active else 0
108 |         ring_buffer_index += 1
109 |         ring_buffer_index %= NUM_WINDOW_CHUNKS
110 | 
111 |         ring_buffer_flags_end[ring_buffer_index_end] = 1 if active else 0
112 |         ring_buffer_index_end += 1
113 |         ring_buffer_index_end %= NUM_WINDOW_CHUNKS_END
114 | 
115 |         # start point detection
116 |         if not triggered:
117 |             ring_buffer.append(chunk)
118 |             num_voiced = sum(ring_buffer_flags)
119 |             if num_voiced > 0.8 * NUM_WINDOW_CHUNKS:
120 |                 sys.stdout.write(' Open ')
121 |                 triggered = True
122 |                 start_point = index - CHUNK_SIZE * 20  # start point
123 |                 ring_buffer.clear()
124 |         # end point detection
125 |         else:
126 |             ring_buffer.append(chunk)
127 |             num_unvoiced = NUM_WINDOW_CHUNKS_END - sum(ring_buffer_flags_end)
128 |             
129 |             if num_unvoiced > 0.90 * NUM_WINDOW_CHUNKS_END or TimeUse > 10:
130 |                 sys.stdout.write(' Close ')
131 |                 triggered = False
132 |                 got_a_sentence = True
133 | 
134 |         sys.stdout.flush()
135 | 
136 |     sys.stdout.write('\n')
137 |     
138 |     stream.stop_stream()
139 |     print("* done recording")
140 |     got_a_sentence = False
141 | 
142 |     # write to file
143 |     raw_data.reverse()
144 |     for index in range(start_point):
145 |         raw_data.pop()
146 |         
147 |     raw_data.reverse()
148 |     raw_data = normalize(raw_data)
149 |     
150 |     #--- Steve Cox
151 |     #--- the wav has a header, we need to strip it off before playing
152 |     wav_data = raw_data[44:len(raw_data)] 
153 |     sound = pygame.mixer.Sound(buffer=wav_data)
154 |     sound.play()
155 |     #--- Wait for the sound to finish playing or we get an echo
156 |     while pygame.mixer.get_busy():
157 |         pass
158 |     
159 |     #data = np.zeros((10, 10), dtype="uint8")
160 |     #zmqWave.sendPlayEvent('zzzz',data)
161 |     
162 | 
163 | stream.close()
164 | 


--------------------------------------------------------------------------------