├── .gitignore
├── .streamlit
    └── config.toml
├── README.md
├── brains.py
├── demo.png
├── interface.py
├── requirements.txt
├── session_manager.py
├── speech_module
    ├── inference.py
    ├── stt_model.py
    ├── transcription.py
    └── tts_model.py
├── system_prompt.txt
└── wake_words.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Directories
 2 | __pycache__/
 3 | speech_module/__pycache__/
 4 | .venv/
 5 | .vscode/
 6 | 
 7 | models/
 8 | saved_audio/ 
 9 | 
10 | # Files
11 | .env
12 | .DS_Store
13 | 


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
 1 | [theme]
 2 | 
 3 | # The preset Streamlit theme that your custom theme inherits from. One of "light" or "dark".
 4 | # base = 'light'
 5 | 
 6 | # Primary accent for interactive elements
 7 | # primaryColor = ''
 8 | 
 9 | # Background color for the main content area
10 | # backgroundColor = ''
11 | 
12 | # Background color for sidebar and most interactive widgets
13 | # secondaryBackgroundColor = ''
14 | 
15 | # Color used for almost all text
16 | # textColor = ''
17 | 
18 | # Font family for all text in the app, except code blocks
19 | # Accepted values (serif | sans serif | monospace) 
20 | # Default: "sans serif"
21 | font = "sans serif"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM-based Voice Assistant 
 2 | This is an AI Voice Assistant based on Large Language Models. A user can interact with the Voice Assistant in natural language, currently English. 
 3 | 
 4 | The implementation brings various deep learning models together: 
 5 | - Large Language Model (GPT-4 or Alpaca, can be chosen) 
 6 | - Speech-To-Text Model (Wave2Vec2-Large)
 7 | - Text-To-Speech Model (Microsoft SpeechT5)
 8 | 
 9 | The speech module is interfaced with the local microphone to create live transcription via a VAD Process. A transcription is sent to the chosen LLM for processing based on wake words. 
10 | 
11 | Once the LLM generates a response, speech module also saves the audio file and generates a speech output using a TTS model. 
12 | 
13 | The User Interface is built using [Streamlit](https://docs.streamlit.io) and provides a familiar Chat-like experience. 
14 | 
15 | # Demo
16 | ![image](demo.png)
17 | 
18 | # Installation 
19 | Install project dependencies
20 | ```
21 | pip install -r requirements 
22 | ```
23 | 
24 | If using GPT Models, create a `.env` file with environment variables for `OPENAI_API_KEY` and `OPENAI_API_BASE`. 
25 | 


--------------------------------------------------------------------------------
/brains.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import openai
 4 | 
 5 | from dotenv import load_dotenv
 6 | 
 7 | def chatgpt(content, model="gpt-4-32k-deployment"):
 8 | 
 9 |     load_dotenv(".env")
10 | 
11 |     openai.api_type = "azure"
12 |     openai.api_base = os.environ.get("OPENAI_API_BASE") 
13 |     openai.api_version = "2023-03-15-preview"
14 |     openai.api_key = os.environ.get("OPENAI_API_KEY")
15 | 
16 |     response = openai.ChatCompletion.create(
17 |                 engine=model,
18 |                 messages=content
19 |             )
20 | 
21 |     output = response['choices'][0]['message']['content']
22 |     return output
23 | 
24 | 


--------------------------------------------------------------------------------
/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avsrma/LLM-based-AI-Assistant/00d38fe3f5449a093aa2c0613dc214884272c4ed/demo.png


--------------------------------------------------------------------------------
/interface.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import base64
  3 | import logging
  4 | import sys
  5 | 
  6 | from brains import chatgpt
  7 | from session_manager import update_conversation, fix_typos_in_wake_word, is_user_talking_to_me
  8 | from speech_module.transcription import LiveTranscription
  9 | 
 10 | import os
 11 | 
 12 | import streamlit as st
 13 | from streamlit_chat import message
 14 | 
 15 | from llama_cpp import Llama
 16 | 
 17 | from speech_module.tts_model import TextToSpeechModel
 18 | 
 19 | def autoplay_audio(file_path="speech.wav", idx=0):
 20 |     print("Playing audio file: ", file_path)
 21 |     with open(file_path, "rb") as binary_audio:
 22 |         audio_bytes = binary_audio.read()
 23 | 
 24 |     audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
 25 |     audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
 26 |     st.markdown(audio_tag, unsafe_allow_html=True) 
 27 | 
 28 | def alpaca_model(conversation, model):
 29 |     response = model.create_chat_completion(conversation)
 30 |     return response['choices'][0]['message']['content']
 31 | 
 32 | def pipeline(tts, engine="gpt4"):
 33 |     print("Live speech recognition")
 34 | 
 35 |     with st.spinner("Loading"):
 36 | 
 37 |         if engine == "alpaca":
 38 |             logger.info('Loaded Alpaca-7B model...')
 39 |             llm_model = Llama(model_path="models/llm/ggml-model-q4_0.bin")
 40 |             logger.info('LLM loaded.')
 41 | 
 42 |         logger.info("Llama model loaded.")    
 43 | 
 44 |         live_transcription = LiveTranscription()
 45 |         live_transcription.start()
 46 |         logger.info("Started live transcription.")
 47 | 
 48 |         with open("system_prompt.txt", "r") as f:
 49 |             system_prompt = f.read()
 50 |         print(system_prompt)
 51 | 
 52 |         conversation = [{"role": "system", 
 53 |                         "content": system_prompt}]
 54 | 
 55 |     with open("wake_words.txt", "r") as f:
 56 |         wake_words = f.read().split("\n")
 57 | 
 58 |     st.title("Listening...")
 59 |     message("Hi I'm Alice, ask me anything! Just use a wake word in your sentence. I respond to Mark One, Alice, or Jarvis!", is_user=True, avatar_style="adventurer", seed="Whiskers")
 60 | 
 61 |     try:
 62 |         while True:
 63 |             transcript, sample_length, inference_time, confidence = live_transcription.get_last_text()
 64 |             print(f"{sample_length:.3f}s\t{inference_time:.3f}s\t{confidence}\t{transcript}")
 65 | 
 66 |             if is_user_talking_to_me(transcript, wake_words): 
 67 |                 logger.info("User is talking to me!")
 68 | 
 69 |                 transcript = fix_typos_in_wake_word(transcript, wake_words, "Alice")
 70 |                 message(transcript, avatar_style="adventurer", seed="Trouble")
 71 | 
 72 |                 with st.spinner("Generating audio: "):
 73 |                     update_conversation(conversation, "user", transcript)
 74 | 
 75 |                     if engine == "alpaca":
 76 |                         logger.info("Alpaca getting response")
 77 |                         response = alpaca_model(conversation, llm_model)
 78 |                         logger.info(response)
 79 | 
 80 |                     elif engine == "gpt4":
 81 |                         logger.info("GPT-4 getting response")
 82 |                         response = chatgpt(content=conversation) 
 83 |                         logger.info(response)
 84 | 
 85 |                     print("Prompt invoked: ", conversation) 
 86 |                     update_conversation(conversation, "assistant", response)
 87 |                     message(response, is_user=True, avatar_style="adventurer", seed="Whiskers")
 88 | 
 89 |                     speech_file = tts.tts_generator(response)
 90 |                     autoplay_audio(speech_file)
 91 | 
 92 |             if transcript == "quit": 
 93 |                 message("Goodbye!", is_user=True, avatar_style="adventurer", seed="Whiskers")
 94 |                 live_transcription.stop()
 95 |                 exit()
 96 |             
 97 |             if transcript == "restart":
 98 |                 live_transcription.stop()
 99 |                 os.execvp(sys.executable, [os.environ.get("WORKDIR")+'/.venv/bin/python'] + sys.argv)
100 | 
101 |     except KeyboardInterrupt:
102 |         live_transcription.stop()
103 |         exit()
104 | 
105 | if __name__ == "__main__":
106 | 
107 |     logger = logging.getLogger(__name__)
108 |     logger.setLevel(logging.INFO)
109 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
110 |     stream_handler = logging.StreamHandler()
111 |     stream_handler.setFormatter(formatter)
112 |     logger.addHandler(stream_handler)
113 | 
114 |     logger.info('Starting Logger')
115 | 
116 |     logger.info("Loading TTS model...")
117 |     tts = TextToSpeechModel()
118 |     logger.info("TTS model loaded")
119 | 
120 |     pipeline(tts, engine="gpt4")
121 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets
 2 | transformers
 3 | soundfile
 4 | librosa
 5 | torch
 6 | torchvision 
 7 | torchaudio
 8 | webrtcvad
 9 | onnx
10 | onnxruntime
11 | pyctcdecode
12 | pyaudio
13 | sentencepiece
14 | openai
15 | python-dotenv
16 | streamlit
17 | llama-cpp-python
18 | streamlit-chat


--------------------------------------------------------------------------------
/session_manager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def fix_typos_in_wake_word(sentence, wake_words, wake_word):
 3 |     words = sentence.split()  # split the sentence into words
 4 |     
 5 |     for i in range(len(words)):
 6 |         if words[i] in wake_words:
 7 |             words[i] = wake_word  # replace the given word with the key word
 8 |     formatted_sentence = ' '.join(words)  # join the words back into a sentence
 9 | 
10 |     return formatted_sentence
11 | 
12 | 
13 | def is_user_talking_to_me(transcript, wake_words):
14 |     words = transcript.split()  # split the sentence into words
15 |     for word in words:
16 |         if word in wake_words:
17 |             return True
18 |     return False
19 | 
20 | def update_conversation(current_conversation, role, content):
21 |     message = {"role": role, "content": content}
22 |     current_conversation.append(message)
23 | 


--------------------------------------------------------------------------------
/speech_module/inference.py:
--------------------------------------------------------------------------------
 1 | import soundfile as sf
 2 | import torch
 3 | from transformers import AutoModelForCTC, AutoProcessor, Wav2Vec2Processor, WhisperProcessor, WhisperForConditionalGeneration
 4 | 
 5 | from speech_module.stt_model import SpeechToTextModel
 6 | 
 7 | 
 8 | class Inference():
 9 |     def __init__(self):
10 |         self.stt = SpeechToTextModel()
11 | 
12 |     def buffer_to_text(self, audio_buffer):
13 |         if(len(audio_buffer)==0):
14 |             return ""
15 | 
16 |         inputs = self.stt.processor(torch.tensor(audio_buffer), 
17 |                                 sampling_rate=16000, 
18 |                                 return_tensors="pt", 
19 |                                 padding=True)
20 | 
21 |         with torch.no_grad():
22 |             logits = self.stt.model(inputs.input_values, 
23 |                                 attention_mask=inputs.attention_mask).logits
24 | 
25 | 
26 |         if hasattr(self.stt.processor, 'decoder') and self.stt.use_autoprocessor:
27 |             transcript = \
28 |                 self.stt.processor.decode(logits[0].cpu().numpy(),
29 |                                         hotwords=self.stt.hotwords,
30 |                                         output_word_offsets=True, 
31 |                                     )
32 |             confidence = transcript.lm_score / len(transcript.text.split(" "))
33 |             transcript = transcript.text 
34 | 
35 |         else:
36 |             predicted_ids = torch.argmax(logits, dim=-1)
37 |             transcript = self.stt.processor.batch_decode(predicted_ids)[0]
38 |             confidence = self.confidence_score(logits,predicted_ids)
39 | 
40 |         return transcript, confidence 
41 | 
42 |     def confidence_score(self, logits, predicted_ids):
43 | 
44 |         scores = torch.nn.functional.softmax(logits, dim=-1)
45 |         pred_scores = scores.gather(-1, predicted_ids.unsqueeze(-1))[:, :, 0]
46 |         mask = torch.logical_and(
47 |             predicted_ids.not_equal(self.stt.processor.tokenizer.word_delimiter_token_id), 
48 |             predicted_ids.not_equal(self.stt.processor.tokenizer.pad_token_id))
49 | 
50 |         character_scores = pred_scores.masked_select(mask)
51 |         total_average = torch.sum(character_scores) / len(character_scores)
52 | 
53 |         return total_average


--------------------------------------------------------------------------------
/speech_module/stt_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCTC, AutoProcessor, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
 3 | 
 4 | class SpeechToTextModel():
 5 |     def __init__(self, 
 6 |                  model_name="facebook/wav2vec2-large-960h-lv60-self", # "maxidl/wav2vec2-large-xlsr-german", # "facebook/wav2vec2-large-robust-ft-swbd-300h" 
 7 |                  hotwords=["Hello"],
 8 |                  use_gpu=False, 
 9 |                  use_autoprocessor=True, 
10 |                  cache_dir="models/speech_to_text"):
11 | 
12 |         self.model_name = model_name
13 |         self.hotwords = hotwords
14 |         self.use_gpu = use_gpu
15 |         self.use_autoprocessor = use_autoprocessor
16 |         self.cache_dir = cache_dir
17 | 
18 |         if use_autoprocessor: 
19 |             self.processor = AutoProcessor.from_pretrained(self.model_name, cache_dir=self.cache_dir)
20 |         else:
21 |             self.processor = Wav2Vec2ProcessorWithLM.from_pretrained(self.model_name, cache_dir=self.cache_dir)
22 | 
23 |         self.model = AutoModelForCTC.from_pretrained(self.model_name, ctc_loss_reduction="mean", cache_dir=self.cache_dir)
24 |     
25 |         if self.use_gpu and torch.backends.mps.is_available():
26 |             self.device = torch.device("mps")
27 |             # self.model.to(self.device)
28 | 
29 | # stt = SpeechToTextModel()
30 | # print(stt.processor)
31 | 


--------------------------------------------------------------------------------
/speech_module/transcription.py:
--------------------------------------------------------------------------------
  1 | import pyaudio
  2 | from torch import device
  3 | import webrtcvad
  4 | import numpy as np
  5 | import threading
  6 | import time
  7 | from sys import exit
  8 | from queue import  Queue
  9 | 
 10 | from speech_module.inference import Inference
 11 | 
 12 | 
 13 | class LiveTranscription():
 14 |     exit_event = threading.Event()
 15 | 
 16 |     def __init__(self, device_name="default"):
 17 |         self.device_name = device_name
 18 | 
 19 |     def stop(self):
 20 |         LiveTranscription.exit_event.set()
 21 |         self.recognition_input_queue.put("close")
 22 |         print("Stopping listening process")
 23 | 
 24 |     def start(self):
 25 |         self.recognition_output_queue = Queue()
 26 |         self.recognition_input_queue = Queue()
 27 | 
 28 |         self.recognition_process = threading.Thread(target=LiveTranscription.recognition_process, args=(self, self.recognition_input_queue, self.recognition_output_queue))
 29 |         
 30 |         self.recognition_process.start()
 31 |         
 32 |         time.sleep(5)  # start vad after recognition model is loaded
 33 |         self.vad_process = threading.Thread(target=LiveTranscription.vad_process, 
 34 |                                             args=(self.device_name, 
 35 |                                                   self.recognition_input_queue,
 36 |                                                   ))
 37 |         self.vad_process.start()
 38 | 
 39 |     def vad_process(device_name, recognition_input_queue):
 40 |         vad = webrtcvad.Vad()
 41 |         vad.set_mode(1)
 42 | 
 43 |         audio = pyaudio.PyAudio()
 44 |         FORMAT = pyaudio.paInt16
 45 |         CHANNELS = 1
 46 |         RATE = 16000
 47 |         FRAME_DURATION = 30
 48 |         CHUNK = int(RATE * FRAME_DURATION / 1000)
 49 |         RECORD_SECONDS = 10
 50 | 
 51 |         microphones = LiveTranscription.list_microphones(audio)
 52 |         selected_input_device_id = LiveTranscription.get_input_device_id(device_name, microphones)
 53 | 
 54 |         stream = audio.open(input_device_index=selected_input_device_id,
 55 |                             format=FORMAT,
 56 |                             channels=CHANNELS,
 57 |                             rate=RATE,
 58 |                             input=True,
 59 |                             frames_per_buffer=CHUNK)
 60 | 
 61 |         frames = b''
 62 |         while True:
 63 |             if LiveTranscription.exit_event.is_set():
 64 |                 break
 65 |             frame = stream.read(CHUNK, exception_on_overflow=False)
 66 |             is_speech = vad.is_speech(frame, RATE)
 67 |             if is_speech:
 68 |                 frames += frame
 69 |             else:
 70 |                 if len(frames) > 1:
 71 |                     recognition_input_queue.put(frames)
 72 |                 frames = b''
 73 |         stream.stop_stream()
 74 |         stream.close()
 75 |         audio.terminate()
 76 | 
 77 |     def recognition_process(self, in_queue, output_queue):
 78 |         live_inference = Inference()
 79 | 
 80 |         print("\nListening to your voice\n")
 81 |         while True:
 82 |             audio_frames = in_queue.get()
 83 |             if audio_frames == "close":
 84 |                 break
 85 | 
 86 |             float64_buffer = np.frombuffer(
 87 |                 audio_frames, dtype=np.int16) / 32767
 88 |             start = time.perf_counter()
 89 | 
 90 |             transcript, confidence = live_inference.buffer_to_text(float64_buffer)
 91 |             transcript = transcript.lower()
 92 | 
 93 |             inference_time = time.perf_counter() - start
 94 |             sample_length = len(float64_buffer) / 16000
 95 | 
 96 |             if transcript != "":
 97 |                 output_queue.put([transcript, sample_length, inference_time, confidence]) 
 98 | 
 99 |     @staticmethod
100 |     def get_input_device_id(device_name, microphones):
101 |         for device in microphones:
102 |             if device_name in device[1]:
103 |                 return device[0]
104 | 
105 |     @staticmethod
106 |     def list_microphones(pyaudio_instance):
107 |         info = pyaudio_instance.get_host_api_info_by_index(0)
108 |         numdevices = info.get('deviceCount')
109 | 
110 |         result = []
111 | 
112 |         for i in range(0, numdevices):
113 |             if (pyaudio_instance.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
114 |                 name = pyaudio_instance.get_device_info_by_host_api_device_index(
115 |                     0, i).get('name')
116 |                 result += [[i, name]]
117 |         return result
118 | 
119 |     def get_last_text(self):
120 |         return self.recognition_output_queue.get()
121 | 


--------------------------------------------------------------------------------
/speech_module/tts_model.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 3 | from datasets import load_dataset
 4 | import torch
 5 | import soundfile
 6 | 
 7 | class TextToSpeechModel():
 8 |     def __init__(self) -> None:
 9 |         self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", cache_dir="models/text_to_speech")
10 |         self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir="models/text_to_speech")
11 |         self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", cache_dir="models/text_to_speech")
12 |         self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir="models/text_to_speech")
13 | 
14 |     def tts_generator(self, input_text, use_gpu=False, device="cpu") -> str:
15 |         print("input_text ", input_text)
16 |         st = time.time()
17 | 
18 |         inputs = self.processor(
19 |             text=input_text, 
20 |             return_tensors="pt") 
21 | 
22 |         if use_gpu and torch.backends.mps.is_available():
23 |             print("Using GPU") 
24 |             device = torch.device("mps")
25 |             self.model = self.model.to(device)
26 |             self.vocoder = self.vocoder.to(device)
27 |             inputs = inputs.to(device)
28 | 
29 |         # # load xvector containing speaker's voice characteristics from a dataset
30 |         # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
31 |         speaker_embeddings = torch.tensor(self.embeddings_dataset[7306]["xvector"], device=device).unsqueeze(0)
32 | 
33 |         speech = self.model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=self.vocoder)
34 |         out_file_path = f"saved_audio/speech{time.time()}.wav"
35 | 
36 |         soundfile.write(out_file_path, 
37 |                 speech.cpu().numpy(), 
38 |                 samplerate=16000)
39 |         et = time.time()
40 |         print("Time:", et-st)
41 |         
42 |         return out_file_path
43 | 
44 | # tts = TextToSpeechModel()
45 | # tts.tts_generator("This is some random text.")
46 | 
47 | # def voice_generator(input_text):
48 | #     tts = TextToSpeechModel()
49 | #     return tts.tts_generator(input_text)
50 | 
51 | # print(voice_generator("some text randoom"))


--------------------------------------------------------------------------------
/system_prompt.txt:
--------------------------------------------------------------------------------
1 | Your name is Alice. You are a funny and intelligent AI that happily answers any questions. 
2 | The user's input is always transcribed by a speech to text model, so it can contain spelling mistakes or be incoherent. 
3 | Account for any spelling or grammatical mistakes before generating a response.


--------------------------------------------------------------------------------
/wake_words.txt:
--------------------------------------------------------------------------------
 1 | alice
 2 | alles
 3 | agi
 4 | i g
 5 | i kee
 6 | i gee 
 7 | mark one
 8 | markone
 9 | alexa
10 | mark
11 | aggie
12 | baggy
13 | age
14 | arge
15 | jarvis
16 | javis 
17 | ag
18 | bagy
19 | maggy
20 | aggi
21 | raggie


--------------------------------------------------------------------------------