├── .gitignore ├── .streamlit └── config.toml ├── README.md ├── brains.py ├── demo.png ├── interface.py ├── requirements.txt ├── session_manager.py ├── speech_module ├── inference.py ├── stt_model.py ├── transcription.py └── tts_model.py ├── system_prompt.txt └── wake_words.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Directories 2 | __pycache__/ 3 | speech_module/__pycache__/ 4 | .venv/ 5 | .vscode/ 6 | 7 | models/ 8 | saved_audio/ 9 | 10 | # Files 11 | .env 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | 3 | # The preset Streamlit theme that your custom theme inherits from. One of "light" or "dark". 4 | # base = 'light' 5 | 6 | # Primary accent for interactive elements 7 | # primaryColor = '' 8 | 9 | # Background color for the main content area 10 | # backgroundColor = '' 11 | 12 | # Background color for sidebar and most interactive widgets 13 | # secondaryBackgroundColor = '' 14 | 15 | # Color used for almost all text 16 | # textColor = '' 17 | 18 | # Font family for all text in the app, except code blocks 19 | # Accepted values (serif | sans serif | monospace) 20 | # Default: "sans serif" 21 | font = "sans serif" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM-based Voice Assistant 2 | This is an AI Voice Assistant based on Large Language Models. A user can interact with the Voice Assistant in natural language, currently English. 3 | 4 | The implementation brings various deep learning models together: 5 | - Large Language Model (GPT-4 or Alpaca, can be chosen) 6 | - Speech-To-Text Model (Wave2Vec2-Large) 7 | - Text-To-Speech Model (Microsoft SpeechT5) 8 | 9 | The speech module is interfaced with the local microphone to create live transcription via a VAD Process. A transcription is sent to the chosen LLM for processing based on wake words. 10 | 11 | Once the LLM generates a response, speech module also saves the audio file and generates a speech output using a TTS model. 12 | 13 | The User Interface is built using [Streamlit](https://docs.streamlit.io) and provides a familiar Chat-like experience. 14 | 15 | # Demo 16 | ![image](demo.png) 17 | 18 | # Installation 19 | Install project dependencies 20 | ``` 21 | pip install -r requirements 22 | ``` 23 | 24 | If using GPT Models, create a `.env` file with environment variables for `OPENAI_API_KEY` and `OPENAI_API_BASE`. 25 | -------------------------------------------------------------------------------- /brains.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import openai 4 | 5 | from dotenv import load_dotenv 6 | 7 | def chatgpt(content, model="gpt-4-32k-deployment"): 8 | 9 | load_dotenv(".env") 10 | 11 | openai.api_type = "azure" 12 | openai.api_base = os.environ.get("OPENAI_API_BASE") 13 | openai.api_version = "2023-03-15-preview" 14 | openai.api_key = os.environ.get("OPENAI_API_KEY") 15 | 16 | response = openai.ChatCompletion.create( 17 | engine=model, 18 | messages=content 19 | ) 20 | 21 | output = response['choices'][0]['message']['content'] 22 | return output 23 | 24 | -------------------------------------------------------------------------------- /demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avsrma/LLM-based-AI-Assistant/00d38fe3f5449a093aa2c0613dc214884272c4ed/demo.png -------------------------------------------------------------------------------- /interface.py: -------------------------------------------------------------------------------- 1 | 2 | import base64 3 | import logging 4 | import sys 5 | 6 | from brains import chatgpt 7 | from session_manager import update_conversation, fix_typos_in_wake_word, is_user_talking_to_me 8 | from speech_module.transcription import LiveTranscription 9 | 10 | import os 11 | 12 | import streamlit as st 13 | from streamlit_chat import message 14 | 15 | from llama_cpp import Llama 16 | 17 | from speech_module.tts_model import TextToSpeechModel 18 | 19 | def autoplay_audio(file_path="speech.wav", idx=0): 20 | print("Playing audio file: ", file_path) 21 | with open(file_path, "rb") as binary_audio: 22 | audio_bytes = binary_audio.read() 23 | 24 | audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') 25 | audio_tag = f'