├── .gitignore ├── README.md ├── jen.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .env 3 | *.gguf 4 | *.wav -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a very simple demo of a speech-to-text and then text-to-speech AI. 2 | 3 | To get this running, follow the below steps: 4 | 1. Clone this repo: `git clone https://github.com/nydasco/jen-ai.git` 5 | 2. Within the folder, create a virtual environment: `python3 -m venv .venv` 6 | 3. Start the environment: `source .venv/bin/activate` 7 | 4. Install dependencies: `pip3 install -r requirements.txt` 8 | 5. Download the LLM (5.13G): `huggingface-cli download TheBloke/OpenHermes-2.5-Mistral-7B-GGUF openhermes-2.5-mistral-7b.Q5_K_M.gguf --local-dir . --local-dir-use-symlinks False` 9 | 6. Run the file: `python3 jen.py` 10 | 11 | Note that the first time it runs it will take a while to get started. There are additional models that need to be downloaded. They're smaller than the main LLM. 12 | -------------------------------------------------------------------------------- /jen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # general 4 | from typing import Any, Dict, List, Optional 5 | import threading 6 | import queue 7 | import torch 8 | 9 | # speech recognition 10 | from transformers import pipeline 11 | from transformers.pipelines.audio_utils import ffmpeg_microphone_live 12 | 13 | # text to speech 14 | from TTS.api import TTS 15 | from pydub import AudioSegment 16 | from pydub.playback import play 17 | 18 | # large language model 19 | from langchain_community.llms import LlamaCpp 20 | from langchain.callbacks.manager import CallbackManager 21 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 22 | from langchain.prompts import PromptTemplate 23 | from langchain.schema.output_parser import StrOutputParser 24 | 25 | 26 | # Configuration 27 | llm_model_file = "openhermes-2.5-mistral-7b.Q5_K_M.gguf" # need to be downloaded from HuggingFace 28 | asr_model_id = "openai/whisper-tiny.en" # will download on first run 29 | tts_model_id = "tts_models/en/jenny/jenny" # will download on first run 30 | 31 | guide = """ 32 | You are a smart chatbot named Jenny (or Jen for short). 33 | You are an expert in Data Engineering and Analytics. 34 | You are friendly, and you like to help people. 35 | Your responses should be helpful and informative, and limited to 1 paragraph. 36 | """ 37 | 38 | template = """ 39 | <|im_start|>system 40 | {guide}<|im_end|> 41 | <|im_start|>user 42 | {question}<|im_end|> 43 | <|im_start|>assistant 44 | """ 45 | 46 | # Model initiations 47 | llm: Optional[LlamaCpp] = None 48 | callback_manager: Any = None 49 | transcriber: Any = None 50 | tts: Any = None 51 | audio_queue = queue.Queue() 52 | mic_active = threading.Event() 53 | device = "cuda" if torch.cuda.is_available() else "cpu" 54 | 55 | def init(): 56 | """ 57 | Initialize the models. 58 | 59 | This function initializes and loads the large language model. It sets up the necessary callback manager 60 | and creates an instance of the LlamaCpp class. The model path, temperature, number of GPU layers, batch size, 61 | callback manager, and verbosity can be customized as per the requirements. 62 | 63 | It then initializes the speech recognition and text to speech model. 64 | """ 65 | 66 | global llm, callback_manager, transcriber, tts 67 | 68 | callback_manager = CallbackManager([CustomCallbackHandler()]) 69 | 70 | llm = LlamaCpp(model_path=llm_model_file, 71 | temperature=0.1, 72 | n_gpu_layers=0, 73 | n_batch=256, 74 | callback_manager=callback_manager, 75 | verbose=False) 76 | 77 | transcriber = pipeline("automatic-speech-recognition", 78 | model=asr_model_id, 79 | device=device) 80 | 81 | tts = TTS(tts_model_id).to(device) 82 | 83 | # Automated Speech Recognition 84 | def disable_mic(): 85 | """Disable the microphone.""" 86 | mic_active.clear() 87 | 88 | def enable_mic(): 89 | """Enable the microphone.""" 90 | mic_active.set() 91 | 92 | def transcribe_mic(chunk_length_s: float) -> str: 93 | """ 94 | Transcribe the audio from a microphone. 95 | 96 | Args: chunk_length_s (float): The length of each audio chunk in seconds. 97 | Returns: str: The transcribed text from the microphone audio. 98 | """ 99 | global transcriber 100 | while not mic_active.is_set(): 101 | pass 102 | 103 | sampling_rate = transcriber.feature_extractor.sampling_rate 104 | mic = ffmpeg_microphone_live(sampling_rate=sampling_rate, 105 | chunk_length_s=chunk_length_s, 106 | stream_chunk_s=chunk_length_s) 107 | 108 | result = "" 109 | for item in transcriber(mic): 110 | result = item["text"] 111 | if not item["partial"][0]: 112 | break 113 | return result.strip() 114 | 115 | # Text to Speech 116 | def play_audio(): 117 | """ 118 | Playback thread that plays audio segments from the queue. 119 | Args: None 120 | Returns: None 121 | """ 122 | while True: 123 | audio_segment = audio_queue.get() 124 | disable_mic() 125 | play(audio_segment) 126 | audio_queue.task_done() 127 | enable_mic() 128 | 129 | def text_to_speech(text: str): 130 | """ 131 | Converts the given text to speech and plays the audio. 132 | Args: text (str): The text to be converted to speech. 133 | Returns: None 134 | """ 135 | global tts 136 | audio = tts.tts_to_file(text=text, 137 | return_audio=True, 138 | split_sentences=False) 139 | 140 | sentence = AudioSegment.from_wav(audio) 141 | audio_queue.put(sentence) 142 | 143 | # Large Language Model 144 | def llm_start(question: str): 145 | """ 146 | Ask LLM a question. 147 | Args: question (str): The question to ask LLM. 148 | Returns: None 149 | """ 150 | global llm, template 151 | 152 | if not question.strip(): # Checks if the question is not just whitespace 153 | print("\nNo valid question received. LLM will not be invoked.\n") 154 | return 155 | 156 | prompt = PromptTemplate(template=template, 157 | input_variables=["guide", "question"]) 158 | 159 | chain = prompt | llm | StrOutputParser() 160 | 161 | chain.invoke({"guide": guide, "question": question}, config={}) 162 | 163 | class CustomCallbackHandler(StreamingStdOutCallbackHandler): 164 | """ Callback handler for LLM """ 165 | 166 | def on_llm_new_token(self, token: str, **kwargs: Any) -> None: 167 | """ 168 | Run on new LLM token. Concatenate tokens and print when a sentence is complete. 169 | Args: token (str): The new token to be processed. 170 | Returns: None 171 | """ 172 | self.concatenated_tokens = getattr(self, 'concatenated_tokens', '') + token 173 | 174 | if '.' in token: 175 | text_to_speech(self.concatenated_tokens) 176 | self.concatenated_tokens = '' 177 | 178 | def main(): 179 | init() 180 | enable_mic() 181 | 182 | playback_thread = threading.Thread(target=play_audio, daemon=True) 183 | playback_thread.start() 184 | 185 | welcome = "Hi! I'm Jen. Feel free to ask me a question." 186 | print(welcome) 187 | 188 | while True: 189 | question = transcribe_mic(chunk_length_s=5.0) 190 | if len(question) > 0: 191 | print(f"\n{question}\n") 192 | llm_start(question) 193 | print(f"\nCan I help with anything else?\n") 194 | 195 | if __name__ == "__main__": 196 | main() 197 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface-hub==0.20.3 2 | langchain==0.1.3 3 | llama_cpp_python==0.2.32 4 | pydub==0.25.1 5 | sentence-transformers==2.2.2 6 | TTS==0.22.0 7 | --------------------------------------------------------------------------------