├── .gitignore
├── README.md
├── jen.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .env
3 | *.gguf
4 | *.wav


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a very simple demo of a speech-to-text and then text-to-speech AI.
 2 | 
 3 | To get this running, follow the below steps:
 4 | 1. Clone this repo: `git clone https://github.com/nydasco/jen-ai.git`
 5 | 2. Within the folder, create a virtual environment: `python3 -m venv .venv`
 6 | 3. Start the environment: `source .venv/bin/activate`
 7 | 4. Install dependencies: `pip3 install -r requirements.txt`
 8 | 5. Download the LLM (5.13G): `huggingface-cli download TheBloke/OpenHermes-2.5-Mistral-7B-GGUF openhermes-2.5-mistral-7b.Q5_K_M.gguf --local-dir . --local-dir-use-symlinks False`
 9 | 6. Run the file: `python3 jen.py`
10 | 
11 | Note that the first time it runs it will take a while to get started. There are additional models that need to be downloaded. They're smaller than the main LLM.
12 | 


--------------------------------------------------------------------------------
/jen.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # general
  4 | from typing import Any, Dict, List, Optional
  5 | import threading
  6 | import queue
  7 | import torch
  8 | 
  9 | # speech recognition
 10 | from transformers import pipeline
 11 | from transformers.pipelines.audio_utils import ffmpeg_microphone_live
 12 | 
 13 | # text to speech
 14 | from TTS.api import TTS
 15 | from pydub import AudioSegment
 16 | from pydub.playback import play
 17 | 
 18 | # large language model
 19 | from langchain_community.llms import LlamaCpp
 20 | from langchain.callbacks.manager import CallbackManager
 21 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 22 | from langchain.prompts import PromptTemplate
 23 | from langchain.schema.output_parser import StrOutputParser
 24 | 
 25 | 
 26 | # Configuration
 27 | llm_model_file = "openhermes-2.5-mistral-7b.Q5_K_M.gguf" # need to be downloaded from HuggingFace
 28 | asr_model_id = "openai/whisper-tiny.en" # will download on first run
 29 | tts_model_id = "tts_models/en/jenny/jenny" # will download on first run
 30 | 
 31 | guide = """
 32 |     You are a smart chatbot named Jenny (or Jen for short).
 33 |     You are an expert in Data Engineering and Analytics.
 34 |     You are friendly, and you like to help people.
 35 |     Your responses should be helpful and informative, and limited to 1 paragraph.
 36 | """
 37 | 
 38 | template = """
 39 |     <|im_start|>system
 40 |     {guide}<|im_end|>
 41 |     <|im_start|>user
 42 |     {question}<|im_end|>
 43 |     <|im_start|>assistant
 44 | """
 45 | 
 46 | # Model initiations
 47 | llm: Optional[LlamaCpp] = None
 48 | callback_manager: Any = None
 49 | transcriber: Any = None
 50 | tts: Any = None
 51 | audio_queue = queue.Queue()
 52 | mic_active = threading.Event()
 53 | device = "cuda" if torch.cuda.is_available() else "cpu"
 54 | 
 55 | def init():
 56 |     """ 
 57 |     Initialize the models.
 58 |     
 59 |     This function initializes and loads the large language model. It sets up the necessary callback manager
 60 |     and creates an instance of the LlamaCpp class. The model path, temperature, number of GPU layers, batch size,
 61 |     callback manager, and verbosity can be customized as per the requirements.
 62 | 
 63 |     It then initializes the speech recognition and text to speech model.
 64 |     """
 65 | 
 66 |     global llm, callback_manager, transcriber, tts
 67 | 
 68 |     callback_manager = CallbackManager([CustomCallbackHandler()])
 69 | 
 70 |     llm = LlamaCpp(model_path=llm_model_file,
 71 |                    temperature=0.1,
 72 |                    n_gpu_layers=0,
 73 |                    n_batch=256,
 74 |                    callback_manager=callback_manager,
 75 |                    verbose=False)
 76 | 
 77 |     transcriber = pipeline("automatic-speech-recognition",
 78 |                            model=asr_model_id,
 79 |                            device=device)
 80 | 
 81 |     tts = TTS(tts_model_id).to(device)
 82 | 
 83 | # Automated Speech Recognition
 84 | def disable_mic():
 85 |     """Disable the microphone."""
 86 |     mic_active.clear()
 87 | 
 88 | def enable_mic():
 89 |     """Enable the microphone."""
 90 |     mic_active.set()
 91 | 
 92 | def transcribe_mic(chunk_length_s: float) -> str:
 93 |     """ 
 94 |     Transcribe the audio from a microphone.
 95 | 
 96 |     Args: chunk_length_s (float): The length of each audio chunk in seconds.
 97 |     Returns: str: The transcribed text from the microphone audio.
 98 |     """
 99 |     global transcriber
100 |     while not mic_active.is_set():
101 |         pass
102 | 
103 |     sampling_rate = transcriber.feature_extractor.sampling_rate
104 |     mic = ffmpeg_microphone_live(sampling_rate=sampling_rate,
105 |                                  chunk_length_s=chunk_length_s,
106 |                                  stream_chunk_s=chunk_length_s)
107 |     
108 |     result = ""
109 |     for item in transcriber(mic):
110 |         result = item["text"]
111 |         if not item["partial"][0]:
112 |             break
113 |     return result.strip()
114 | 
115 | # Text to Speech
116 | def play_audio():
117 |     """
118 |     Playback thread that plays audio segments from the queue.
119 |     Args: None
120 |     Returns: None
121 |     """
122 |     while True:
123 |         audio_segment = audio_queue.get()
124 |         disable_mic()
125 |         play(audio_segment)
126 |         audio_queue.task_done()
127 |         enable_mic()
128 | 
129 | def text_to_speech(text: str):
130 |     """
131 |     Converts the given text to speech and plays the audio.
132 |     Args: text (str): The text to be converted to speech.
133 |     Returns: None
134 |     """
135 |     global tts
136 |     audio = tts.tts_to_file(text=text, 
137 |                             return_audio=True, 
138 |                             split_sentences=False)
139 |     
140 |     sentence = AudioSegment.from_wav(audio)
141 |     audio_queue.put(sentence) 
142 | 
143 | # Large Language Model
144 | def llm_start(question: str):
145 |     """
146 |     Ask LLM a question.
147 |     Args: question (str): The question to ask LLM.
148 |     Returns: None
149 |     """
150 |     global llm, template
151 | 
152 |     if not question.strip():  # Checks if the question is not just whitespace
153 |         print("\nNo valid question received. LLM will not be invoked.\n")
154 |         return
155 | 
156 |     prompt = PromptTemplate(template=template, 
157 |                             input_variables=["guide", "question"])
158 |     
159 |     chain = prompt | llm | StrOutputParser()
160 | 
161 |     chain.invoke({"guide": guide, "question": question}, config={})
162 | 
163 | class CustomCallbackHandler(StreamingStdOutCallbackHandler):
164 |     """ Callback handler for LLM """
165 | 
166 |     def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
167 |         """
168 |         Run on new LLM token. Concatenate tokens and print when a sentence is complete.
169 |         Args: token (str): The new token to be processed.
170 |         Returns: None
171 |         """
172 |         self.concatenated_tokens = getattr(self, 'concatenated_tokens', '') + token
173 | 
174 |         if '.' in token:
175 |             text_to_speech(self.concatenated_tokens)
176 |             self.concatenated_tokens = ''
177 | 
178 | def main():
179 |     init()
180 |     enable_mic()
181 | 
182 |     playback_thread = threading.Thread(target=play_audio, daemon=True)
183 |     playback_thread.start()
184 | 
185 |     welcome = "Hi! I'm Jen. Feel free to ask me a question."
186 |     print(welcome)
187 |     
188 |     while True:
189 |         question = transcribe_mic(chunk_length_s=5.0)
190 |         if len(question) > 0:
191 |             print(f"\n{question}\n")
192 |             llm_start(question)
193 |             print(f"\nCan I help with anything else?\n")
194 | 
195 | if __name__ == "__main__":
196 |     main()
197 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface-hub==0.20.3
2 | langchain==0.1.3
3 | llama_cpp_python==0.2.32
4 | pydub==0.25.1
5 | sentence-transformers==2.2.2
6 | TTS==0.22.0
7 | 


--------------------------------------------------------------------------------