├── .gitignore
├── README.md
├── gradio_app.py
├── install.sh
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | mistral-7b-instruct-v0.2.Q4_K_M.gguf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # personal_llm_assistant
 2 | 
 3 | First install a new conda environment:
 4 | ```
 5 | conda create --name assistant python=3.10
 6 | ```
 7 | 
 8 | Activate the new conda env:
 9 | ```
10 | conda activate assistant
11 | ```
12 | 
13 | Run the ```install.sh``` bash script to install the required packages and libraries:
14 | ```
15 | chmod +x install.sh
16 | bash install.sh
17 | ```
18 | 
19 | Download your gguf model to serve:
20 | ```
21 | huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir ./models/ --local-dir-use-symlinks False
22 | ```
23 | 
24 | Start the llm engine (based on your GPU available RAM, you might need to change the ```--n_gpu_layers``` parameter value):
25 | ```
26 | python3 -m llama_cpp.server --model ./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf --n_gpu_layers -1 --chat_format chatml
27 | ```
28 | 
29 | Finally, in another terminal, run the python code:
30 | ```
31 | python gradio_app.py
32 | ```
33 | 


--------------------------------------------------------------------------------
/gradio_app.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | from transformers import pipeline
  3 | from transformers import AutoProcessor, BarkModel
  4 | import torch
  5 | from openai import OpenAI
  6 | import numpy as np
  7 | from IPython.display import Audio, display
  8 | import numpy as np
  9 | import re
 10 | from nltk.tokenize import sent_tokenize
 11 | 
 12 | 
 13 | WORDS_PER_CHUNK = 25
 14 | 
 15 | 
 16 | # Setup Whisper client
 17 | pipe = pipeline(
 18 |     "automatic-speech-recognition",
 19 |     model="openai/whisper-large-v2",
 20 |     torch_dtype=torch.float16,
 21 |     device="cuda:0"
 22 | )
 23 | 
 24 | voice_processor = AutoProcessor.from_pretrained("suno/bark")
 25 | voice_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to("cuda:0")
 26 | 
 27 | voice_model =  voice_model.to_bettertransformer()
 28 | voice_preset = "v2/en_speaker_9"
 29 | 
 30 | 
 31 | system_prompt = "You are a helpful AI. You must answer the questino user asks briefly."
 32 | 
 33 | 
 34 | client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx")  # Placeholder, replace 
 35 | sample_rate = 48000
 36 | 
 37 | def transcribe_and_query_llm_voice(audio_file_path):
 38 | 
 39 |     transcription = pipe(audio_file_path)['text']
 40 |     
 41 |     response = client.chat.completions.create(
 42 |         model="mistral",
 43 |         messages=[
 44 |             {"role": "system", "content": system_prompt},  # Update this as per your needs
 45 |             {"role": "user", "content": transcription}
 46 |         ],
 47 |     )
 48 |     llm_response = response.choices[0].message.content
 49 | 
 50 |     sampling_rate = voice_model.generation_config.sample_rate
 51 |     silence = np.zeros(int(0.25 * sampling_rate))
 52 | 
 53 |     BATCH_SIZE = 12
 54 |     model_input = sent_tokenize(llm_response)
 55 | 
 56 |     pieces = []
 57 |     for i in range(0, len(model_input), BATCH_SIZE):
 58 |         inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))]
 59 |         
 60 |         if len(inputs) != 0:
 61 |             inputs = voice_processor(inputs, voice_preset=voice_preset)
 62 |             
 63 |             speech_output, output_lengths = voice_model.generate(**inputs.to("cuda:0"), return_output_lengths=True, min_eos_p=0.2)
 64 |             
 65 |             speech_output = [output[:length].cpu().numpy() for (output,length) in zip(speech_output, output_lengths)]
 66 |             
 67 |             pieces += [*speech_output, silence.copy()]
 68 |         
 69 |         
 70 |     whole_ouput = np.concatenate(pieces)
 71 | 
 72 |     audio_output = (sampling_rate, whole_ouput) 
 73 | 
 74 |     return llm_response, audio_output
 75 | 
 76 | 
 77 | def transcribe_and_query_llm_text(text_input):
 78 | 
 79 |     transcription = text_input
 80 |     
 81 |     response = client.chat.completions.create(
 82 |         model="mistral",
 83 |         messages=[
 84 |             {"role": "system", "content": system_prompt},  # Update this as per your needs
 85 |             {"role": "user", "content": transcription + "\n Answer briefly."}
 86 |         ],
 87 |     )
 88 | 
 89 |     llm_response = response.choices[0].message.content
 90 | 
 91 |     sampling_rate = voice_model.generation_config.sample_rate
 92 |     silence = np.zeros(int(0.25 * sampling_rate))
 93 | 
 94 |     BATCH_SIZE = 12
 95 |     model_input = sent_tokenize(llm_response)
 96 | 
 97 |     pieces = []
 98 |     for i in range(0, len(model_input), BATCH_SIZE):
 99 |         inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))]
100 |         
101 |         if len(inputs) != 0:
102 |             inputs = voice_processor(inputs, voice_preset=voice_preset)
103 |             
104 |             speech_output, output_lengths = voice_model.generate(**inputs.to("cuda:0"), return_output_lengths=True, min_eos_p=0.2)
105 |             
106 |             speech_output = [output[:length].cpu().numpy() for (output,length) in zip(speech_output, output_lengths)]
107 |             
108 |             
109 |             pieces += [*speech_output, silence.copy()]
110 |         
111 |         
112 |     whole_ouput = np.concatenate(pieces)
113 | 
114 |     audio_output = (sampling_rate, whole_ouput)  
115 | 
116 |     return llm_response, audio_output
117 | 
118 | 
119 | 
120 | with gr.Blocks() as demo:
121 |     with gr.Row():
122 |         with gr.Column():
123 |             text_input = gr.Textbox(label="Type your request", placeholder="Type here or use the microphone...")
124 |             audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Or record your speech")
125 |         with gr.Column():
126 |             output_text = gr.Textbox(label="LLM Response")
127 |             output_audio = gr.Audio(label="LLM Response as Speech", type="numpy")
128 |     
129 |     submit_btn_text = gr.Button("Submit Text")
130 |     submit_btn_voice = gr.Button("Submit Voice")
131 |     
132 | 
133 |     submit_btn_voice.click(fn=transcribe_and_query_llm_voice, inputs=[audio_input], outputs=[output_text, output_audio])
134 |     submit_btn_text.click(fn=transcribe_and_query_llm_text, inputs=[text_input], outputs=[output_text, output_audio])
135 | 
136 | demo.launch(ssl_verify=False,
137 |             share=False,
138 |             debug=False)


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set environment variables and install the llama-cpp-python package with GPU support
 4 | export CMAKE_ARGS="-DLLAMA_CUBLAS=on"
 5 | export FORCE_CMAKE=1
 6 | pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir
 7 | 
 8 | # Install standard dependencies from requirements.txt
 9 | pip install -r requirements.txt
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | llama-cpp-python[server]
2 | gradio
3 | openai
4 | huggingface-cli
5 | huggingface_hub
6 | torch
7 | transformers
8 | nltk
9 | optimum


--------------------------------------------------------------------------------