├── .gitignore ├── README.md ├── gradio_app.py ├── install.sh └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | mistral-7b-instruct-v0.2.Q4_K_M.gguf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # personal_llm_assistant 2 | 3 | First install a new conda environment: 4 | ``` 5 | conda create --name assistant python=3.10 6 | ``` 7 | 8 | Activate the new conda env: 9 | ``` 10 | conda activate assistant 11 | ``` 12 | 13 | Run the ```install.sh``` bash script to install the required packages and libraries: 14 | ``` 15 | chmod +x install.sh 16 | bash install.sh 17 | ``` 18 | 19 | Download your gguf model to serve: 20 | ``` 21 | huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir ./models/ --local-dir-use-symlinks False 22 | ``` 23 | 24 | Start the llm engine (based on your GPU available RAM, you might need to change the ```--n_gpu_layers``` parameter value): 25 | ``` 26 | python3 -m llama_cpp.server --model ./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf --n_gpu_layers -1 --chat_format chatml 27 | ``` 28 | 29 | Finally, in another terminal, run the python code: 30 | ``` 31 | python gradio_app.py 32 | ``` 33 | -------------------------------------------------------------------------------- /gradio_app.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from transformers import pipeline 3 | from transformers import AutoProcessor, BarkModel 4 | import torch 5 | from openai import OpenAI 6 | import numpy as np 7 | from IPython.display import Audio, display 8 | import numpy as np 9 | import re 10 | from nltk.tokenize import sent_tokenize 11 | 12 | 13 | WORDS_PER_CHUNK = 25 14 | 15 | 16 | # Setup Whisper client 17 | pipe = pipeline( 18 | "automatic-speech-recognition", 19 | model="openai/whisper-large-v2", 20 | torch_dtype=torch.float16, 21 | device="cuda:0" 22 | ) 23 | 24 | voice_processor = AutoProcessor.from_pretrained("suno/bark") 25 | voice_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to("cuda:0") 26 | 27 | voice_model = voice_model.to_bettertransformer() 28 | voice_preset = "v2/en_speaker_9" 29 | 30 | 31 | system_prompt = "You are a helpful AI. You must answer the questino user asks briefly." 32 | 33 | 34 | client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx") # Placeholder, replace 35 | sample_rate = 48000 36 | 37 | def transcribe_and_query_llm_voice(audio_file_path): 38 | 39 | transcription = pipe(audio_file_path)['text'] 40 | 41 | response = client.chat.completions.create( 42 | model="mistral", 43 | messages=[ 44 | {"role": "system", "content": system_prompt}, # Update this as per your needs 45 | {"role": "user", "content": transcription} 46 | ], 47 | ) 48 | llm_response = response.choices[0].message.content 49 | 50 | sampling_rate = voice_model.generation_config.sample_rate 51 | silence = np.zeros(int(0.25 * sampling_rate)) 52 | 53 | BATCH_SIZE = 12 54 | model_input = sent_tokenize(llm_response) 55 | 56 | pieces = [] 57 | for i in range(0, len(model_input), BATCH_SIZE): 58 | inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))] 59 | 60 | if len(inputs) != 0: 61 | inputs = voice_processor(inputs, voice_preset=voice_preset) 62 | 63 | speech_output, output_lengths = voice_model.generate(**inputs.to("cuda:0"), return_output_lengths=True, min_eos_p=0.2) 64 | 65 | speech_output = [output[:length].cpu().numpy() for (output,length) in zip(speech_output, output_lengths)] 66 | 67 | pieces += [*speech_output, silence.copy()] 68 | 69 | 70 | whole_ouput = np.concatenate(pieces) 71 | 72 | audio_output = (sampling_rate, whole_ouput) 73 | 74 | return llm_response, audio_output 75 | 76 | 77 | def transcribe_and_query_llm_text(text_input): 78 | 79 | transcription = text_input 80 | 81 | response = client.chat.completions.create( 82 | model="mistral", 83 | messages=[ 84 | {"role": "system", "content": system_prompt}, # Update this as per your needs 85 | {"role": "user", "content": transcription + "\n Answer briefly."} 86 | ], 87 | ) 88 | 89 | llm_response = response.choices[0].message.content 90 | 91 | sampling_rate = voice_model.generation_config.sample_rate 92 | silence = np.zeros(int(0.25 * sampling_rate)) 93 | 94 | BATCH_SIZE = 12 95 | model_input = sent_tokenize(llm_response) 96 | 97 | pieces = [] 98 | for i in range(0, len(model_input), BATCH_SIZE): 99 | inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))] 100 | 101 | if len(inputs) != 0: 102 | inputs = voice_processor(inputs, voice_preset=voice_preset) 103 | 104 | speech_output, output_lengths = voice_model.generate(**inputs.to("cuda:0"), return_output_lengths=True, min_eos_p=0.2) 105 | 106 | speech_output = [output[:length].cpu().numpy() for (output,length) in zip(speech_output, output_lengths)] 107 | 108 | 109 | pieces += [*speech_output, silence.copy()] 110 | 111 | 112 | whole_ouput = np.concatenate(pieces) 113 | 114 | audio_output = (sampling_rate, whole_ouput) 115 | 116 | return llm_response, audio_output 117 | 118 | 119 | 120 | with gr.Blocks() as demo: 121 | with gr.Row(): 122 | with gr.Column(): 123 | text_input = gr.Textbox(label="Type your request", placeholder="Type here or use the microphone...") 124 | audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Or record your speech") 125 | with gr.Column(): 126 | output_text = gr.Textbox(label="LLM Response") 127 | output_audio = gr.Audio(label="LLM Response as Speech", type="numpy") 128 | 129 | submit_btn_text = gr.Button("Submit Text") 130 | submit_btn_voice = gr.Button("Submit Voice") 131 | 132 | 133 | submit_btn_voice.click(fn=transcribe_and_query_llm_voice, inputs=[audio_input], outputs=[output_text, output_audio]) 134 | submit_btn_text.click(fn=transcribe_and_query_llm_text, inputs=[text_input], outputs=[output_text, output_audio]) 135 | 136 | demo.launch(ssl_verify=False, 137 | share=False, 138 | debug=False) -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set environment variables and install the llama-cpp-python package with GPU support 4 | export CMAKE_ARGS="-DLLAMA_CUBLAS=on" 5 | export FORCE_CMAKE=1 6 | pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir 7 | 8 | # Install standard dependencies from requirements.txt 9 | pip install -r requirements.txt 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | llama-cpp-python[server] 2 | gradio 3 | openai 4 | huggingface-cli 5 | huggingface_hub 6 | torch 7 | transformers 8 | nltk 9 | optimum --------------------------------------------------------------------------------