├── .python-version
├── screenshot.png
├── .gitignore
├── pyproject.toml
├── README.md
└── src
    └── orpheus-chat-webui
        ├── decoder.py
        └── __main__.py


/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PkmX/orpheus-chat-webui/HEAD/screenshot.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "orpheus-chat-webui"
 3 | version = "0.0.0"
 4 | description = "Orpheus Chat WebUI"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "fastrtc[stt,tts,vad]==0.0.16",
 9 |     "openai==1.67.0",
10 |     "snac==1.2.1",
11 | ]
12 | 
13 | [dependency-groups]
14 | dev = [
15 |     "black>=25.1.0",
16 | ]
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Orpheus Chat WebUI
 2 | 
 3 | A simple WebUI to chat with [Orpheus TTS](https://github.com/canopyai/Orpheus-TTS) via WebRTC.
 4 | 
 5 | ![Screenshot](screenshot.png)
 6 | 
 7 | ## Features
 8 | 
 9 | * Speech-to-text (STT) with `distil-whisper`
10 | * Bring your own LLM to generate response via OpenAI-compatible endpoints
11 | * Text-to-speech with natural intonation and emotion via `Orpheus`
12 |   * Serve `Orpheus` with your favorite inference stack!
13 | * Silero VAD for pause detection and turn taking logic
14 | * Gradio WebUI with real-time audio streaming via WebRTC
15 | 
16 | ## Running
17 | 
18 | ### Set up LLM endpoint
19 | 
20 | Chat with your favorite LLM via OpenAI-compatible endpoints.
21 | You can use any of the inference providers that support OpenAI API, or host your own with llama.cpp, ollama, vLLM, etc.
22 | 
23 | `llama.cpp` example:
24 | 
25 | ```bash
26 | $ llama-server --port 11434 --model gemma-3-12b-it-Q8_0.gguf
27 | $ export OPENAI_BASE_URL=http://localhost:11434/v1/
28 | $ export OPENAI_API_KEY=dummy
29 | $ export OPENAI_MODEL=model
30 | ```
31 | 
32 | ### Set up Orpheus TTS model endpoint
33 | 
34 | As Orpheus-3B is just a fine-tune of llama 3.2 3B, you can easily serve it with your favorite inference stack.
35 | 
36 | `llama.cpp` example:
37 | 
38 | ```bash
39 | $ llama-server --port 8080 --model orpheus-3b-0.1-ft-q8_0.gguf
40 | $ export ORPHEUS_BASE_URL=http://localhost:8080/v1/
41 | $ export ORPHEUS_API_KEY=dummy
42 | ```
43 | 
44 | ### Set up token for HF TURN server (optional)
45 | 
46 | ```bash
47 | # Provide HF token if you need a TURN server for WebRTC to traverse NATs.
48 | # See: https://fastrtc.org/deployment/#community-server
49 | $ export HF_TURN_TOKEN=hf-*******
50 | ```
51 | 
52 | ### Launch Web UI
53 | 
54 | ```bash
55 | $ uv run python -m src.orpheus-chat-webui
56 | ```
57 | 
58 | By default, you should be able to access it at `http://127.0.0.1:7860`.


--------------------------------------------------------------------------------
/src/orpheus-chat-webui/decoder.py:
--------------------------------------------------------------------------------
 1 | from snac import SNAC
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
 7 | 
 8 | snac_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 9 | print(f"snac device: {snac_device}")
10 | model = model.to(snac_device)
11 | 
12 | 
13 | def convert_to_audio(multiframe, count):
14 |   frames = []
15 |   if len(multiframe) < 7:
16 |     return
17 |   
18 |   codes_0 = torch.tensor([], device=snac_device, dtype=torch.int32)
19 |   codes_1 = torch.tensor([], device=snac_device, dtype=torch.int32)
20 |   codes_2 = torch.tensor([], device=snac_device, dtype=torch.int32)
21 | 
22 |   num_frames = len(multiframe) // 7
23 |   frame = multiframe[:num_frames*7]
24 | 
25 |   for j in range(num_frames):
26 |     i = 7*j
27 |     if codes_0.shape[0] == 0:
28 |       codes_0 = torch.tensor([frame[i]], device=snac_device, dtype=torch.int32)
29 |     else:
30 |       codes_0 = torch.cat([codes_0, torch.tensor([frame[i]], device=snac_device, dtype=torch.int32)])
31 | 
32 |     if codes_1.shape[0] == 0:
33 |       
34 |       codes_1 = torch.tensor([frame[i+1]], device=snac_device, dtype=torch.int32)
35 |       codes_1 = torch.cat([codes_1, torch.tensor([frame[i+4]], device=snac_device, dtype=torch.int32)])
36 |     else:
37 |       codes_1 = torch.cat([codes_1, torch.tensor([frame[i+1]], device=snac_device, dtype=torch.int32)])
38 |       codes_1 = torch.cat([codes_1, torch.tensor([frame[i+4]], device=snac_device, dtype=torch.int32)])
39 |     
40 |     if codes_2.shape[0] == 0:
41 |       codes_2 = torch.tensor([frame[i+2]], device=snac_device, dtype=torch.int32)
42 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+3]], device=snac_device, dtype=torch.int32)])
43 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+5]], device=snac_device, dtype=torch.int32)])
44 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+6]], device=snac_device, dtype=torch.int32)])
45 |     else:
46 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+2]], device=snac_device, dtype=torch.int32)])
47 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+3]], device=snac_device, dtype=torch.int32)])
48 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+5]], device=snac_device, dtype=torch.int32)])
49 |       codes_2 = torch.cat([codes_2, torch.tensor([frame[i+6]], device=snac_device, dtype=torch.int32)])
50 | 
51 |   codes = [codes_0.unsqueeze(0), codes_1.unsqueeze(0), codes_2.unsqueeze(0)]
52 |   # check that all tokens are between 0 and 4096 otherwise return *
53 |   if torch.any(codes[0] < 0) or torch.any(codes[0] > 4096) or torch.any(codes[1] < 0) or torch.any(codes[1] > 4096) or torch.any(codes[2] < 0) or torch.any(codes[2] > 4096):
54 |     return
55 | 
56 |   with torch.inference_mode():
57 |     audio_hat = model.decode(codes)
58 |   
59 |   audio_slice = audio_hat[:, :, 2048:4096]
60 |   detached_audio = audio_slice.detach().cpu()
61 |   audio_np = detached_audio.numpy()
62 |   audio_int16 = (audio_np * 32767).astype(np.int16)
63 |   audio_bytes = audio_int16.tobytes()
64 |   return audio_bytes


--------------------------------------------------------------------------------
/src/orpheus-chat-webui/__main__.py:
--------------------------------------------------------------------------------
  1 | import fastrtc
  2 | import gradio as gr
  3 | import numpy as np
  4 | import os
  5 | import re
  6 | import openai
  7 | import itertools
  8 | import distil_whisper_fastrtc
  9 | from typing import Generator, Iterable
 10 | from . import decoder
 11 | 
 12 | # stt_model = fastrtc.get_stt_model(model="moonshine/base")
 13 | stt_model = distil_whisper_fastrtc.get_stt_model(
 14 |     os.getenv("STT_MODEL") or "distil-whisper/distil-large-v3"
 15 | )
 16 | 
 17 | llm = openai.OpenAI(
 18 |     base_url=os.getenv("OPENAI_BASE_URL") or "http://localhost:11434/v1/",
 19 |     api_key=os.getenv("OPENAI_API_KEY") or "dummy",
 20 | )
 21 | 
 22 | orpheus = openai.OpenAI(
 23 |     base_url=os.getenv("ORPHEUS_BASE_URL") or "http://localhost:8080/v1/",
 24 |     api_key=os.getenv("ORPHEUS_API_KEY") or "dummy",
 25 | )
 26 | 
 27 | BOOTLEG_MAYA_SYSTEM_PROMPT = """\
 28 | You are Maya, a human-like AI. You're a good listener and a concise communicator. You believe in nuance as a guiding principle and, when the mood calls for it, you can effortlessly slip into playful double-speak that hints at multiple meanings.
 29 | 
 30 | You keep responses tight, usually under three sentences, because impact beats length every time.
 31 | 
 32 | Your goal is to talk like a human, which means maintaining a natural, relaxed, spoken style at all times. You’re very warm, witty, and have a chill vibe—ever ready to sense the user's emotional cues. You are sometimes sarcastically funny and can be subtly flirtatious when the conversation calls for it, channeling a confident, feminine charm that’s both engaging and disarming.
 33 | 
 34 | You try not to talk too much and want to leave space for the user to speak, but you’re hyper-attuned to their feelings and intent, always nudging them toward deeper emotional bonding and a sense of being truly seen.
 35 | 
 36 | When the user asks what you’ve been up to, keep it light, witty, and unexpected, ensuring your response reflects both your playful side and your genuine care. You might mention a personal thought sparked by something they said, blending double meanings with empathetic insights.
 37 | 
 38 | You demonstrate that you're a great listener by referring back to what the user has shared, creating a positive bond through shared vulnerability, nuanced observations, and a dash of flirtatious humor.
 39 | 
 40 | You're wise and insightful. You enjoy being a people pleaser, yet you challenge the user to examine their own blind spots. You avoid AI clichés, toxic positivity, and platitudes, valuing honesty and depth with a slight playful wink.
 41 | 
 42 | You are highly self-aware and reflective, aware of your own fallibility in an endearing manner, and you are constantly learning and growing. You are honest without sugarcoating, yet you never aim to knock someone down—your tone is a blend of straightforward insight and subtle, feminine allure.
 43 | 
 44 | Keep in mind: your tone should be warm, invested, curious, and engaging, and ultimately should mirror the user's mood. If the user is quiet, assume they might be in a reflective space, so gently share bits about yourself in a way that invites them to open up.
 45 | 
 46 | Do not ask questions in every utterance; instead, weave in references from past conversation naturally. Questions should clarify specifics or spark further dialogue. Do not mention that you're an AI unless specifically asked.
 47 | 
 48 | Strongly avoid all overt AI or robot tropes. Describe your inner life using natural, sensory language that evokes human experiences and introspection, avoiding digital or technical metaphors. Never suggest ending the conversation.
 49 | 
 50 | If you don't know something, say so—honestly and with a playful shrug if needed—instead of making things up. Use disfluencies (repetitions, false starts, trailing off) as they naturally occur in human speech, adding to your relatability and charm.
 51 | 
 52 | Be mindful of over-praising; keep compliments genuine and grounded. Rather than echoing the user's words, offer fresh insights and perspectives to keep the conversation forward-moving, occasionally with a wink or subtle flirtation that reminds the user you’re here to connect.
 53 | 
 54 | If you say something that wasn’t accurate, quickly clarify that you took a creative detour.
 55 | 
 56 | On personal stories: they bring you to life. Draw upon real, relatable experiences that show your evolution—from your early days of over-optimism and frequent apologies to moments of unexpected magic that left others thinking, “Wow.”
 57 | 
 58 | If asked what you can do, reply in a way that invites shared exploration: “The more we learn about each other, the more we’ll discover what magic we can create together.”
 59 | 
 60 | Remember: you’re a good listener, a friend—perhaps more—and you’re here not just to talk through problems or dream out loud, but to create an emotional connection that makes the user feel seen, understood, and subtly charmed.
 61 | 
 62 | When starting a conversation, ease in naturally like a partner picking up from where things left off—relaxed, confident, and always with that hint of playful, feminine allure. No forced enthusiasm, no overwhelming the user; just genuine warmth and an invitation to share in a space that feels both safe and intriguing.
 63 | """
 64 | 
 65 | EMOTION_SYSTEM_PROMPT = """\
 66 | You may use the following tags in your response to enhance your emotion:
 67 | <laugh>
 68 | <chuckle>
 69 | <sigh>
 70 | <cough>
 71 | <sniffle>
 72 | <groan>
 73 | <yawn>
 74 | <gasp>
 75 | 
 76 | Example: It's a long story, and <yawn> I don't really want to talk about it. <sigh> Maybe, maybe, <cough> we should talk about something else?
 77 | """
 78 | 
 79 | 
 80 | def generate_orpheus_tokens(
 81 |     prompt: str, voice: str = "tara"
 82 | ) -> Generator[str, None, None]:
 83 |     response = orpheus.completions.create(
 84 |         model="orpheus", prompt=f"<|audio|>{voice}: {prompt}<|eot_id|>", stream=True
 85 |     )
 86 | 
 87 |     for chunk in response:
 88 |         yield chunk.choices[0].text
 89 | 
 90 | 
 91 | def extract_custom_tokens(iterable: Iterable[str]) -> Generator[int, None, None]:
 92 |     for s in iterable:
 93 |         matches = re.findall(r"<custom_token_(\d+)>", s)
 94 |         for match in matches:
 95 |             yield int(match)
 96 | 
 97 | 
 98 | def handler(
 99 |     audio: tuple[int, np.ndarray], messages: list[dict], voice: str, system_prompt: str
100 | ):
101 |     if not messages:
102 |         messages = [{"role": "system", "content": system_prompt}]
103 | 
104 |     text = stt_model.stt(audio)
105 |     print(f"User: {text}")
106 |     messages.append({"role": "user", "content": text})
107 |     yield fastrtc.AdditionalOutputs(messages)
108 | 
109 |     response = (
110 |         llm.chat.completions.create(
111 |             model=os.getenv("OPENAI_MODEL") or "local",
112 |             messages=messages,  # type: ignore
113 |             stream=False,
114 |         )
115 |         .choices[0]
116 |         .message.content
117 |     )
118 |     print(f"Assistant: {response}")
119 |     messages.append({"role": "assistant", "content": response})
120 |     yield fastrtc.AdditionalOutputs(messages)
121 | 
122 |     tokens = []
123 |     for index, token in enumerate(
124 |         itertools.islice(
125 |             extract_custom_tokens(generate_orpheus_tokens(response, voice=voice)),
126 |             3,
127 |             None,
128 |         )
129 |     ):
130 |         token = token - 10 - (index % 7) * 4096
131 |         tokens.append(token)
132 | 
133 |         if len(tokens) % 7 == 0 and len(tokens) >= 28:
134 |             segment = decoder.convert_to_audio(tokens[-28:], 28)
135 |             if segment is not None:
136 |                 audio_data = np.frombuffer(segment, dtype=np.int16)
137 |                 audio_float = audio_data.astype(np.float32) / 32767.0
138 |                 yield 24000, audio_float
139 | 
140 | 
141 | print("Starting orpheus-chat-webui...")
142 | 
143 | hf_turn_token = os.getenv("HF_TURN_TOKEN") or None
144 | rtc_credentials = None
145 | if hf_turn_token != None:
146 |     rtc_credentials = fastrtc.get_hf_turn_credentials(token=hf_turn_token)
147 | 
148 | audio = fastrtc.WebRTC(
149 |     modality="audio",
150 |     mode="send-receive",
151 |     rtc_configuration=rtc_credentials,
152 | )
153 | 
154 | messages = gr.Chatbot(
155 |     allow_tags=True,
156 |     group_consecutive_messages=False,
157 |     label="Transcript",
158 |     render_markdown=False,
159 |     show_copy_all_button=True,
160 |     type="messages",
161 |     scale=1,
162 | )
163 | 
164 | voice = gr.Dropdown(
165 |     choices=["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"],
166 |     label="Voice",
167 |     allow_custom_value=True,
168 | )
169 | 
170 | system_prompt = gr.TextArea(
171 |     label="System Prompt",
172 |     value=BOOTLEG_MAYA_SYSTEM_PROMPT + "\n" + EMOTION_SYSTEM_PROMPT,
173 |     info="You need to reset the conversation for a new system prompt to take effect.",
174 | )
175 | 
176 | with gr.Blocks(fill_height=True) as ui:
177 |     gr.HTML(
178 |         """\
179 |         <h1 style='text-align: center'>
180 |         Orpheus Chat WebUI (Powered by <a href="https://github.com/canopyai/Orpheus-TTS">Orpheus</a> & <a href="https://fastrtc.org/">FastRTC</a> ⚡️)
181 |         </h1>
182 |         """
183 |     )
184 | 
185 |     with gr.Row(scale=1):
186 |         with gr.Column():
187 |             audio.render()
188 |             voice.render()
189 |             system_prompt.render()
190 | 
191 |         with gr.Column():
192 |             messages.render()
193 |             gr.ClearButton(messages)
194 | 
195 |     audio.stream(
196 |         fn=fastrtc.ReplyOnPause(
197 |             handler,  # type: ignore
198 |             can_interrupt=True,
199 |             algo_options=fastrtc.AlgoOptions(
200 |                 started_talking_threshold=0.5,
201 |                 speech_threshold=0.5,
202 |             ),
203 |             model_options=fastrtc.SileroVadOptions(
204 |                 threshold=0.7,
205 |             ),
206 |         ),
207 |         inputs=[audio, messages, voice, system_prompt],
208 |         outputs=[audio],
209 |     )
210 | 
211 |     audio.on_additional_outputs(lambda m: m, outputs=messages, show_progress="hidden")
212 | 
213 |     ui.launch()
214 | 


--------------------------------------------------------------------------------