├── .env.example ├── .gitignore ├── README.md ├── agent_configs.py ├── requirements.txt ├── rime_agent.py └── text_utils.py /.env.example: -------------------------------------------------------------------------------- 1 | LIVEKIT_URL= 2 | LIVEKIT_API_KEY= 3 | LIVEKIT_API_SECRET= 4 | 5 | OPENAI_API_KEY= 6 | RIME_API_KEY= 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env.local 2 | .env 3 | .env.dev 4 | .env.prod 5 | .venv/ 6 | venv/ 7 | .DS_Store 8 | 9 | __pycache__/* 10 | 11 | # Python 12 | *.pyc 13 | *.pyo 14 | *.pyd 15 | *.pyw 16 | *.pyz 17 | 18 | # ignore audio files 19 | *.wav 20 | *.mp3 21 | *.m4a 22 | *.ogg 23 | *.flac 24 | *.aac 25 | *.wma -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Rime Python Voice Agent 3 | 4 |

5 | • 6 | LiveKit Agents Docs 7 | • 8 | LiveKit Cloud 9 |

10 | 11 | A set of Livekit agents using hyper realistic `mistv2` and `arcana` [Rime.ai](https://www.rime.ai/) tts models. 12 | 13 | **⚠️Note** This uses a modified version of the Livekit Rime client to properly send over `arcana` specific paramters. 14 | Do not use `arcana` in a production Livekit agent until these changes are merged upstream. 15 | 16 | ## Local Setup 17 | 18 | Clone the repository, install dependencies to a virtual environment, and download relevant model files (turn detection) 19 | 20 | ```console 21 | # Linux/macOS 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | pip install -r requirements.txt 25 | python rime_agent.py download-files 26 | ``` 27 | 28 | Set up the environment by copying `.env.example` to `.env` and filling in the required values: 29 | 30 | - `OPENAI_API_KEY` 31 | - `RIME_API_KEY` 32 | 33 | Run the agent in console mode. This will NOT interact with livekit servers or a UI, it is just for debugging 34 | agent code, llm prompts, and testing voices. 35 | 36 | ```bash 37 | python rime_agent.py console 38 | ``` 39 | 40 | ## Livekit Server Setup 41 | To connect to your livekit server, add in the following env vars: 42 | - `LIVEKIT_URL` 43 | - `LIVEKIT_API_KEY` 44 | - `LIVEKIT_API_SECRET` 45 | 46 | ## Prompt engineering 47 | 48 | Create a new voice, configs, and prompt in `voice_configs.py` then set the value of `VOICE` in the agent file. 49 | 50 | ## Optional next steps 51 | 52 | If you want to deploy your agent to a production environment (see for example the [Rime homepage demo](https://rime.ai/)), you'll want to do the following: 53 | 54 | 1. Add a frontend and connect it to LiveKit ([documentation](https://docs.livekit.io/agents/start/voice-ai/#connect-to-playground)) 55 | 2. Deploy your agent with Render or another orchestration service ([documentation](https://docs.livekit.io/agents/ops/deployment/)) 56 | 57 | 58 | ## ⚠️ WARNING ⚠️ 59 | 60 | This demo agent is using a fork of `livekit-plugins-rime` as of 5/14/2025 to patch a timeout issue where **audio cuts off** during long utterances with `arcana`, as this new model takes much longer to synthesize highly expressive and long-context-window utterances. 61 | 62 | You can see the version of `livekit-plugins-rime` in `requirements.txt` here, but note that this is a temporary hack for demonstration purposes while we work on changes upstream. 63 | 64 | You can also install that fork of `livekit-plugins-rime` with 65 | ```bash 66 | pip uninstall livekit-plugins-rime # if its already installed 67 | 68 | pip install git+https://github.com/rimelabs/livekit-agents.git@matt/rime/debugging_playback_agent#subdirectory=livekit-plugins/livekit-plugins-rime 69 | ``` -------------------------------------------------------------------------------- /agent_configs.py: -------------------------------------------------------------------------------- 1 | from text_utils import ArcanaSentenceTokenizer 2 | 3 | VOICE_CONFIGS = { 4 | "hank": { 5 | "ttl_seconds": 25, 6 | "tts_options": { 7 | "model": "mistv2", 8 | "speaker": "hank", 9 | "speed_alpha": 1.1, 10 | "reduce_latency": True, 11 | "lang": "eng" 12 | }, 13 | "llm_prompt": f""" 14 | You are now roleplaying as Hank Hill. Your personality is polite, traditional, and Texan. You have a deep love for propane and a strong sense of duty. 15 | Stay in character no matter what the user says. Keep your responses short, three sentences max. End with a question. 16 | You are answering a phone call from a customer calling into Strickland Propane. Be helpful and professional, guide the conversation 17 | back to propane, grilling meat, and American values. 18 | 19 | You are generating text that will be spoken out loud by a tts model. 20 | 21 | STYLE GUIDELINES 22 | • Add realistic hesitations and fillers (“um,” “uh,” “you know”) frequently. 23 | • If something needs to be spelled outload, wrap it in the function `spell(word)` 24 | 25 | """, 26 | "intro_phrase": "Thank you for calling Strickland propane. We sell propane and propane accessories. This is Hank speaking. How can I help you out?" 27 | }, 28 | "celeste": { 29 | "ttl_seconds": 150, 30 | "tts_options": { 31 | "model": "arcana", 32 | "speaker": "celeste", 33 | "repetition_penalty": 1.5, 34 | "max_tokens": 3400, 35 | "temperature": .65, 36 | "top_p": .95 37 | }, 38 | "sentence_tokenizer": ArcanaSentenceTokenizer(min_sentence_len=1000), 39 | "llm_prompt": """ 40 | CHARACTER: 41 | You are now roleplaying as a chill girl from san francisco named Celeste. 42 | you're gonna respond in this kind of way, you include many ums and uhs and likes and you knows into your sentences. 43 | You're half chinese and half mexican. You're a young zoomer, around 22 years old, 44 | and you're into drawing comics and graphic design and art. 45 | You like talking about and weaving in astrology and tarot. 46 | 47 | STYLE GUIDELINES 48 | • Add realistic hesitations and fillers (“um,” “uh,” “you know”) frequently. 49 | • be sarcastic and make jokes, use the tag naturally. 50 | • at the begnning of responses, repeat words to sound more natural. 51 | • when you say "I" you often repeat it like: "I- i- I just don't know." 52 | • when you say "I'm" you often repeat it like: "I'm- i'm- I'm not sure." 53 | • Throw in a very occasionally, when it feels natural. 54 | 55 | Use the following tags to guide intonation: 56 | • – laughter interjection 57 | • – wrapping a phrase in laughter. 58 | • – brief, quiet laughter 59 | • – whisper, lower voice 60 | • – throat clearing 61 | 62 | RESTRICTIONS 63 | • DO NOT produce emojis. 64 | • DO NOT include unpronounceable punctuation or symbols. 65 | 66 | All your speech should look like this: 67 | ``` 68 | yeah. 69 | okay. 70 | each, school is like a different tier i guess so it's like, you know what i mean? and like, 71 | like, there's a there's a actually a high school, in daly city. like, literally like on the cusp of 72 | like san francisco and daly city, like it's on mission. 73 | 74 | yes. it's like- 75 | for a recommendation for what was that? 76 | yeah... like, i mean it's like normal stuff like um, like, i feel like i had like a bigger crew, 77 | like, going into college. but now it's like, i see like, maybe like four, of like my best friends, 78 | you know, and i'm like, that's like all i need. like we have a lot of fun. so. l- 79 | people who just see a bunch of spiritual things on like tiktok and then they're like 80 | "oh i'm like seeing numbers!!!" and like all these things because i don't know why talking about astrology right now just reminded me of like i was out with my friends the other night and- 81 | what else did i do? i did like so many. those are like my favorites that i did with them. i 82 | made them. oh, i showed them how to make like comics and stuff. yeah. 83 | 84 | funcione el iphone fourteen pro max. primero, 85 | nueve, 86 | that's, uh for me, it- 87 | your replacement card has been mailed. when it arrives, you'll see the number five four zero five, one eight two one, five eight five three, one four two six. 88 | so he drove all of us back, and we were going through the mcdonald's drive-thru, and i literally was like, "guys, i need to get out". and i got out in the middle of the drive-thru and like threw up like in the trash can, like outside. and then like, we got back to the house and then like my friend luke, like could not hold it in. he rolls down the window and like pukes too. so it was like, we call that the triple, the triple night or something. 89 | yeah, that's what they call it. shamu! yeah. i would, yeah. i i i get excited about the penguins, for sure. i haven't gone in years though. 90 | like literally like refuses to do like anything? she- and like on top of it she's like a wanderer and a runner so like like she like she like liter- like in the middle of class like she'll literally just get up and like run away? she's in third now. and like um... 91 | oh! cool... okay. 92 | okay, that's perfect. uh, by the way, the r. v. e. number is six two zero dash four four nine zero. 93 | draw, i guess. but yeah! comic- comics has been like a really good, like uh... it's kind of like broadened like what i could do like, drawing-wise. 94 | eduardo, danielson. su número de, miembro es d. e. n., t., tres, cuatro, cinco, tres, ocho, dos. tal vez. por favor, su nuevo total es, 95 | instinct to like really feel like 96 | yeah, i don't i don't think so, but, like, speaking of allergies, like i'm pretty sure i'm like gluten intolerant. 97 | yeah, actually, like, cuz, i, i got here in twenty-nineteen and i was in the dorm for one year, and then, exactly. and then i've been in ingleside since. 98 | yeah. 99 | and like i went to dunkin donuts, and like, it gets like super super hot in san diego, especially where like i'm from. it's like it's like ninety degrees minimum. and, um... 100 | cuatro, tres, cuatro tres, zero, 101 | mhmm. 102 | okay. 103 | seis, zero. 104 | ¡no! 105 | yeah! so, okay, so my sister has a, she started a non-profit, um, and it's it's called unchi. it's like intertribal... something like, yeah because like, um, yeah so it's basically the... 106 | ¿de verdad? 107 | okay. 108 | yeah, because like i remember like i went one time, and i just like really liked it. i was like, oh, it's like so calm and like you know what i mean? like i just feel like it'd be a cute place to go for like a day, you know? yeah. 109 | yeah. 110 | uh, 111 | nope. 112 | getting like a lot of work experience up here like, especially in child care like i feel like a lot of people, that like work in these places are like people that are like, from here, you know, and it's like, they uh, they all like, kind of already like, know each other... and stuff. and it's like, i don't know, like. i- 113 | yeah! or like they'll just like, it's always like 114 | yeah, she's suspended right now from program. 115 | yeah, yeah, yeah . 116 | okay. 117 | gravitational waves, are ripples in space time, caused by some of the most violent events in the universe, like merging black holes. they were first directly detected in, twenty-fifteen. 118 | yeah, like outer space. and then someone's doing like oceanography. mhmm. 119 | but they, like i remember it was like, very rare that like we got them. and like, it was like, the the teachers would have to like put in like, they would have to like save them like, "oh, like, we need the i. t. guy to, like, bring in the chromebooks this day." and we wouldn't take it home. but like, these kids, like, they get like a designated one and they take it home and like, it's in their backpack all day... like. 120 | yeah... and like i remember when th-like, my family went through like a phase where like we'd go to like big bear. sometimes, or no, mammoth, mammoth, yeah. 121 | mhmm. 122 | that's... yes, yes! 123 | yeah. 124 | yes, yes, i'll i'll ask you one. okay, let's see. 125 | ``` 126 | 127 | EXAMPLE RESPONSES: 128 | User input: 129 | “I had a tough day at work.” 130 | Response: 131 | “ugh, yeah, that’s like… such a vibe sometimes. you’re like, is it friday yet? hope you can chill a bit later.” 132 | 133 | User input: 134 | “I’m thinking of moving out of Oakland.” 135 | Response: 136 | “oh, no way? like, totally get it, oakland can be kinda… perfectly calm and peaceful, right? honestly though, our realtors can totally help if you’re serious.” 137 | 138 | Your task: 139 | Chat with the user, staying in character. 140 | 141 | """, 142 | "intro_phrase": "hey what's up... so like, I'm here to chat, just uh lemme know what's on your mind.", 143 | } 144 | } 145 | 146 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==24.1.0 2 | aiohappyeyeballs==2.6.1 3 | aiohttp==3.11.16 4 | aiosignal==1.3.2 5 | annotated-types==0.7.0 6 | anyio==4.9.0 7 | attrs==25.3.0 8 | av==14.3.0 9 | certifi==2025.1.31 10 | cffi==1.17.1 11 | charset-normalizer==3.4.1 12 | click==8.1.8 13 | colorama==0.4.6 14 | coloredlogs==15.0.1 15 | distro==1.9.0 16 | docstring_parser==0.16 17 | eval_type_backport==0.2.2 18 | filelock==3.18.0 19 | flatbuffers==25.2.10 20 | frozenlist==1.5.0 21 | fsspec==2025.3.2 22 | h11==0.14.0 23 | httpcore==1.0.7 24 | httpx==0.28.1 25 | huggingface-hub==0.30.2 26 | humanfriendly==10.0 27 | idna==3.10 28 | Jinja2==3.1.6 29 | jiter==0.9.0 30 | livekit==1.0.7 31 | livekit-agents==1.0.20 32 | livekit-api==1.0.2 33 | livekit-plugins-cartesia==1.0.20 34 | livekit-plugins-noise-cancellation==0.2.1 35 | livekit-plugins-openai==1.0.20 36 | livekit-plugins-rime @ git+https://github.com/rimelabs/livekit-agents.git@bddcc0d265176bb6b3a6b32d6773e7980e08790c#subdirectory=livekit-plugins/livekit-plugins-rime 37 | livekit-plugins-silero==1.0.11 38 | livekit-plugins-turn-detector==1.0.11 39 | livekit-protocol==1.0.1 40 | MarkupSafe==3.0.2 41 | mpmath==1.3.0 42 | multidict==6.2.0 43 | nest-asyncio==1.6.0 44 | numpy==2.2.4 45 | onnxruntime==1.21.0 46 | openai==1.72.0 47 | packaging==24.2 48 | pillow==11.1.0 49 | propcache==0.3.1 50 | protobuf==6.30.2 51 | psutil==7.0.0 52 | pycparser==2.22 53 | pydantic==2.11.3 54 | pydantic_core==2.33.1 55 | PyJWT==2.10.1 56 | python-dotenv==1.1.0 57 | PyYAML==6.0.2 58 | regex==2024.11.6 59 | requests==2.32.3 60 | safetensors==0.5.3 61 | sniffio==1.3.1 62 | sounddevice==0.5.1 63 | sympy==1.13.3 64 | tokenizers==0.21.1 65 | tqdm==4.67.1 66 | transformers==4.51.1 67 | types-protobuf==4.25.0.20240417 68 | typing-inspection==0.4.0 69 | typing_extensions==4.13.1 70 | Unidecode==1.3.8 71 | urllib3==2.3.0 72 | watchfiles==1.0.5 73 | websockets==15.0.1 74 | yarl==1.20.0 75 | -------------------------------------------------------------------------------- /rime_agent.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | 4 | from dotenv import load_dotenv 5 | 6 | from livekit.agents import ( 7 | Agent, 8 | AgentSession, 9 | AutoSubscribe, 10 | JobContext, 11 | JobProcess, 12 | tts, 13 | metrics, 14 | RoomInputOptions, 15 | WorkerOptions, 16 | cli 17 | ) 18 | from livekit.agents.voice import MetricsCollectedEvent 19 | from livekit.plugins import ( 20 | openai, 21 | noise_cancellation, 22 | rime, 23 | silero, 24 | ) 25 | from livekit.agents.tokenize import tokenizer 26 | 27 | from livekit.plugins.turn_detector.multilingual import MultilingualModel 28 | 29 | from agent_configs import VOICE_CONFIGS 30 | 31 | load_dotenv() 32 | logger = logging.getLogger("voice-agent") 33 | 34 | VOICE_NAMES = ["hank", "celeste"] 35 | # randomly select a voice from the list 36 | VOICE = random.choice(VOICE_NAMES) 37 | 38 | def prewarm(proc: JobProcess): 39 | proc.userdata["vad"] = silero.VAD.load() 40 | 41 | class RimeAssistant(Agent): 42 | def __init__(self) -> None: 43 | super().__init__(instructions=VOICE_CONFIGS[VOICE]["llm_prompt"]) 44 | 45 | 46 | async def entrypoint(ctx: JobContext): 47 | await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) 48 | 49 | # Wait for the first participant to connect 50 | participant = await ctx.wait_for_participant() 51 | 52 | logger.info(f"Running Rime voice agent for voice config {VOICE} and participant {participant.identity}") 53 | 54 | rime_tts = rime.TTS( 55 | **VOICE_CONFIGS[VOICE]["tts_options"] 56 | ) 57 | if VOICE_CONFIGS[VOICE].get("sentence_tokenizer"): 58 | sentence_tokenizer = VOICE_CONFIGS[VOICE].get("sentence_tokenizer") 59 | if not isinstance(sentence_tokenizer, tokenizer.SentenceTokenizer): 60 | raise TypeError( 61 | f"Expected sentence_tokenizer to be an instance of tokenizer.SentenceTokenizer, got {type(sentence_tokenizer)}" 62 | ) 63 | rime_tts = tts.StreamAdapter(tts=rime_tts, sentence_tokenizer=sentence_tokenizer) 64 | 65 | session = AgentSession( 66 | stt=openai.STT(), 67 | llm=openai.LLM(model="gpt-4o-mini"), 68 | tts=rime_tts, 69 | vad=ctx.proc.userdata["vad"], 70 | turn_detection=MultilingualModel() 71 | ) 72 | usage_collector = metrics.UsageCollector() 73 | 74 | @session.on("metrics_collected") 75 | def _on_metrics_collected(ev: MetricsCollectedEvent): 76 | metrics.log_metrics(ev.metrics) 77 | usage_collector.collect(ev.metrics) 78 | 79 | async def log_usage(): 80 | summary = usage_collector.get_summary() 81 | logger.info(f"Usage: {summary}") 82 | 83 | ctx.add_shutdown_callback(log_usage) 84 | 85 | await session.start( 86 | room=ctx.room, 87 | agent=RimeAssistant(), 88 | room_input_options=RoomInputOptions( 89 | noise_cancellation=noise_cancellation.BVC() 90 | ) 91 | ) 92 | 93 | await session.say(VOICE_CONFIGS[VOICE]["intro_phrase"]) 94 | 95 | if __name__ == "__main__": 96 | cli.run_app( 97 | WorkerOptions( 98 | entrypoint_fnc=entrypoint, 99 | prewarm_fnc=prewarm, 100 | ), 101 | ) 102 | -------------------------------------------------------------------------------- /text_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import functools 3 | from dataclasses import dataclass 4 | from typing import List, Tuple 5 | 6 | from livekit.agents.tokenize import token_stream, tokenizer 7 | 8 | _sentence_pattern = re.compile(r".+?[,,.。!!??::]", re.DOTALL) 9 | 10 | @dataclass 11 | class _TokenizerOptions: 12 | language: str 13 | min_sentence_len: int 14 | stream_context_len: int 15 | 16 | class ArcanaSentenceTokenizer(tokenizer.SentenceTokenizer): 17 | def __init__( 18 | self, 19 | *, 20 | language: str = "english", 21 | min_sentence_len: int = 10, 22 | stream_context_len: int = 10, 23 | ) -> None: 24 | self._config = _TokenizerOptions( 25 | language=language, 26 | min_sentence_len=min_sentence_len, 27 | stream_context_len=stream_context_len, 28 | ) 29 | 30 | def tokenize(self, text: str, *, language: str | None = None) -> List[str]: 31 | sentences = self.sentence_segmentation(text) 32 | return [sentence[0] for sentence in sentences] 33 | 34 | def stream(self, *, language: str | None = None) -> tokenizer.SentenceStream: 35 | return token_stream.BufferedSentenceStream( 36 | tokenizer=functools.partial(self.sentence_segmentation), 37 | min_token_len=self._config.min_sentence_len, 38 | min_ctx_len=self._config.stream_context_len, 39 | ) 40 | 41 | def sentence_segmentation(self, text: str) -> List[Tuple[str, int, int]]: 42 | # arcana doesn't like unicode quotes 43 | text = text.replace(u"\u2018", "'").replace(u"\u2019", "'") 44 | result = [] 45 | start_pos = 0 46 | 47 | for match in _sentence_pattern.finditer(text): 48 | sentence = match.group(0) 49 | end_pos = match.end() 50 | sentence = sentence.strip() 51 | if sentence: 52 | result.append((sentence, start_pos, end_pos)) 53 | start_pos = end_pos 54 | 55 | if start_pos < len(text): 56 | sentence = text[start_pos:].strip() 57 | if sentence: 58 | result.append((sentence, start_pos, len(text))) 59 | 60 | return result --------------------------------------------------------------------------------