├── .gitignore ├── requirements.txt ├── README.md └── assistant.py /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .env -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | livekit-agents~=0.8 2 | livekit-plugins-deepgram~=0.6 3 | livekit-plugins-openai~=0.7 4 | livekit-plugins-silero~=0.6 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LiveKit Assistant 2 | 3 | First, create a virtual environment, update pip, and install the required packages: 4 | 5 | ``` 6 | $ python3 -m venv .venv 7 | $ source .venv/bin/activate 8 | $ pip install -U pip 9 | $ pip install -r requirements.txt 10 | ``` 11 | 12 | You need to set up the following environment variables: 13 | 14 | ``` 15 | LIVEKIT_URL=... 16 | LIVEKIT_API_KEY=... 17 | LIVEKIT_API_SECRET=... 18 | DEEPGRAM_API_KEY=... 19 | OPENAI_API_KEY=... 20 | ``` 21 | 22 | Then, run the assistant: 23 | 24 | ``` 25 | $ python3 assistant.py download-files 26 | $ python3 assistant.py start 27 | ``` 28 | 29 | Finally, you can load the [hosted playground](https://agents-playground.livekit.io/) and connect it. -------------------------------------------------------------------------------- /assistant.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Annotated 3 | 4 | from livekit import agents, rtc 5 | from livekit.agents import JobContext, WorkerOptions, cli, tokenize, tts 6 | from livekit.agents.llm import ( 7 | ChatContext, 8 | ChatImage, 9 | ChatMessage, 10 | ) 11 | from livekit.agents.voice_assistant import VoiceAssistant 12 | from livekit.plugins import deepgram, openai, silero 13 | 14 | 15 | class AssistantFunction(agents.llm.FunctionContext): 16 | """This class is used to define functions that will be called by the assistant.""" 17 | 18 | @agents.llm.ai_callable( 19 | description=( 20 | "Called when asked to evaluate something that would require vision capabilities," 21 | "for example, an image, video, or the webcam feed." 22 | ) 23 | ) 24 | async def image( 25 | self, 26 | user_msg: Annotated[ 27 | str, 28 | agents.llm.TypeInfo( 29 | description="The user message that triggered this function" 30 | ), 31 | ], 32 | ): 33 | print(f"Message triggering vision capabilities: {user_msg}") 34 | return None 35 | 36 | 37 | async def get_video_track(room: rtc.Room): 38 | """Get the first video track from the room. We'll use this track to process images.""" 39 | 40 | video_track = asyncio.Future[rtc.RemoteVideoTrack]() 41 | 42 | for _, participant in room.remote_participants.items(): 43 | for _, track_publication in participant.track_publications.items(): 44 | if track_publication.track is not None and isinstance( 45 | track_publication.track, rtc.RemoteVideoTrack 46 | ): 47 | video_track.set_result(track_publication.track) 48 | print(f"Using video track {track_publication.track.sid}") 49 | break 50 | 51 | return await video_track 52 | 53 | 54 | async def entrypoint(ctx: JobContext): 55 | await ctx.connect() 56 | print(f"Room name: {ctx.room.name}") 57 | 58 | chat_context = ChatContext( 59 | messages=[ 60 | ChatMessage( 61 | role="system", 62 | content=( 63 | "Your name is Alloy. You are a funny, witty bot. Your interface with users will be voice and vision." 64 | "Respond with short and concise answers. Avoid using unpronouncable punctuation or emojis." 65 | ), 66 | ) 67 | ] 68 | ) 69 | 70 | gpt = openai.LLM(model="gpt-4o") 71 | 72 | # Since OpenAI does not support streaming TTS, we'll use it with a StreamAdapter 73 | # to make it compatible with the VoiceAssistant 74 | openai_tts = tts.StreamAdapter( 75 | tts=openai.TTS(voice="alloy"), 76 | sentence_tokenizer=tokenize.basic.SentenceTokenizer(), 77 | ) 78 | 79 | latest_image: rtc.VideoFrame | None = None 80 | 81 | assistant = VoiceAssistant( 82 | vad=silero.VAD.load(), # We'll use Silero's Voice Activity Detector (VAD) 83 | stt=deepgram.STT(), # We'll use Deepgram's Speech To Text (STT) 84 | llm=gpt, 85 | tts=openai_tts, # We'll use OpenAI's Text To Speech (TTS) 86 | fnc_ctx=AssistantFunction(), 87 | chat_ctx=chat_context, 88 | ) 89 | 90 | chat = rtc.ChatManager(ctx.room) 91 | 92 | async def _answer(text: str, use_image: bool = False): 93 | """ 94 | Answer the user's message with the given text and optionally the latest 95 | image captured from the video track. 96 | """ 97 | content: list[str | ChatImage] = [text] 98 | if use_image and latest_image: 99 | content.append(ChatImage(image=latest_image)) 100 | 101 | chat_context.messages.append(ChatMessage(role="user", content=content)) 102 | 103 | stream = gpt.chat(chat_ctx=chat_context) 104 | await assistant.say(stream, allow_interruptions=True) 105 | 106 | @chat.on("message_received") 107 | def on_message_received(msg: rtc.ChatMessage): 108 | """This event triggers whenever we get a new message from the user.""" 109 | 110 | if msg.message: 111 | asyncio.create_task(_answer(msg.message, use_image=False)) 112 | 113 | @assistant.on("function_calls_finished") 114 | def on_function_calls_finished(called_functions: list[agents.llm.CalledFunction]): 115 | """This event triggers when an assistant's function call completes.""" 116 | 117 | if len(called_functions) == 0: 118 | return 119 | 120 | user_msg = called_functions[0].call_info.arguments.get("user_msg") 121 | if user_msg: 122 | asyncio.create_task(_answer(user_msg, use_image=True)) 123 | 124 | assistant.start(ctx.room) 125 | 126 | await asyncio.sleep(1) 127 | await assistant.say("Hi there! How can I help?", allow_interruptions=True) 128 | 129 | while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED: 130 | video_track = await get_video_track(ctx.room) 131 | 132 | async for event in rtc.VideoStream(video_track): 133 | # We'll continually grab the latest image from the video track 134 | # and store it in a variable. 135 | latest_image = event.frame 136 | 137 | 138 | if __name__ == "__main__": 139 | cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) 140 | --------------------------------------------------------------------------------