├── .gitignore
├── requirements.txt
├── README.md
└── assistant.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .env


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | livekit-agents~=0.8
2 | livekit-plugins-deepgram~=0.6
3 | livekit-plugins-openai~=0.7
4 | livekit-plugins-silero~=0.6
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LiveKit Assistant
 2 | 
 3 | First, create a virtual environment, update pip, and install the required packages:
 4 | 
 5 | ```
 6 | $ python3 -m venv .venv
 7 | $ source .venv/bin/activate
 8 | $ pip install -U pip
 9 | $ pip install -r requirements.txt
10 | ```
11 | 
12 | You need to set up the following environment variables:
13 | 
14 | ```
15 | LIVEKIT_URL=...
16 | LIVEKIT_API_KEY=...
17 | LIVEKIT_API_SECRET=...
18 | DEEPGRAM_API_KEY=...
19 | OPENAI_API_KEY=...
20 | ```
21 | 
22 | Then, run the assistant:
23 | 
24 | ```
25 | $ python3 assistant.py download-files
26 | $ python3 assistant.py start
27 | ```
28 | 
29 | Finally, you can load the [hosted playground](https://agents-playground.livekit.io/) and connect it.


--------------------------------------------------------------------------------
/assistant.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from typing import Annotated
  3 | 
  4 | from livekit import agents, rtc
  5 | from livekit.agents import JobContext, WorkerOptions, cli, tokenize, tts
  6 | from livekit.agents.llm import (
  7 |     ChatContext,
  8 |     ChatImage,
  9 |     ChatMessage,
 10 | )
 11 | from livekit.agents.voice_assistant import VoiceAssistant
 12 | from livekit.plugins import deepgram, openai, silero
 13 | 
 14 | 
 15 | class AssistantFunction(agents.llm.FunctionContext):
 16 |     """This class is used to define functions that will be called by the assistant."""
 17 | 
 18 |     @agents.llm.ai_callable(
 19 |         description=(
 20 |             "Called when asked to evaluate something that would require vision capabilities,"
 21 |             "for example, an image, video, or the webcam feed."
 22 |         )
 23 |     )
 24 |     async def image(
 25 |         self,
 26 |         user_msg: Annotated[
 27 |             str,
 28 |             agents.llm.TypeInfo(
 29 |                 description="The user message that triggered this function"
 30 |             ),
 31 |         ],
 32 |     ):
 33 |         print(f"Message triggering vision capabilities: {user_msg}")
 34 |         return None
 35 | 
 36 | 
 37 | async def get_video_track(room: rtc.Room):
 38 |     """Get the first video track from the room. We'll use this track to process images."""
 39 | 
 40 |     video_track = asyncio.Future[rtc.RemoteVideoTrack]()
 41 | 
 42 |     for _, participant in room.remote_participants.items():
 43 |         for _, track_publication in participant.track_publications.items():
 44 |             if track_publication.track is not None and isinstance(
 45 |                 track_publication.track, rtc.RemoteVideoTrack
 46 |             ):
 47 |                 video_track.set_result(track_publication.track)
 48 |                 print(f"Using video track {track_publication.track.sid}")
 49 |                 break
 50 | 
 51 |     return await video_track
 52 | 
 53 | 
 54 | async def entrypoint(ctx: JobContext):
 55 |     await ctx.connect()
 56 |     print(f"Room name: {ctx.room.name}")
 57 | 
 58 |     chat_context = ChatContext(
 59 |         messages=[
 60 |             ChatMessage(
 61 |                 role="system",
 62 |                 content=(
 63 |                     "Your name is Alloy. You are a funny, witty bot. Your interface with users will be voice and vision."
 64 |                     "Respond with short and concise answers. Avoid using unpronouncable punctuation or emojis."
 65 |                 ),
 66 |             )
 67 |         ]
 68 |     )
 69 | 
 70 |     gpt = openai.LLM(model="gpt-4o")
 71 | 
 72 |     # Since OpenAI does not support streaming TTS, we'll use it with a StreamAdapter
 73 |     # to make it compatible with the VoiceAssistant
 74 |     openai_tts = tts.StreamAdapter(
 75 |         tts=openai.TTS(voice="alloy"),
 76 |         sentence_tokenizer=tokenize.basic.SentenceTokenizer(),
 77 |     )
 78 | 
 79 |     latest_image: rtc.VideoFrame | None = None
 80 | 
 81 |     assistant = VoiceAssistant(
 82 |         vad=silero.VAD.load(),  # We'll use Silero's Voice Activity Detector (VAD)
 83 |         stt=deepgram.STT(),  # We'll use Deepgram's Speech To Text (STT)
 84 |         llm=gpt,
 85 |         tts=openai_tts,  # We'll use OpenAI's Text To Speech (TTS)
 86 |         fnc_ctx=AssistantFunction(),
 87 |         chat_ctx=chat_context,
 88 |     )
 89 | 
 90 |     chat = rtc.ChatManager(ctx.room)
 91 | 
 92 |     async def _answer(text: str, use_image: bool = False):
 93 |         """
 94 |         Answer the user's message with the given text and optionally the latest
 95 |         image captured from the video track.
 96 |         """
 97 |         content: list[str | ChatImage] = [text]
 98 |         if use_image and latest_image:
 99 |             content.append(ChatImage(image=latest_image))
100 | 
101 |         chat_context.messages.append(ChatMessage(role="user", content=content))
102 | 
103 |         stream = gpt.chat(chat_ctx=chat_context)
104 |         await assistant.say(stream, allow_interruptions=True)
105 | 
106 |     @chat.on("message_received")
107 |     def on_message_received(msg: rtc.ChatMessage):
108 |         """This event triggers whenever we get a new message from the user."""
109 | 
110 |         if msg.message:
111 |             asyncio.create_task(_answer(msg.message, use_image=False))
112 | 
113 |     @assistant.on("function_calls_finished")
114 |     def on_function_calls_finished(called_functions: list[agents.llm.CalledFunction]):
115 |         """This event triggers when an assistant's function call completes."""
116 | 
117 |         if len(called_functions) == 0:
118 |             return
119 | 
120 |         user_msg = called_functions[0].call_info.arguments.get("user_msg")
121 |         if user_msg:
122 |             asyncio.create_task(_answer(user_msg, use_image=True))
123 | 
124 |     assistant.start(ctx.room)
125 | 
126 |     await asyncio.sleep(1)
127 |     await assistant.say("Hi there! How can I help?", allow_interruptions=True)
128 | 
129 |     while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
130 |         video_track = await get_video_track(ctx.room)
131 | 
132 |         async for event in rtc.VideoStream(video_track):
133 |             # We'll continually grab the latest image from the video track
134 |             # and store it in a variable.
135 |             latest_image = event.frame
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
140 | 


--------------------------------------------------------------------------------