├── Assistant2.py ├── README.md └── requirements.txt /Assistant2.py: -------------------------------------------------------------------------------- 1 | #tested 8/19/24 see requirements.txt 2 | import os 3 | import asyncio 4 | from typing import Annotated 5 | from dotenv import load_dotenv 6 | import selectors 7 | 8 | import asyncio 9 | 10 | from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli, llm 11 | 12 | from livekit import agents, rtc 13 | from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts 14 | from livekit.agents.llm import ( 15 | ChatContext, 16 | ChatImage, 17 | ChatMessage, 18 | ChatRole, 19 | ) 20 | from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant 21 | from livekit.plugins import deepgram, openai, silero, azure 22 | 23 | 24 | class MyPolicy(asyncio.DefaultEventLoopPolicy): 25 | def new_event_loop(self): 26 | selector = selectors.SelectSelector() 27 | return asyncio.SelectorEventLoop(selector) 28 | 29 | asyncio.set_event_loop_policy(MyPolicy()) 30 | 31 | # Load environment variables from .env file 32 | load_dotenv() 33 | def reload_env_variables(): 34 | livekit_url = os.environ.get('LIVEKIT_URL') 35 | livekit_api_key = os.environ.get('LIVEKIT_API_KEY') 36 | livekit_api_secret = os.environ.get('LIVEKIT_API_SECRET') 37 | eleven_api_key = os.environ.get('ELEVEN_API_KEY') 38 | deepgram_api_key = os.environ.get('DEEPGRAM_API_KEY') 39 | openai_api_key = os.environ.get('OPENAI_API_KEY') 40 | speech_region = os.environ.get('AZURE_SPEECH_REGION') 41 | speech_key = os.environ.get('AZURE_SPEECH_KEY') 42 | 43 | return { 44 | 'livekit_url': livekit_url, 45 | 'livekit_api_key': livekit_api_key, 46 | 'livekit_api_secret': livekit_api_secret, 47 | 'eleven_api_key': eleven_api_key, 48 | 'deepgram_api_key': deepgram_api_key, 49 | 'openai_api_key': openai_api_key, 50 | 'speech_region': speech_region, 51 | 'speech_key': speech_key 52 | } 53 | def print_env_variables(env_vars): 54 | for key, value in env_vars.items(): 55 | if value: 56 | print(f"{key}: {value[:2]}...{value[-2:]}") 57 | else: 58 | print(f"{key}: None") 59 | env_vars = reload_env_variables() 60 | print_env_variables(env_vars) 61 | 62 | 63 | class AssistantFunction(agents.llm.FunctionContext): 64 | """This class is used to define functions that will be called by the assistant.""" 65 | @agents.llm.ai_callable( 66 | desc=( 67 | "Use this function whenever asked to evaluate an image, video, or the webcam feed being shared with you" 68 | ) 69 | ) 70 | async def image( 71 | self, 72 | user_msg: Annotated[ 73 | str, 74 | agents.llm.TypeInfo(desc="The user message that triggered this function"), 75 | ], 76 | ): 77 | print(f"Message triggering vision capabilities: {user_msg}") 78 | context = AssistantContext.get_current() 79 | context.store_metadata("user_msg", user_msg) 80 | 81 | async def get_video_track(room: rtc.Room): 82 | """Get the first video track from the room. We'll use this track to process images.""" 83 | video_track = asyncio.Future[rtc.RemoteVideoTrack]() 84 | for _, participant in room.participants.items(): 85 | for _, track_publication in participant.tracks.items(): 86 | if track_publication.track is not None and isinstance( 87 | track_publication.track, rtc.RemoteVideoTrack 88 | ): 89 | video_track.set_result(track_publication.track) 90 | print(f"Using video track {track_publication.track.sid}") 91 | break 92 | return await video_track 93 | 94 | async def entrypoint(ctx: JobContext): 95 | print(f"Room name: {ctx.room.name}") 96 | chat_context = ChatContext( 97 | messages=[ 98 | ChatMessage( 99 | role=ChatRole.SYSTEM, 100 | text=( 101 | "Your name is Andrew. You are an assitant who is slightly sarcastic and witty. You have both voice and vision capabilities." 102 | "Respond with clear and concise answers with minimal jargon. Do not use emojis." 103 | ), 104 | ) 105 | ] 106 | ) 107 | gpt = openai.LLM(model="gpt-4o") 108 | latest_image: rtc.VideoFrame | None = None 109 | assistant = VoiceAssistant( 110 | vad=silero.VAD(), # We'll use Silero's Voice Activity Detector (VAD) 111 | stt=deepgram.STT(), 112 | #stt=azure.STT(), # We'll use Deepgram's Speech To Text (STT) 113 | llm=gpt, # We'll use GTP 4.0 114 | tts=azure.TTS(voice="en-US-AvaMultilingualNeural"), 115 | #tts=elevenlabs.TTS(), # Text-to-Speech #tts=openai_tts, 116 | #tts=openai_tts, # We'll use OpenAI's Text To Speech (TTS) 117 | fnc_ctx=AssistantFunction(), 118 | chat_ctx=chat_context, 119 | ) 120 | chat = rtc.ChatManager(ctx.room) 121 | async def _answer(text: str, use_image: bool = False): 122 | """ 123 | Answer the user's message with the given text and optionally the latest 124 | image captured from the video track. 125 | """ 126 | args = {} 127 | if use_image and latest_image: 128 | args["images"] = [ChatImage(image=latest_image)] 129 | chat_context.messages.append(ChatMessage(role=ChatRole.USER, text=text, **args)) 130 | stream = await gpt.chat(chat_context) 131 | await assistant.say(stream, allow_interruptions=True) 132 | await assistant.say(stream) 133 | @chat.on("message_received") 134 | def on_message_received(msg: rtc.ChatMessage): 135 | """This event triggers whenever we get a new message from the user.""" 136 | if msg.message: 137 | asyncio.create_task(_answer(msg.message, use_image=False)) 138 | @assistant.on("function_calls_finished") 139 | def on_function_calls_finished(ctx: AssistantContext): 140 | """This event triggers when an assistant's function call completes.""" 141 | user_msg = ctx.get_metadata("user_msg") 142 | if user_msg: 143 | asyncio.create_task(_answer(user_msg, use_image=True)) 144 | assistant.start(ctx.room) 145 | await asyncio.sleep(3) 146 | await assistant.say("Hey, how can I help you today?", allow_interruptions=True) 147 | while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED: 148 | video_track = await get_video_track(ctx.room) 149 | async for event in rtc.VideoStream(video_track): 150 | # We'll continually grab the latest image from the video track 151 | # and store it in a variable. 152 | latest_image = event.frame 153 | 154 | async def request_fnc(req: JobRequest) -> None: 155 | await req.accept(entrypoint) 156 | 157 | if __name__ == "__main__": 158 | cli.run_app(WorkerOptions(request_fnc)) 159 | 160 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # livekitAgent 2 | 3 | https://youtu.be/niyg3zRKO3k 4 | 5 | ![Livekit](https://github.com/user-attachments/assets/f8463adb-dcc4-410a-a758-0d49b04b1227) 6 | 7 | See the python code... built in July of 2024 with libs at that time. Tested and working 8/19/2024. See livekit list of key files and versions below. Lots of development and releases by Livekit team ongoing so be sure to visit the site and look at latest updates for any breaking changes or be sure to use the version I used. See the full requirement.txt as the python environment was fussy. 8 | 9 | https://livekit.io/ 10 | 11 | Docs: https://docs.livekit.io/home/ 12 | 13 | Quickstart: https://docs.livekit.io/agents/quickstart/ 14 | 15 | Reference: https://docs.livekit.io/reference/ 16 | 17 | Requirements.txt 18 | 19 | livekit==0.11.1 20 | 21 | livekit-agents==0.7.2 22 | 23 | livekit-api==0.5.1 24 | 25 | livekit-plugins-deepgram==0.5.0 26 | 27 | livekit-plugins-elevenlabs==0.6.0 28 | 29 | livekit-plugins-openai==0.6.0 30 | 31 | livekit-plugins-silero==0.5.0 32 | 33 | livekit-protocol==0.5.1 34 | 35 | 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.5 2 | aiosignal==1.3.1 3 | annotated-types==0.7.0 4 | anyio==4.4.0 5 | attrs==23.2.0 6 | av==12.2.0 7 | azure-cognitiveservices-speech==1.38.0 8 | certifi==2024.6.2 9 | click==8.1.7 10 | colorama==0.4.6 11 | coloredlogs==15.0.1 12 | distro==1.9.0 13 | filelock==3.15.4 14 | flatbuffers==24.3.25 15 | frozenlist==1.4.1 16 | fsspec==2024.6.1 17 | h11==0.14.0 18 | httpcore==1.0.5 19 | httpx==0.27.0 20 | humanfriendly==10.0 21 | idna==3.7 22 | intel-openmp==2021.4.0 23 | Jinja2==3.1.4 24 | livekit==0.11.1 25 | livekit-agents==0.7.2 26 | livekit-api==0.5.1 27 | livekit-plugins-azure==0.2.1 28 | livekit-plugins-deepgram==0.5.0 29 | livekit-plugins-elevenlabs==0.6.0 30 | livekit-plugins-openai==0.6.0 31 | livekit-plugins-silero==0.5.3 32 | livekit-protocol==0.5.1 33 | MarkupSafe==2.1.5 34 | mkl==2021.4.0 35 | mpmath==1.3.0 36 | multidict==6.0.5 37 | networkx==3.3 38 | numpy==1.26.4 39 | onnxruntime==1.18.1 40 | openai==1.30.5 41 | packaging==24.1 42 | pillow==10.3.0 43 | protobuf==5.27.2 44 | psutil==5.9.8 45 | pydantic==2.7.4 46 | pydantic_core==2.18.4 47 | PyJWT==2.8.0 48 | pyreadline3==3.4.1 49 | python-dotenv==1.0.1 50 | setuptools==70.1.1 51 | sniffio==1.3.1 52 | sympy==1.12.1 53 | tbb==2021.13.0 54 | torch==2.3.1 55 | torchaudio==2.3.1 56 | tqdm==4.66.4 57 | types-protobuf==4.25.0.20240417 58 | typing_extensions==4.12.2 59 | watchfiles==0.22.0 60 | wheel==0.43.0 61 | yarl==1.9.4 62 | --------------------------------------------------------------------------------