├── .gitignore ├── text-streaming-agent ├── requirements.txt ├── .env.example ├── agent.py └── chatgpt.py ├── hive-moderation-agent ├── requirements.txt ├── .env.example ├── hive_data_classes.py └── agent.py ├── deepgram-transcription-agent ├── requirements.txt ├── .env.example └── agent.py ├── claude-3-agent ├── .env.example ├── requirements.txt ├── claude.py ├── deepgram.py └── agent.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | **/.env 2 | **/.venv 3 | **/__pycache__ -------------------------------------------------------------------------------- /text-streaming-agent/requirements.txt: -------------------------------------------------------------------------------- 1 | livekit 2 | livekit-agents 3 | python-dotenv 4 | openai -------------------------------------------------------------------------------- /hive-moderation-agent/requirements.txt: -------------------------------------------------------------------------------- 1 | livekit 2 | livekit-agents 3 | python-dotenv 4 | Pillow 5 | aiohttp -------------------------------------------------------------------------------- /deepgram-transcription-agent/requirements.txt: -------------------------------------------------------------------------------- 1 | livekit 2 | livekit-agents 3 | python-dotenv 4 | livekit-plugins-deepgram -------------------------------------------------------------------------------- /hive-moderation-agent/.env.example: -------------------------------------------------------------------------------- 1 | LIVEKIT_URL=XXXXXX 2 | LIVEKIT_API_KEY=XXXXXX 3 | LIVEKIT_API_SECRET=XXXXXX 4 | 5 | HIVE_API_KEY=XXXXXX -------------------------------------------------------------------------------- /text-streaming-agent/.env.example: -------------------------------------------------------------------------------- 1 | LIVEKIT_URL=XXXXXX 2 | LIVEKIT_API_KEY=XXXXXX 3 | LIVEKIT_API_SECRET=XXXXXX 4 | 5 | OPENAI_API_KEY=XXXXXX -------------------------------------------------------------------------------- /deepgram-transcription-agent/.env.example: -------------------------------------------------------------------------------- 1 | LIVEKIT_URL=XXXXXX 2 | LIVEKIT_API_KEY=XXXXXX 3 | LIVEKIT_API_SECRET=XXXXXX 4 | 5 | DEEPGRAM_API_KEY=XXXXXX -------------------------------------------------------------------------------- /claude-3-agent/.env.example: -------------------------------------------------------------------------------- 1 | LIVEKIT_URL=XXXXXX 2 | LIVEKIT_API_KEY=XXXXXX 3 | LIVEKIT_API_SECRET=XXXXXX 4 | 5 | DEEPGRAM_API_KEY=XXXXXX 6 | ELEVEN_API_KEY=XXXXXX 7 | ANTHROPIC_API_KEY=XXXXXX -------------------------------------------------------------------------------- /claude-3-agent/requirements.txt: -------------------------------------------------------------------------------- 1 | livekit>=0.9.0 2 | livekit-api 3 | livekit-agents>=0.4.0 4 | livekit-plugins-deepgram>=0.2.0 5 | livekit-plugins-elevenlabs>=0.2.0 6 | python-dotenv 7 | anthropic -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # livekit-agents 2 | 3 | Example agents I've built using the LiveKit Agents (https://github.com/livekit/agents) framework 4 | 5 | ## To run any of these agents 6 | 7 | 1. `cd` into agent subdirectory 8 | 2. `cp .env.example .env` 9 | 3. open the `.env` file and replace `XXXXXX` with proper values for each environment variable 10 | 4. `python agent.py start` 11 | 12 | ## To interact with the agent in the agents playground 13 | 14 | 1. open a browser and navigate to: `https://agents-playground.livekit.io/` 15 | 2. choose to the same LiveKit Cloud project that the agent is running against (or manually enter a websocket URL and participant token if self-hosting) 16 | 3. click `Connect` in the top right corner of the playground UI 17 | -------------------------------------------------------------------------------- /text-streaming-agent/agent.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from livekit import rtc 3 | from livekit.agents import ( 4 | JobContext, 5 | JobRequest, 6 | WorkerOptions, 7 | cli, 8 | ) 9 | from chatgpt import ( 10 | ChatGPTMessage, 11 | ChatGPTMessageRole, 12 | ChatGPTPlugin, 13 | ) 14 | 15 | from dotenv import load_dotenv 16 | 17 | load_dotenv() 18 | 19 | CHATGPT_MODEL = "gpt-4-1106-preview" 20 | 21 | 22 | async def entrypoint(job: JobContext): 23 | tasks = [] 24 | chat = rtc.ChatManager(job.room) 25 | chatgpt_plugin = ChatGPTPlugin(prompt="", message_capacity=20, model=CHATGPT_MODEL) 26 | 27 | async def process_chatgpt_result(text_stream): 28 | first_message = True 29 | message: rtc.ChatMessage = None 30 | async for text in text_stream: 31 | if first_message: 32 | message = await chat.send_message(text) 33 | first_message = False 34 | else: 35 | message.message = message.message + text 36 | await chat.update_message(message) 37 | 38 | def on_message_received(message: rtc.ChatMessage): 39 | if message.deleted: 40 | return 41 | msg = ChatGPTMessage(role=ChatGPTMessageRole.user, content=message.message) 42 | chatgpt_result = chatgpt_plugin.add_message(msg) 43 | tasks.append(asyncio.create_task(process_chatgpt_result(chatgpt_result))) 44 | 45 | chat.on("message_received", on_message_received) 46 | 47 | 48 | async def request_fnc(req: JobRequest) -> None: 49 | await req.accept(entrypoint, auto_subscribe=None) 50 | 51 | 52 | if __name__ == "__main__": 53 | cli.run_app(WorkerOptions(request_fnc=request_fnc)) 54 | -------------------------------------------------------------------------------- /deepgram-transcription-agent/agent.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from livekit import agents, rtc 4 | from livekit.plugins.deepgram import STT, SpeechStream 5 | 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | 11 | class TranscriptionAgent: 12 | @classmethod 13 | async def create(cls, ctx: agents.JobContext): 14 | agent = TranscriptionAgent(ctx) 15 | await agent.start() 16 | 17 | def __init__(self, ctx: agents.JobContext): 18 | self.ctx = ctx 19 | self.chat = rtc.ChatManager(ctx.room) 20 | self.stt = STT( 21 | min_silence_duration=100, 22 | ) 23 | 24 | async def start(self): 25 | def on_track_subscribed( 26 | track: rtc.Track, 27 | publication: rtc.TrackPublication, 28 | participant: rtc.RemoteParticipant, 29 | ): 30 | self.ctx.create_task(self.process_track(track, participant)) 31 | 32 | self.ctx.room.on("track_subscribed", on_track_subscribed) 33 | 34 | self.update_agent_state("listening") 35 | 36 | async def process_track( 37 | self, track: rtc.AudioTrack, participant: rtc.RemoteParticipant 38 | ): 39 | audio_stream = rtc.AudioStream(track) 40 | stream = self.stt.stream() 41 | self.ctx.create_task(self.process_stt(stream, participant)) 42 | async for audio_frame_event in audio_stream: 43 | stream.push_frame(audio_frame_event.frame) 44 | await stream.flush() 45 | 46 | async def process_stt( 47 | self, stream: SpeechStream, participant: rtc.RemoteParticipant 48 | ): 49 | buffered_text = "" 50 | async for event in stream: 51 | if event.alternatives[0].text == "": 52 | continue 53 | if event.is_final: 54 | buffered_text = " ".join([buffered_text, event.alternatives[0].text]) 55 | 56 | if not event.end_of_speech: 57 | continue 58 | 59 | self.ctx.create_task( 60 | self.chat.send_message(f"{participant.identity} said: {buffered_text}") 61 | ) 62 | buffered_text = "" 63 | 64 | def update_agent_state(self, state: str): 65 | metadata = json.dumps( 66 | { 67 | "agent_state": state, 68 | } 69 | ) 70 | self.ctx.create_task(self.ctx.room.local_participant.update_metadata(metadata)) 71 | 72 | 73 | if __name__ == "__main__": 74 | logging.basicConfig(level=logging.INFO) 75 | 76 | async def job_request_cb(job_request: agents.JobRequest): 77 | await job_request.accept( 78 | TranscriptionAgent.create, 79 | identity="deepgram-transcriber", 80 | name="Transcriber", 81 | # subscribe to all video tracks automatically 82 | auto_subscribe=agents.AutoSubscribe.AUDIO_ONLY, 83 | # disconnect when the last participant leaves 84 | auto_disconnect=agents.AutoDisconnect.DEFAULT, 85 | ) 86 | 87 | worker = agents.Worker(request_handler=job_request_cb) 88 | agents.run_app(worker) 89 | -------------------------------------------------------------------------------- /hive-moderation-agent/hive_data_classes.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, is_dataclass 2 | from typing import get_type_hints, List 3 | 4 | from typing import get_type_hints, List, Any 5 | 6 | 7 | def from_dict(cls, data): 8 | if is_dataclass(cls) and isinstance(data, dict): 9 | # Get type hints for all fields in the dataclass 10 | field_types = get_type_hints(cls) 11 | # Special handling for reserved words like 'class' 12 | reserved_word_mappings = {"class": "class_"} # Map 'class' to 'class_' 13 | processed_data = {} 14 | for key, value in data.items(): 15 | # Check if the key is a reserved word and map it accordingly 16 | field_name = reserved_word_mappings.get(key, key) 17 | # Only include keys that have corresponding fields in the dataclass 18 | if field_name in field_types: 19 | field_type = field_types[field_name] 20 | # Determine if the field_type is itself a dataclass 21 | if is_dataclass(field_type): 22 | processed_value = from_dict(field_type, value) 23 | elif hasattr(field_type, "__origin__") and issubclass( 24 | field_type.__origin__, List 25 | ): 26 | # Handle List fields, assuming all elements are of the same type 27 | item_type = field_type.__args__[0] 28 | processed_value = [from_dict(item_type, item) for item in value] 29 | else: 30 | processed_value = value 31 | processed_data[field_name] = processed_value 32 | return cls(**processed_data) 33 | elif isinstance(data, list): 34 | # This assumes that the function was called with a list type as `cls`, 35 | # which might not work as expected without context on the list's element type. 36 | # A better approach might be needed for handling lists of dataclasses. 37 | return [ 38 | from_dict(cls.__args__[0], item) if hasattr(cls, "__args__") else item 39 | for item in data 40 | ] 41 | else: 42 | return data 43 | 44 | 45 | @dataclass 46 | class Status: 47 | code: str 48 | message: str 49 | 50 | 51 | @dataclass 52 | class ModInput: 53 | id: str 54 | charge: float 55 | config_tag: SyntaxWarning 56 | config_version: float 57 | created_on: str 58 | model: str 59 | model_type: str 60 | model_version: float 61 | project_id: int 62 | user_id: int 63 | 64 | 65 | @dataclass 66 | class ModClass: 67 | class_: str 68 | score: float 69 | 70 | 71 | @dataclass 72 | class ModOutput: 73 | time: int 74 | classes: List[ModClass] 75 | 76 | 77 | @dataclass 78 | class Response: 79 | input: ModInput 80 | output: List[ModOutput] 81 | 82 | 83 | @dataclass 84 | class ModResponse: 85 | status: Status 86 | response: Response 87 | 88 | 89 | @dataclass 90 | class HiveResponse: 91 | id: str 92 | code: int 93 | project_id: int 94 | user_id: int 95 | created_on: str 96 | status: List[ModResponse] 97 | from_cache: bool 98 | -------------------------------------------------------------------------------- /hive-moderation-agent/agent.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import json 3 | import logging 4 | import time 5 | from livekit import agents, rtc 6 | from PIL import Image 7 | import os 8 | from io import BytesIO 9 | from hive_data_classes import HiveResponse, from_dict 10 | 11 | 12 | from dotenv import load_dotenv 13 | 14 | load_dotenv() 15 | 16 | MOD_FRAME_INTERVAL = 5.0 # check 1 frame every 5 seconds 17 | 18 | hive_headers = { 19 | "Authorization": f"Token {os.getenv('HIVE_API_KEY')}", 20 | "accept": "application/json", 21 | } 22 | 23 | 24 | class ModeratorAgent: 25 | @classmethod 26 | async def create(cls, ctx: agents.JobContext): 27 | agent = ModeratorAgent(ctx) 28 | await agent.start() 29 | 30 | def __init__(self, ctx: agents.JobContext): 31 | self.ctx = ctx 32 | self.chat = rtc.ChatManager(ctx.room) 33 | 34 | async def start(self): 35 | self.ctx.create_task( 36 | self.chat.send_message( 37 | "I'm a moderation agent, I will detect and notify you of all inappropriate material you transmit in your video stream" 38 | ) 39 | ) 40 | 41 | def on_track_subscribed( 42 | track: rtc.Track, 43 | publication: rtc.TrackPublication, 44 | participant: rtc.RemoteParticipant, 45 | ): 46 | self.ctx.create_task(self.process_track(track)) 47 | 48 | self.ctx.room.on("track_subscribed", on_track_subscribed) 49 | 50 | self.update_agent_state("monitoring") 51 | 52 | async def process_track(self, track: rtc.VideoTrack): 53 | video_stream = rtc.VideoStream(track) 54 | last_processed_time = 0 55 | frame_interval = MOD_FRAME_INTERVAL 56 | async for frame in video_stream: 57 | current_time = time.time() 58 | if (current_time - last_processed_time) >= frame_interval: 59 | last_processed_time = current_time 60 | self.ctx.create_task(self.detect(frame)) 61 | 62 | async def detect(self, frame: rtc.VideoFrame): 63 | argb_frame = frame.frame.convert(rtc.VideoBufferType.RGBA) 64 | image = Image.frombytes( 65 | "RGBA", (argb_frame.width, argb_frame.height), argb_frame.data 66 | ) 67 | buffer = BytesIO() 68 | image.save(buffer, format="PNG") 69 | buffer.seek(0) # reset buffer position to beginning after writing 70 | 71 | data = aiohttp.FormData() 72 | data.add_field("image", buffer, filename="image.png", content_type="image/png") 73 | 74 | async with aiohttp.ClientSession() as session: 75 | async with session.post( 76 | "https://api.thehive.ai/api/v2/task/sync", 77 | headers=hive_headers, 78 | data=data, 79 | ) as response: 80 | response_dict = await response.json() 81 | hive_response: HiveResponse = from_dict(HiveResponse, response_dict) 82 | if ( 83 | hive_response.code == 200 84 | and len(hive_response.status) > 0 85 | and len(hive_response.status[0].response.output) > 0 86 | ): 87 | results = hive_response.status[0].response.output[0].classes 88 | if len(results) > 0: 89 | sorted_results = sorted( 90 | results, key=lambda r: r.score, reverse=True 91 | )[:10] 92 | results_str = "Results:\n" 93 | for result in sorted_results: 94 | results_str += f"{result.class_}: {result.score}" 95 | self.ctx.create_task(self.chat.send_message(results_str)) 96 | 97 | def update_agent_state(self, state: str): 98 | metadata = json.dumps( 99 | { 100 | "agent_state": state, 101 | } 102 | ) 103 | self.ctx.create_task(self.ctx.room.local_participant.update_metadata(metadata)) 104 | 105 | 106 | if __name__ == "__main__": 107 | logging.basicConfig(level=logging.INFO) 108 | 109 | async def job_request_cb(job_request: agents.JobRequest): 110 | await job_request.accept( 111 | ModeratorAgent.create, 112 | identity="hive-moderator", 113 | name="Moderator", 114 | # subscribe to all video tracks automatically 115 | auto_subscribe=agents.AutoSubscribe.VIDEO_ONLY, 116 | # disconnect when the last participant leaves 117 | auto_disconnect=agents.AutoDisconnect.DEFAULT, 118 | ) 119 | 120 | worker = agents.Worker(request_handler=job_request_cb) 121 | agents.run_app(worker) 122 | -------------------------------------------------------------------------------- /text-streaming-agent/chatgpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 LiveKit, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import logging 17 | import asyncio 18 | import openai 19 | from dataclasses import dataclass 20 | from typing import AsyncIterable, List, Optional 21 | from enum import Enum 22 | 23 | ChatGPTMessageRole = Enum("MessageRole", ["system", "user", "assistant", "function"]) 24 | 25 | 26 | @dataclass 27 | class ChatGPTMessage: 28 | role: ChatGPTMessageRole 29 | content: str 30 | 31 | def to_api(self): 32 | return {"role": self.role.name, "content": self.content} 33 | 34 | 35 | class ChatGPTPlugin: 36 | """OpenAI ChatGPT Plugin""" 37 | 38 | def __init__(self, prompt: str, message_capacity: int, model: str): 39 | """ 40 | Args: 41 | prompt (str): First 'system' message sent to the chat that prompts the assistant 42 | message_capacity (int): Maximum number of messages to send to the chat 43 | model (str): Which model to use (i.e. 'gpt-3.5-turbo') 44 | """ 45 | self._model = model 46 | self._client = openai.AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"]) 47 | self._prompt = prompt 48 | self._message_capacity = message_capacity 49 | self._messages: List[ChatGPTMessage] = [] 50 | self._producing_response = False 51 | self._needs_interrupt = False 52 | 53 | def interrupt(self): 54 | """Interrupt a currently streaming response (if there is one)""" 55 | if self._producing_response: 56 | self._needs_interrupt = True 57 | 58 | async def aclose(self): 59 | pass 60 | 61 | async def send_system_prompt(self) -> AsyncIterable[str]: 62 | """Send the system prompt to the chat and generate a streamed response 63 | 64 | Returns: 65 | AsyncIterable[str]: Streamed ChatGPT response 66 | """ 67 | async for text in self.add_message(None): 68 | yield text 69 | 70 | async def add_message( 71 | self, message: Optional[ChatGPTMessage] 72 | ) -> AsyncIterable[str]: 73 | """Add a message to the chat and generate a streamed response 74 | 75 | Args: 76 | message (ChatGPTMessage): The message to add 77 | 78 | Returns: 79 | AsyncIterable[str]: Streamed ChatGPT response 80 | """ 81 | 82 | if message is not None: 83 | self._messages.append(message) 84 | if len(self._messages) > self._message_capacity: 85 | self._messages.pop(0) 86 | 87 | async for text in self._generate_text_streamed(self._model): 88 | yield text 89 | 90 | async def _generate_text_streamed(self, model: str) -> AsyncIterable[str]: 91 | prompt_message = ChatGPTMessage( 92 | role=ChatGPTMessageRole.system, content=self._prompt 93 | ) 94 | try: 95 | chat_messages = [m.to_api() for m in self._messages] 96 | chat_stream = await asyncio.wait_for( 97 | self._client.chat.completions.create( 98 | model=model, 99 | n=1, 100 | stream=True, 101 | messages=[prompt_message.to_api()] + chat_messages, 102 | ), 103 | 10, 104 | ) 105 | except TimeoutError: 106 | yield "Sorry, I'm taking too long to respond. Please try again later." 107 | return 108 | 109 | self._producing_response = True 110 | complete_response = "" 111 | 112 | async def anext_util(aiter): 113 | async for item in aiter: 114 | return item 115 | 116 | return None 117 | 118 | while True: 119 | try: 120 | chunk = await asyncio.wait_for(anext_util(chat_stream), 5) 121 | except TimeoutError: 122 | break 123 | except asyncio.CancelledError: 124 | self._producing_response = False 125 | self._needs_interrupt = False 126 | break 127 | 128 | if chunk is None: 129 | break 130 | content = chunk.choices[0].delta.content 131 | 132 | if self._needs_interrupt: 133 | self._needs_interrupt = False 134 | logging.info("ChatGPT interrupted") 135 | break 136 | 137 | if content is not None: 138 | complete_response += content 139 | yield content 140 | 141 | self._messages.append( 142 | ChatGPTMessage(role=ChatGPTMessageRole.assistant, content=complete_response) 143 | ) 144 | self._producing_response = False 145 | -------------------------------------------------------------------------------- /claude-3-agent/claude.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 LiveKit, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import logging 17 | import asyncio 18 | from anthropic import AsyncAnthropic 19 | from dataclasses import dataclass 20 | from typing import AsyncIterable, List, Optional 21 | from enum import Enum 22 | 23 | ClaudeMessageRole = Enum("MessageRole", ["system", "user", "assistant", "function"]) 24 | 25 | 26 | class ClaudeModels(Enum): 27 | Claude3Opus = "claude-3-opus-20240229" 28 | Claude3Sonnet = "claude-3-sonnet-20240229" 29 | Claude3Haiku = "claude-3-haiku-20240307" 30 | 31 | 32 | @dataclass 33 | class ClaudeMessage: 34 | role: ClaudeMessageRole 35 | content: str 36 | 37 | def to_api(self): 38 | return {"role": self.role.name, "content": self.content} 39 | 40 | 41 | class ClaudePlugin: 42 | """Claude Plugin""" 43 | 44 | def __init__(self, prompt: str, message_capacity: int, model: str): 45 | """ 46 | Args: 47 | prompt (str): First 'system' message sent to the chat that prompts the assistant 48 | message_capacity (int): Maximum number of messages to send to the chat 49 | model (str): Which model to use (i.e. 'gpt-3.5-turbo') 50 | """ 51 | self._model = model 52 | self._client = AsyncAnthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) 53 | self._prompt = prompt 54 | self._message_capacity = message_capacity 55 | self._messages: List[ClaudeMessage] = [] 56 | self._producing_response = False 57 | self._needs_interrupt = False 58 | 59 | def interrupt(self): 60 | """Interrupt a currently streaming response (if there is one)""" 61 | if self._producing_response: 62 | self._needs_interrupt = True 63 | 64 | async def aclose(self): 65 | pass 66 | 67 | async def send_system_prompt(self) -> AsyncIterable[str]: 68 | """Send the system prompt to the chat and generate a streamed response 69 | 70 | Returns: 71 | AsyncIterable[str]: Streamed ChatGPT response 72 | """ 73 | async for text in self.add_message(None): 74 | yield text 75 | 76 | async def add_message(self, message: Optional[ClaudeMessage]) -> AsyncIterable[str]: 77 | """Add a message to the chat and generate a streamed response 78 | 79 | Args: 80 | message (ChatGPTMessage): The message to add 81 | 82 | Returns: 83 | AsyncIterable[str]: Streamed ChatGPT response 84 | """ 85 | 86 | if message is not None: 87 | self._messages.append(message) 88 | if len(self._messages) > self._message_capacity: 89 | self._messages.pop(0) 90 | 91 | async for text in self._generate_text_streamed(self._model): 92 | yield text 93 | 94 | async def _generate_text_streamed(self, model: str) -> AsyncIterable[str]: 95 | prompt_message = ClaudeMessage( 96 | role=ClaudeMessageRole.system, content=self._prompt 97 | ) 98 | try: 99 | chat_messages = [m.to_api() for m in self._messages] 100 | chat_stream = await self._client.messages.create( 101 | model=model, 102 | stream=True, 103 | max_tokens=1024, 104 | messages=[prompt_message.to_api()] + chat_messages, 105 | ) 106 | except TimeoutError: 107 | yield "Sorry, I'm taking too long to respond. Please try again later." 108 | return 109 | 110 | self._producing_response = True 111 | complete_response = "" 112 | 113 | async def anext_util(aiter): 114 | async for item in aiter: 115 | return item 116 | 117 | return None 118 | 119 | while True: 120 | try: 121 | chunk = await asyncio.wait_for(anext_util(chat_stream), 5) 122 | except TimeoutError: 123 | break 124 | except asyncio.CancelledError: 125 | self._producing_response = False 126 | self._needs_interrupt = False 127 | break 128 | 129 | if chunk is None: 130 | break 131 | content = chunk.choices[0].delta.content 132 | 133 | if self._needs_interrupt: 134 | self._needs_interrupt = False 135 | logging.info("ChatGPT interrupted") 136 | break 137 | 138 | if content is not None: 139 | complete_response += content 140 | yield content 141 | 142 | self._messages.append( 143 | ClaudeMessage(role=ClaudeMessageRole.assistant, content=complete_response) 144 | ) 145 | self._producing_response = False 146 | -------------------------------------------------------------------------------- /claude-3-agent/deepgram.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import json 4 | import logging 5 | import os 6 | from urllib.parse import urlencode 7 | from dataclasses import dataclass 8 | from typing import Optional 9 | 10 | import aiohttp 11 | from livekit import rtc 12 | 13 | from enum import Enum 14 | 15 | 16 | # taken from deepgram-sdk 17 | class LiveTranscriptionEvents(str, Enum): 18 | Open: str = "Open" 19 | Close: str = "Close" 20 | Transcript: str = "Results" 21 | Metadata: str = "Metadata" 22 | UtteranceEnd: str = "UtteranceEnd" 23 | SpeechStarted: str = "SpeechStarted" 24 | Error: str = "Error" 25 | Warning: str = "Warning" 26 | 27 | 28 | STREAM_KEEPALIVE_MSG: str = json.dumps({"type": "KeepAlive"}) 29 | STREAM_CLOSE_MSG: str = json.dumps({"type": "CloseStream"}) 30 | 31 | 32 | class STTStream: 33 | @dataclass 34 | class StartedEvent: 35 | type: str = "started" 36 | 37 | @dataclass 38 | class InterimEvent: 39 | text: str 40 | type: str = "interim" 41 | 42 | @dataclass 43 | class FinishedEvent: 44 | text: str 45 | type: str = "finished" 46 | 47 | def __init__( 48 | self, 49 | ) -> None: 50 | super().__init__() 51 | self._api_key = os.environ["DEEPGRAM_API_KEY"] 52 | 53 | self._queue = asyncio.Queue() 54 | self._event_queue = asyncio.Queue[ 55 | STTStream.StartedEvent | STTStream.InterimEvent | STTStream.FinishedEvent 56 | ]() 57 | self._closed = False 58 | self._main_task = asyncio.create_task(self._run(max_retry=32)) 59 | 60 | def log_exception(task: asyncio.Task) -> None: 61 | if not task.cancelled() and task.exception(): 62 | logging.error(f"deepgram task failed: {task.exception()}") 63 | 64 | self._main_task.add_done_callback(log_exception) 65 | 66 | def push_frame(self, frame: rtc.AudioFrame) -> None: 67 | if self._closed: 68 | raise ValueError("cannot push frame to closed stream") 69 | 70 | self._queue.put_nowait(frame.remix_and_resample(16000, 1)) 71 | 72 | async def aclose(self) -> None: 73 | await self._queue.put(STREAM_CLOSE_MSG) 74 | await self._main_task 75 | 76 | async def _run(self, max_retry: int) -> None: 77 | """Try to connect to Deepgram with exponential backoff and forward frames""" 78 | async with aiohttp.ClientSession() as session: 79 | retry_count = 0 80 | ws: Optional[aiohttp.ClientWebSocketResponse] = None 81 | listen_task: Optional[asyncio.Task] = None 82 | keepalive_task: Optional[asyncio.Task] = None 83 | while True: 84 | try: 85 | ws = await self._try_connect(session) 86 | listen_task = asyncio.create_task(self._listen_loop(ws)) 87 | keepalive_task = asyncio.create_task(self._keepalive_loop(ws)) 88 | # break out of the retry loop if we are done 89 | if await self._send_loop(ws): 90 | keepalive_task.cancel() 91 | await asyncio.wait_for(listen_task, timeout=5) 92 | break 93 | except Exception as e: 94 | if retry_count > max_retry and max_retry > 0: 95 | logging.error(f"failed to connect to Deepgram: {e}") 96 | break 97 | 98 | retry_delay = min(retry_count * 5, 5) # max 5s 99 | retry_count += 1 100 | logging.warning( 101 | f"failed to connect to Deepgram: {e} - retrying in {retry_delay}s" 102 | ) 103 | await asyncio.sleep(retry_delay) 104 | 105 | self._closed = True 106 | 107 | async def _send_loop(self, ws: aiohttp.ClientWebSocketResponse) -> bool: 108 | while not ws.closed: 109 | data = await self._queue.get() 110 | # fire and forget, we don't care if we miss frames in the error case 111 | self._queue.task_done() 112 | 113 | if ws.closed: 114 | raise Exception("websocket closed") 115 | 116 | if isinstance(data, rtc.AudioFrame): 117 | await ws.send_bytes(data.data.tobytes()) 118 | else: 119 | if data == STREAM_CLOSE_MSG: 120 | await ws.send_str(data) 121 | return True 122 | return False 123 | 124 | async def _keepalive_loop(self, ws: aiohttp.ClientWebSocketResponse) -> None: 125 | while not ws.closed: 126 | await ws.send_str(STREAM_KEEPALIVE_MSG) 127 | await asyncio.sleep(5) 128 | 129 | async def _listen_loop(self, ws: aiohttp.ClientWebSocketResponse) -> None: 130 | speaking = False 131 | last_transcript = "" 132 | 133 | while not ws.closed: 134 | msg = await ws.receive() 135 | if msg.type in ( 136 | aiohttp.WSMsgType.CLOSED, 137 | aiohttp.WSMsgType.CLOSE, 138 | aiohttp.WSMsgType.CLOSING, 139 | ): 140 | break 141 | 142 | try: 143 | if msg.type == aiohttp.WSMsgType.TEXT: 144 | data = json.loads(msg.data) 145 | type = data.get("type") 146 | if not type: 147 | continue 148 | 149 | if not speaking: 150 | if type == LiveTranscriptionEvents.SpeechStarted: 151 | speaking = True 152 | event = self.StartedEvent() 153 | await self._event_queue.put(event) 154 | else: 155 | if type == LiveTranscriptionEvents.UtteranceEnd: 156 | if last_transcript != "": 157 | speaking = False 158 | event = self.FinishedEvent(text=last_transcript) 159 | last_transcript = "" 160 | await self._event_queue.put(event) 161 | elif type == LiveTranscriptionEvents.Transcript: 162 | is_final_transcript = data["is_final"] 163 | is_endpoint = data["speech_final"] 164 | 165 | if is_final_transcript: 166 | transcript = data["channel"]["alternatives"][0][ 167 | "transcript" 168 | ] 169 | if transcript != "": 170 | last_transcript += transcript 171 | if not is_endpoint: 172 | last_transcript += " " 173 | 174 | if is_endpoint and last_transcript != "": 175 | speaking = False 176 | event = self.FinishedEvent(text=last_transcript) 177 | last_transcript = "" 178 | await self._event_queue.put(event) 179 | 180 | except Exception as e: 181 | logging.error("Error handling message %s: %s", msg, e) 182 | continue 183 | 184 | async def _try_connect( 185 | self, session: aiohttp.ClientSession 186 | ) -> aiohttp.ClientWebSocketResponse: 187 | live_config = { 188 | "model": "nova-2", 189 | "language": "en-US", 190 | "filler_words": True, 191 | "punctuate": True, 192 | "smart_format": True, 193 | "interim_results": True, 194 | "encoding": "linear16", 195 | "sample_rate": 16000, 196 | "channels": 1, 197 | "endpointing": 200, 198 | "vad_events": True, 199 | "utterance_end_ms": 1000, 200 | } 201 | 202 | query_params = urlencode(live_config).lower() 203 | 204 | url = f"wss://api.deepgram.com/v1/listen?{query_params}" 205 | ws = await session.ws_connect( 206 | url, headers={"Authorization": f"Token {self._api_key}"} 207 | ) 208 | 209 | return ws 210 | 211 | def __aiter__(self) -> "STTStream": 212 | return self 213 | 214 | async def __anext__( 215 | self, 216 | ): 217 | if self._closed and self._event_queue.empty(): 218 | raise StopAsyncIteration 219 | 220 | return await self._event_queue.get() 221 | -------------------------------------------------------------------------------- /claude-3-agent/agent.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 LiveKit, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import random 17 | import time 18 | import threading 19 | import asyncio 20 | from datetime import datetime 21 | from enum import Enum 22 | import json 23 | import logging 24 | from typing import AsyncIterable 25 | 26 | from livekit import rtc, agents 27 | from livekit.agents.tts import SynthesisEvent, SynthesisEventType 28 | 29 | from claude import ClaudeMessage, ClaudeMessageRole, ClaudePlugin, ClaudeModels 30 | from deepgram import STTStream 31 | from livekit.plugins.elevenlabs import TTS 32 | 33 | from dotenv import load_dotenv 34 | 35 | load_dotenv() 36 | 37 | 38 | PROMPT = "You are KITT, a friendly voice assistant powered by LiveKit. \ 39 | Conversation should be personable, and be sure to ask follow up questions. \ 40 | If your response is a question, please append a question mark symbol to the end of it.\ 41 | Don't respond with more than a few sentences." 42 | INTRO = "Hello, I am KITT, a friendly voice assistant powered by LiveKit Agents. \ 43 | You can find my source code in the top right of this screen if you're curious how I work. \ 44 | Feel free to ask me anything — I'm here to help! Just start talking or type in the chat." 45 | SIP_INTRO = "Hello, I am KITT, a friendly voice assistant powered by LiveKit Agents. \ 46 | Feel free to ask me anything — I'm here to help! Just start talking." 47 | 48 | 49 | # convert intro response to a stream 50 | async def intro_text_stream(sip: bool): 51 | if sip: 52 | yield SIP_INTRO 53 | return 54 | 55 | yield INTRO 56 | 57 | 58 | AgentState = Enum("AgentState", "IDLE, LISTENING, THINKING, SPEAKING") 59 | 60 | ELEVEN_TTS_SAMPLE_RATE = 24000 61 | ELEVEN_TTS_CHANNELS = 1 62 | 63 | 64 | class WorkerLifecycle: 65 | def __init__(self): 66 | self._accepting_jobs = True 67 | # Stop accepting jobs after a random time between 20 and 30 minutes 68 | self._stop_thread = threading.Thread( 69 | target=self._stop_accepting_jobs_after, 70 | args=(random.randrange(60 * 60, 60 * 120),), 71 | ) 72 | self._stop_thread.start() 73 | 74 | def _stop_accepting_jobs_after(self, after: int): 75 | time.sleep(after) 76 | self._accepting_jobs = False 77 | self._kill_after( 78 | random.randrange(2 * 60, 4 * 60) 79 | ) # kill 10-15 minutes after stopping accepting jobs 80 | 81 | def _kill_after(self, after: int): 82 | time.sleep(after) 83 | self._kill() 84 | 85 | def should_accept_job(self): 86 | return self._accepting_jobs 87 | 88 | def _kill(self): 89 | # kill the worker 90 | os._exit(0) 91 | 92 | 93 | class KITT: 94 | @classmethod 95 | async def create(cls, ctx: agents.JobContext): 96 | kitt = KITT(ctx) 97 | await kitt.start() 98 | 99 | def __init__(self, ctx: agents.JobContext): 100 | # plugins 101 | self.claude_plugin = ClaudePlugin( 102 | prompt=PROMPT, message_capacity=20, model=ClaudeModels.Claude3Opus.value 103 | ) 104 | self.tts_plugin = TTS( 105 | model_id="eleven_turbo_v2", sample_rate=ELEVEN_TTS_SAMPLE_RATE 106 | ) 107 | 108 | self.ctx: agents.JobContext = ctx 109 | self.chat = rtc.ChatManager(ctx.room) 110 | self.audio_out = rtc.AudioSource(ELEVEN_TTS_SAMPLE_RATE, ELEVEN_TTS_CHANNELS) 111 | 112 | self._sending_audio = False 113 | self._processing = False 114 | self._agent_state: AgentState = AgentState.IDLE 115 | 116 | self.chat.on("message_received", self.on_chat_received) 117 | self.ctx.room.on("track_subscribed", self.on_track_subscribed) 118 | 119 | async def start(self): 120 | # if you have to perform teardown cleanup, you can listen to the disconnected event 121 | # self.ctx.room.on("disconnected", your_cleanup_function) 122 | 123 | # publish audio track 124 | track = rtc.LocalAudioTrack.create_audio_track("agent-mic", self.audio_out) 125 | await self.ctx.room.local_participant.publish_track(track) 126 | 127 | # allow the participant to fully subscribe to the agent's audio track, so it doesn't miss 128 | # anything in the beginning 129 | await asyncio.sleep(1) 130 | 131 | sip = self.ctx.room.name.startswith("sip") 132 | await self.process_claude_result(intro_text_stream(sip)) 133 | self.update_state() 134 | 135 | def on_chat_received(self, message: rtc.ChatMessage): 136 | # TODO: handle deleted and updated messages in message context 137 | if message.deleted: 138 | return 139 | 140 | msg = ClaudeMessage(role=ClaudeMessageRole.user, content=message.message) 141 | claude_result = self.claude_plugin.add_message(msg) 142 | self.ctx.create_task(self.process_chatgpt_result(claude_result)) 143 | 144 | def on_track_subscribed( 145 | self, 146 | track: rtc.Track, 147 | publication: rtc.TrackPublication, 148 | participant: rtc.RemoteParticipant, 149 | ): 150 | self.ctx.create_task(self.process_track(track)) 151 | 152 | async def process_track(self, track: rtc.Track): 153 | audio_stream = rtc.AudioStream(track) 154 | stream = STTStream() 155 | self.ctx.create_task(self.process_stt_stream(stream)) 156 | async for audio_frame_event in audio_stream: 157 | if self._agent_state != AgentState.LISTENING: 158 | continue 159 | stream.push_frame(audio_frame_event.frame) 160 | await stream.flush() 161 | 162 | async def process_stt_stream(self, stream: STTStream): 163 | buffered_text = "" 164 | async for event in stream: 165 | if event.type == "started": 166 | pass 167 | elif event.type == "interim": 168 | pass 169 | elif event.type == "finished": 170 | if event.text == "": 171 | continue 172 | buffered_text = " ".join([buffered_text, event.text]) 173 | await self.ctx.room.local_participant.publish_data( 174 | json.dumps( 175 | { 176 | "text": buffered_text, 177 | "timestamp": int(datetime.now().timestamp() * 1000), 178 | } 179 | ), 180 | topic="transcription", 181 | ) 182 | 183 | msg = ClaudeMessage(role=ClaudeMessageRole.user, content=buffered_text) 184 | claude_stream = self.claude_plugin.add_message(msg) 185 | self.ctx.create_task(self.process_claude_result(claude_stream)) 186 | buffered_text = "" 187 | 188 | async def process_claude_result(self, text_stream): 189 | self.update_state(processing=True) 190 | 191 | stream = self.tts_plugin.stream() 192 | # send audio to TTS in parallel 193 | self.ctx.create_task(self.send_audio_stream(stream)) 194 | all_text = "" 195 | async for text in text_stream: 196 | stream.push_text(text) 197 | all_text += text 198 | 199 | self.update_state(processing=False) 200 | # buffer up the entire response from Groq before sending a chat message 201 | await self.chat.send_message(all_text) 202 | await stream.flush() 203 | 204 | async def send_audio_stream(self, tts_stream: AsyncIterable[SynthesisEvent]): 205 | async for e in tts_stream: 206 | if e.type == SynthesisEventType.STARTED: 207 | self.update_state(sending_audio=True) 208 | elif e.type == SynthesisEventType.FINISHED: 209 | self.update_state(sending_audio=False) 210 | elif e.type == SynthesisEventType.AUDIO: 211 | await self.audio_out.capture_frame(e.audio.data) 212 | await tts_stream.aclose() 213 | 214 | def update_state(self, sending_audio: bool = None, processing: bool = None): 215 | if sending_audio is not None: 216 | self._sending_audio = sending_audio 217 | if processing is not None: 218 | self._processing = processing 219 | 220 | state = AgentState.LISTENING 221 | if self._sending_audio: 222 | state = AgentState.SPEAKING 223 | elif self._processing: 224 | state = AgentState.THINKING 225 | 226 | self._agent_state = state 227 | metadata = json.dumps( 228 | { 229 | "agent_state": state.name.lower(), 230 | } 231 | ) 232 | self.ctx.create_task(self.ctx.room.local_participant.update_metadata(metadata)) 233 | 234 | 235 | if __name__ == "__main__": 236 | logging.basicConfig(level=logging.INFO) 237 | 238 | worker_lifecycle = WorkerLifecycle() 239 | 240 | async def job_request_cb(job_request: agents.JobRequest): 241 | logging.info("Accepting job for KITT") 242 | if not worker_lifecycle.should_accept_job(): 243 | await job_request.reject() 244 | return 245 | 246 | await job_request.accept( 247 | KITT.create, 248 | identity="claude_agent", 249 | name="Claude", 250 | auto_subscribe=agents.AutoSubscribe.AUDIO_ONLY, 251 | auto_disconnect=agents.AutoDisconnect.DEFAULT, 252 | ) 253 | 254 | worker = agents.Worker(request_handler=job_request_cb) 255 | agents.run_app(worker) 256 | --------------------------------------------------------------------------------