├── .gitignore
├── text-streaming-agent
    ├── requirements.txt
    ├── .env.example
    ├── agent.py
    └── chatgpt.py
├── hive-moderation-agent
    ├── requirements.txt
    ├── .env.example
    ├── hive_data_classes.py
    └── agent.py
├── deepgram-transcription-agent
    ├── requirements.txt
    ├── .env.example
    └── agent.py
├── claude-3-agent
    ├── .env.example
    ├── requirements.txt
    ├── claude.py
    ├── deepgram.py
    └── agent.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | **/.env
2 | **/.venv
3 | **/__pycache__


--------------------------------------------------------------------------------
/text-streaming-agent/requirements.txt:
--------------------------------------------------------------------------------
1 | livekit
2 | livekit-agents
3 | python-dotenv
4 | openai 


--------------------------------------------------------------------------------
/hive-moderation-agent/requirements.txt:
--------------------------------------------------------------------------------
1 | livekit
2 | livekit-agents
3 | python-dotenv
4 | Pillow
5 | aiohttp


--------------------------------------------------------------------------------
/deepgram-transcription-agent/requirements.txt:
--------------------------------------------------------------------------------
1 | livekit
2 | livekit-agents
3 | python-dotenv
4 | livekit-plugins-deepgram


--------------------------------------------------------------------------------
/hive-moderation-agent/.env.example:
--------------------------------------------------------------------------------
1 | LIVEKIT_URL=XXXXXX
2 | LIVEKIT_API_KEY=XXXXXX
3 | LIVEKIT_API_SECRET=XXXXXX
4 | 
5 | HIVE_API_KEY=XXXXXX


--------------------------------------------------------------------------------
/text-streaming-agent/.env.example:
--------------------------------------------------------------------------------
1 | LIVEKIT_URL=XXXXXX
2 | LIVEKIT_API_KEY=XXXXXX
3 | LIVEKIT_API_SECRET=XXXXXX
4 | 
5 | OPENAI_API_KEY=XXXXXX


--------------------------------------------------------------------------------
/deepgram-transcription-agent/.env.example:
--------------------------------------------------------------------------------
1 | LIVEKIT_URL=XXXXXX
2 | LIVEKIT_API_KEY=XXXXXX
3 | LIVEKIT_API_SECRET=XXXXXX
4 | 
5 | DEEPGRAM_API_KEY=XXXXXX


--------------------------------------------------------------------------------
/claude-3-agent/.env.example:
--------------------------------------------------------------------------------
1 | LIVEKIT_URL=XXXXXX
2 | LIVEKIT_API_KEY=XXXXXX
3 | LIVEKIT_API_SECRET=XXXXXX
4 | 
5 | DEEPGRAM_API_KEY=XXXXXX
6 | ELEVEN_API_KEY=XXXXXX
7 | ANTHROPIC_API_KEY=XXXXXX


--------------------------------------------------------------------------------
/claude-3-agent/requirements.txt:
--------------------------------------------------------------------------------
1 | livekit>=0.9.0
2 | livekit-api
3 | livekit-agents>=0.4.0
4 | livekit-plugins-deepgram>=0.2.0
5 | livekit-plugins-elevenlabs>=0.2.0
6 | python-dotenv
7 | anthropic


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # livekit-agents
 2 | 
 3 | Example agents I've built using the LiveKit Agents (https://github.com/livekit/agents) framework
 4 | 
 5 | ## To run any of these agents
 6 | 
 7 | 1. `cd` into agent subdirectory
 8 | 2. `cp .env.example .env`
 9 | 3. open the `.env` file and replace `XXXXXX` with proper values for each environment variable
10 | 4. `python agent.py start`
11 | 
12 | ## To interact with the agent in the agents playground
13 | 
14 | 1. open a browser and navigate to: `https://agents-playground.livekit.io/`
15 | 2. choose to the same LiveKit Cloud project that the agent is running against (or manually enter a websocket URL and participant token if self-hosting)
16 | 3. click `Connect` in the top right corner of the playground UI
17 | 


--------------------------------------------------------------------------------
/text-streaming-agent/agent.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from livekit import rtc
 3 | from livekit.agents import (
 4 |     JobContext,
 5 |     JobRequest,
 6 |     WorkerOptions,
 7 |     cli,
 8 | )
 9 | from chatgpt import (
10 |     ChatGPTMessage,
11 |     ChatGPTMessageRole,
12 |     ChatGPTPlugin,
13 | )
14 | 
15 | from dotenv import load_dotenv
16 | 
17 | load_dotenv()
18 | 
19 | CHATGPT_MODEL = "gpt-4-1106-preview"
20 | 
21 | 
22 | async def entrypoint(job: JobContext):
23 |     tasks = []
24 |     chat = rtc.ChatManager(job.room)
25 |     chatgpt_plugin = ChatGPTPlugin(prompt="", message_capacity=20, model=CHATGPT_MODEL)
26 | 
27 |     async def process_chatgpt_result(text_stream):
28 |         first_message = True
29 |         message: rtc.ChatMessage = None
30 |         async for text in text_stream:
31 |             if first_message:
32 |                 message = await chat.send_message(text)
33 |                 first_message = False
34 |             else:
35 |                 message.message = message.message + text
36 |                 await chat.update_message(message)
37 | 
38 |     def on_message_received(message: rtc.ChatMessage):
39 |         if message.deleted:
40 |             return
41 |         msg = ChatGPTMessage(role=ChatGPTMessageRole.user, content=message.message)
42 |         chatgpt_result = chatgpt_plugin.add_message(msg)
43 |         tasks.append(asyncio.create_task(process_chatgpt_result(chatgpt_result)))
44 | 
45 |     chat.on("message_received", on_message_received)
46 | 
47 | 
48 | async def request_fnc(req: JobRequest) -> None:
49 |     await req.accept(entrypoint, auto_subscribe=None)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     cli.run_app(WorkerOptions(request_fnc=request_fnc))
54 | 


--------------------------------------------------------------------------------
/deepgram-transcription-agent/agent.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from livekit import agents, rtc
 4 | from livekit.plugins.deepgram import STT, SpeechStream
 5 | 
 6 | from dotenv import load_dotenv
 7 | 
 8 | load_dotenv()
 9 | 
10 | 
11 | class TranscriptionAgent:
12 |     @classmethod
13 |     async def create(cls, ctx: agents.JobContext):
14 |         agent = TranscriptionAgent(ctx)
15 |         await agent.start()
16 | 
17 |     def __init__(self, ctx: agents.JobContext):
18 |         self.ctx = ctx
19 |         self.chat = rtc.ChatManager(ctx.room)
20 |         self.stt = STT(
21 |             min_silence_duration=100,
22 |         )
23 | 
24 |     async def start(self):
25 |         def on_track_subscribed(
26 |             track: rtc.Track,
27 |             publication: rtc.TrackPublication,
28 |             participant: rtc.RemoteParticipant,
29 |         ):
30 |             self.ctx.create_task(self.process_track(track, participant))
31 | 
32 |         self.ctx.room.on("track_subscribed", on_track_subscribed)
33 | 
34 |         self.update_agent_state("listening")
35 | 
36 |     async def process_track(
37 |         self, track: rtc.AudioTrack, participant: rtc.RemoteParticipant
38 |     ):
39 |         audio_stream = rtc.AudioStream(track)
40 |         stream = self.stt.stream()
41 |         self.ctx.create_task(self.process_stt(stream, participant))
42 |         async for audio_frame_event in audio_stream:
43 |             stream.push_frame(audio_frame_event.frame)
44 |         await stream.flush()
45 | 
46 |     async def process_stt(
47 |         self, stream: SpeechStream, participant: rtc.RemoteParticipant
48 |     ):
49 |         buffered_text = ""
50 |         async for event in stream:
51 |             if event.alternatives[0].text == "":
52 |                 continue
53 |             if event.is_final:
54 |                 buffered_text = " ".join([buffered_text, event.alternatives[0].text])
55 | 
56 |             if not event.end_of_speech:
57 |                 continue
58 | 
59 |             self.ctx.create_task(
60 |                 self.chat.send_message(f"{participant.identity} said: {buffered_text}")
61 |             )
62 |             buffered_text = ""
63 | 
64 |     def update_agent_state(self, state: str):
65 |         metadata = json.dumps(
66 |             {
67 |                 "agent_state": state,
68 |             }
69 |         )
70 |         self.ctx.create_task(self.ctx.room.local_participant.update_metadata(metadata))
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     logging.basicConfig(level=logging.INFO)
75 | 
76 |     async def job_request_cb(job_request: agents.JobRequest):
77 |         await job_request.accept(
78 |             TranscriptionAgent.create,
79 |             identity="deepgram-transcriber",
80 |             name="Transcriber",
81 |             # subscribe to all video tracks automatically
82 |             auto_subscribe=agents.AutoSubscribe.AUDIO_ONLY,
83 |             # disconnect when the last participant leaves
84 |             auto_disconnect=agents.AutoDisconnect.DEFAULT,
85 |         )
86 | 
87 |     worker = agents.Worker(request_handler=job_request_cb)
88 |     agents.run_app(worker)
89 | 


--------------------------------------------------------------------------------
/hive-moderation-agent/hive_data_classes.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, is_dataclass
 2 | from typing import get_type_hints, List
 3 | 
 4 | from typing import get_type_hints, List, Any
 5 | 
 6 | 
 7 | def from_dict(cls, data):
 8 |     if is_dataclass(cls) and isinstance(data, dict):
 9 |         # Get type hints for all fields in the dataclass
10 |         field_types = get_type_hints(cls)
11 |         # Special handling for reserved words like 'class'
12 |         reserved_word_mappings = {"class": "class_"}  # Map 'class' to 'class_'
13 |         processed_data = {}
14 |         for key, value in data.items():
15 |             # Check if the key is a reserved word and map it accordingly
16 |             field_name = reserved_word_mappings.get(key, key)
17 |             # Only include keys that have corresponding fields in the dataclass
18 |             if field_name in field_types:
19 |                 field_type = field_types[field_name]
20 |                 # Determine if the field_type is itself a dataclass
21 |                 if is_dataclass(field_type):
22 |                     processed_value = from_dict(field_type, value)
23 |                 elif hasattr(field_type, "__origin__") and issubclass(
24 |                     field_type.__origin__, List
25 |                 ):
26 |                     # Handle List fields, assuming all elements are of the same type
27 |                     item_type = field_type.__args__[0]
28 |                     processed_value = [from_dict(item_type, item) for item in value]
29 |                 else:
30 |                     processed_value = value
31 |                 processed_data[field_name] = processed_value
32 |         return cls(**processed_data)
33 |     elif isinstance(data, list):
34 |         # This assumes that the function was called with a list type as `cls`,
35 |         # which might not work as expected without context on the list's element type.
36 |         # A better approach might be needed for handling lists of dataclasses.
37 |         return [
38 |             from_dict(cls.__args__[0], item) if hasattr(cls, "__args__") else item
39 |             for item in data
40 |         ]
41 |     else:
42 |         return data
43 | 
44 | 
45 | @dataclass
46 | class Status:
47 |     code: str
48 |     message: str
49 | 
50 | 
51 | @dataclass
52 | class ModInput:
53 |     id: str
54 |     charge: float
55 |     config_tag: SyntaxWarning
56 |     config_version: float
57 |     created_on: str
58 |     model: str
59 |     model_type: str
60 |     model_version: float
61 |     project_id: int
62 |     user_id: int
63 | 
64 | 
65 | @dataclass
66 | class ModClass:
67 |     class_: str
68 |     score: float
69 | 
70 | 
71 | @dataclass
72 | class ModOutput:
73 |     time: int
74 |     classes: List[ModClass]
75 | 
76 | 
77 | @dataclass
78 | class Response:
79 |     input: ModInput
80 |     output: List[ModOutput]
81 | 
82 | 
83 | @dataclass
84 | class ModResponse:
85 |     status: Status
86 |     response: Response
87 | 
88 | 
89 | @dataclass
90 | class HiveResponse:
91 |     id: str
92 |     code: int
93 |     project_id: int
94 |     user_id: int
95 |     created_on: str
96 |     status: List[ModResponse]
97 |     from_cache: bool
98 | 


--------------------------------------------------------------------------------
/hive-moderation-agent/agent.py:
--------------------------------------------------------------------------------
  1 | import aiohttp
  2 | import json
  3 | import logging
  4 | import time
  5 | from livekit import agents, rtc
  6 | from PIL import Image
  7 | import os
  8 | from io import BytesIO
  9 | from hive_data_classes import HiveResponse, from_dict
 10 | 
 11 | 
 12 | from dotenv import load_dotenv
 13 | 
 14 | load_dotenv()
 15 | 
 16 | MOD_FRAME_INTERVAL = 5.0  # check 1 frame every 5 seconds
 17 | 
 18 | hive_headers = {
 19 |     "Authorization": f"Token {os.getenv('HIVE_API_KEY')}",
 20 |     "accept": "application/json",
 21 | }
 22 | 
 23 | 
 24 | class ModeratorAgent:
 25 |     @classmethod
 26 |     async def create(cls, ctx: agents.JobContext):
 27 |         agent = ModeratorAgent(ctx)
 28 |         await agent.start()
 29 | 
 30 |     def __init__(self, ctx: agents.JobContext):
 31 |         self.ctx = ctx
 32 |         self.chat = rtc.ChatManager(ctx.room)
 33 | 
 34 |     async def start(self):
 35 |         self.ctx.create_task(
 36 |             self.chat.send_message(
 37 |                 "I'm a moderation agent, I will detect and notify you of all inappropriate material you transmit in your video stream"
 38 |             )
 39 |         )
 40 | 
 41 |         def on_track_subscribed(
 42 |             track: rtc.Track,
 43 |             publication: rtc.TrackPublication,
 44 |             participant: rtc.RemoteParticipant,
 45 |         ):
 46 |             self.ctx.create_task(self.process_track(track))
 47 | 
 48 |         self.ctx.room.on("track_subscribed", on_track_subscribed)
 49 | 
 50 |         self.update_agent_state("monitoring")
 51 | 
 52 |     async def process_track(self, track: rtc.VideoTrack):
 53 |         video_stream = rtc.VideoStream(track)
 54 |         last_processed_time = 0
 55 |         frame_interval = MOD_FRAME_INTERVAL
 56 |         async for frame in video_stream:
 57 |             current_time = time.time()
 58 |             if (current_time - last_processed_time) >= frame_interval:
 59 |                 last_processed_time = current_time
 60 |                 self.ctx.create_task(self.detect(frame))
 61 | 
 62 |     async def detect(self, frame: rtc.VideoFrame):
 63 |         argb_frame = frame.frame.convert(rtc.VideoBufferType.RGBA)
 64 |         image = Image.frombytes(
 65 |             "RGBA", (argb_frame.width, argb_frame.height), argb_frame.data
 66 |         )
 67 |         buffer = BytesIO()
 68 |         image.save(buffer, format="PNG")
 69 |         buffer.seek(0)  # reset buffer position to beginning after writing
 70 | 
 71 |         data = aiohttp.FormData()
 72 |         data.add_field("image", buffer, filename="image.png", content_type="image/png")
 73 | 
 74 |         async with aiohttp.ClientSession() as session:
 75 |             async with session.post(
 76 |                 "https://api.thehive.ai/api/v2/task/sync",
 77 |                 headers=hive_headers,
 78 |                 data=data,
 79 |             ) as response:
 80 |                 response_dict = await response.json()
 81 |                 hive_response: HiveResponse = from_dict(HiveResponse, response_dict)
 82 |                 if (
 83 |                     hive_response.code == 200
 84 |                     and len(hive_response.status) > 0
 85 |                     and len(hive_response.status[0].response.output) > 0
 86 |                 ):
 87 |                     results = hive_response.status[0].response.output[0].classes
 88 |                     if len(results) > 0:
 89 |                         sorted_results = sorted(
 90 |                             results, key=lambda r: r.score, reverse=True
 91 |                         )[:10]
 92 |                         results_str = "Results:\n"
 93 |                         for result in sorted_results:
 94 |                             results_str += f"{result.class_}: {result.score}"
 95 |                         self.ctx.create_task(self.chat.send_message(results_str))
 96 | 
 97 |     def update_agent_state(self, state: str):
 98 |         metadata = json.dumps(
 99 |             {
100 |                 "agent_state": state,
101 |             }
102 |         )
103 |         self.ctx.create_task(self.ctx.room.local_participant.update_metadata(metadata))
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     logging.basicConfig(level=logging.INFO)
108 | 
109 |     async def job_request_cb(job_request: agents.JobRequest):
110 |         await job_request.accept(
111 |             ModeratorAgent.create,
112 |             identity="hive-moderator",
113 |             name="Moderator",
114 |             # subscribe to all video tracks automatically
115 |             auto_subscribe=agents.AutoSubscribe.VIDEO_ONLY,
116 |             # disconnect when the last participant leaves
117 |             auto_disconnect=agents.AutoDisconnect.DEFAULT,
118 |         )
119 | 
120 |     worker = agents.Worker(request_handler=job_request_cb)
121 |     agents.run_app(worker)
122 | 


--------------------------------------------------------------------------------
/text-streaming-agent/chatgpt.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 LiveKit, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import logging
 17 | import asyncio
 18 | import openai
 19 | from dataclasses import dataclass
 20 | from typing import AsyncIterable, List, Optional
 21 | from enum import Enum
 22 | 
 23 | ChatGPTMessageRole = Enum("MessageRole", ["system", "user", "assistant", "function"])
 24 | 
 25 | 
 26 | @dataclass
 27 | class ChatGPTMessage:
 28 |     role: ChatGPTMessageRole
 29 |     content: str
 30 | 
 31 |     def to_api(self):
 32 |         return {"role": self.role.name, "content": self.content}
 33 | 
 34 | 
 35 | class ChatGPTPlugin:
 36 |     """OpenAI ChatGPT Plugin"""
 37 | 
 38 |     def __init__(self, prompt: str, message_capacity: int, model: str):
 39 |         """
 40 |         Args:
 41 |             prompt (str): First 'system' message sent to the chat that prompts the assistant
 42 |             message_capacity (int): Maximum number of messages to send to the chat
 43 |             model (str): Which model to use (i.e. 'gpt-3.5-turbo')
 44 |         """
 45 |         self._model = model
 46 |         self._client = openai.AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
 47 |         self._prompt = prompt
 48 |         self._message_capacity = message_capacity
 49 |         self._messages: List[ChatGPTMessage] = []
 50 |         self._producing_response = False
 51 |         self._needs_interrupt = False
 52 | 
 53 |     def interrupt(self):
 54 |         """Interrupt a currently streaming response (if there is one)"""
 55 |         if self._producing_response:
 56 |             self._needs_interrupt = True
 57 | 
 58 |     async def aclose(self):
 59 |         pass
 60 | 
 61 |     async def send_system_prompt(self) -> AsyncIterable[str]:
 62 |         """Send the system prompt to the chat and generate a streamed response
 63 | 
 64 |         Returns:
 65 |             AsyncIterable[str]: Streamed ChatGPT response
 66 |         """
 67 |         async for text in self.add_message(None):
 68 |             yield text
 69 | 
 70 |     async def add_message(
 71 |         self, message: Optional[ChatGPTMessage]
 72 |     ) -> AsyncIterable[str]:
 73 |         """Add a message to the chat and generate a streamed response
 74 | 
 75 |         Args:
 76 |             message (ChatGPTMessage): The message to add
 77 | 
 78 |         Returns:
 79 |             AsyncIterable[str]: Streamed ChatGPT response
 80 |         """
 81 | 
 82 |         if message is not None:
 83 |             self._messages.append(message)
 84 |         if len(self._messages) > self._message_capacity:
 85 |             self._messages.pop(0)
 86 | 
 87 |         async for text in self._generate_text_streamed(self._model):
 88 |             yield text
 89 | 
 90 |     async def _generate_text_streamed(self, model: str) -> AsyncIterable[str]:
 91 |         prompt_message = ChatGPTMessage(
 92 |             role=ChatGPTMessageRole.system, content=self._prompt
 93 |         )
 94 |         try:
 95 |             chat_messages = [m.to_api() for m in self._messages]
 96 |             chat_stream = await asyncio.wait_for(
 97 |                 self._client.chat.completions.create(
 98 |                     model=model,
 99 |                     n=1,
100 |                     stream=True,
101 |                     messages=[prompt_message.to_api()] + chat_messages,
102 |                 ),
103 |                 10,
104 |             )
105 |         except TimeoutError:
106 |             yield "Sorry, I'm taking too long to respond. Please try again later."
107 |             return
108 | 
109 |         self._producing_response = True
110 |         complete_response = ""
111 | 
112 |         async def anext_util(aiter):
113 |             async for item in aiter:
114 |                 return item
115 | 
116 |             return None
117 | 
118 |         while True:
119 |             try:
120 |                 chunk = await asyncio.wait_for(anext_util(chat_stream), 5)
121 |             except TimeoutError:
122 |                 break
123 |             except asyncio.CancelledError:
124 |                 self._producing_response = False
125 |                 self._needs_interrupt = False
126 |                 break
127 | 
128 |             if chunk is None:
129 |                 break
130 |             content = chunk.choices[0].delta.content
131 | 
132 |             if self._needs_interrupt:
133 |                 self._needs_interrupt = False
134 |                 logging.info("ChatGPT interrupted")
135 |                 break
136 | 
137 |             if content is not None:
138 |                 complete_response += content
139 |                 yield content
140 | 
141 |         self._messages.append(
142 |             ChatGPTMessage(role=ChatGPTMessageRole.assistant, content=complete_response)
143 |         )
144 |         self._producing_response = False
145 | 


--------------------------------------------------------------------------------
/claude-3-agent/claude.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 LiveKit, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import logging
 17 | import asyncio
 18 | from anthropic import AsyncAnthropic
 19 | from dataclasses import dataclass
 20 | from typing import AsyncIterable, List, Optional
 21 | from enum import Enum
 22 | 
 23 | ClaudeMessageRole = Enum("MessageRole", ["system", "user", "assistant", "function"])
 24 | 
 25 | 
 26 | class ClaudeModels(Enum):
 27 |     Claude3Opus = "claude-3-opus-20240229"
 28 |     Claude3Sonnet = "claude-3-sonnet-20240229"
 29 |     Claude3Haiku = "claude-3-haiku-20240307"
 30 | 
 31 | 
 32 | @dataclass
 33 | class ClaudeMessage:
 34 |     role: ClaudeMessageRole
 35 |     content: str
 36 | 
 37 |     def to_api(self):
 38 |         return {"role": self.role.name, "content": self.content}
 39 | 
 40 | 
 41 | class ClaudePlugin:
 42 |     """Claude Plugin"""
 43 | 
 44 |     def __init__(self, prompt: str, message_capacity: int, model: str):
 45 |         """
 46 |         Args:
 47 |             prompt (str): First 'system' message sent to the chat that prompts the assistant
 48 |             message_capacity (int): Maximum number of messages to send to the chat
 49 |             model (str): Which model to use (i.e. 'gpt-3.5-turbo')
 50 |         """
 51 |         self._model = model
 52 |         self._client = AsyncAnthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
 53 |         self._prompt = prompt
 54 |         self._message_capacity = message_capacity
 55 |         self._messages: List[ClaudeMessage] = []
 56 |         self._producing_response = False
 57 |         self._needs_interrupt = False
 58 | 
 59 |     def interrupt(self):
 60 |         """Interrupt a currently streaming response (if there is one)"""
 61 |         if self._producing_response:
 62 |             self._needs_interrupt = True
 63 | 
 64 |     async def aclose(self):
 65 |         pass
 66 | 
 67 |     async def send_system_prompt(self) -> AsyncIterable[str]:
 68 |         """Send the system prompt to the chat and generate a streamed response
 69 | 
 70 |         Returns:
 71 |             AsyncIterable[str]: Streamed ChatGPT response
 72 |         """
 73 |         async for text in self.add_message(None):
 74 |             yield text
 75 | 
 76 |     async def add_message(self, message: Optional[ClaudeMessage]) -> AsyncIterable[str]:
 77 |         """Add a message to the chat and generate a streamed response
 78 | 
 79 |         Args:
 80 |             message (ChatGPTMessage): The message to add
 81 | 
 82 |         Returns:
 83 |             AsyncIterable[str]: Streamed ChatGPT response
 84 |         """
 85 | 
 86 |         if message is not None:
 87 |             self._messages.append(message)
 88 |         if len(self._messages) > self._message_capacity:
 89 |             self._messages.pop(0)
 90 | 
 91 |         async for text in self._generate_text_streamed(self._model):
 92 |             yield text
 93 | 
 94 |     async def _generate_text_streamed(self, model: str) -> AsyncIterable[str]:
 95 |         prompt_message = ClaudeMessage(
 96 |             role=ClaudeMessageRole.system, content=self._prompt
 97 |         )
 98 |         try:
 99 |             chat_messages = [m.to_api() for m in self._messages]
100 |             chat_stream = await self._client.messages.create(
101 |                 model=model,
102 |                 stream=True,
103 |                 max_tokens=1024,
104 |                 messages=[prompt_message.to_api()] + chat_messages,
105 |             )
106 |         except TimeoutError:
107 |             yield "Sorry, I'm taking too long to respond. Please try again later."
108 |             return
109 | 
110 |         self._producing_response = True
111 |         complete_response = ""
112 | 
113 |         async def anext_util(aiter):
114 |             async for item in aiter:
115 |                 return item
116 | 
117 |             return None
118 | 
119 |         while True:
120 |             try:
121 |                 chunk = await asyncio.wait_for(anext_util(chat_stream), 5)
122 |             except TimeoutError:
123 |                 break
124 |             except asyncio.CancelledError:
125 |                 self._producing_response = False
126 |                 self._needs_interrupt = False
127 |                 break
128 | 
129 |             if chunk is None:
130 |                 break
131 |             content = chunk.choices[0].delta.content
132 | 
133 |             if self._needs_interrupt:
134 |                 self._needs_interrupt = False
135 |                 logging.info("ChatGPT interrupted")
136 |                 break
137 | 
138 |             if content is not None:
139 |                 complete_response += content
140 |                 yield content
141 | 
142 |         self._messages.append(
143 |             ClaudeMessage(role=ClaudeMessageRole.assistant, content=complete_response)
144 |         )
145 |         self._producing_response = False
146 | 


--------------------------------------------------------------------------------
/claude-3-agent/deepgram.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | import json
  4 | import logging
  5 | import os
  6 | from urllib.parse import urlencode
  7 | from dataclasses import dataclass
  8 | from typing import Optional
  9 | 
 10 | import aiohttp
 11 | from livekit import rtc
 12 | 
 13 | from enum import Enum
 14 | 
 15 | 
 16 | # taken from deepgram-sdk
 17 | class LiveTranscriptionEvents(str, Enum):
 18 |     Open: str = "Open"
 19 |     Close: str = "Close"
 20 |     Transcript: str = "Results"
 21 |     Metadata: str = "Metadata"
 22 |     UtteranceEnd: str = "UtteranceEnd"
 23 |     SpeechStarted: str = "SpeechStarted"
 24 |     Error: str = "Error"
 25 |     Warning: str = "Warning"
 26 | 
 27 | 
 28 | STREAM_KEEPALIVE_MSG: str = json.dumps({"type": "KeepAlive"})
 29 | STREAM_CLOSE_MSG: str = json.dumps({"type": "CloseStream"})
 30 | 
 31 | 
 32 | class STTStream:
 33 |     @dataclass
 34 |     class StartedEvent:
 35 |         type: str = "started"
 36 | 
 37 |     @dataclass
 38 |     class InterimEvent:
 39 |         text: str
 40 |         type: str = "interim"
 41 | 
 42 |     @dataclass
 43 |     class FinishedEvent:
 44 |         text: str
 45 |         type: str = "finished"
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |     ) -> None:
 50 |         super().__init__()
 51 |         self._api_key = os.environ["DEEPGRAM_API_KEY"]
 52 | 
 53 |         self._queue = asyncio.Queue()
 54 |         self._event_queue = asyncio.Queue[
 55 |             STTStream.StartedEvent | STTStream.InterimEvent | STTStream.FinishedEvent
 56 |         ]()
 57 |         self._closed = False
 58 |         self._main_task = asyncio.create_task(self._run(max_retry=32))
 59 | 
 60 |         def log_exception(task: asyncio.Task) -> None:
 61 |             if not task.cancelled() and task.exception():
 62 |                 logging.error(f"deepgram task failed: {task.exception()}")
 63 | 
 64 |         self._main_task.add_done_callback(log_exception)
 65 | 
 66 |     def push_frame(self, frame: rtc.AudioFrame) -> None:
 67 |         if self._closed:
 68 |             raise ValueError("cannot push frame to closed stream")
 69 | 
 70 |         self._queue.put_nowait(frame.remix_and_resample(16000, 1))
 71 | 
 72 |     async def aclose(self) -> None:
 73 |         await self._queue.put(STREAM_CLOSE_MSG)
 74 |         await self._main_task
 75 | 
 76 |     async def _run(self, max_retry: int) -> None:
 77 |         """Try to connect to Deepgram with exponential backoff and forward frames"""
 78 |         async with aiohttp.ClientSession() as session:
 79 |             retry_count = 0
 80 |             ws: Optional[aiohttp.ClientWebSocketResponse] = None
 81 |             listen_task: Optional[asyncio.Task] = None
 82 |             keepalive_task: Optional[asyncio.Task] = None
 83 |             while True:
 84 |                 try:
 85 |                     ws = await self._try_connect(session)
 86 |                     listen_task = asyncio.create_task(self._listen_loop(ws))
 87 |                     keepalive_task = asyncio.create_task(self._keepalive_loop(ws))
 88 |                     # break out of the retry loop if we are done
 89 |                     if await self._send_loop(ws):
 90 |                         keepalive_task.cancel()
 91 |                         await asyncio.wait_for(listen_task, timeout=5)
 92 |                         break
 93 |                 except Exception as e:
 94 |                     if retry_count > max_retry and max_retry > 0:
 95 |                         logging.error(f"failed to connect to Deepgram: {e}")
 96 |                         break
 97 | 
 98 |                     retry_delay = min(retry_count * 5, 5)  # max 5s
 99 |                     retry_count += 1
100 |                     logging.warning(
101 |                         f"failed to connect to Deepgram: {e} - retrying in {retry_delay}s"
102 |                     )
103 |                     await asyncio.sleep(retry_delay)
104 | 
105 |         self._closed = True
106 | 
107 |     async def _send_loop(self, ws: aiohttp.ClientWebSocketResponse) -> bool:
108 |         while not ws.closed:
109 |             data = await self._queue.get()
110 |             # fire and forget, we don't care if we miss frames in the error case
111 |             self._queue.task_done()
112 | 
113 |             if ws.closed:
114 |                 raise Exception("websocket closed")
115 | 
116 |             if isinstance(data, rtc.AudioFrame):
117 |                 await ws.send_bytes(data.data.tobytes())
118 |             else:
119 |                 if data == STREAM_CLOSE_MSG:
120 |                     await ws.send_str(data)
121 |                     return True
122 |         return False
123 | 
124 |     async def _keepalive_loop(self, ws: aiohttp.ClientWebSocketResponse) -> None:
125 |         while not ws.closed:
126 |             await ws.send_str(STREAM_KEEPALIVE_MSG)
127 |             await asyncio.sleep(5)
128 | 
129 |     async def _listen_loop(self, ws: aiohttp.ClientWebSocketResponse) -> None:
130 |         speaking = False
131 |         last_transcript = ""
132 | 
133 |         while not ws.closed:
134 |             msg = await ws.receive()
135 |             if msg.type in (
136 |                 aiohttp.WSMsgType.CLOSED,
137 |                 aiohttp.WSMsgType.CLOSE,
138 |                 aiohttp.WSMsgType.CLOSING,
139 |             ):
140 |                 break
141 | 
142 |             try:
143 |                 if msg.type == aiohttp.WSMsgType.TEXT:
144 |                     data = json.loads(msg.data)
145 |                     type = data.get("type")
146 |                     if not type:
147 |                         continue
148 | 
149 |                     if not speaking:
150 |                         if type == LiveTranscriptionEvents.SpeechStarted:
151 |                             speaking = True
152 |                             event = self.StartedEvent()
153 |                             await self._event_queue.put(event)
154 |                     else:
155 |                         if type == LiveTranscriptionEvents.UtteranceEnd:
156 |                             if last_transcript != "":
157 |                                 speaking = False
158 |                                 event = self.FinishedEvent(text=last_transcript)
159 |                                 last_transcript = ""
160 |                                 await self._event_queue.put(event)
161 |                         elif type == LiveTranscriptionEvents.Transcript:
162 |                             is_final_transcript = data["is_final"]
163 |                             is_endpoint = data["speech_final"]
164 | 
165 |                             if is_final_transcript:
166 |                                 transcript = data["channel"]["alternatives"][0][
167 |                                     "transcript"
168 |                                 ]
169 |                                 if transcript != "":
170 |                                     last_transcript += transcript
171 |                                     if not is_endpoint:
172 |                                         last_transcript += " "
173 | 
174 |                             if is_endpoint and last_transcript != "":
175 |                                 speaking = False
176 |                                 event = self.FinishedEvent(text=last_transcript)
177 |                                 last_transcript = ""
178 |                                 await self._event_queue.put(event)
179 | 
180 |             except Exception as e:
181 |                 logging.error("Error handling message %s: %s", msg, e)
182 |                 continue
183 | 
184 |     async def _try_connect(
185 |         self, session: aiohttp.ClientSession
186 |     ) -> aiohttp.ClientWebSocketResponse:
187 |         live_config = {
188 |             "model": "nova-2",
189 |             "language": "en-US",
190 |             "filler_words": True,
191 |             "punctuate": True,
192 |             "smart_format": True,
193 |             "interim_results": True,
194 |             "encoding": "linear16",
195 |             "sample_rate": 16000,
196 |             "channels": 1,
197 |             "endpointing": 200,
198 |             "vad_events": True,
199 |             "utterance_end_ms": 1000,
200 |         }
201 | 
202 |         query_params = urlencode(live_config).lower()
203 | 
204 |         url = f"wss://api.deepgram.com/v1/listen?{query_params}"
205 |         ws = await session.ws_connect(
206 |             url, headers={"Authorization": f"Token {self._api_key}"}
207 |         )
208 | 
209 |         return ws
210 | 
211 |     def __aiter__(self) -> "STTStream":
212 |         return self
213 | 
214 |     async def __anext__(
215 |         self,
216 |     ):
217 |         if self._closed and self._event_queue.empty():
218 |             raise StopAsyncIteration
219 | 
220 |         return await self._event_queue.get()
221 | 


--------------------------------------------------------------------------------
/claude-3-agent/agent.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 LiveKit, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import random
 17 | import time
 18 | import threading
 19 | import asyncio
 20 | from datetime import datetime
 21 | from enum import Enum
 22 | import json
 23 | import logging
 24 | from typing import AsyncIterable
 25 | 
 26 | from livekit import rtc, agents
 27 | from livekit.agents.tts import SynthesisEvent, SynthesisEventType
 28 | 
 29 | from claude import ClaudeMessage, ClaudeMessageRole, ClaudePlugin, ClaudeModels
 30 | from deepgram import STTStream
 31 | from livekit.plugins.elevenlabs import TTS
 32 | 
 33 | from dotenv import load_dotenv
 34 | 
 35 | load_dotenv()
 36 | 
 37 | 
 38 | PROMPT = "You are KITT, a friendly voice assistant powered by LiveKit.  \
 39 |           Conversation should be personable, and be sure to ask follow up questions. \
 40 |           If your response is a question, please append a question mark symbol to the end of it.\
 41 |           Don't respond with more than a few sentences."
 42 | INTRO = "Hello, I am KITT, a friendly voice assistant powered by LiveKit Agents. \
 43 |         You can find my source code in the top right of this screen if you're curious how I work. \
 44 |         Feel free to ask me anything — I'm here to help! Just start talking or type in the chat."
 45 | SIP_INTRO = "Hello, I am KITT, a friendly voice assistant powered by LiveKit Agents. \
 46 |              Feel free to ask me anything — I'm here to help! Just start talking."
 47 | 
 48 | 
 49 | # convert intro response to a stream
 50 | async def intro_text_stream(sip: bool):
 51 |     if sip:
 52 |         yield SIP_INTRO
 53 |         return
 54 | 
 55 |     yield INTRO
 56 | 
 57 | 
 58 | AgentState = Enum("AgentState", "IDLE, LISTENING, THINKING, SPEAKING")
 59 | 
 60 | ELEVEN_TTS_SAMPLE_RATE = 24000
 61 | ELEVEN_TTS_CHANNELS = 1
 62 | 
 63 | 
 64 | class WorkerLifecycle:
 65 |     def __init__(self):
 66 |         self._accepting_jobs = True
 67 |         # Stop accepting jobs after a random time between 20 and 30 minutes
 68 |         self._stop_thread = threading.Thread(
 69 |             target=self._stop_accepting_jobs_after,
 70 |             args=(random.randrange(60 * 60, 60 * 120),),
 71 |         )
 72 |         self._stop_thread.start()
 73 | 
 74 |     def _stop_accepting_jobs_after(self, after: int):
 75 |         time.sleep(after)
 76 |         self._accepting_jobs = False
 77 |         self._kill_after(
 78 |             random.randrange(2 * 60, 4 * 60)
 79 |         )  # kill 10-15 minutes after stopping accepting jobs
 80 | 
 81 |     def _kill_after(self, after: int):
 82 |         time.sleep(after)
 83 |         self._kill()
 84 | 
 85 |     def should_accept_job(self):
 86 |         return self._accepting_jobs
 87 | 
 88 |     def _kill(self):
 89 |         # kill the worker
 90 |         os._exit(0)
 91 | 
 92 | 
 93 | class KITT:
 94 |     @classmethod
 95 |     async def create(cls, ctx: agents.JobContext):
 96 |         kitt = KITT(ctx)
 97 |         await kitt.start()
 98 | 
 99 |     def __init__(self, ctx: agents.JobContext):
100 |         # plugins
101 |         self.claude_plugin = ClaudePlugin(
102 |             prompt=PROMPT, message_capacity=20, model=ClaudeModels.Claude3Opus.value
103 |         )
104 |         self.tts_plugin = TTS(
105 |             model_id="eleven_turbo_v2", sample_rate=ELEVEN_TTS_SAMPLE_RATE
106 |         )
107 | 
108 |         self.ctx: agents.JobContext = ctx
109 |         self.chat = rtc.ChatManager(ctx.room)
110 |         self.audio_out = rtc.AudioSource(ELEVEN_TTS_SAMPLE_RATE, ELEVEN_TTS_CHANNELS)
111 | 
112 |         self._sending_audio = False
113 |         self._processing = False
114 |         self._agent_state: AgentState = AgentState.IDLE
115 | 
116 |         self.chat.on("message_received", self.on_chat_received)
117 |         self.ctx.room.on("track_subscribed", self.on_track_subscribed)
118 | 
119 |     async def start(self):
120 |         # if you have to perform teardown cleanup, you can listen to the disconnected event
121 |         # self.ctx.room.on("disconnected", your_cleanup_function)
122 | 
123 |         # publish audio track
124 |         track = rtc.LocalAudioTrack.create_audio_track("agent-mic", self.audio_out)
125 |         await self.ctx.room.local_participant.publish_track(track)
126 | 
127 |         # allow the participant to fully subscribe to the agent's audio track, so it doesn't miss
128 |         # anything in the beginning
129 |         await asyncio.sleep(1)
130 | 
131 |         sip = self.ctx.room.name.startswith("sip")
132 |         await self.process_claude_result(intro_text_stream(sip))
133 |         self.update_state()
134 | 
135 |     def on_chat_received(self, message: rtc.ChatMessage):
136 |         # TODO: handle deleted and updated messages in message context
137 |         if message.deleted:
138 |             return
139 | 
140 |         msg = ClaudeMessage(role=ClaudeMessageRole.user, content=message.message)
141 |         claude_result = self.claude_plugin.add_message(msg)
142 |         self.ctx.create_task(self.process_chatgpt_result(claude_result))
143 | 
144 |     def on_track_subscribed(
145 |         self,
146 |         track: rtc.Track,
147 |         publication: rtc.TrackPublication,
148 |         participant: rtc.RemoteParticipant,
149 |     ):
150 |         self.ctx.create_task(self.process_track(track))
151 | 
152 |     async def process_track(self, track: rtc.Track):
153 |         audio_stream = rtc.AudioStream(track)
154 |         stream = STTStream()
155 |         self.ctx.create_task(self.process_stt_stream(stream))
156 |         async for audio_frame_event in audio_stream:
157 |             if self._agent_state != AgentState.LISTENING:
158 |                 continue
159 |             stream.push_frame(audio_frame_event.frame)
160 |         await stream.flush()
161 | 
162 |     async def process_stt_stream(self, stream: STTStream):
163 |         buffered_text = ""
164 |         async for event in stream:
165 |             if event.type == "started":
166 |                 pass
167 |             elif event.type == "interim":
168 |                 pass
169 |             elif event.type == "finished":
170 |                 if event.text == "":
171 |                     continue
172 |                 buffered_text = " ".join([buffered_text, event.text])
173 |                 await self.ctx.room.local_participant.publish_data(
174 |                     json.dumps(
175 |                         {
176 |                             "text": buffered_text,
177 |                             "timestamp": int(datetime.now().timestamp() * 1000),
178 |                         }
179 |                     ),
180 |                     topic="transcription",
181 |                 )
182 | 
183 |                 msg = ClaudeMessage(role=ClaudeMessageRole.user, content=buffered_text)
184 |                 claude_stream = self.claude_plugin.add_message(msg)
185 |                 self.ctx.create_task(self.process_claude_result(claude_stream))
186 |                 buffered_text = ""
187 | 
188 |     async def process_claude_result(self, text_stream):
189 |         self.update_state(processing=True)
190 | 
191 |         stream = self.tts_plugin.stream()
192 |         # send audio to TTS in parallel
193 |         self.ctx.create_task(self.send_audio_stream(stream))
194 |         all_text = ""
195 |         async for text in text_stream:
196 |             stream.push_text(text)
197 |             all_text += text
198 | 
199 |         self.update_state(processing=False)
200 |         # buffer up the entire response from Groq before sending a chat message
201 |         await self.chat.send_message(all_text)
202 |         await stream.flush()
203 | 
204 |     async def send_audio_stream(self, tts_stream: AsyncIterable[SynthesisEvent]):
205 |         async for e in tts_stream:
206 |             if e.type == SynthesisEventType.STARTED:
207 |                 self.update_state(sending_audio=True)
208 |             elif e.type == SynthesisEventType.FINISHED:
209 |                 self.update_state(sending_audio=False)
210 |             elif e.type == SynthesisEventType.AUDIO:
211 |                 await self.audio_out.capture_frame(e.audio.data)
212 |         await tts_stream.aclose()
213 | 
214 |     def update_state(self, sending_audio: bool = None, processing: bool = None):
215 |         if sending_audio is not None:
216 |             self._sending_audio = sending_audio
217 |         if processing is not None:
218 |             self._processing = processing
219 | 
220 |         state = AgentState.LISTENING
221 |         if self._sending_audio:
222 |             state = AgentState.SPEAKING
223 |         elif self._processing:
224 |             state = AgentState.THINKING
225 | 
226 |         self._agent_state = state
227 |         metadata = json.dumps(
228 |             {
229 |                 "agent_state": state.name.lower(),
230 |             }
231 |         )
232 |         self.ctx.create_task(self.ctx.room.local_participant.update_metadata(metadata))
233 | 
234 | 
235 | if __name__ == "__main__":
236 |     logging.basicConfig(level=logging.INFO)
237 | 
238 |     worker_lifecycle = WorkerLifecycle()
239 | 
240 |     async def job_request_cb(job_request: agents.JobRequest):
241 |         logging.info("Accepting job for KITT")
242 |         if not worker_lifecycle.should_accept_job():
243 |             await job_request.reject()
244 |             return
245 | 
246 |         await job_request.accept(
247 |             KITT.create,
248 |             identity="claude_agent",
249 |             name="Claude",
250 |             auto_subscribe=agents.AutoSubscribe.AUDIO_ONLY,
251 |             auto_disconnect=agents.AutoDisconnect.DEFAULT,
252 |         )
253 | 
254 |     worker = agents.Worker(request_handler=job_request_cb)
255 |     agents.run_app(worker)
256 | 


--------------------------------------------------------------------------------