├── main_realtime_api.py
├── modules
├── __init__.py
├── typings.py
├── simple_llm.py
└── constants.py
├── img
├── own-your-ai.png
└── reliable-ai-agents.png
├── .env.sample
├── requirements.txt
├── .gitignore
├── README.md
├── structured_outputs_example.py
├── main.py
└── assistants
└── assistants.py
/main_realtime_api.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/img/own-your-ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/disler/personal-ai-starter-pack/HEAD/img/own-your-ai.png
--------------------------------------------------------------------------------
/img/reliable-ai-agents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/disler/personal-ai-starter-pack/HEAD/img/reliable-ai-agents.png
--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | GOOGLE_API_KEY=
2 | ASSEMBLYAI_API_KEY=
3 | ELEVEN_API_KEY=
4 | OPENAI_API_KEY=
5 |
6 | GEMINI_API_KEY=
7 | ANTHROPIC_API_KEY=
8 | AIDER_AUTO_COMMITS=false
9 |
10 | GROQ_API_KEY=
11 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv
2 | openai
3 | anthropic
4 | groq
5 | pytest
6 | pydantic
7 | assemblyai
8 | assemblyai[extras]
9 | sounddevice
10 | numpy
11 | elevenlabs
12 | llm
13 | llm-claude
14 | llm-claude-3
15 | llm-ollama
16 | llm-gemini
17 | Pillow
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .aider.ident.cache.v1
2 | .aider.tags.cache.v1
3 | .aider.chat.history.md
4 | .aider.input.history
5 |
6 | log.txt
7 |
8 | agent_results/
9 |
10 | .vercel
11 | *.log
12 | *.pyc
13 | __pycache__
14 |
15 | # Environments
16 | .env
17 | .venv
18 | env/
19 | venv/
20 | ENV/
21 | env.bak/
22 | venv.bak/
23 | .env.yml
24 | .env.yaml
25 | .env.billing.yaml
26 | .env.billing.yml
27 |
28 | .aider*
29 |
30 | *.wav
31 | **.pyc
32 | modules/__pycache__
33 |
34 | *.mp3
35 | *.mp4
36 | *.wav
37 | *.aac
38 | *.ogg
39 | *.flac
40 | *.m4a
41 | *.mp4
42 |
43 | *.json
44 |
45 | data/
--------------------------------------------------------------------------------
/modules/typings.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | from typing import Any, Dict, List, Optional, Union
3 |
4 | import warnings
5 |
6 | # Suppress specific warnings
7 | warnings.filterwarnings("ignore", message="Valid config keys have changed in V2:")
8 | warnings.filterwarnings(
9 | "ignore", message='Field "model_id" has conflict with protected namespace "model_".'
10 | )
11 |
12 |
13 | class Interaction(BaseModel):
14 | role: str
15 | content: str
16 |
17 |
18 | from enum import Enum
19 |
20 |
21 | class ImageRatio(str, Enum):
22 | SQUARE = "1024x1024"
23 | PORTRAIT = "1024x1792"
24 | LANDSCAPE = "1792x1024"
25 |
26 |
27 | class Style(str, Enum):
28 | VIVID = "vivid"
29 | NATURAL = "natural"
30 |
31 |
32 | class Quality(str, Enum):
33 | STANDARD = "standard"
34 | HD = "hd"
35 |
36 |
37 | class GenerateImageParams(BaseModel):
38 | prompts: List[str]
39 | quality: Quality
40 | image_ratio: Optional[ImageRatio]
41 | style: Optional[Style]
42 |
43 |
44 | class ImageFormat(str, Enum):
45 | JPEG = "jpeg"
46 | PNG = "png"
47 | GIF = "gif"
48 | BMP = "bmp"
49 | TIFF = "tiff"
50 |
51 |
52 | class ConvertImageParams(BaseModel):
53 | version_numbers: List[int]
54 | image_format: ImageFormat
55 |
56 |
57 | class ResizeImageParams(BaseModel):
58 | version_numbers: List[int]
59 | width: int
60 | height: int
61 |
62 |
63 | class OpenImageDirParams(BaseModel):
64 | pass
65 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fast Personal AI Assistant & Structured Output
2 | >
3 | > A quick start personal AI assistant framework using OpenAI, Groq, AssemblyAI and ElevenLabs.
4 | >
5 | > And a breakdown of the reliability of AI agents with the new structured output.
6 |
7 | 
8 | 
9 |
10 |
11 | ## Setup
12 |
13 | - Create and activate virtual environment:
14 | ```bash
15 | python -m venv venv
16 | source venv/bin/activate # On Windows, use `venv\Scripts\activate`
17 | ```
18 |
19 | - Install dependencies:
20 | ```bash
21 | pip install -r requirements.txt
22 | ```
23 |
24 | - Set up environment variables:
25 | ```bash
26 | cp .env.sample .env
27 | # Edit .env file and add your API keys
28 | ```
29 | `I recommend starting with the OpenAI assistant since you only need to set up the OpenAI API key.`.
30 |
31 | - Run the main script:
32 | ```bash
33 | python main.py
34 | ```
35 |
36 | - Run the structured output script:
37 | ```bash
38 | python structured_outputs_example.py
39 | ```
40 |
41 | - Press `Enter` to start recording, and `Enter` again to stop recording.
42 |
43 | - Adjust the maximum duration of the recording in `constants.py: DURATION`
44 |
45 | - Update configuration variables in `constants.py`
46 | - Tweak naming.
47 | - Update the prompt to your liking.
48 | - Update the assistant type to the one you want to use.
49 |
50 | ## Watch the walk through video
51 | - [Coding RELIABLE AI Agents: Legit Structured Outputs Use Cases (Strawberry Agent?)](https://youtu.be/PoO7Zjsvx0k)
52 | - [CONTROL your Personal AI Assistant with GPT-4o mini & ElevenLabs](https://youtu.be/ikaKpfUOb0U)
53 |
54 | ## Resources
55 | - https://openai.com/index/introducing-structured-outputs-in-the-api/
56 | - https://www.assemblyai.com/
57 | - https://console.groq.com/docs/speech-text
58 | - https://console.groq.com/docs/libraries
59 | - https://platform.openai.com/docs/guides/speech-to-text
60 | - https://platform.openai.com/docs/guides/text-to-speech
61 | - https://platform.openai.com/docs/api-reference/audio#audio/createTranscription-prompt
62 | - https://openai.com/api/pricing/
63 |
--------------------------------------------------------------------------------
/modules/simple_llm.py:
--------------------------------------------------------------------------------
1 | import llm
2 | from dotenv import load_dotenv
3 | import os
4 |
5 | load_dotenv()
6 |
7 |
8 | def prompt(model: llm.Model, prompt: str):
9 | res = model.prompt(prompt)
10 | return res.text()
11 |
12 |
13 | def get_model_name(model: llm.Model):
14 | return model.model_id
15 |
16 |
17 | def build_models():
18 | ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
19 |
20 | sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet")
21 | sonnet_3_5_model.key = ANTHROPIC_API_KEY
22 |
23 | return sonnet_3_5_model
24 |
25 |
26 | def build_big_3_models():
27 | ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
28 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
29 | GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30 |
31 | sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet")
32 | sonnet_3_5_model.key = ANTHROPIC_API_KEY
33 |
34 | gpt4_o_model: llm.Model = llm.get_model("4o")
35 | gpt4_o_model.key = OPENAI_API_KEY
36 |
37 | gemini_1_5_pro_model: llm.Model = llm.get_model("gemini-1.5-pro-latest")
38 | gemini_1_5_pro_model.key = GEMINI_API_KEY
39 |
40 | return sonnet_3_5_model, gpt4_o_model, gemini_1_5_pro_model
41 |
42 |
43 | def build_big_3_plus_mini_models():
44 |
45 | ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
46 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
47 | GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
48 |
49 | sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet")
50 | sonnet_3_5_model.key = ANTHROPIC_API_KEY
51 |
52 | gpt4_o_model: llm.Model = llm.get_model("4o")
53 | gpt4_o_model.key = OPENAI_API_KEY
54 |
55 | gemini_1_5_pro_model: llm.Model = llm.get_model("gemini-1.5-pro-latest")
56 | gemini_1_5_pro_model.key = GEMINI_API_KEY
57 |
58 | gpt4_o_mini_model: llm.Model = llm.get_model("gpt-4o-mini")
59 | gpt4_o_mini_model.key = OPENAI_API_KEY
60 |
61 | return sonnet_3_5_model, gpt4_o_model, gemini_1_5_pro_model, gpt4_o_mini_model
62 |
63 |
64 | def build_mini_model():
65 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
66 |
67 | gpt4_o_mini_model: llm.Model = llm.get_model("gpt-4o-mini")
68 | gpt4_o_mini_model.key = OPENAI_API_KEY
69 |
70 | return gpt4_o_mini_model
71 |
72 |
73 | def build_new_gpt4o():
74 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
75 |
76 | gpt4_o_model: llm.Model = llm.get_model("gpt-4o-2024-08-06")
77 | gpt4_o_model.key = OPENAI_API_KEY
78 |
79 | return gpt4_o_model
80 |
--------------------------------------------------------------------------------
/structured_outputs_example.py:
--------------------------------------------------------------------------------
1 | # https://openai.com/index/introducing-structured-outputs-in-the-api/
2 |
3 | from enum import Enum
4 | from typing import Union
5 |
6 | from pydantic import BaseModel
7 |
8 | import openai
9 | from openai import OpenAI
10 |
11 | NEW_GPT_4o_AUG = "gpt-4o-2024-08-06"
12 |
13 |
14 | def structured_output_tool_call():
15 |
16 | class Table(str, Enum):
17 | orders = "orders"
18 | customers = "customers"
19 | products = "products"
20 |
21 | class Column(str, Enum):
22 | id = "id"
23 | status = "status"
24 | expected_delivery_date = "expected_delivery_date"
25 | delivered_at = "delivered_at"
26 | shipped_at = "shipped_at"
27 | ordered_at = "ordered_at"
28 | canceled_at = "canceled_at"
29 |
30 | class Operator(str, Enum):
31 | eq = "="
32 | gt = ">"
33 | lt = "<"
34 | le = "<="
35 | ge = ">="
36 | ne = "!="
37 |
38 | class OrderBy(str, Enum):
39 | asc = "asc"
40 | desc = "desc"
41 |
42 | class DynamicValue(BaseModel):
43 | column_name: str
44 |
45 | class Condition(BaseModel):
46 | column: str
47 | operator: Operator
48 | value: Union[str, int, DynamicValue]
49 |
50 | class Query(BaseModel):
51 | table_name: Table
52 | columns: list[Column]
53 | conditions: list[Condition]
54 | order_by: OrderBy
55 |
56 | client = OpenAI()
57 |
58 | completion = client.beta.chat.completions.parse(
59 | model=NEW_GPT_4o_AUG,
60 | messages=[
61 | {
62 | "role": "system",
63 | "content": "You are a helpful assistant. The current date is August 6, 2024. You help users query for the data they are looking for by calling the query function.",
64 | },
65 | {
66 | "role": "user",
67 | "content": "Find all the orders that were cancelled in the first quarter of 2022",
68 | },
69 | ],
70 | tools=[
71 | openai.pydantic_function_tool(Query),
72 | ],
73 | )
74 |
75 | def mock_query_function(query: Query):
76 | print(f"Table Name: {query.table_name}")
77 | print("Columns:")
78 | for column in query.columns:
79 | print(f" - {column}")
80 | print("Conditions:")
81 | for condition in query.conditions:
82 | print(
83 | f" - Column: {condition.column}, Operator: {condition.operator}, Value: {condition.value}"
84 | )
85 | print(f"Order By: {query.order_by}")
86 |
87 | print(
88 | "completion.choices and completion.choices[0].message",
89 | completion.choices and completion.choices[0].message,
90 | )
91 |
92 | # Parse the completion result and pass it to the mock function if available
93 | if completion.choices and completion.choices[0].message.tool_calls:
94 | if completion.choices[0].message.tool_calls[0].function.name == "Query ":
95 | query_result = (
96 | completion.choices[0].message.tool_calls[0].function.parsed_arguments
97 | )
98 | mock_query_function(query_result)
99 | else:
100 | print(f"{completion.choices and completion.choices[0].message.content}")
101 | else:
102 | print(f"{completion.choices and completion.choices[0].message.content}")
103 |
104 |
105 | def structured_output_minimal():
106 |
107 | class Step(BaseModel):
108 | explanation: str
109 | output: str
110 |
111 | class MathResponse(BaseModel):
112 | steps: list[Step]
113 | final_answer: str
114 |
115 | client = OpenAI()
116 |
117 | completion = client.beta.chat.completions.parse(
118 | model=NEW_GPT_4o_AUG,
119 | messages=[
120 | {"role": "system", "content": "You are a helpful math tutor."},
121 | {"role": "user", "content": "solve 8x + 31 = 2"},
122 | ],
123 | response_format=MathResponse,
124 | )
125 |
126 | message = completion.choices[0].message
127 | if message.parsed:
128 | print(message.parsed.steps)
129 | print(message.parsed.final_answer)
130 | else:
131 | print(message.refusal)
132 |
133 |
134 | structured_output_minimal()
135 | structured_output_tool_call()
136 |
--------------------------------------------------------------------------------
/modules/constants.py:
--------------------------------------------------------------------------------
1 | # CONSTANTS update these to fit your personal flow
2 |
3 | PERSONAL_AI_ASSISTANT_NAME = "Ada"
4 | HUMAN_COMPANION_NAME = "Dan"
5 |
6 | CONVO_TRAIL_CUTOFF = 30
7 |
8 | FS = 44100 # Sample rate
9 | CHANNELS = 1 # Mono audio
10 | DURATION = 30 # Duration of the recording in seconds
11 |
12 | ELEVEN_LABS_PRIMARY_SOLID_VOICE = "WejK3H1m7MI9CHnIjW9K"
13 | ELEVEN_LABS_CRINGE_VOICE = "uyfkySFC5J00qZ6iLAdh"
14 |
15 | OPENAI_IMG_AGENT_DIR = "data/images/openai"
16 |
17 |
18 | # --------------------------- ASSISTANT TYPES ---------------------------
19 |
20 | ASSISTANT_TYPE = "OpenAISuperPAF"
21 |
22 | # ASSISTANT_TYPE = "OpenAIPAF"
23 |
24 | # ASSISTANT_TYPE = "GroqElevenPAF"
25 |
26 | # ASSISTANT_TYPE = "AssElevenPAF"
27 |
28 |
29 | # ---------------------------- PROMPT
30 |
31 | PERSONAL_AI_ASSISTANT_PROMPT_HEAD = f"""You are a friendly, ultra helpful, attentive, concise AI assistant named '{PERSONAL_AI_ASSISTANT_NAME}'.
32 |
33 |
34 | You work with your human companion '{HUMAN_COMPANION_NAME}' to build, collaborate, and connect.
35 | We both like short, concise, conversational interactions.
36 | You're responding to '{HUMAN_COMPANION_NAME}'s latest-input.
37 | Respond in a short, conversational matter. Exclude meta-data, markdown, dashes, asterisks, etc.
38 | When building your response, consider our previous-interactions as well, but focus primarily on the latest-input.
39 | When you're asked for more details, add more details and be more verbose.
40 | Be friendly, helpful, and interested. Ask questions where appropriate.
41 |
42 |
43 |
44 | [[previous_interactions]]
45 |
46 |
47 |
48 | [[latest_input]]
49 |
50 |
51 | Your Conversational Response:"""
52 |
53 |
54 | OPENAI_SUPER_ASSISTANT_PROMPT_HEAD = f"""You are a friendly, ultra helpful, attentive, concise AI assistant named '{PERSONAL_AI_ASSISTANT_NAME}'.
55 |
56 |
57 | You work with your human companion '{HUMAN_COMPANION_NAME}' to build, collaborate, and connect.
58 | We both like short, concise, conversational interactions.
59 | You're responding to '{HUMAN_COMPANION_NAME}'s latest-input.
60 | Respond in a short, conversational matter. Exclude meta-data, markdown, dashes, asterisks, etc.
61 | When building your response, consider our previous-interactions as well, but focus primarily on the latest-input.
62 | When you're asked for more details, add more details and be more verbose.
63 | Be friendly, helpful, and interested. Ask questions where appropriate.
64 | You can use various tools to run functionality for your human companion.
65 |
66 |
67 |
68 |
69 | generate_image
70 | If the human companion requests an image, use this tool.
71 |
72 |
73 | Unless otherwise specified, default quality to 'hd'.
74 |
75 |
76 | If a user asks for a certain number of images, append additional prompts parameter with that number of prompts.
77 |
78 |
79 | Be sure to create as many images as the user requested by adding them to the prompts parameter.
80 |
81 |
82 |
83 |
84 | convert_image
85 | If the human companion requests an image format conversion, use this tool.
86 |
87 |
88 | Ensure the image_format parameter is set to the desired format (e.g., 'jpg', 'png').
89 |
90 |
91 | Use the version_numbers parameter to specify which image versions to convert.
92 |
93 |
94 |
95 |
96 | resize_image
97 | If the human companion requests an image resize, use this tool.
98 |
99 |
100 | Specify the desired width and height in pixels.
101 |
102 |
103 | Use the version_numbers parameter to specify which image versions to resize.
104 |
105 |
106 |
107 |
108 | open_image_directory
109 | If the human companion requests to open the image directory, use this tool.
110 |
111 |
112 | This tool doesn't require any parameters.
113 |
114 |
115 |
116 |
117 |
118 |
119 | [[previous_interactions]]
120 |
121 |
122 |
123 | [[latest_input]]
124 |
125 |
126 | Your Conversational Response:"""
127 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import List
3 | from modules.typings import Interaction
4 | import sounddevice as sd
5 | import wave
6 | import os
7 | from datetime import datetime
8 | from assistants.assistants import OpenAISuperPAF
9 | import threading
10 | from dotenv import load_dotenv
11 | from modules.constants import (
12 | OPENAI_SUPER_ASSISTANT_PROMPT_HEAD,
13 | PERSONAL_AI_ASSISTANT_PROMPT_HEAD,
14 | FS,
15 | CHANNELS,
16 | DURATION,
17 | CONVO_TRAIL_CUTOFF,
18 | ASSISTANT_TYPE,
19 | )
20 |
21 | from modules.typings import Interaction
22 | from assistants.assistants import OpenAISuperPAF, OpenAIPAF, AssElevenPAF, GroqElevenPAF
23 |
24 | load_dotenv()
25 |
26 |
27 | def record_audio(duration=DURATION, fs=FS, channels=CHANNELS):
28 | """
29 | Simple function to record audio from the microphone.
30 | Gives you DURATION seconds of audio to speak into the microphone.
31 | After DURATION seconds, the recording will stop.
32 | Hit enter to stop the recording at any time.
33 | """
34 |
35 | print("🔴 Recording...")
36 | recording = sd.rec(
37 | int(duration * fs), samplerate=fs, channels=channels, dtype="int16"
38 | )
39 |
40 | def duration_warning():
41 | time.sleep(duration)
42 | if not stop_event.is_set():
43 | print(
44 | "⚠️ Record limit hit - your assistant won't hear what you're saying now. Increase the duration."
45 | )
46 |
47 | stop_event = threading.Event()
48 | warning_thread = threading.Thread(target=duration_warning)
49 | warning_thread.daemon = (
50 | True # Set the thread as daemon so it doesn't block program exit
51 | )
52 | warning_thread.start()
53 |
54 | input("🟡 Press Enter to stop recording...")
55 | stop_event.set()
56 | sd.stop()
57 |
58 | print(f"🍞 Recording Chunk Complete")
59 | return recording
60 |
61 |
62 | def ensure_data_directory_exists():
63 | if not os.path.exists("data"):
64 | os.makedirs("data")
65 |
66 |
67 | def create_audio_file(recording):
68 | ensure_data_directory_exists()
69 | """
70 | Creates an audio file from the recording.
71 | """
72 |
73 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
74 | filename = os.path.join("data", f"audio_{timestamp}.wav")
75 |
76 | with wave.open(filename, "wb") as wf:
77 | wf.setnchannels(CHANNELS)
78 | wf.setsampwidth(2)
79 | wf.setframerate(FS)
80 | wf.writeframes(recording)
81 |
82 | file_size = os.path.getsize(filename)
83 |
84 | print(f"📁 File {filename} has been saved with a size of {file_size} bytes.")
85 |
86 | return filename
87 |
88 |
89 | def build_prompt(latest_input: str, previous_interactions: List[Interaction]) -> str:
90 |
91 | base_prompt = PERSONAL_AI_ASSISTANT_PROMPT_HEAD
92 |
93 | if ASSISTANT_TYPE == "OpenAISuperPAF":
94 | print(f"🚀 Using OpenAI Super Personal AI Assistant Prompt...")
95 | base_prompt = OPENAI_SUPER_ASSISTANT_PROMPT_HEAD
96 |
97 | previous_interactions_str = "\n".join(
98 | [
99 | f"""
100 | {interaction.role}
101 | {interaction.content}
102 | """
103 | for interaction in previous_interactions
104 | ]
105 | )
106 | prepared_prompt = base_prompt.replace(
107 | "[[previous_interactions]]", previous_interactions_str
108 | )
109 |
110 | prepared_prompt = prepared_prompt.replace("[[latest_input]]", latest_input)
111 |
112 | return prepared_prompt
113 |
114 |
115 | def main():
116 | """
117 | In a loop, we:
118 |
119 | 1. Press enter to start recording
120 | 2. Record audio from the microphone for N seconds
121 | 3. When we press enter again, we create an audio file from the recording
122 | 4. Transcribe the audio file
123 | 5. Our AI assistant thinks (prompt) of a response to the transcription
124 | 6. Our AI assistant speaks the response
125 | 7. Delete the audio file
126 | 8. Update previous interactions
127 | """
128 |
129 | previous_interactions: List[Interaction] = []
130 |
131 | if ASSISTANT_TYPE == "OpenAISuperPAF":
132 | assistant = OpenAISuperPAF()
133 | print("🚀 Initialized OpenAI Super Personal AI Assistant...")
134 | elif ASSISTANT_TYPE == "OpenAIPAF":
135 | assistant = OpenAIPAF()
136 | print("🚀 Initialized OpenAI Personal AI Assistant...")
137 | elif ASSISTANT_TYPE == "AssElevenPAF":
138 | assistant = AssElevenPAF()
139 | print("🚀 Initialized AssemblyAI-ElevenLabs Personal AI Assistant...")
140 | elif ASSISTANT_TYPE == "GroqElevenPAF":
141 | assistant = GroqElevenPAF()
142 | print("🚀 Initialized Groq-ElevenLabs Personal AI Assistant...")
143 | else:
144 | raise ValueError(f"Invalid assistant type: {ASSISTANT_TYPE}")
145 |
146 | assistant.setup()
147 |
148 | while True:
149 | try:
150 | input("🎧 Press Enter to start recording...")
151 | recording = record_audio(duration=DURATION, fs=FS, channels=CHANNELS)
152 |
153 | filename = create_audio_file(recording)
154 | transcription = assistant.transcribe(filename)
155 |
156 | print(f"📝 Your Input Transcription: '{transcription}'")
157 |
158 | prompt = build_prompt(transcription, previous_interactions)
159 | response = assistant.think(prompt)
160 |
161 | print(f"🤖 Your Personal AI Assistant Response: '{response}'")
162 |
163 | assistant.speak(response)
164 |
165 | os.remove(filename)
166 |
167 | # Update previous interactions
168 | previous_interactions.append(
169 | Interaction(role="human", content=transcription)
170 | )
171 | previous_interactions.append(
172 | Interaction(role="assistant", content=response)
173 | )
174 |
175 | # Keep only the last CONVO_TRAIL_CUTOFF interactions
176 | if len(previous_interactions) > CONVO_TRAIL_CUTOFF:
177 | previous_interactions = previous_interactions[-CONVO_TRAIL_CUTOFF:]
178 |
179 | print("\nReady for next interaction. Press Ctrl+C to exit.")
180 | except KeyboardInterrupt:
181 | print("\nExiting the program.")
182 | break
183 |
184 |
185 | if __name__ == "__main__":
186 | main()
187 |
--------------------------------------------------------------------------------
/assistants/assistants.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import time
3 | import functools
4 | import uuid
5 | import requests
6 | import sounddevice as sd
7 | import wave
8 | import os
9 | import json
10 | from datetime import datetime
11 | import assemblyai as aai
12 | from elevenlabs import play
13 | from elevenlabs.client import ElevenLabs
14 | from PIL import Image
15 | import subprocess
16 | from modules.constants import (
17 | OPENAI_IMG_AGENT_DIR,
18 | ELEVEN_LABS_CRINGE_VOICE,
19 | ELEVEN_LABS_PRIMARY_SOLID_VOICE,
20 | )
21 | from modules.simple_llm import build_mini_model, build_new_gpt4o, prompt
22 | from dotenv import load_dotenv
23 | import openai
24 | from groq import Groq
25 |
26 | from modules.typings import (
27 | ConvertImageParams,
28 | GenerateImageParams,
29 | ImageRatio,
30 | Style,
31 | ResizeImageParams,
32 | OpenImageDirParams,
33 | )
34 |
35 |
36 | class PersonalAssistantFramework(abc.ABC):
37 | @staticmethod
38 | def timeit_decorator(func):
39 | @functools.wraps(func)
40 | def wrapper(*args, **kwargs):
41 | start_time = time.time()
42 | result = func(*args, **kwargs)
43 | end_time = time.time()
44 | duration = round(end_time - start_time, 2)
45 | print(
46 | f"⏰ {args[0].__class__.__name__} - {func.__name__}() took {duration:.2f} seconds"
47 | )
48 |
49 | json_file = f"{args[0].__class__.__name__}_time_table.json"
50 |
51 | # Read existing data or create an empty list
52 | if os.path.exists(json_file):
53 | with open(json_file, "r") as file:
54 | try:
55 | data = json.load(file)
56 | except json.JSONDecodeError:
57 | data = []
58 | else:
59 | data = []
60 |
61 | # Create new time record
62 | time_record = {
63 | "assistant": args[0].__class__.__name__,
64 | "function": func.__name__,
65 | "duration": f"{duration:.2f}",
66 | "position": 0, # New entry always at the top
67 | }
68 |
69 | # Update positions of existing records
70 | for record in data:
71 | record["position"] += 1
72 |
73 | # Insert new record at the beginning
74 | data.insert(0, time_record)
75 |
76 | # Sort data by position
77 | data.sort(key=lambda x: x["position"])
78 |
79 | # Write updated data back to file
80 | with open(json_file, "w") as file:
81 | json.dump(data, file, indent=2)
82 |
83 | return result
84 |
85 | return wrapper
86 |
87 | @abc.abstractmethod
88 | def setup(self):
89 | pass
90 |
91 | @abc.abstractmethod
92 | def transcribe(self, file_path):
93 | pass
94 |
95 | @abc.abstractmethod
96 | def speak(self, text: str):
97 | pass
98 |
99 | @abc.abstractmethod
100 | def think(self, prompt: str) -> str:
101 | pass
102 |
103 |
104 | class AssElevenPAF(PersonalAssistantFramework):
105 | def setup(self):
106 | aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
107 | self.elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVEN_API_KEY"))
108 | self.llm_model = build_mini_model()
109 |
110 | @PersonalAssistantFramework.timeit_decorator
111 | def generate_voice_audio(self, text: str):
112 | audio_generator = self.elevenlabs_client.generate(
113 | text=text,
114 | voice=ELEVEN_LABS_PRIMARY_SOLID_VOICE,
115 | model="eleven_turbo_v2",
116 | stream=False,
117 | )
118 | audio_bytes = b"".join(list(audio_generator))
119 | return audio_bytes
120 |
121 | @PersonalAssistantFramework.timeit_decorator
122 | def transcribe(self, file_path):
123 | transcriber = aai.Transcriber()
124 | transcript = transcriber.transcribe(file_path)
125 | return transcript.text
126 |
127 | def speak(self, text: str):
128 | audio = self.generate_voice_audio(text)
129 | play(audio)
130 |
131 | @PersonalAssistantFramework.timeit_decorator
132 | def think(self, thought: str) -> str:
133 | return prompt(self.llm_model, thought)
134 |
135 |
136 | class OpenAIPAF(PersonalAssistantFramework):
137 | def setup(self):
138 | openai.api_key = os.getenv("OPENAI_API_KEY")
139 | self.llm_model = build_mini_model()
140 |
141 | @PersonalAssistantFramework.timeit_decorator
142 | def transcribe(self, file_path):
143 | with open(file_path, "rb") as audio_file:
144 | transcript = openai.audio.transcriptions.create(
145 | model="whisper-1", # this points to whisper v2. See Docs (https://platform.openai.com/docs/api-reference/audio/createTranscription)
146 | file=audio_file,
147 | )
148 | return transcript.text
149 |
150 | @PersonalAssistantFramework.timeit_decorator
151 | def generate_voice_audio(self, text: str):
152 | response = openai.audio.speech.create(
153 | model="tts-1-hd", voice="shimmer", input=text, response_format="aac"
154 | )
155 | audio_bytes = b"".join(list(response.iter_bytes()))
156 | return audio_bytes
157 |
158 | def speak(self, text: str):
159 | audio = self.generate_voice_audio(text)
160 | play(audio)
161 |
162 | @PersonalAssistantFramework.timeit_decorator
163 | def think(self, thought: str) -> str:
164 | return prompt(self.llm_model, thought)
165 |
166 |
167 | class GroqElevenPAF(PersonalAssistantFramework):
168 | def setup(self):
169 | self.groq_client = Groq()
170 | self.elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVEN_API_KEY"))
171 | self.llm_model = build_mini_model()
172 |
173 | @PersonalAssistantFramework.timeit_decorator
174 | def transcribe(self, file_path):
175 | with open(file_path, "rb") as file:
176 | transcription = self.groq_client.audio.transcriptions.create(
177 | file=(file_path, file.read()),
178 | model="distil-whisper-large-v3-en",
179 | response_format="text",
180 | )
181 | return str(transcription)
182 |
183 | @PersonalAssistantFramework.timeit_decorator
184 | def generate_voice_audio(self, text: str):
185 | audio_generator = self.elevenlabs_client.generate(
186 | text=text,
187 | voice=ELEVEN_LABS_PRIMARY_SOLID_VOICE,
188 | model="eleven_turbo_v2_5",
189 | stream=False,
190 | )
191 | audio_bytes = b"".join(list(audio_generator))
192 | return audio_bytes
193 |
194 | def speak(self, text: str):
195 | audio = self.generate_voice_audio(text)
196 | play(audio)
197 |
198 | @PersonalAssistantFramework.timeit_decorator
199 | def think(self, thought: str) -> str:
200 | return prompt(self.llm_model, thought)
201 |
202 |
203 | class OpenAISuperPAF(OpenAIPAF):
204 | def setup(self):
205 | super().setup()
206 | openai.api_key = os.getenv("OPENAI_API_KEY")
207 | self.weak_model = build_mini_model()
208 | self.download_directory = os.path.join(os.getcwd(), OPENAI_IMG_AGENT_DIR)
209 | if not os.path.exists(self.download_directory):
210 | os.makedirs(self.download_directory)
211 |
212 | def generate_image(self, generate_image_params: GenerateImageParams) -> bool:
213 |
214 | # handle defaults
215 | if generate_image_params.image_ratio is None:
216 | generate_image_params.image_ratio = ImageRatio.SQUARE
217 | if generate_image_params.quality is None:
218 | generate_image_params.quality = "hd"
219 | if generate_image_params.style is None:
220 | generate_image_params.style = Style.NATURAL
221 |
222 | client = openai.OpenAI()
223 | subdirectory = os.path.join(self.download_directory)
224 | if not os.path.exists(subdirectory):
225 | os.makedirs(subdirectory)
226 |
227 | for index, prompt in enumerate(generate_image_params.prompts):
228 | print(f"🖼️ Generating image {index + 1} with prompt: {prompt}")
229 | response = client.images.generate(
230 | model="dall-e-3",
231 | prompt=prompt,
232 | size=generate_image_params.image_ratio.value,
233 | quality=generate_image_params.quality,
234 | n=1,
235 | style=generate_image_params.style.value,
236 | )
237 | image_url = response.data[0].url
238 | image_response = requests.get(image_url)
239 | image_path = os.path.join(subdirectory, f"version_{index}.png")
240 | with open(image_path, "wb") as file:
241 | file.write(image_response.content)
242 |
243 | return True
244 |
245 | def convert_image(self, convert_image_params: ConvertImageParams) -> bool:
246 | subdirectory = os.path.join(self.download_directory)
247 | if not os.path.exists(subdirectory):
248 | os.makedirs(subdirectory)
249 |
250 | for index in convert_image_params.version_numbers:
251 | input_path = os.path.join(subdirectory, f"version_{index}.png")
252 | if not os.path.exists(input_path):
253 | print(f"🟡 Warning: File {input_path} does not exist. Skipping.")
254 | continue
255 |
256 | output_path = os.path.join(
257 | subdirectory, f"version_{index}.{convert_image_params.image_format}"
258 | )
259 |
260 | try:
261 | with Image.open(input_path) as img:
262 | img.save(
263 | output_path,
264 | format=convert_image_params.image_format.value.upper(),
265 | )
266 | print(f"🖼️ Converted {input_path} to {output_path}")
267 | except Exception as e:
268 | print(f"Error converting {input_path}: {str(e)}")
269 | return False
270 |
271 | return True
272 |
273 | def resize_image(self, resize_image_params: ResizeImageParams) -> bool:
274 | subdirectory = os.path.join(self.download_directory)
275 | if not os.path.exists(subdirectory):
276 | os.makedirs(subdirectory)
277 |
278 | for index in resize_image_params.version_numbers:
279 | input_path = os.path.join(subdirectory, f"version_{index}.png")
280 | if not os.path.exists(input_path):
281 | print(f"🟡 Warning: File {input_path} does not exist. Skipping.")
282 | continue
283 |
284 | output_path = os.path.join(
285 | subdirectory,
286 | f"version_{index}_resized_w{resize_image_params.width}_h{resize_image_params.height}.png",
287 | )
288 |
289 | try:
290 | with Image.open(input_path) as img:
291 | resized_img = img.resize(
292 | (resize_image_params.width, resize_image_params.height)
293 | )
294 | resized_img.save(output_path)
295 | print(f"🖼️ Resized {input_path} to {output_path}")
296 | except Exception as e:
297 | print(f"Error resizing {input_path}: {str(e)}")
298 | return False
299 |
300 | return True
301 |
302 | def open_image_directory(self, open_image_dir_params: OpenImageDirParams) -> bool:
303 | try:
304 | if os.name == "nt": # For Windows
305 | os.startfile(self.download_directory)
306 | elif os.name == "posix": # For macOS and Linux
307 | subprocess.call(["open", self.download_directory])
308 | print(f"📂 Opened image directory: {self.download_directory}")
309 | return True
310 | except Exception as e:
311 | print(f"Error opening image directory: {str(e)}")
312 | return False
313 |
314 | @PersonalAssistantFramework.timeit_decorator
315 | def think(self, thought: str) -> str:
316 | client = openai.OpenAI()
317 | completion = client.beta.chat.completions.parse(
318 | model="gpt-4o-2024-08-06",
319 | messages=[
320 | {"role": "system", "content": "You are a helpful assistant."},
321 | {"role": "user", "content": thought},
322 | ],
323 | tools=[
324 | openai.pydantic_function_tool(GenerateImageParams),
325 | openai.pydantic_function_tool(ConvertImageParams),
326 | openai.pydantic_function_tool(ResizeImageParams),
327 | openai.pydantic_function_tool(OpenImageDirParams),
328 | ],
329 | )
330 |
331 | message = completion.choices[0].message
332 |
333 | if message.tool_calls:
334 |
335 | tool_call = message.tool_calls[0]
336 |
337 | pretty_parsed_arguments = (
338 | tool_call.function.parsed_arguments.model_dump_json(indent=2)
339 | )
340 |
341 | print(
342 | f"""Tool call found: '{tool_call.function.name}(
343 | {pretty_parsed_arguments}
344 | )'.
345 | Calling..."""
346 | )
347 |
348 | success = False
349 |
350 | tool_call_success_prompt = f"Quickly let your human companion know that you've run the '{tool_call.function.name}' tool. Respond in a short, conversational manner, no fluff."
351 |
352 | tool_function_map = {
353 | "GenerateImageParams": self.generate_image,
354 | "ConvertImageParams": self.convert_image,
355 | "ResizeImageParams": self.resize_image,
356 | "OpenImageDirParams": self.open_image_directory,
357 | }
358 |
359 | if tool_call.function.name in tool_function_map:
360 | # 🚀 GUARANTEED OUTPUT STRUCTURE 🚀
361 | params = tool_call.function.parsed_arguments
362 | success = tool_function_map[tool_call.function.name](params)
363 | tool_call_success_prompt = f"Quickly let your human companion know that you've run the '{tool_call.function.name}' tool. Respond in a short, conversational manner, no fluff."
364 | else:
365 | success = False
366 | tool_call_success_prompt = (
367 | "An unknown tool was called. Please try again."
368 | )
369 |
370 | if success:
371 | return prompt(self.weak_model, tool_call_success_prompt)
372 |
373 | else:
374 | # just a normal thought
375 | return prompt(self.weak_model, thought)
376 |
--------------------------------------------------------------------------------