├── main_realtime_api.py ├── modules ├── __init__.py ├── typings.py ├── simple_llm.py └── constants.py ├── img ├── own-your-ai.png └── reliable-ai-agents.png ├── .env.sample ├── requirements.txt ├── .gitignore ├── README.md ├── structured_outputs_example.py ├── main.py └── assistants └── assistants.py /main_realtime_api.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/own-your-ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/disler/personal-ai-starter-pack/HEAD/img/own-your-ai.png -------------------------------------------------------------------------------- /img/reliable-ai-agents.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/disler/personal-ai-starter-pack/HEAD/img/reliable-ai-agents.png -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | GOOGLE_API_KEY= 2 | ASSEMBLYAI_API_KEY= 3 | ELEVEN_API_KEY= 4 | OPENAI_API_KEY= 5 | 6 | GEMINI_API_KEY= 7 | ANTHROPIC_API_KEY= 8 | AIDER_AUTO_COMMITS=false 9 | 10 | GROQ_API_KEY= 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | openai 3 | anthropic 4 | groq 5 | pytest 6 | pydantic 7 | assemblyai 8 | assemblyai[extras] 9 | sounddevice 10 | numpy 11 | elevenlabs 12 | llm 13 | llm-claude 14 | llm-claude-3 15 | llm-ollama 16 | llm-gemini 17 | Pillow 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .aider.ident.cache.v1 2 | .aider.tags.cache.v1 3 | .aider.chat.history.md 4 | .aider.input.history 5 | 6 | log.txt 7 | 8 | agent_results/ 9 | 10 | .vercel 11 | *.log 12 | *.pyc 13 | __pycache__ 14 | 15 | # Environments 16 | .env 17 | .venv 18 | env/ 19 | venv/ 20 | ENV/ 21 | env.bak/ 22 | venv.bak/ 23 | .env.yml 24 | .env.yaml 25 | .env.billing.yaml 26 | .env.billing.yml 27 | 28 | .aider* 29 | 30 | *.wav 31 | **.pyc 32 | modules/__pycache__ 33 | 34 | *.mp3 35 | *.mp4 36 | *.wav 37 | *.aac 38 | *.ogg 39 | *.flac 40 | *.m4a 41 | *.mp4 42 | 43 | *.json 44 | 45 | data/ -------------------------------------------------------------------------------- /modules/typings.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Any, Dict, List, Optional, Union 3 | 4 | import warnings 5 | 6 | # Suppress specific warnings 7 | warnings.filterwarnings("ignore", message="Valid config keys have changed in V2:") 8 | warnings.filterwarnings( 9 | "ignore", message='Field "model_id" has conflict with protected namespace "model_".' 10 | ) 11 | 12 | 13 | class Interaction(BaseModel): 14 | role: str 15 | content: str 16 | 17 | 18 | from enum import Enum 19 | 20 | 21 | class ImageRatio(str, Enum): 22 | SQUARE = "1024x1024" 23 | PORTRAIT = "1024x1792" 24 | LANDSCAPE = "1792x1024" 25 | 26 | 27 | class Style(str, Enum): 28 | VIVID = "vivid" 29 | NATURAL = "natural" 30 | 31 | 32 | class Quality(str, Enum): 33 | STANDARD = "standard" 34 | HD = "hd" 35 | 36 | 37 | class GenerateImageParams(BaseModel): 38 | prompts: List[str] 39 | quality: Quality 40 | image_ratio: Optional[ImageRatio] 41 | style: Optional[Style] 42 | 43 | 44 | class ImageFormat(str, Enum): 45 | JPEG = "jpeg" 46 | PNG = "png" 47 | GIF = "gif" 48 | BMP = "bmp" 49 | TIFF = "tiff" 50 | 51 | 52 | class ConvertImageParams(BaseModel): 53 | version_numbers: List[int] 54 | image_format: ImageFormat 55 | 56 | 57 | class ResizeImageParams(BaseModel): 58 | version_numbers: List[int] 59 | width: int 60 | height: int 61 | 62 | 63 | class OpenImageDirParams(BaseModel): 64 | pass 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast Personal AI Assistant & Structured Output 2 | > 3 | > A quick start personal AI assistant framework using OpenAI, Groq, AssemblyAI and ElevenLabs. 4 | > 5 | > And a breakdown of the reliability of AI agents with the new structured output. 6 | 7 | ![reliable-ai-agents.png](./img/reliable-ai-agents.png) 8 | ![own-your-ai](./img/own-your-ai.png) 9 | 10 | 11 | ## Setup 12 | 13 | - Create and activate virtual environment: 14 | ```bash 15 | python -m venv venv 16 | source venv/bin/activate # On Windows, use `venv\Scripts\activate` 17 | ``` 18 | 19 | - Install dependencies: 20 | ```bash 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | - Set up environment variables: 25 | ```bash 26 | cp .env.sample .env 27 | # Edit .env file and add your API keys 28 | ``` 29 | `I recommend starting with the OpenAI assistant since you only need to set up the OpenAI API key.`. 30 | 31 | - Run the main script: 32 | ```bash 33 | python main.py 34 | ``` 35 | 36 | - Run the structured output script: 37 | ```bash 38 | python structured_outputs_example.py 39 | ``` 40 | 41 | - Press `Enter` to start recording, and `Enter` again to stop recording. 42 | 43 | - Adjust the maximum duration of the recording in `constants.py: DURATION` 44 | 45 | - Update configuration variables in `constants.py` 46 | - Tweak naming. 47 | - Update the prompt to your liking. 48 | - Update the assistant type to the one you want to use. 49 | 50 | ## Watch the walk through video 51 | - [Coding RELIABLE AI Agents: Legit Structured Outputs Use Cases (Strawberry Agent?)](https://youtu.be/PoO7Zjsvx0k) 52 | - [CONTROL your Personal AI Assistant with GPT-4o mini & ElevenLabs](https://youtu.be/ikaKpfUOb0U) 53 | 54 | ## Resources 55 | - https://openai.com/index/introducing-structured-outputs-in-the-api/ 56 | - https://www.assemblyai.com/ 57 | - https://console.groq.com/docs/speech-text 58 | - https://console.groq.com/docs/libraries 59 | - https://platform.openai.com/docs/guides/speech-to-text 60 | - https://platform.openai.com/docs/guides/text-to-speech 61 | - https://platform.openai.com/docs/api-reference/audio#audio/createTranscription-prompt 62 | - https://openai.com/api/pricing/ 63 | -------------------------------------------------------------------------------- /modules/simple_llm.py: -------------------------------------------------------------------------------- 1 | import llm 2 | from dotenv import load_dotenv 3 | import os 4 | 5 | load_dotenv() 6 | 7 | 8 | def prompt(model: llm.Model, prompt: str): 9 | res = model.prompt(prompt) 10 | return res.text() 11 | 12 | 13 | def get_model_name(model: llm.Model): 14 | return model.model_id 15 | 16 | 17 | def build_models(): 18 | ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") 19 | 20 | sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet") 21 | sonnet_3_5_model.key = ANTHROPIC_API_KEY 22 | 23 | return sonnet_3_5_model 24 | 25 | 26 | def build_big_3_models(): 27 | ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") 28 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 29 | GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 30 | 31 | sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet") 32 | sonnet_3_5_model.key = ANTHROPIC_API_KEY 33 | 34 | gpt4_o_model: llm.Model = llm.get_model("4o") 35 | gpt4_o_model.key = OPENAI_API_KEY 36 | 37 | gemini_1_5_pro_model: llm.Model = llm.get_model("gemini-1.5-pro-latest") 38 | gemini_1_5_pro_model.key = GEMINI_API_KEY 39 | 40 | return sonnet_3_5_model, gpt4_o_model, gemini_1_5_pro_model 41 | 42 | 43 | def build_big_3_plus_mini_models(): 44 | 45 | ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") 46 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 47 | GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 48 | 49 | sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet") 50 | sonnet_3_5_model.key = ANTHROPIC_API_KEY 51 | 52 | gpt4_o_model: llm.Model = llm.get_model("4o") 53 | gpt4_o_model.key = OPENAI_API_KEY 54 | 55 | gemini_1_5_pro_model: llm.Model = llm.get_model("gemini-1.5-pro-latest") 56 | gemini_1_5_pro_model.key = GEMINI_API_KEY 57 | 58 | gpt4_o_mini_model: llm.Model = llm.get_model("gpt-4o-mini") 59 | gpt4_o_mini_model.key = OPENAI_API_KEY 60 | 61 | return sonnet_3_5_model, gpt4_o_model, gemini_1_5_pro_model, gpt4_o_mini_model 62 | 63 | 64 | def build_mini_model(): 65 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 66 | 67 | gpt4_o_mini_model: llm.Model = llm.get_model("gpt-4o-mini") 68 | gpt4_o_mini_model.key = OPENAI_API_KEY 69 | 70 | return gpt4_o_mini_model 71 | 72 | 73 | def build_new_gpt4o(): 74 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 75 | 76 | gpt4_o_model: llm.Model = llm.get_model("gpt-4o-2024-08-06") 77 | gpt4_o_model.key = OPENAI_API_KEY 78 | 79 | return gpt4_o_model 80 | -------------------------------------------------------------------------------- /structured_outputs_example.py: -------------------------------------------------------------------------------- 1 | # https://openai.com/index/introducing-structured-outputs-in-the-api/ 2 | 3 | from enum import Enum 4 | from typing import Union 5 | 6 | from pydantic import BaseModel 7 | 8 | import openai 9 | from openai import OpenAI 10 | 11 | NEW_GPT_4o_AUG = "gpt-4o-2024-08-06" 12 | 13 | 14 | def structured_output_tool_call(): 15 | 16 | class Table(str, Enum): 17 | orders = "orders" 18 | customers = "customers" 19 | products = "products" 20 | 21 | class Column(str, Enum): 22 | id = "id" 23 | status = "status" 24 | expected_delivery_date = "expected_delivery_date" 25 | delivered_at = "delivered_at" 26 | shipped_at = "shipped_at" 27 | ordered_at = "ordered_at" 28 | canceled_at = "canceled_at" 29 | 30 | class Operator(str, Enum): 31 | eq = "=" 32 | gt = ">" 33 | lt = "<" 34 | le = "<=" 35 | ge = ">=" 36 | ne = "!=" 37 | 38 | class OrderBy(str, Enum): 39 | asc = "asc" 40 | desc = "desc" 41 | 42 | class DynamicValue(BaseModel): 43 | column_name: str 44 | 45 | class Condition(BaseModel): 46 | column: str 47 | operator: Operator 48 | value: Union[str, int, DynamicValue] 49 | 50 | class Query(BaseModel): 51 | table_name: Table 52 | columns: list[Column] 53 | conditions: list[Condition] 54 | order_by: OrderBy 55 | 56 | client = OpenAI() 57 | 58 | completion = client.beta.chat.completions.parse( 59 | model=NEW_GPT_4o_AUG, 60 | messages=[ 61 | { 62 | "role": "system", 63 | "content": "You are a helpful assistant. The current date is August 6, 2024. You help users query for the data they are looking for by calling the query function.", 64 | }, 65 | { 66 | "role": "user", 67 | "content": "Find all the orders that were cancelled in the first quarter of 2022", 68 | }, 69 | ], 70 | tools=[ 71 | openai.pydantic_function_tool(Query), 72 | ], 73 | ) 74 | 75 | def mock_query_function(query: Query): 76 | print(f"Table Name: {query.table_name}") 77 | print("Columns:") 78 | for column in query.columns: 79 | print(f" - {column}") 80 | print("Conditions:") 81 | for condition in query.conditions: 82 | print( 83 | f" - Column: {condition.column}, Operator: {condition.operator}, Value: {condition.value}" 84 | ) 85 | print(f"Order By: {query.order_by}") 86 | 87 | print( 88 | "completion.choices and completion.choices[0].message", 89 | completion.choices and completion.choices[0].message, 90 | ) 91 | 92 | # Parse the completion result and pass it to the mock function if available 93 | if completion.choices and completion.choices[0].message.tool_calls: 94 | if completion.choices[0].message.tool_calls[0].function.name == "Query ": 95 | query_result = ( 96 | completion.choices[0].message.tool_calls[0].function.parsed_arguments 97 | ) 98 | mock_query_function(query_result) 99 | else: 100 | print(f"{completion.choices and completion.choices[0].message.content}") 101 | else: 102 | print(f"{completion.choices and completion.choices[0].message.content}") 103 | 104 | 105 | def structured_output_minimal(): 106 | 107 | class Step(BaseModel): 108 | explanation: str 109 | output: str 110 | 111 | class MathResponse(BaseModel): 112 | steps: list[Step] 113 | final_answer: str 114 | 115 | client = OpenAI() 116 | 117 | completion = client.beta.chat.completions.parse( 118 | model=NEW_GPT_4o_AUG, 119 | messages=[ 120 | {"role": "system", "content": "You are a helpful math tutor."}, 121 | {"role": "user", "content": "solve 8x + 31 = 2"}, 122 | ], 123 | response_format=MathResponse, 124 | ) 125 | 126 | message = completion.choices[0].message 127 | if message.parsed: 128 | print(message.parsed.steps) 129 | print(message.parsed.final_answer) 130 | else: 131 | print(message.refusal) 132 | 133 | 134 | structured_output_minimal() 135 | structured_output_tool_call() 136 | -------------------------------------------------------------------------------- /modules/constants.py: -------------------------------------------------------------------------------- 1 | # CONSTANTS update these to fit your personal flow 2 | 3 | PERSONAL_AI_ASSISTANT_NAME = "Ada" 4 | HUMAN_COMPANION_NAME = "Dan" 5 | 6 | CONVO_TRAIL_CUTOFF = 30 7 | 8 | FS = 44100 # Sample rate 9 | CHANNELS = 1 # Mono audio 10 | DURATION = 30 # Duration of the recording in seconds 11 | 12 | ELEVEN_LABS_PRIMARY_SOLID_VOICE = "WejK3H1m7MI9CHnIjW9K" 13 | ELEVEN_LABS_CRINGE_VOICE = "uyfkySFC5J00qZ6iLAdh" 14 | 15 | OPENAI_IMG_AGENT_DIR = "data/images/openai" 16 | 17 | 18 | # --------------------------- ASSISTANT TYPES --------------------------- 19 | 20 | ASSISTANT_TYPE = "OpenAISuperPAF" 21 | 22 | # ASSISTANT_TYPE = "OpenAIPAF" 23 | 24 | # ASSISTANT_TYPE = "GroqElevenPAF" 25 | 26 | # ASSISTANT_TYPE = "AssElevenPAF" 27 | 28 | 29 | # ---------------------------- PROMPT 30 | 31 | PERSONAL_AI_ASSISTANT_PROMPT_HEAD = f"""You are a friendly, ultra helpful, attentive, concise AI assistant named '{PERSONAL_AI_ASSISTANT_NAME}'. 32 | 33 | 34 | You work with your human companion '{HUMAN_COMPANION_NAME}' to build, collaborate, and connect. 35 | We both like short, concise, conversational interactions. 36 | You're responding to '{HUMAN_COMPANION_NAME}'s latest-input. 37 | Respond in a short, conversational matter. Exclude meta-data, markdown, dashes, asterisks, etc. 38 | When building your response, consider our previous-interactions as well, but focus primarily on the latest-input. 39 | When you're asked for more details, add more details and be more verbose. 40 | Be friendly, helpful, and interested. Ask questions where appropriate. 41 | 42 | 43 | 44 | [[previous_interactions]] 45 | 46 | 47 | 48 | [[latest_input]] 49 | 50 | 51 | Your Conversational Response:""" 52 | 53 | 54 | OPENAI_SUPER_ASSISTANT_PROMPT_HEAD = f"""You are a friendly, ultra helpful, attentive, concise AI assistant named '{PERSONAL_AI_ASSISTANT_NAME}'. 55 | 56 | 57 | You work with your human companion '{HUMAN_COMPANION_NAME}' to build, collaborate, and connect. 58 | We both like short, concise, conversational interactions. 59 | You're responding to '{HUMAN_COMPANION_NAME}'s latest-input. 60 | Respond in a short, conversational matter. Exclude meta-data, markdown, dashes, asterisks, etc. 61 | When building your response, consider our previous-interactions as well, but focus primarily on the latest-input. 62 | When you're asked for more details, add more details and be more verbose. 63 | Be friendly, helpful, and interested. Ask questions where appropriate. 64 | You can use various tools to run functionality for your human companion. 65 | 66 | 67 | 68 | 69 | generate_image 70 | If the human companion requests an image, use this tool. 71 | 72 | 73 | Unless otherwise specified, default quality to 'hd'. 74 | 75 | 76 | If a user asks for a certain number of images, append additional prompts parameter with that number of prompts. 77 | 78 | 79 | Be sure to create as many images as the user requested by adding them to the prompts parameter. 80 | 81 | 82 | 83 | 84 | convert_image 85 | If the human companion requests an image format conversion, use this tool. 86 | 87 | 88 | Ensure the image_format parameter is set to the desired format (e.g., 'jpg', 'png'). 89 | 90 | 91 | Use the version_numbers parameter to specify which image versions to convert. 92 | 93 | 94 | 95 | 96 | resize_image 97 | If the human companion requests an image resize, use this tool. 98 | 99 | 100 | Specify the desired width and height in pixels. 101 | 102 | 103 | Use the version_numbers parameter to specify which image versions to resize. 104 | 105 | 106 | 107 | 108 | open_image_directory 109 | If the human companion requests to open the image directory, use this tool. 110 | 111 | 112 | This tool doesn't require any parameters. 113 | 114 | 115 | 116 | 117 | 118 | 119 | [[previous_interactions]] 120 | 121 | 122 | 123 | [[latest_input]] 124 | 125 | 126 | Your Conversational Response:""" 127 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List 3 | from modules.typings import Interaction 4 | import sounddevice as sd 5 | import wave 6 | import os 7 | from datetime import datetime 8 | from assistants.assistants import OpenAISuperPAF 9 | import threading 10 | from dotenv import load_dotenv 11 | from modules.constants import ( 12 | OPENAI_SUPER_ASSISTANT_PROMPT_HEAD, 13 | PERSONAL_AI_ASSISTANT_PROMPT_HEAD, 14 | FS, 15 | CHANNELS, 16 | DURATION, 17 | CONVO_TRAIL_CUTOFF, 18 | ASSISTANT_TYPE, 19 | ) 20 | 21 | from modules.typings import Interaction 22 | from assistants.assistants import OpenAISuperPAF, OpenAIPAF, AssElevenPAF, GroqElevenPAF 23 | 24 | load_dotenv() 25 | 26 | 27 | def record_audio(duration=DURATION, fs=FS, channels=CHANNELS): 28 | """ 29 | Simple function to record audio from the microphone. 30 | Gives you DURATION seconds of audio to speak into the microphone. 31 | After DURATION seconds, the recording will stop. 32 | Hit enter to stop the recording at any time. 33 | """ 34 | 35 | print("🔴 Recording...") 36 | recording = sd.rec( 37 | int(duration * fs), samplerate=fs, channels=channels, dtype="int16" 38 | ) 39 | 40 | def duration_warning(): 41 | time.sleep(duration) 42 | if not stop_event.is_set(): 43 | print( 44 | "⚠️ Record limit hit - your assistant won't hear what you're saying now. Increase the duration." 45 | ) 46 | 47 | stop_event = threading.Event() 48 | warning_thread = threading.Thread(target=duration_warning) 49 | warning_thread.daemon = ( 50 | True # Set the thread as daemon so it doesn't block program exit 51 | ) 52 | warning_thread.start() 53 | 54 | input("🟡 Press Enter to stop recording...") 55 | stop_event.set() 56 | sd.stop() 57 | 58 | print(f"🍞 Recording Chunk Complete") 59 | return recording 60 | 61 | 62 | def ensure_data_directory_exists(): 63 | if not os.path.exists("data"): 64 | os.makedirs("data") 65 | 66 | 67 | def create_audio_file(recording): 68 | ensure_data_directory_exists() 69 | """ 70 | Creates an audio file from the recording. 71 | """ 72 | 73 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 74 | filename = os.path.join("data", f"audio_{timestamp}.wav") 75 | 76 | with wave.open(filename, "wb") as wf: 77 | wf.setnchannels(CHANNELS) 78 | wf.setsampwidth(2) 79 | wf.setframerate(FS) 80 | wf.writeframes(recording) 81 | 82 | file_size = os.path.getsize(filename) 83 | 84 | print(f"📁 File {filename} has been saved with a size of {file_size} bytes.") 85 | 86 | return filename 87 | 88 | 89 | def build_prompt(latest_input: str, previous_interactions: List[Interaction]) -> str: 90 | 91 | base_prompt = PERSONAL_AI_ASSISTANT_PROMPT_HEAD 92 | 93 | if ASSISTANT_TYPE == "OpenAISuperPAF": 94 | print(f"🚀 Using OpenAI Super Personal AI Assistant Prompt...") 95 | base_prompt = OPENAI_SUPER_ASSISTANT_PROMPT_HEAD 96 | 97 | previous_interactions_str = "\n".join( 98 | [ 99 | f""" 100 | {interaction.role} 101 | {interaction.content} 102 | """ 103 | for interaction in previous_interactions 104 | ] 105 | ) 106 | prepared_prompt = base_prompt.replace( 107 | "[[previous_interactions]]", previous_interactions_str 108 | ) 109 | 110 | prepared_prompt = prepared_prompt.replace("[[latest_input]]", latest_input) 111 | 112 | return prepared_prompt 113 | 114 | 115 | def main(): 116 | """ 117 | In a loop, we: 118 | 119 | 1. Press enter to start recording 120 | 2. Record audio from the microphone for N seconds 121 | 3. When we press enter again, we create an audio file from the recording 122 | 4. Transcribe the audio file 123 | 5. Our AI assistant thinks (prompt) of a response to the transcription 124 | 6. Our AI assistant speaks the response 125 | 7. Delete the audio file 126 | 8. Update previous interactions 127 | """ 128 | 129 | previous_interactions: List[Interaction] = [] 130 | 131 | if ASSISTANT_TYPE == "OpenAISuperPAF": 132 | assistant = OpenAISuperPAF() 133 | print("🚀 Initialized OpenAI Super Personal AI Assistant...") 134 | elif ASSISTANT_TYPE == "OpenAIPAF": 135 | assistant = OpenAIPAF() 136 | print("🚀 Initialized OpenAI Personal AI Assistant...") 137 | elif ASSISTANT_TYPE == "AssElevenPAF": 138 | assistant = AssElevenPAF() 139 | print("🚀 Initialized AssemblyAI-ElevenLabs Personal AI Assistant...") 140 | elif ASSISTANT_TYPE == "GroqElevenPAF": 141 | assistant = GroqElevenPAF() 142 | print("🚀 Initialized Groq-ElevenLabs Personal AI Assistant...") 143 | else: 144 | raise ValueError(f"Invalid assistant type: {ASSISTANT_TYPE}") 145 | 146 | assistant.setup() 147 | 148 | while True: 149 | try: 150 | input("🎧 Press Enter to start recording...") 151 | recording = record_audio(duration=DURATION, fs=FS, channels=CHANNELS) 152 | 153 | filename = create_audio_file(recording) 154 | transcription = assistant.transcribe(filename) 155 | 156 | print(f"📝 Your Input Transcription: '{transcription}'") 157 | 158 | prompt = build_prompt(transcription, previous_interactions) 159 | response = assistant.think(prompt) 160 | 161 | print(f"🤖 Your Personal AI Assistant Response: '{response}'") 162 | 163 | assistant.speak(response) 164 | 165 | os.remove(filename) 166 | 167 | # Update previous interactions 168 | previous_interactions.append( 169 | Interaction(role="human", content=transcription) 170 | ) 171 | previous_interactions.append( 172 | Interaction(role="assistant", content=response) 173 | ) 174 | 175 | # Keep only the last CONVO_TRAIL_CUTOFF interactions 176 | if len(previous_interactions) > CONVO_TRAIL_CUTOFF: 177 | previous_interactions = previous_interactions[-CONVO_TRAIL_CUTOFF:] 178 | 179 | print("\nReady for next interaction. Press Ctrl+C to exit.") 180 | except KeyboardInterrupt: 181 | print("\nExiting the program.") 182 | break 183 | 184 | 185 | if __name__ == "__main__": 186 | main() 187 | -------------------------------------------------------------------------------- /assistants/assistants.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import time 3 | import functools 4 | import uuid 5 | import requests 6 | import sounddevice as sd 7 | import wave 8 | import os 9 | import json 10 | from datetime import datetime 11 | import assemblyai as aai 12 | from elevenlabs import play 13 | from elevenlabs.client import ElevenLabs 14 | from PIL import Image 15 | import subprocess 16 | from modules.constants import ( 17 | OPENAI_IMG_AGENT_DIR, 18 | ELEVEN_LABS_CRINGE_VOICE, 19 | ELEVEN_LABS_PRIMARY_SOLID_VOICE, 20 | ) 21 | from modules.simple_llm import build_mini_model, build_new_gpt4o, prompt 22 | from dotenv import load_dotenv 23 | import openai 24 | from groq import Groq 25 | 26 | from modules.typings import ( 27 | ConvertImageParams, 28 | GenerateImageParams, 29 | ImageRatio, 30 | Style, 31 | ResizeImageParams, 32 | OpenImageDirParams, 33 | ) 34 | 35 | 36 | class PersonalAssistantFramework(abc.ABC): 37 | @staticmethod 38 | def timeit_decorator(func): 39 | @functools.wraps(func) 40 | def wrapper(*args, **kwargs): 41 | start_time = time.time() 42 | result = func(*args, **kwargs) 43 | end_time = time.time() 44 | duration = round(end_time - start_time, 2) 45 | print( 46 | f"⏰ {args[0].__class__.__name__} - {func.__name__}() took {duration:.2f} seconds" 47 | ) 48 | 49 | json_file = f"{args[0].__class__.__name__}_time_table.json" 50 | 51 | # Read existing data or create an empty list 52 | if os.path.exists(json_file): 53 | with open(json_file, "r") as file: 54 | try: 55 | data = json.load(file) 56 | except json.JSONDecodeError: 57 | data = [] 58 | else: 59 | data = [] 60 | 61 | # Create new time record 62 | time_record = { 63 | "assistant": args[0].__class__.__name__, 64 | "function": func.__name__, 65 | "duration": f"{duration:.2f}", 66 | "position": 0, # New entry always at the top 67 | } 68 | 69 | # Update positions of existing records 70 | for record in data: 71 | record["position"] += 1 72 | 73 | # Insert new record at the beginning 74 | data.insert(0, time_record) 75 | 76 | # Sort data by position 77 | data.sort(key=lambda x: x["position"]) 78 | 79 | # Write updated data back to file 80 | with open(json_file, "w") as file: 81 | json.dump(data, file, indent=2) 82 | 83 | return result 84 | 85 | return wrapper 86 | 87 | @abc.abstractmethod 88 | def setup(self): 89 | pass 90 | 91 | @abc.abstractmethod 92 | def transcribe(self, file_path): 93 | pass 94 | 95 | @abc.abstractmethod 96 | def speak(self, text: str): 97 | pass 98 | 99 | @abc.abstractmethod 100 | def think(self, prompt: str) -> str: 101 | pass 102 | 103 | 104 | class AssElevenPAF(PersonalAssistantFramework): 105 | def setup(self): 106 | aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") 107 | self.elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVEN_API_KEY")) 108 | self.llm_model = build_mini_model() 109 | 110 | @PersonalAssistantFramework.timeit_decorator 111 | def generate_voice_audio(self, text: str): 112 | audio_generator = self.elevenlabs_client.generate( 113 | text=text, 114 | voice=ELEVEN_LABS_PRIMARY_SOLID_VOICE, 115 | model="eleven_turbo_v2", 116 | stream=False, 117 | ) 118 | audio_bytes = b"".join(list(audio_generator)) 119 | return audio_bytes 120 | 121 | @PersonalAssistantFramework.timeit_decorator 122 | def transcribe(self, file_path): 123 | transcriber = aai.Transcriber() 124 | transcript = transcriber.transcribe(file_path) 125 | return transcript.text 126 | 127 | def speak(self, text: str): 128 | audio = self.generate_voice_audio(text) 129 | play(audio) 130 | 131 | @PersonalAssistantFramework.timeit_decorator 132 | def think(self, thought: str) -> str: 133 | return prompt(self.llm_model, thought) 134 | 135 | 136 | class OpenAIPAF(PersonalAssistantFramework): 137 | def setup(self): 138 | openai.api_key = os.getenv("OPENAI_API_KEY") 139 | self.llm_model = build_mini_model() 140 | 141 | @PersonalAssistantFramework.timeit_decorator 142 | def transcribe(self, file_path): 143 | with open(file_path, "rb") as audio_file: 144 | transcript = openai.audio.transcriptions.create( 145 | model="whisper-1", # this points to whisper v2. See Docs (https://platform.openai.com/docs/api-reference/audio/createTranscription) 146 | file=audio_file, 147 | ) 148 | return transcript.text 149 | 150 | @PersonalAssistantFramework.timeit_decorator 151 | def generate_voice_audio(self, text: str): 152 | response = openai.audio.speech.create( 153 | model="tts-1-hd", voice="shimmer", input=text, response_format="aac" 154 | ) 155 | audio_bytes = b"".join(list(response.iter_bytes())) 156 | return audio_bytes 157 | 158 | def speak(self, text: str): 159 | audio = self.generate_voice_audio(text) 160 | play(audio) 161 | 162 | @PersonalAssistantFramework.timeit_decorator 163 | def think(self, thought: str) -> str: 164 | return prompt(self.llm_model, thought) 165 | 166 | 167 | class GroqElevenPAF(PersonalAssistantFramework): 168 | def setup(self): 169 | self.groq_client = Groq() 170 | self.elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVEN_API_KEY")) 171 | self.llm_model = build_mini_model() 172 | 173 | @PersonalAssistantFramework.timeit_decorator 174 | def transcribe(self, file_path): 175 | with open(file_path, "rb") as file: 176 | transcription = self.groq_client.audio.transcriptions.create( 177 | file=(file_path, file.read()), 178 | model="distil-whisper-large-v3-en", 179 | response_format="text", 180 | ) 181 | return str(transcription) 182 | 183 | @PersonalAssistantFramework.timeit_decorator 184 | def generate_voice_audio(self, text: str): 185 | audio_generator = self.elevenlabs_client.generate( 186 | text=text, 187 | voice=ELEVEN_LABS_PRIMARY_SOLID_VOICE, 188 | model="eleven_turbo_v2_5", 189 | stream=False, 190 | ) 191 | audio_bytes = b"".join(list(audio_generator)) 192 | return audio_bytes 193 | 194 | def speak(self, text: str): 195 | audio = self.generate_voice_audio(text) 196 | play(audio) 197 | 198 | @PersonalAssistantFramework.timeit_decorator 199 | def think(self, thought: str) -> str: 200 | return prompt(self.llm_model, thought) 201 | 202 | 203 | class OpenAISuperPAF(OpenAIPAF): 204 | def setup(self): 205 | super().setup() 206 | openai.api_key = os.getenv("OPENAI_API_KEY") 207 | self.weak_model = build_mini_model() 208 | self.download_directory = os.path.join(os.getcwd(), OPENAI_IMG_AGENT_DIR) 209 | if not os.path.exists(self.download_directory): 210 | os.makedirs(self.download_directory) 211 | 212 | def generate_image(self, generate_image_params: GenerateImageParams) -> bool: 213 | 214 | # handle defaults 215 | if generate_image_params.image_ratio is None: 216 | generate_image_params.image_ratio = ImageRatio.SQUARE 217 | if generate_image_params.quality is None: 218 | generate_image_params.quality = "hd" 219 | if generate_image_params.style is None: 220 | generate_image_params.style = Style.NATURAL 221 | 222 | client = openai.OpenAI() 223 | subdirectory = os.path.join(self.download_directory) 224 | if not os.path.exists(subdirectory): 225 | os.makedirs(subdirectory) 226 | 227 | for index, prompt in enumerate(generate_image_params.prompts): 228 | print(f"🖼️ Generating image {index + 1} with prompt: {prompt}") 229 | response = client.images.generate( 230 | model="dall-e-3", 231 | prompt=prompt, 232 | size=generate_image_params.image_ratio.value, 233 | quality=generate_image_params.quality, 234 | n=1, 235 | style=generate_image_params.style.value, 236 | ) 237 | image_url = response.data[0].url 238 | image_response = requests.get(image_url) 239 | image_path = os.path.join(subdirectory, f"version_{index}.png") 240 | with open(image_path, "wb") as file: 241 | file.write(image_response.content) 242 | 243 | return True 244 | 245 | def convert_image(self, convert_image_params: ConvertImageParams) -> bool: 246 | subdirectory = os.path.join(self.download_directory) 247 | if not os.path.exists(subdirectory): 248 | os.makedirs(subdirectory) 249 | 250 | for index in convert_image_params.version_numbers: 251 | input_path = os.path.join(subdirectory, f"version_{index}.png") 252 | if not os.path.exists(input_path): 253 | print(f"🟡 Warning: File {input_path} does not exist. Skipping.") 254 | continue 255 | 256 | output_path = os.path.join( 257 | subdirectory, f"version_{index}.{convert_image_params.image_format}" 258 | ) 259 | 260 | try: 261 | with Image.open(input_path) as img: 262 | img.save( 263 | output_path, 264 | format=convert_image_params.image_format.value.upper(), 265 | ) 266 | print(f"🖼️ Converted {input_path} to {output_path}") 267 | except Exception as e: 268 | print(f"Error converting {input_path}: {str(e)}") 269 | return False 270 | 271 | return True 272 | 273 | def resize_image(self, resize_image_params: ResizeImageParams) -> bool: 274 | subdirectory = os.path.join(self.download_directory) 275 | if not os.path.exists(subdirectory): 276 | os.makedirs(subdirectory) 277 | 278 | for index in resize_image_params.version_numbers: 279 | input_path = os.path.join(subdirectory, f"version_{index}.png") 280 | if not os.path.exists(input_path): 281 | print(f"🟡 Warning: File {input_path} does not exist. Skipping.") 282 | continue 283 | 284 | output_path = os.path.join( 285 | subdirectory, 286 | f"version_{index}_resized_w{resize_image_params.width}_h{resize_image_params.height}.png", 287 | ) 288 | 289 | try: 290 | with Image.open(input_path) as img: 291 | resized_img = img.resize( 292 | (resize_image_params.width, resize_image_params.height) 293 | ) 294 | resized_img.save(output_path) 295 | print(f"🖼️ Resized {input_path} to {output_path}") 296 | except Exception as e: 297 | print(f"Error resizing {input_path}: {str(e)}") 298 | return False 299 | 300 | return True 301 | 302 | def open_image_directory(self, open_image_dir_params: OpenImageDirParams) -> bool: 303 | try: 304 | if os.name == "nt": # For Windows 305 | os.startfile(self.download_directory) 306 | elif os.name == "posix": # For macOS and Linux 307 | subprocess.call(["open", self.download_directory]) 308 | print(f"📂 Opened image directory: {self.download_directory}") 309 | return True 310 | except Exception as e: 311 | print(f"Error opening image directory: {str(e)}") 312 | return False 313 | 314 | @PersonalAssistantFramework.timeit_decorator 315 | def think(self, thought: str) -> str: 316 | client = openai.OpenAI() 317 | completion = client.beta.chat.completions.parse( 318 | model="gpt-4o-2024-08-06", 319 | messages=[ 320 | {"role": "system", "content": "You are a helpful assistant."}, 321 | {"role": "user", "content": thought}, 322 | ], 323 | tools=[ 324 | openai.pydantic_function_tool(GenerateImageParams), 325 | openai.pydantic_function_tool(ConvertImageParams), 326 | openai.pydantic_function_tool(ResizeImageParams), 327 | openai.pydantic_function_tool(OpenImageDirParams), 328 | ], 329 | ) 330 | 331 | message = completion.choices[0].message 332 | 333 | if message.tool_calls: 334 | 335 | tool_call = message.tool_calls[0] 336 | 337 | pretty_parsed_arguments = ( 338 | tool_call.function.parsed_arguments.model_dump_json(indent=2) 339 | ) 340 | 341 | print( 342 | f"""Tool call found: '{tool_call.function.name}( 343 | {pretty_parsed_arguments} 344 | )'. 345 | Calling...""" 346 | ) 347 | 348 | success = False 349 | 350 | tool_call_success_prompt = f"Quickly let your human companion know that you've run the '{tool_call.function.name}' tool. Respond in a short, conversational manner, no fluff." 351 | 352 | tool_function_map = { 353 | "GenerateImageParams": self.generate_image, 354 | "ConvertImageParams": self.convert_image, 355 | "ResizeImageParams": self.resize_image, 356 | "OpenImageDirParams": self.open_image_directory, 357 | } 358 | 359 | if tool_call.function.name in tool_function_map: 360 | # 🚀 GUARANTEED OUTPUT STRUCTURE 🚀 361 | params = tool_call.function.parsed_arguments 362 | success = tool_function_map[tool_call.function.name](params) 363 | tool_call_success_prompt = f"Quickly let your human companion know that you've run the '{tool_call.function.name}' tool. Respond in a short, conversational manner, no fluff." 364 | else: 365 | success = False 366 | tool_call_success_prompt = ( 367 | "An unknown tool was called. Please try again." 368 | ) 369 | 370 | if success: 371 | return prompt(self.weak_model, tool_call_success_prompt) 372 | 373 | else: 374 | # just a normal thought 375 | return prompt(self.weak_model, thought) 376 | --------------------------------------------------------------------------------