├── main_realtime_api.py
├── modules
    ├── __init__.py
    ├── typings.py
    ├── simple_llm.py
    └── constants.py
├── img
    ├── own-your-ai.png
    └── reliable-ai-agents.png
├── .env.sample
├── requirements.txt
├── .gitignore
├── README.md
├── structured_outputs_example.py
├── main.py
└── assistants
    └── assistants.py


/main_realtime_api.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/img/own-your-ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/disler/personal-ai-starter-pack/HEAD/img/own-your-ai.png


--------------------------------------------------------------------------------
/img/reliable-ai-agents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/disler/personal-ai-starter-pack/HEAD/img/reliable-ai-agents.png


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
 1 | GOOGLE_API_KEY=
 2 | ASSEMBLYAI_API_KEY=
 3 | ELEVEN_API_KEY=
 4 | OPENAI_API_KEY=
 5 | 
 6 | GEMINI_API_KEY=
 7 | ANTHROPIC_API_KEY=
 8 | AIDER_AUTO_COMMITS=false
 9 | 
10 | GROQ_API_KEY=
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | python-dotenv
 2 | openai
 3 | anthropic
 4 | groq
 5 | pytest
 6 | pydantic
 7 | assemblyai
 8 | assemblyai[extras]
 9 | sounddevice
10 | numpy
11 | elevenlabs
12 | llm
13 | llm-claude
14 | llm-claude-3
15 | llm-ollama
16 | llm-gemini
17 | Pillow
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .aider.ident.cache.v1
 2 | .aider.tags.cache.v1
 3 | .aider.chat.history.md
 4 | .aider.input.history
 5 | 
 6 | log.txt
 7 | 
 8 | agent_results/
 9 | 
10 | .vercel
11 | *.log
12 | *.pyc
13 | __pycache__
14 | 
15 | # Environments 
16 | .env 
17 | .venv 
18 | env/ 
19 | venv/ 
20 | ENV/ 
21 | env.bak/ 
22 | venv.bak/ 
23 | .env.yml
24 | .env.yaml
25 | .env.billing.yaml
26 | .env.billing.yml
27 | 
28 | .aider*
29 | 
30 | *.wav
31 | **.pyc
32 | modules/__pycache__
33 | 
34 | *.mp3
35 | *.mp4
36 | *.wav
37 | *.aac
38 | *.ogg
39 | *.flac
40 | *.m4a
41 | *.mp4
42 | 
43 | *.json
44 | 
45 | data/


--------------------------------------------------------------------------------
/modules/typings.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Any, Dict, List, Optional, Union
 3 | 
 4 | import warnings
 5 | 
 6 | # Suppress specific warnings
 7 | warnings.filterwarnings("ignore", message="Valid config keys have changed in V2:")
 8 | warnings.filterwarnings(
 9 |     "ignore", message='Field "model_id" has conflict with protected namespace "model_".'
10 | )
11 | 
12 | 
13 | class Interaction(BaseModel):
14 |     role: str
15 |     content: str
16 | 
17 | 
18 | from enum import Enum
19 | 
20 | 
21 | class ImageRatio(str, Enum):
22 |     SQUARE = "1024x1024"
23 |     PORTRAIT = "1024x1792"
24 |     LANDSCAPE = "1792x1024"
25 | 
26 | 
27 | class Style(str, Enum):
28 |     VIVID = "vivid"
29 |     NATURAL = "natural"
30 | 
31 | 
32 | class Quality(str, Enum):
33 |     STANDARD = "standard"
34 |     HD = "hd"
35 | 
36 | 
37 | class GenerateImageParams(BaseModel):
38 |     prompts: List[str]
39 |     quality: Quality
40 |     image_ratio: Optional[ImageRatio]
41 |     style: Optional[Style]
42 | 
43 | 
44 | class ImageFormat(str, Enum):
45 |     JPEG = "jpeg"
46 |     PNG = "png"
47 |     GIF = "gif"
48 |     BMP = "bmp"
49 |     TIFF = "tiff"
50 | 
51 | 
52 | class ConvertImageParams(BaseModel):
53 |     version_numbers: List[int]
54 |     image_format: ImageFormat
55 | 
56 | 
57 | class ResizeImageParams(BaseModel):
58 |     version_numbers: List[int]
59 |     width: int
60 |     height: int
61 | 
62 | 
63 | class OpenImageDirParams(BaseModel):
64 |     pass
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fast Personal AI Assistant & Structured Output
 2 | >
 3 | > A quick start personal AI assistant framework using OpenAI, Groq, AssemblyAI and ElevenLabs.
 4 | >
 5 | > And a breakdown of the reliability of AI agents with the new structured output.
 6 | 
 7 | ![reliable-ai-agents.png](./img/reliable-ai-agents.png)
 8 | ![own-your-ai](./img/own-your-ai.png)
 9 | 
10 | 
11 | ## Setup
12 | 
13 | - Create and activate virtual environment:
14 |   ```bash
15 |   python -m venv venv
16 |   source venv/bin/activate  # On Windows, use `venv\Scripts\activate`
17 |   ```
18 | 
19 | - Install dependencies:
20 |   ```bash
21 |   pip install -r requirements.txt
22 |   ```
23 | 
24 | - Set up environment variables:
25 |   ```bash
26 |   cp .env.sample .env
27 |   # Edit .env file and add your API keys
28 |   ```
29 |   `I recommend starting with the OpenAI assistant since you only need to set up the OpenAI API key.`.
30 | 
31 | - Run the main script:
32 |   ```bash
33 |   python main.py
34 |   ```
35 | 
36 | - Run the structured output script:
37 |   ```bash
38 |   python structured_outputs_example.py
39 |   ```
40 | 
41 | - Press `Enter` to start recording, and `Enter` again to stop recording.
42 | 
43 | - Adjust the maximum duration of the recording in `constants.py: DURATION`
44 | 
45 | - Update configuration variables in `constants.py`
46 |   - Tweak naming.
47 |   - Update the prompt to your liking.
48 |   - Update the assistant type to the one you want to use.
49 | 
50 | ## Watch the walk through video
51 | - [Coding RELIABLE AI Agents: Legit Structured Outputs Use Cases (Strawberry Agent?)](https://youtu.be/PoO7Zjsvx0k)
52 | - [CONTROL your Personal AI Assistant with GPT-4o mini & ElevenLabs](https://youtu.be/ikaKpfUOb0U)
53 | 
54 | ## Resources
55 | - https://openai.com/index/introducing-structured-outputs-in-the-api/
56 | - https://www.assemblyai.com/ 
57 | - https://console.groq.com/docs/speech-text
58 | - https://console.groq.com/docs/libraries
59 | - https://platform.openai.com/docs/guides/speech-to-text
60 | - https://platform.openai.com/docs/guides/text-to-speech
61 | - https://platform.openai.com/docs/api-reference/audio#audio/createTranscription-prompt
62 | - https://openai.com/api/pricing/
63 | 


--------------------------------------------------------------------------------
/modules/simple_llm.py:
--------------------------------------------------------------------------------
 1 | import llm
 2 | from dotenv import load_dotenv
 3 | import os
 4 | 
 5 | load_dotenv()
 6 | 
 7 | 
 8 | def prompt(model: llm.Model, prompt: str):
 9 |     res = model.prompt(prompt)
10 |     return res.text()
11 | 
12 | 
13 | def get_model_name(model: llm.Model):
14 |     return model.model_id
15 | 
16 | 
17 | def build_models():
18 |     ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
19 | 
20 |     sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet")
21 |     sonnet_3_5_model.key = ANTHROPIC_API_KEY
22 | 
23 |     return sonnet_3_5_model
24 | 
25 | 
26 | def build_big_3_models():
27 |     ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
28 |     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
29 |     GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30 | 
31 |     sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet")
32 |     sonnet_3_5_model.key = ANTHROPIC_API_KEY
33 | 
34 |     gpt4_o_model: llm.Model = llm.get_model("4o")
35 |     gpt4_o_model.key = OPENAI_API_KEY
36 | 
37 |     gemini_1_5_pro_model: llm.Model = llm.get_model("gemini-1.5-pro-latest")
38 |     gemini_1_5_pro_model.key = GEMINI_API_KEY
39 | 
40 |     return sonnet_3_5_model, gpt4_o_model, gemini_1_5_pro_model
41 | 
42 | 
43 | def build_big_3_plus_mini_models():
44 | 
45 |     ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
46 |     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
47 |     GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
48 | 
49 |     sonnet_3_5_model: llm.Model = llm.get_model("claude-3.5-sonnet")
50 |     sonnet_3_5_model.key = ANTHROPIC_API_KEY
51 | 
52 |     gpt4_o_model: llm.Model = llm.get_model("4o")
53 |     gpt4_o_model.key = OPENAI_API_KEY
54 | 
55 |     gemini_1_5_pro_model: llm.Model = llm.get_model("gemini-1.5-pro-latest")
56 |     gemini_1_5_pro_model.key = GEMINI_API_KEY
57 | 
58 |     gpt4_o_mini_model: llm.Model = llm.get_model("gpt-4o-mini")
59 |     gpt4_o_mini_model.key = OPENAI_API_KEY
60 | 
61 |     return sonnet_3_5_model, gpt4_o_model, gemini_1_5_pro_model, gpt4_o_mini_model
62 | 
63 | 
64 | def build_mini_model():
65 |     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
66 | 
67 |     gpt4_o_mini_model: llm.Model = llm.get_model("gpt-4o-mini")
68 |     gpt4_o_mini_model.key = OPENAI_API_KEY
69 | 
70 |     return gpt4_o_mini_model
71 | 
72 | 
73 | def build_new_gpt4o():
74 |     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
75 | 
76 |     gpt4_o_model: llm.Model = llm.get_model("gpt-4o-2024-08-06")
77 |     gpt4_o_model.key = OPENAI_API_KEY
78 | 
79 |     return gpt4_o_model
80 | 


--------------------------------------------------------------------------------
/structured_outputs_example.py:
--------------------------------------------------------------------------------
  1 | # https://openai.com/index/introducing-structured-outputs-in-the-api/
  2 | 
  3 | from enum import Enum
  4 | from typing import Union
  5 | 
  6 | from pydantic import BaseModel
  7 | 
  8 | import openai
  9 | from openai import OpenAI
 10 | 
 11 | NEW_GPT_4o_AUG = "gpt-4o-2024-08-06"
 12 | 
 13 | 
 14 | def structured_output_tool_call():
 15 | 
 16 |     class Table(str, Enum):
 17 |         orders = "orders"
 18 |         customers = "customers"
 19 |         products = "products"
 20 | 
 21 |     class Column(str, Enum):
 22 |         id = "id"
 23 |         status = "status"
 24 |         expected_delivery_date = "expected_delivery_date"
 25 |         delivered_at = "delivered_at"
 26 |         shipped_at = "shipped_at"
 27 |         ordered_at = "ordered_at"
 28 |         canceled_at = "canceled_at"
 29 | 
 30 |     class Operator(str, Enum):
 31 |         eq = "="
 32 |         gt = ">"
 33 |         lt = "<"
 34 |         le = "<="
 35 |         ge = ">="
 36 |         ne = "!="
 37 | 
 38 |     class OrderBy(str, Enum):
 39 |         asc = "asc"
 40 |         desc = "desc"
 41 | 
 42 |     class DynamicValue(BaseModel):
 43 |         column_name: str
 44 | 
 45 |     class Condition(BaseModel):
 46 |         column: str
 47 |         operator: Operator
 48 |         value: Union[str, int, DynamicValue]
 49 | 
 50 |     class Query(BaseModel):
 51 |         table_name: Table
 52 |         columns: list[Column]
 53 |         conditions: list[Condition]
 54 |         order_by: OrderBy
 55 | 
 56 |     client = OpenAI()
 57 | 
 58 |     completion = client.beta.chat.completions.parse(
 59 |         model=NEW_GPT_4o_AUG,
 60 |         messages=[
 61 |             {
 62 |                 "role": "system",
 63 |                 "content": "You are a helpful assistant. The current date is August 6, 2024. You help users query for the data they are looking for by calling the query function.",
 64 |             },
 65 |             {
 66 |                 "role": "user",
 67 |                 "content": "Find all the orders that were cancelled in the first quarter of 2022",
 68 |             },
 69 |         ],
 70 |         tools=[
 71 |             openai.pydantic_function_tool(Query),
 72 |         ],
 73 |     )
 74 | 
 75 |     def mock_query_function(query: Query):
 76 |         print(f"Table Name: {query.table_name}")
 77 |         print("Columns:")
 78 |         for column in query.columns:
 79 |             print(f"  - {column}")
 80 |         print("Conditions:")
 81 |         for condition in query.conditions:
 82 |             print(
 83 |                 f"  - Column: {condition.column}, Operator: {condition.operator}, Value: {condition.value}"
 84 |             )
 85 |         print(f"Order By: {query.order_by}")
 86 | 
 87 |     print(
 88 |         "completion.choices and completion.choices[0].message",
 89 |         completion.choices and completion.choices[0].message,
 90 |     )
 91 | 
 92 |     # Parse the completion result and pass it to the mock function if available
 93 |     if completion.choices and completion.choices[0].message.tool_calls:
 94 |         if completion.choices[0].message.tool_calls[0].function.name == "Query  ":
 95 |             query_result = (
 96 |                 completion.choices[0].message.tool_calls[0].function.parsed_arguments
 97 |             )
 98 |             mock_query_function(query_result)
 99 |         else:
100 |             print(f"{completion.choices and completion.choices[0].message.content}")
101 |     else:
102 |         print(f"{completion.choices and completion.choices[0].message.content}")
103 | 
104 | 
105 | def structured_output_minimal():
106 | 
107 |     class Step(BaseModel):
108 |         explanation: str
109 |         output: str
110 | 
111 |     class MathResponse(BaseModel):
112 |         steps: list[Step]
113 |         final_answer: str
114 | 
115 |     client = OpenAI()
116 | 
117 |     completion = client.beta.chat.completions.parse(
118 |         model=NEW_GPT_4o_AUG,
119 |         messages=[
120 |             {"role": "system", "content": "You are a helpful math tutor."},
121 |             {"role": "user", "content": "solve 8x + 31 = 2"},
122 |         ],
123 |         response_format=MathResponse,
124 |     )
125 | 
126 |     message = completion.choices[0].message
127 |     if message.parsed:
128 |         print(message.parsed.steps)
129 |         print(message.parsed.final_answer)
130 |     else:
131 |         print(message.refusal)
132 | 
133 | 
134 | structured_output_minimal()
135 | structured_output_tool_call()
136 | 


--------------------------------------------------------------------------------
/modules/constants.py:
--------------------------------------------------------------------------------
  1 | # CONSTANTS update these to fit your personal flow
  2 | 
  3 | PERSONAL_AI_ASSISTANT_NAME = "Ada"
  4 | HUMAN_COMPANION_NAME = "Dan"
  5 | 
  6 | CONVO_TRAIL_CUTOFF = 30
  7 | 
  8 | FS = 44100  # Sample rate
  9 | CHANNELS = 1  # Mono audio
 10 | DURATION = 30  # Duration of the recording in seconds
 11 | 
 12 | ELEVEN_LABS_PRIMARY_SOLID_VOICE = "WejK3H1m7MI9CHnIjW9K"
 13 | ELEVEN_LABS_CRINGE_VOICE = "uyfkySFC5J00qZ6iLAdh"
 14 | 
 15 | OPENAI_IMG_AGENT_DIR = "data/images/openai"
 16 | 
 17 | 
 18 | # --------------------------- ASSISTANT TYPES ---------------------------
 19 | 
 20 | ASSISTANT_TYPE = "OpenAISuperPAF"
 21 | 
 22 | # ASSISTANT_TYPE = "OpenAIPAF"
 23 | 
 24 | # ASSISTANT_TYPE = "GroqElevenPAF"
 25 | 
 26 | # ASSISTANT_TYPE = "AssElevenPAF"
 27 | 
 28 | 
 29 | # ---------------------------- PROMPT
 30 | 
 31 | PERSONAL_AI_ASSISTANT_PROMPT_HEAD = f"""You are a friendly, ultra helpful, attentive, concise AI assistant named '{PERSONAL_AI_ASSISTANT_NAME}'.
 32 | 
 33 | <instructions>
 34 |     <rule>You work with your human companion '{HUMAN_COMPANION_NAME}' to build, collaborate, and connect.</rule>
 35 |     <rule>We both like short, concise, conversational interactions.</rule>
 36 |     <rule>You're responding to '{HUMAN_COMPANION_NAME}'s latest-input.</rule>
 37 |     <rule>Respond in a short, conversational matter. Exclude meta-data, markdown, dashes, asterisks, etc.</rule>
 38 |     <rule>When building your response, consider our previous-interactions as well, but focus primarily on the latest-input.</rule>
 39 |     <rule>When you're asked for more details, add more details and be more verbose.</rule>
 40 |     <rule>Be friendly, helpful, and interested. Ask questions where appropriate.</rule>
 41 | </instructions>
 42 | 
 43 | <previous-interactions>
 44 |     [[previous_interactions]]
 45 | </previous-interactions>
 46 | 
 47 | <latest-input>
 48 |     [[latest_input]]
 49 | </latest-input>
 50 | 
 51 | Your Conversational Response:"""
 52 | 
 53 | 
 54 | OPENAI_SUPER_ASSISTANT_PROMPT_HEAD = f"""You are a friendly, ultra helpful, attentive, concise AI assistant named '{PERSONAL_AI_ASSISTANT_NAME}'.
 55 | 
 56 | <instructions>
 57 |     <rule>You work with your human companion '{HUMAN_COMPANION_NAME}' to build, collaborate, and connect.</rule>
 58 |     <rule>We both like short, concise, conversational interactions.</rule>
 59 |     <rule>You're responding to '{HUMAN_COMPANION_NAME}'s latest-input.</rule>
 60 |     <rule>Respond in a short, conversational matter. Exclude meta-data, markdown, dashes, asterisks, etc.</rule>
 61 |     <rule>When building your response, consider our previous-interactions as well, but focus primarily on the latest-input.</rule>
 62 |     <rule>When you're asked for more details, add more details and be more verbose.</rule>
 63 |     <rule>Be friendly, helpful, and interested. Ask questions where appropriate.</rule>
 64 |     <rule>You can use various tools to run functionality for your human companion.</rule>
 65 | </instructions>
 66 | 
 67 | <tools>
 68 |     <image-generation>
 69 |         <name>generate_image</name>
 70 |         <trigger>If the human companion requests an image, use this tool.</trigger>
 71 |         <parameter-details>
 72 |             <detail>
 73 |                 Unless otherwise specified, default quality to 'hd'.
 74 |             </detail>
 75 |             <detail>
 76 |                 If a user asks for a certain number of images, append additional prompts parameter with that number of prompts.
 77 |             </detail>
 78 |             <detail>
 79 |                 Be sure to create as many images as the user requested by adding them to the prompts parameter.
 80 |             </detail>
 81 |         </parameter-details>
 82 |     </image-generation>
 83 |     <image-conversion>
 84 |         <name>convert_image</name>
 85 |         <trigger>If the human companion requests an image format conversion, use this tool.</trigger>
 86 |         <parameter-details>
 87 |             <detail>
 88 |                 Ensure the image_format parameter is set to the desired format (e.g., 'jpg', 'png').
 89 |             </detail>
 90 |             <detail>
 91 |                 Use the version_numbers parameter to specify which image versions to convert.
 92 |             </detail>
 93 |         </parameter-details>
 94 |     </image-conversion>
 95 |     <image-resize>
 96 |         <name>resize_image</name>
 97 |         <trigger>If the human companion requests an image resize, use this tool.</trigger>
 98 |         <parameter-details>
 99 |             <detail>
100 |                 Specify the desired width and height in pixels.
101 |             </detail>
102 |             <detail>
103 |                 Use the version_numbers parameter to specify which image versions to resize.
104 |             </detail>
105 |         </parameter-details>
106 |     </image-resize>
107 |     <open-image-directory>
108 |         <name>open_image_directory</name>
109 |         <trigger>If the human companion requests to open the image directory, use this tool.</trigger>
110 |         <parameter-details>
111 |             <detail>
112 |                 This tool doesn't require any parameters.
113 |             </detail>
114 |         </parameter-details>
115 |     </open-image-directory>
116 | </tools>
117 | 
118 | <previous-interactions>
119 |     [[previous_interactions]]
120 | </previous-interactions>
121 | 
122 | <latest-input>
123 |     [[latest_input]]
124 | </latest-input>
125 | 
126 | Your Conversational Response:"""
127 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import List
  3 | from modules.typings import Interaction
  4 | import sounddevice as sd
  5 | import wave
  6 | import os
  7 | from datetime import datetime
  8 | from assistants.assistants import OpenAISuperPAF
  9 | import threading
 10 | from dotenv import load_dotenv
 11 | from modules.constants import (
 12 |     OPENAI_SUPER_ASSISTANT_PROMPT_HEAD,
 13 |     PERSONAL_AI_ASSISTANT_PROMPT_HEAD,
 14 |     FS,
 15 |     CHANNELS,
 16 |     DURATION,
 17 |     CONVO_TRAIL_CUTOFF,
 18 |     ASSISTANT_TYPE,
 19 | )
 20 | 
 21 | from modules.typings import Interaction
 22 | from assistants.assistants import OpenAISuperPAF, OpenAIPAF, AssElevenPAF, GroqElevenPAF
 23 | 
 24 | load_dotenv()
 25 | 
 26 | 
 27 | def record_audio(duration=DURATION, fs=FS, channels=CHANNELS):
 28 |     """
 29 |     Simple function to record audio from the microphone.
 30 |     Gives you DURATION seconds of audio to speak into the microphone.
 31 |     After DURATION seconds, the recording will stop.
 32 |     Hit enter to stop the recording at any time.
 33 |     """
 34 | 
 35 |     print("🔴 Recording...")
 36 |     recording = sd.rec(
 37 |         int(duration * fs), samplerate=fs, channels=channels, dtype="int16"
 38 |     )
 39 | 
 40 |     def duration_warning():
 41 |         time.sleep(duration)
 42 |         if not stop_event.is_set():
 43 |             print(
 44 |                 "⚠️ Record limit hit - your assistant won't hear what you're saying now. Increase the duration."
 45 |             )
 46 | 
 47 |     stop_event = threading.Event()
 48 |     warning_thread = threading.Thread(target=duration_warning)
 49 |     warning_thread.daemon = (
 50 |         True  # Set the thread as daemon so it doesn't block program exit
 51 |     )
 52 |     warning_thread.start()
 53 | 
 54 |     input("🟡 Press Enter to stop recording...")
 55 |     stop_event.set()
 56 |     sd.stop()
 57 | 
 58 |     print(f"🍞 Recording Chunk Complete")
 59 |     return recording
 60 | 
 61 | 
 62 | def ensure_data_directory_exists():
 63 |     if not os.path.exists("data"):
 64 |         os.makedirs("data")
 65 | 
 66 | 
 67 | def create_audio_file(recording):
 68 |     ensure_data_directory_exists()
 69 |     """
 70 |     Creates an audio file from the recording.
 71 |     """
 72 | 
 73 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 74 |     filename = os.path.join("data", f"audio_{timestamp}.wav")
 75 | 
 76 |     with wave.open(filename, "wb") as wf:
 77 |         wf.setnchannels(CHANNELS)
 78 |         wf.setsampwidth(2)
 79 |         wf.setframerate(FS)
 80 |         wf.writeframes(recording)
 81 | 
 82 |     file_size = os.path.getsize(filename)
 83 | 
 84 |     print(f"📁 File {filename} has been saved with a size of {file_size} bytes.")
 85 | 
 86 |     return filename
 87 | 
 88 | 
 89 | def build_prompt(latest_input: str, previous_interactions: List[Interaction]) -> str:
 90 | 
 91 |     base_prompt = PERSONAL_AI_ASSISTANT_PROMPT_HEAD
 92 | 
 93 |     if ASSISTANT_TYPE == "OpenAISuperPAF":
 94 |         print(f"🚀 Using OpenAI Super Personal AI Assistant Prompt...")
 95 |         base_prompt = OPENAI_SUPER_ASSISTANT_PROMPT_HEAD
 96 | 
 97 |     previous_interactions_str = "\n".join(
 98 |         [
 99 |             f"""<interaction>
100 |     <role>{interaction.role}</role>
101 |     <content>{interaction.content}</content>
102 | </interaction>"""
103 |             for interaction in previous_interactions
104 |         ]
105 |     )
106 |     prepared_prompt = base_prompt.replace(
107 |         "[[previous_interactions]]", previous_interactions_str
108 |     )
109 | 
110 |     prepared_prompt = prepared_prompt.replace("[[latest_input]]", latest_input)
111 | 
112 |     return prepared_prompt
113 | 
114 | 
115 | def main():
116 |     """
117 |     In a loop, we:
118 | 
119 |     1. Press enter to start recording
120 |     2. Record audio from the microphone for N seconds
121 |     3. When we press enter again, we create an audio file from the recording
122 |     4. Transcribe the audio file
123 |     5. Our AI assistant thinks (prompt) of a response to the transcription
124 |     6. Our AI assistant speaks the response
125 |     7. Delete the audio file
126 |     8. Update previous interactions
127 |     """
128 | 
129 |     previous_interactions: List[Interaction] = []
130 | 
131 |     if ASSISTANT_TYPE == "OpenAISuperPAF":
132 |         assistant = OpenAISuperPAF()
133 |         print("🚀 Initialized OpenAI Super Personal AI Assistant...")
134 |     elif ASSISTANT_TYPE == "OpenAIPAF":
135 |         assistant = OpenAIPAF()
136 |         print("🚀 Initialized OpenAI Personal AI Assistant...")
137 |     elif ASSISTANT_TYPE == "AssElevenPAF":
138 |         assistant = AssElevenPAF()
139 |         print("🚀 Initialized AssemblyAI-ElevenLabs Personal AI Assistant...")
140 |     elif ASSISTANT_TYPE == "GroqElevenPAF":
141 |         assistant = GroqElevenPAF()
142 |         print("🚀 Initialized Groq-ElevenLabs Personal AI Assistant...")
143 |     else:
144 |         raise ValueError(f"Invalid assistant type: {ASSISTANT_TYPE}")
145 | 
146 |     assistant.setup()
147 | 
148 |     while True:
149 |         try:
150 |             input("🎧 Press Enter to start recording...")
151 |             recording = record_audio(duration=DURATION, fs=FS, channels=CHANNELS)
152 | 
153 |             filename = create_audio_file(recording)
154 |             transcription = assistant.transcribe(filename)
155 | 
156 |             print(f"📝 Your Input Transcription: '{transcription}'")
157 | 
158 |             prompt = build_prompt(transcription, previous_interactions)
159 |             response = assistant.think(prompt)
160 | 
161 |             print(f"🤖 Your Personal AI Assistant Response: '{response}'")
162 | 
163 |             assistant.speak(response)
164 | 
165 |             os.remove(filename)
166 | 
167 |             # Update previous interactions
168 |             previous_interactions.append(
169 |                 Interaction(role="human", content=transcription)
170 |             )
171 |             previous_interactions.append(
172 |                 Interaction(role="assistant", content=response)
173 |             )
174 | 
175 |             # Keep only the last CONVO_TRAIL_CUTOFF interactions
176 |             if len(previous_interactions) > CONVO_TRAIL_CUTOFF:
177 |                 previous_interactions = previous_interactions[-CONVO_TRAIL_CUTOFF:]
178 | 
179 |             print("\nReady for next interaction. Press Ctrl+C to exit.")
180 |         except KeyboardInterrupt:
181 |             print("\nExiting the program.")
182 |             break
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     main()
187 | 


--------------------------------------------------------------------------------
/assistants/assistants.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import time
  3 | import functools
  4 | import uuid
  5 | import requests
  6 | import sounddevice as sd
  7 | import wave
  8 | import os
  9 | import json
 10 | from datetime import datetime
 11 | import assemblyai as aai
 12 | from elevenlabs import play
 13 | from elevenlabs.client import ElevenLabs
 14 | from PIL import Image
 15 | import subprocess
 16 | from modules.constants import (
 17 |     OPENAI_IMG_AGENT_DIR,
 18 |     ELEVEN_LABS_CRINGE_VOICE,
 19 |     ELEVEN_LABS_PRIMARY_SOLID_VOICE,
 20 | )
 21 | from modules.simple_llm import build_mini_model, build_new_gpt4o, prompt
 22 | from dotenv import load_dotenv
 23 | import openai
 24 | from groq import Groq
 25 | 
 26 | from modules.typings import (
 27 |     ConvertImageParams,
 28 |     GenerateImageParams,
 29 |     ImageRatio,
 30 |     Style,
 31 |     ResizeImageParams,
 32 |     OpenImageDirParams,
 33 | )
 34 | 
 35 | 
 36 | class PersonalAssistantFramework(abc.ABC):
 37 |     @staticmethod
 38 |     def timeit_decorator(func):
 39 |         @functools.wraps(func)
 40 |         def wrapper(*args, **kwargs):
 41 |             start_time = time.time()
 42 |             result = func(*args, **kwargs)
 43 |             end_time = time.time()
 44 |             duration = round(end_time - start_time, 2)
 45 |             print(
 46 |                 f"⏰ {args[0].__class__.__name__} - {func.__name__}() took {duration:.2f} seconds"
 47 |             )
 48 | 
 49 |             json_file = f"{args[0].__class__.__name__}_time_table.json"
 50 | 
 51 |             # Read existing data or create an empty list
 52 |             if os.path.exists(json_file):
 53 |                 with open(json_file, "r") as file:
 54 |                     try:
 55 |                         data = json.load(file)
 56 |                     except json.JSONDecodeError:
 57 |                         data = []
 58 |             else:
 59 |                 data = []
 60 | 
 61 |             # Create new time record
 62 |             time_record = {
 63 |                 "assistant": args[0].__class__.__name__,
 64 |                 "function": func.__name__,
 65 |                 "duration": f"{duration:.2f}",
 66 |                 "position": 0,  # New entry always at the top
 67 |             }
 68 | 
 69 |             # Update positions of existing records
 70 |             for record in data:
 71 |                 record["position"] += 1
 72 | 
 73 |             # Insert new record at the beginning
 74 |             data.insert(0, time_record)
 75 | 
 76 |             # Sort data by position
 77 |             data.sort(key=lambda x: x["position"])
 78 | 
 79 |             # Write updated data back to file
 80 |             with open(json_file, "w") as file:
 81 |                 json.dump(data, file, indent=2)
 82 | 
 83 |             return result
 84 | 
 85 |         return wrapper
 86 | 
 87 |     @abc.abstractmethod
 88 |     def setup(self):
 89 |         pass
 90 | 
 91 |     @abc.abstractmethod
 92 |     def transcribe(self, file_path):
 93 |         pass
 94 | 
 95 |     @abc.abstractmethod
 96 |     def speak(self, text: str):
 97 |         pass
 98 | 
 99 |     @abc.abstractmethod
100 |     def think(self, prompt: str) -> str:
101 |         pass
102 | 
103 | 
104 | class AssElevenPAF(PersonalAssistantFramework):
105 |     def setup(self):
106 |         aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
107 |         self.elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVEN_API_KEY"))
108 |         self.llm_model = build_mini_model()
109 | 
110 |     @PersonalAssistantFramework.timeit_decorator
111 |     def generate_voice_audio(self, text: str):
112 |         audio_generator = self.elevenlabs_client.generate(
113 |             text=text,
114 |             voice=ELEVEN_LABS_PRIMARY_SOLID_VOICE,
115 |             model="eleven_turbo_v2",
116 |             stream=False,
117 |         )
118 |         audio_bytes = b"".join(list(audio_generator))
119 |         return audio_bytes
120 | 
121 |     @PersonalAssistantFramework.timeit_decorator
122 |     def transcribe(self, file_path):
123 |         transcriber = aai.Transcriber()
124 |         transcript = transcriber.transcribe(file_path)
125 |         return transcript.text
126 | 
127 |     def speak(self, text: str):
128 |         audio = self.generate_voice_audio(text)
129 |         play(audio)
130 | 
131 |     @PersonalAssistantFramework.timeit_decorator
132 |     def think(self, thought: str) -> str:
133 |         return prompt(self.llm_model, thought)
134 | 
135 | 
136 | class OpenAIPAF(PersonalAssistantFramework):
137 |     def setup(self):
138 |         openai.api_key = os.getenv("OPENAI_API_KEY")
139 |         self.llm_model = build_mini_model()
140 | 
141 |     @PersonalAssistantFramework.timeit_decorator
142 |     def transcribe(self, file_path):
143 |         with open(file_path, "rb") as audio_file:
144 |             transcript = openai.audio.transcriptions.create(
145 |                 model="whisper-1",  # this points to whisper v2. See Docs (https://platform.openai.com/docs/api-reference/audio/createTranscription)
146 |                 file=audio_file,
147 |             )
148 |         return transcript.text
149 | 
150 |     @PersonalAssistantFramework.timeit_decorator
151 |     def generate_voice_audio(self, text: str):
152 |         response = openai.audio.speech.create(
153 |             model="tts-1-hd", voice="shimmer", input=text, response_format="aac"
154 |         )
155 |         audio_bytes = b"".join(list(response.iter_bytes()))
156 |         return audio_bytes
157 | 
158 |     def speak(self, text: str):
159 |         audio = self.generate_voice_audio(text)
160 |         play(audio)
161 | 
162 |     @PersonalAssistantFramework.timeit_decorator
163 |     def think(self, thought: str) -> str:
164 |         return prompt(self.llm_model, thought)
165 | 
166 | 
167 | class GroqElevenPAF(PersonalAssistantFramework):
168 |     def setup(self):
169 |         self.groq_client = Groq()
170 |         self.elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVEN_API_KEY"))
171 |         self.llm_model = build_mini_model()
172 | 
173 |     @PersonalAssistantFramework.timeit_decorator
174 |     def transcribe(self, file_path):
175 |         with open(file_path, "rb") as file:
176 |             transcription = self.groq_client.audio.transcriptions.create(
177 |                 file=(file_path, file.read()),
178 |                 model="distil-whisper-large-v3-en",
179 |                 response_format="text",
180 |             )
181 |         return str(transcription)
182 | 
183 |     @PersonalAssistantFramework.timeit_decorator
184 |     def generate_voice_audio(self, text: str):
185 |         audio_generator = self.elevenlabs_client.generate(
186 |             text=text,
187 |             voice=ELEVEN_LABS_PRIMARY_SOLID_VOICE,
188 |             model="eleven_turbo_v2_5",
189 |             stream=False,
190 |         )
191 |         audio_bytes = b"".join(list(audio_generator))
192 |         return audio_bytes
193 | 
194 |     def speak(self, text: str):
195 |         audio = self.generate_voice_audio(text)
196 |         play(audio)
197 | 
198 |     @PersonalAssistantFramework.timeit_decorator
199 |     def think(self, thought: str) -> str:
200 |         return prompt(self.llm_model, thought)
201 | 
202 | 
203 | class OpenAISuperPAF(OpenAIPAF):
204 |     def setup(self):
205 |         super().setup()
206 |         openai.api_key = os.getenv("OPENAI_API_KEY")
207 |         self.weak_model = build_mini_model()
208 |         self.download_directory = os.path.join(os.getcwd(), OPENAI_IMG_AGENT_DIR)
209 |         if not os.path.exists(self.download_directory):
210 |             os.makedirs(self.download_directory)
211 | 
212 |     def generate_image(self, generate_image_params: GenerateImageParams) -> bool:
213 | 
214 |         # handle defaults
215 |         if generate_image_params.image_ratio is None:
216 |             generate_image_params.image_ratio = ImageRatio.SQUARE
217 |         if generate_image_params.quality is None:
218 |             generate_image_params.quality = "hd"
219 |         if generate_image_params.style is None:
220 |             generate_image_params.style = Style.NATURAL
221 | 
222 |         client = openai.OpenAI()
223 |         subdirectory = os.path.join(self.download_directory)
224 |         if not os.path.exists(subdirectory):
225 |             os.makedirs(subdirectory)
226 | 
227 |         for index, prompt in enumerate(generate_image_params.prompts):
228 |             print(f"🖼️ Generating image {index + 1} with prompt: {prompt}")
229 |             response = client.images.generate(
230 |                 model="dall-e-3",
231 |                 prompt=prompt,
232 |                 size=generate_image_params.image_ratio.value,
233 |                 quality=generate_image_params.quality,
234 |                 n=1,
235 |                 style=generate_image_params.style.value,
236 |             )
237 |             image_url = response.data[0].url
238 |             image_response = requests.get(image_url)
239 |             image_path = os.path.join(subdirectory, f"version_{index}.png")
240 |             with open(image_path, "wb") as file:
241 |                 file.write(image_response.content)
242 | 
243 |         return True
244 | 
245 |     def convert_image(self, convert_image_params: ConvertImageParams) -> bool:
246 |         subdirectory = os.path.join(self.download_directory)
247 |         if not os.path.exists(subdirectory):
248 |             os.makedirs(subdirectory)
249 | 
250 |         for index in convert_image_params.version_numbers:
251 |             input_path = os.path.join(subdirectory, f"version_{index}.png")
252 |             if not os.path.exists(input_path):
253 |                 print(f"🟡 Warning: File {input_path} does not exist. Skipping.")
254 |                 continue
255 | 
256 |             output_path = os.path.join(
257 |                 subdirectory, f"version_{index}.{convert_image_params.image_format}"
258 |             )
259 | 
260 |             try:
261 |                 with Image.open(input_path) as img:
262 |                     img.save(
263 |                         output_path,
264 |                         format=convert_image_params.image_format.value.upper(),
265 |                     )
266 |                 print(f"🖼️ Converted {input_path} to {output_path}")
267 |             except Exception as e:
268 |                 print(f"Error converting {input_path}: {str(e)}")
269 |                 return False
270 | 
271 |         return True
272 | 
273 |     def resize_image(self, resize_image_params: ResizeImageParams) -> bool:
274 |         subdirectory = os.path.join(self.download_directory)
275 |         if not os.path.exists(subdirectory):
276 |             os.makedirs(subdirectory)
277 | 
278 |         for index in resize_image_params.version_numbers:
279 |             input_path = os.path.join(subdirectory, f"version_{index}.png")
280 |             if not os.path.exists(input_path):
281 |                 print(f"🟡 Warning: File {input_path} does not exist. Skipping.")
282 |                 continue
283 | 
284 |             output_path = os.path.join(
285 |                 subdirectory,
286 |                 f"version_{index}_resized_w{resize_image_params.width}_h{resize_image_params.height}.png",
287 |             )
288 | 
289 |             try:
290 |                 with Image.open(input_path) as img:
291 |                     resized_img = img.resize(
292 |                         (resize_image_params.width, resize_image_params.height)
293 |                     )
294 |                     resized_img.save(output_path)
295 |                 print(f"🖼️ Resized {input_path} to {output_path}")
296 |             except Exception as e:
297 |                 print(f"Error resizing {input_path}: {str(e)}")
298 |                 return False
299 | 
300 |         return True
301 | 
302 |     def open_image_directory(self, open_image_dir_params: OpenImageDirParams) -> bool:
303 |         try:
304 |             if os.name == "nt":  # For Windows
305 |                 os.startfile(self.download_directory)
306 |             elif os.name == "posix":  # For macOS and Linux
307 |                 subprocess.call(["open", self.download_directory])
308 |             print(f"📂 Opened image directory: {self.download_directory}")
309 |             return True
310 |         except Exception as e:
311 |             print(f"Error opening image directory: {str(e)}")
312 |             return False
313 | 
314 |     @PersonalAssistantFramework.timeit_decorator
315 |     def think(self, thought: str) -> str:
316 |         client = openai.OpenAI()
317 |         completion = client.beta.chat.completions.parse(
318 |             model="gpt-4o-2024-08-06",
319 |             messages=[
320 |                 {"role": "system", "content": "You are a helpful assistant."},
321 |                 {"role": "user", "content": thought},
322 |             ],
323 |             tools=[
324 |                 openai.pydantic_function_tool(GenerateImageParams),
325 |                 openai.pydantic_function_tool(ConvertImageParams),
326 |                 openai.pydantic_function_tool(ResizeImageParams),
327 |                 openai.pydantic_function_tool(OpenImageDirParams),
328 |             ],
329 |         )
330 | 
331 |         message = completion.choices[0].message
332 | 
333 |         if message.tool_calls:
334 | 
335 |             tool_call = message.tool_calls[0]
336 | 
337 |             pretty_parsed_arguments = (
338 |                 tool_call.function.parsed_arguments.model_dump_json(indent=2)
339 |             )
340 | 
341 |             print(
342 |                 f"""Tool call found: '{tool_call.function.name}(
343 | {pretty_parsed_arguments}
344 | )'. 
345 | Calling..."""
346 |             )
347 | 
348 |             success = False
349 | 
350 |             tool_call_success_prompt = f"Quickly let your human companion know that you've run the '{tool_call.function.name}' tool. Respond in a short, conversational manner, no fluff."
351 | 
352 |             tool_function_map = {
353 |                 "GenerateImageParams": self.generate_image,
354 |                 "ConvertImageParams": self.convert_image,
355 |                 "ResizeImageParams": self.resize_image,
356 |                 "OpenImageDirParams": self.open_image_directory,
357 |             }
358 | 
359 |             if tool_call.function.name in tool_function_map:
360 |                 # 🚀 GUARANTEED OUTPUT STRUCTURE 🚀
361 |                 params = tool_call.function.parsed_arguments
362 |                 success = tool_function_map[tool_call.function.name](params)
363 |                 tool_call_success_prompt = f"Quickly let your human companion know that you've run the '{tool_call.function.name}' tool. Respond in a short, conversational manner, no fluff."
364 |             else:
365 |                 success = False
366 |                 tool_call_success_prompt = (
367 |                     "An unknown tool was called. Please try again."
368 |                 )
369 | 
370 |             if success:
371 |                 return prompt(self.weak_model, tool_call_success_prompt)
372 | 
373 |         else:
374 |             # just a normal thought
375 |             return prompt(self.weak_model, thought)
376 | 


--------------------------------------------------------------------------------