├── .env.example
├── .gitignore
├── LICENSE
├── README.md
├── assets
    └── thumbnail.jpg
├── config.yaml
├── conversify
    ├── core
    │   ├── __init__.py
    │   ├── agent.py
    │   ├── callbacks.py
    │   ├── memory.py
    │   └── vision.py
    ├── data
    │   └── warmup_audio.wav
    ├── main.py
    ├── models
    │   ├── __init__.py
    │   ├── llm.py
    │   ├── stt.py
    │   ├── tts.py
    │   └── utils.py
    ├── prompts
    │   └── llm.txt
    └── utils
    │   ├── __init__.py
    │   ├── config.py
    │   └── logger.py
├── requirements.txt
└── scripts
    ├── run_app.sh
    ├── run_kokoro.sh
    └── run_llm.sh


/.env.example:
--------------------------------------------------------------------------------
1 | LIVEKIT_URL=*********
2 | LIVEKIT_API_KEY=*********
3 | LIVEKIT_API_SECRET=*********


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | logs/*
2 | __pycache__/
3 | */__pycache__/
4 | **/__pycache__
5 | ***/__pycache__
6 | .env.local
7 | conversify/data/memory_store/*
8 | conversify/data/models_cache/*
9 | KMS/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Conversify 🗣️ ✨
  2 | 
  3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
  4 | 
  5 | Conversify is a real‑time, low‑latency, voice- and vision-enabled AI assistant built on LiveKit. This project demonstrates highly responsive conversational AI workflows, leveraging locally hosted models.
  6 | 
  7 | ## Demo Video
  8 | 
  9 | [![Watch the demo](assets/thumbnail.jpg)](https://youtu.be/Biva5VGV5Pg)
 10 | 
 11 | 
 12 | ## ✨ Key Features
 13 | 
 14 | - ⚡ **Low Latency**: End-to-end response time under 600 ms.
 15 | - 🗣️ **Real‑time Voice**: Natural conversation using local STT and TTS services.
 16 | - 🧠 **Local LLM Integration**: Compatible with any OpenAI‑style API (e.g., SGLang, vLLM, Ollama).
 17 | - 👀 **Basic Vision**: Processes video frames with multimodal LLM prompts.
 18 | - 💾 **Conversational Memory**: Persists context across user sessions.
 19 | - 🔧 **Configurable**: All settings managed via `config/config.yaml`.
 20 | 
 21 | ---
 22 | 
 23 | ## ⚙️ Prerequisites
 24 | 
 25 | - **OS**: Linux or WSL on Windows (tested)
 26 | - **Python**: 3.11+
 27 | - **Services**:
 28 |   - LiveKit Server Cloud (sign up at https://cloud.livekit.io)
 29 |   - An LLM inference server with OpenAI-compatible API (e.g., SGLang, vLLM, Ollama)
 30 |   - Kokoro FastAPI TTS server (https://github.com/remsky/Kokoro-FastAPI)
 31 | 
 32 | ---
 33 | 
 34 | ## 🛠️ Installation
 35 | 
 36 | 1. **Clone the repository**
 37 | 
 38 |     ```bash
 39 |     git clone https://github.com/taresh18/conversify.git
 40 |     cd conversify
 41 |     ```
 42 | 
 43 | 2. **Create a virtual environment** (recommended)
 44 | 
 45 |     ```bash
 46 |     python -m venv venv
 47 |     source venv/bin/activate    # Linux/macOS
 48 |     # venv\Scripts\activate   # Windows
 49 |     ```
 50 | 
 51 | 3. **Install dependencies**
 52 | 
 53 |     ```bash
 54 |     pip install -r requirements.txt
 55 |     ```
 56 | 
 57 | 4. **Configure environment variables**
 58 | 
 59 |     ```bash
 60 |     cp .env.example .env.local
 61 |     nano .env.local  # Add your LiveKit and other credentials
 62 |     ```
 63 | 
 64 | 5. **Update `config/config.yaml`**
 65 | 
 66 |     - Set LLM API endpoint and model names
 67 |     - Configure STT/TTS server URLs and parameters
 68 |     - Adjust vision and memory settings as needed
 69 | 
 70 | ---
 71 | 
 72 | ## 🏃 Running the Application
 73 | 
 74 | Ensure all external services are running before starting Conversify.
 75 | 
 76 | 1. **Start the LLM server** (example using provided script)
 77 | 
 78 |     ```bash
 79 |     chmod +x ./scripts/run_llm.sh
 80 |     ./scripts/run_llm.sh &
 81 |     ```
 82 | 
 83 | 2. **Start the Kokoro TTS server**
 84 | 
 85 |     ```bash
 86 |     chmod +x ./scripts/run_kokoro.sh
 87 |     ./scripts/run_kokoro.sh &
 88 |     ```
 89 | 
 90 | 3. **Launch Conversify**
 91 | 
 92 |     ```bash
 93 |     chmod +x ./scripts/run_app.sh
 94 |     ./scripts/run_app.sh
 95 |     ```
 96 | 
 97 | 4. **Interact via LiveKit Agents Playground**
 98 | 
 99 |     - Navigate to https://agents-playground.livekit.io
100 |     - Select your LiveKit project and room
101 |     - Join and begin conversation
102 | 
103 | ---
104 | 
105 | ## ⚙️ Configuration
106 | 
107 | All runtime settings are in `config/config.yaml`. Key options include:
108 | 
109 | - **STT**: model selection and parameters
110 | - **LLM**: endpoint URLs and model names
111 | - **TTS**: voice options and server settings
112 | - **Vision**: enable/disable frame analysis and thresholds
113 | - **Memory**: persistence and retrieval parameters
114 | - **Logging**: level and file path (`app.log`)
115 | 
116 | Secrets and credentials reside in `.env.local`, following the template in `.env.example`.
117 | 
118 | ---
119 | 
120 | ## 🏗️ Project Structure
121 | 
122 | ```plaintext
123 | conversify/
124 | ├── config/
125 | │   └── config.yaml         # All application settings
126 | ├── conversify/
127 | │   ├── core/               # Orchestration and agent logic
128 | │   ├── stt/                # Speech-to-text client
129 | │   ├── tts/                # Text-to-speech client
130 | │   ├── llm/                # LLM integration client
131 | │   ├── livekit/            # LiveKit session & media management
132 | │   └── utils/              # Logger and shared utilities
133 | ├── prompts/
134 | │   └── llm.txt             # System prompt for LLM
135 | ├── scripts/
136 | │   ├── run_llm.sh
137 | │   ├── run_kokoro.sh
138 | │   └── run_app.sh
139 | ├── .env.example            # Template for environment variables
140 | ├── .env.local              # Local secrets (ignored)
141 | ├── requirements.txt
142 | ├── .gitignore
143 | └── README.md
144 | ```
145 | 
146 | ---
147 | 
148 | ## 📚 References
149 | 
150 | - LiveKit Agents: https://github.com/livekit/agents
151 | - Faster Whisper: https://github.com/SYSTRAN/faster-whisper
152 | - Kokoro FastAPI: https://github.com/remsky/Kokoro-FastAPI
153 | - Memoripy: https://github.com/caspianmoon/memoripy
154 | 
155 | ---
156 | 
157 | ## 📜 License
158 | 
159 | This project is released under the Apache License 2.0. See the [LICENSE](LICENSE) file for details.
160 | 
161 | 


--------------------------------------------------------------------------------
/assets/thumbnail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taresh18/conversify-speech/153c7fed59f933eb0c3ea01be13c2c947c0bf33a/assets/thumbnail.jpg


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | agent:
 2 |   env_file: ".env.local"
 3 |   instructions_file: conversify/prompts/llm.txt
 4 |   greeting: "Hey! How are you doing today?"
 5 |   goodbye: "Goodbye! Have a great day!"
 6 |   default_participant_identity: "identity-qfXx"
 7 |   use_eou: false                                      # livekit turn detection
 8 |   use_background_noise_removal: true                  # uses Krisp BVC noise cancellation
 9 |   use_background_audio: false                         # plays office background audio and keyboard typing sound while the agent speaks
10 |   allow_interruptions: True                           # reset tts on user iterruption
11 | 
12 | stt:
13 |   whisper:
14 |     language: "en"                      
15 |     model: "deepdml/faster-whisper-large-v3-turbo-ct2" 
16 |     device: "cuda"                     
17 |     compute_type: "float16"             
18 |     model_cache_directory: "conversify/data/models_cache"  
19 |     warmup_audio: "conversify/data/warmup_audio.wav"  
20 | 
21 | llm:
22 |   base_url: "http://127.0.0.1:30000/v1" 
23 |   api_key: "NULL"                      
24 |   model: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ"                 
25 |   temperature: 0.4                  
26 |   parallel_tool_calls: false
27 |   tool_choice: "auto"
28 | 
29 | tts:
30 |   kokoro:
31 |     base_url: "http://0.0.0.0:8880/v1" 
32 |     api_key: "NULL"                     
33 |     model: "tts-1"                      
34 |     voice: "af_heart"                   
35 |     speed: 1.0     
36 | 
37 | vad:
38 |   min_speech_duration: 0.20             # Minimum duration (seconds) for speech detection
39 |   min_silence_duration: 0.40            # Minimum silence duration (seconds) to detect end of speech
40 |   prefix_padding_duration: 0.5          # Padding duration (seconds) before detected speech
41 |   max_buffered_speech: 60.0             # Maximum duration (seconds) of buffered speech
42 |   activation_threshold: 0.5             # Threshold for voice activation detection
43 |   force_cpu: false                      # Force VAD to run on CPU instead of GPU
44 |   sample_rate: 16000  
45 | 
46 | vision:
47 |   use: true                            
48 |   video_frame_interval: 0.2   
49 | 
50 | memory:
51 |   use: false                             
52 |   dir: "conversify/data/memory_store"              
53 |   load_last_n: 6       
54 | 
55 | embedding:
56 |   vllm_model_name: "mixedbread-ai/mxbai-embed-large-v1"                             
57 | 
58 | worker:
59 |   job_memory_warn_mb: 1900              
60 |   load_threshold: 1.0                   
61 |   job_memory_limit_mb: 10000            
62 | 
63 | logging:
64 |   level: "DEBUG"                         
65 |   file: "logs/app.log"                


--------------------------------------------------------------------------------
/conversify/core/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/conversify/core/agent.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import asyncio 
  4 | from typing import AsyncIterable, Dict, Any
  5 | 
  6 | from livekit.agents import (
  7 |     Agent,
  8 |     llm, 
  9 |     FunctionTool, 
 10 |     ChatContext 
 11 | )
 12 | from livekit.agents.voice import ModelSettings 
 13 | from livekit import rtc 
 14 | from livekit.agents.llm.chat_context import ImageContent 
 15 | 
 16 | from .memory import AgentMemoryManager
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | class ConversifyAgent(Agent):
 21 |     """ 
 22 |     Agent class that handles interaction logic, including optional
 23 |     memory management and image processing.
 24 |     Depends on shared_state for inter-task communication (e.g., latest_image)
 25 |     and an AgentMemoryManager instance for persistence.
 26 |     """
 27 |     def __init__(self, 
 28 |                  participant_identity: str,
 29 |                  shared_state: Dict[str, Any],
 30 |                  config: Dict[str, Any]) -> None: 
 31 |         
 32 |         agent_config = config['agent']
 33 |         memory_config = config['memory']
 34 |         
 35 |         # Get instructions from the config for the Agent constructor
 36 |         super().__init__(
 37 |             instructions=agent_config['instructions'], 
 38 |             allow_interruptions=agent_config['allow_interruptions']
 39 |         )
 40 |         
 41 |         # Use default participant identity if it's provided and not empty
 42 |         self.participant_identity = agent_config['default_participant_identity'] or participant_identity
 43 |         
 44 |         self.config = config 
 45 |         self.shared_state = shared_state
 46 |         self.vision_keywords = ['see', 'look', 'picture', 'image', 'visual', 'color', 'this', 'object', 'view', 'frame', 'screen', 'desk', 'holding']
 47 |         
 48 |         # Initialize memory handler using config if enabled
 49 |         self.memory_handler = None
 50 |         if memory_config['use']:
 51 |             self.memory_handler = AgentMemoryManager(
 52 |                 participant_identity=self.participant_identity,
 53 |                 config=config 
 54 |             )
 55 |         
 56 |         logger.info(f"ConversifyAgent initialized for identity: {self.participant_identity}. Memory: {'Enabled' if self.memory_handler else 'Disabled'}")
 57 | 
 58 |     async def on_enter(self):
 59 |         """Called when the agent joins. Loads memory (if enabled) and greets."""
 60 |         logger.info(f"Agent '{self.participant_identity}' entering session.")
 61 |         if self.memory_handler:
 62 |             logger.info("Loading agent memory...")
 63 |             await self.memory_handler.load_memory(self.update_chat_ctx)
 64 |             logger.info("Agent memory loaded.")
 65 |         
 66 |         await self.session.say(self.config['agent']['greeting'])
 67 |         
 68 |     async def on_exit(self):
 69 |         """Called when the agent leaves. Says goodbye."""
 70 |         logger.info(f"Agent '{self.participant_identity}' exiting session.")
 71 |         await self.session.say(self.config['agent']['goodbye'])
 72 | 
 73 |     def process_image(self, chat_ctx: llm.ChatContext):
 74 |         """Checks for vision keywords and adds latest image from shared_state if applicable."""
 75 |         # Check if latest_image exists in shared_state
 76 |         if 'latest_image' not in self.shared_state:
 77 |             logger.warning("No 'latest_image' key found in shared_state")
 78 |             return
 79 |             
 80 |         latest_image = self.shared_state['latest_image']
 81 |         if not latest_image:
 82 |             logger.debug("Latest image is None or empty")
 83 |             return
 84 | 
 85 |         if not chat_ctx.items:
 86 |             return
 87 | 
 88 |         last_message = chat_ctx.items[-1]
 89 | 
 90 |         if last_message.role != "user" or not last_message.content or not isinstance(last_message.content[0], str):
 91 |             return
 92 | 
 93 |         user_text = last_message.content[0]
 94 |         
 95 |         should_add_image = any(keyword in user_text.lower() for keyword in self.vision_keywords)
 96 | 
 97 |         if should_add_image:
 98 |             logger.info(f"Vision keyword found in '{user_text[:50]}...'. Adding image to context.")
 99 |             if not isinstance(last_message.content, list):
100 |                  last_message.content = [last_message.content] 
101 |             last_message.content.append(ImageContent(image=latest_image))
102 |             logger.debug("Successfully added ImageContent to the last message.")
103 |         
104 |     @staticmethod
105 |     def clean_text(text_chunk: str) -> str:
106 |         """Cleans text by removing special tags, code blocks, markdown, and emojis."""
107 |         # Remove special tags
108 |         cleaned = text_chunk.replace("<think>", "").replace("</think>", "")
109 |         # Remove code blocks enclosed in triple backticks
110 |         cleaned = re.sub(r'```.*?```', '', cleaned, flags=re.DOTALL)
111 |         # Remove code blocks enclosed in triple single quotes
112 |         cleaned = re.sub(r"'''(.*?)'''", r'\1', cleaned, flags=re.DOTALL)
113 |         # Remove markdown bold/italic markers
114 |         cleaned = re.sub(r'(\*\*|__)(.*?)\1', r'\2', cleaned)
115 |         cleaned = re.sub(r'(\*|_)(.*?)\1', r'\2', cleaned)
116 |         # Remove inline code markers (backticks)
117 |         cleaned = re.sub(r'`([^`]*)`', r'\1', cleaned)
118 |         # Remove LaTeX inline delimiters: remove one or more backslashes preceding "(" or ")"
119 |         cleaned = re.sub(r'\\+\(', '', cleaned)
120 |         cleaned = re.sub(r'\\+\)', '', cleaned)
121 |         # Remove emojis (Unicode emoji ranges)
122 |         cleaned = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+', '', cleaned)
123 |         return cleaned
124 | 
125 |     async def llm_node(
126 |         self,
127 |         chat_ctx: llm.ChatContext,
128 |         tools: list[FunctionTool],
129 |         model_settings: ModelSettings
130 |     ) -> AsyncIterable[llm.ChatChunk]:
131 |         """Processes context via LLM, potentially adding image first. Delegates to default."""
132 |         logger.debug(f"LLM node received context with {len(chat_ctx.items)} items.")
133 |         
134 |         # Only process image if vision is enabled in config
135 |         if self.config['vision']['use']:
136 |             self.process_image(chat_ctx)
137 | 
138 |         async for chunk in Agent.default.llm_node(self, chat_ctx, tools, model_settings):
139 |             yield chunk
140 | 
141 |     async def tts_node(
142 |         self,
143 |         text: AsyncIterable[str],
144 |         model_settings: ModelSettings
145 |     ) -> AsyncIterable[rtc.AudioFrame]:
146 |         """Cleans text stream and delegates to default TTS node."""
147 |         logger.debug("TTS node received text stream.")
148 |         
149 |         cleaned_text_chunks = []
150 |         
151 |         async for chunk in text:
152 |             # Process each chunk with the clean_text method
153 |             cleaned_chunk = self.clean_text(chunk)
154 |             if cleaned_chunk:
155 |                 cleaned_text_chunks.append(cleaned_chunk)
156 | 
157 |         if cleaned_text_chunks:
158 |             logger.debug(f"Sending {len(cleaned_text_chunks)} cleaned chunks to default TTS.")
159 |             async def text_stream():
160 |                 for cleaned_chunk in cleaned_text_chunks:
161 |                     yield cleaned_chunk
162 |             
163 |             # Pass self as the first parameter to the default.tts_node method
164 |             async for frame in self.default.tts_node(self, text_stream(), model_settings):
165 |                 yield frame
166 |             logger.debug("TTS node finished streaming audio frames.")
167 |         else:
168 |             logger.info("No text content left after cleaning for TTS.")
169 | 
170 |     


--------------------------------------------------------------------------------
/conversify/core/callbacks.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import asyncio
 3 | from typing import Dict, Any
 4 | 
 5 | from livekit.agents import AgentSession, metrics
 6 | from livekit.agents.voice import MetricsCollectedEvent
 7 | from livekit.agents.metrics import LLMMetrics, TTSMetrics, EOUMetrics
 8 | 
 9 | from .agent import ConversifyAgent
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | # Globals for metrics callback
14 | end_of_utterance_delay = 0
15 | llm_ttft = 0
16 | tts_ttfb = 0
17 | usage_collector = metrics.UsageCollector()
18 | 
19 | 
20 | def metrics_callback(session: AgentSession):
21 |     """Sets up the callback for collecting and logging session metrics."""
22 |     @session.on("metrics_collected")
23 |     def _on_metrics_collected(ev: MetricsCollectedEvent):
24 |         agent_metrics = ev.metrics
25 |         metrics.log_metrics(agent_metrics)
26 |         usage_collector.collect(agent_metrics)
27 |         
28 |         # Access globals safely
29 |         global end_of_utterance_delay, llm_ttft, tts_ttfb
30 |         
31 |         if isinstance(agent_metrics, EOUMetrics):
32 |             end_of_utterance_delay = agent_metrics.end_of_utterance_delay
33 |         elif isinstance(agent_metrics, LLMMetrics):
34 |             llm_ttft = agent_metrics.ttft
35 |         elif isinstance(agent_metrics, TTSMetrics):
36 |             tts_ttfb = agent_metrics.ttfb
37 |             # Calculate E2E latency only when TTS metrics arrive (last step)
38 |             e2e_latency = end_of_utterance_delay + llm_ttft + tts_ttfb
39 |             logger.info(f"TOTAL END TO END LATENCY --> {e2e_latency:.3f}s, EOU: {end_of_utterance_delay:.3f}s, LLM: {llm_ttft:.3f}s, TTS: {tts_ttfb:.3f}s")
40 |             # Reset for next interaction cycle
41 |             end_of_utterance_delay = 0
42 |             llm_ttft = 0
43 |             tts_ttfb = 0
44 | 
45 | 
46 | async def shutdown_callback(agent: ConversifyAgent, video_task: asyncio.Task | None):
47 |     """Handles graceful shutdown logic: cancels tasks, logs usage, saves memory."""
48 |     logger.info("Application shutdown initiated")
49 |     
50 |     # Cancel video task
51 |     if video_task and not video_task.done():
52 |         logger.info("Attempting to cancel video processing task...")
53 |         try:
54 |             video_task.cancel()
55 |             await video_task 
56 |             logger.info("Video processing task successfully cancelled.")
57 |         except asyncio.CancelledError:
58 |             logger.info("Video processing task was already cancelled or finished.")
59 |         except Exception as e:
60 |             logger.error(f"Error during video task cancellation: {e}", exc_info=True)
61 | 
62 |     # Log usage summary
63 |     summary: Dict[str, Any] = usage_collector.get_summary()
64 |     logger.info(f"Usage Summary: {summary}")
65 | 
66 |     # Save conversation history if available
67 |     if agent.memory_handler:
68 |         try:
69 |             logger.info("Saving conversation memory...")
70 |             if not hasattr(agent, 'chat_ctx') or agent.chat_ctx is None:
71 |                 logger.warning("Agent chat context is not available, skipping memory save.")
72 |             else:
73 |                 await agent.memory_handler.save_memory(agent.chat_ctx)
74 |                 logger.info("Conversation memory saved.")
75 |         except Exception as e:
76 |             logger.error(f"Error saving conversation memory: {e}", exc_info=True)
77 |     else:
78 |         logger.info("Memory handler not available, skipping memory save.")
79 |     
80 |     logger.info("Shutdown callback finished.") 


--------------------------------------------------------------------------------
/conversify/core/memory.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import List, Any, Dict, Optional, Callable
  4 | 
  5 | import numpy as np
  6 | from vllm import LLM
  7 | from pydantic import BaseModel, Field
  8 | 
  9 | from memoripy import MemoryManager, JSONStorage, ChatModel, EmbeddingModel
 10 | 
 11 | from langchain_core.output_parsers import JsonOutputParser
 12 | from langchain_core.prompts import PromptTemplate
 13 | from langchain_openai import ChatOpenAI
 14 | from livekit.agents import ChatMessage, ChatContext
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class ConceptExtractionResponse(BaseModel):
 20 |     """Model for structured response from concept extraction."""
 21 |     concepts: List[str] = Field(description="List of key concepts extracted from the text.")
 22 | 
 23 | 
 24 | class ChatCompletionsModel(ChatModel):
 25 |     """Implementation of ChatModel for concept extraction using LLM."""
 26 |     
 27 |     def __init__(self, llm_config: Dict[str, Any]):
 28 |         """
 29 |         Initialize the ChatCompletionsModel with configuration.
 30 |         
 31 |         Args:
 32 |             llm_config: Dictionary containing LLM configuration (base_url, api_key, model)
 33 |         """
 34 |         api_endpoint = llm_config['base_url']
 35 |         api_key = llm_config['api_key']
 36 |         model_name = llm_config['model']
 37 | 
 38 |         logger.info(f"Initializing ChatCompletionsModel with endpoint: {api_endpoint}, model: {model_name}")
 39 |         try:
 40 |             self.llm = ChatOpenAI(
 41 |                 openai_api_base=api_endpoint, 
 42 |                 openai_api_key=api_key, 
 43 |                 model_name=model_name,
 44 |                 request_timeout=30.0,  
 45 |                 max_retries=2         
 46 |             )
 47 |             self.parser = JsonOutputParser(pydantic_object=ConceptExtractionResponse)
 48 |             self.prompt_template = PromptTemplate(
 49 |                 template=(
 50 |                     "Extract key concepts from the following text in a concise, context-specific manner. "
 51 |                     "Include only the most highly relevant and specific core concepts that best capture the text's meaning. "
 52 |                     "Return nothing but the JSON string.\n"
 53 |                     "{format_instructions}\n{text}"
 54 |                 ),
 55 |                 input_variables=["text"],
 56 |                 partial_variables={"format_instructions": self.parser.get_format_instructions()},
 57 |             )
 58 |             logger.info("ChatCompletionsModel initialized successfully.")
 59 |         except Exception as e:
 60 |             logger.error(f"Failed to initialize ChatCompletionsModel components: {e}", exc_info=True)
 61 |             raise
 62 | 
 63 |     def invoke(self, messages: List[Dict[str, Any]]) -> str:
 64 |         """
 65 |         Invoke the LLM with a list of messages.
 66 |         
 67 |         Args:
 68 |             messages: List of message dictionaries to send to the LLM
 69 |         
 70 |         Returns:
 71 |             Response content as a string
 72 |         """
 73 |         if not messages:
 74 |             logger.warning("Empty messages list provided to ChatCompletionsModel.invoke()")
 75 |             return ""
 76 |         
 77 |         try:
 78 |             response = self.llm.invoke(messages)
 79 |             return str(response.content) if response and hasattr(response, 'content') else ""
 80 |         except Exception as e:
 81 |             logger.error(f"Error during ChatCompletionsModel invocation: {e}", exc_info=True)
 82 |             return "Error processing request."
 83 | 
 84 |     def extract_concepts(self, text: str) -> List[str]:
 85 |         """
 86 |         Extract key concepts from the input text.
 87 |         
 88 |         Args:
 89 |             text: The text to extract concepts from
 90 |             
 91 |         Returns:
 92 |             List of extracted concept strings
 93 |         """
 94 |         if not text or not isinstance(text, str) or not text.strip():
 95 |             logger.warning("Empty or whitespace-only text provided to extract_concepts()")
 96 |             return []
 97 |         
 98 |         try:
 99 |             chain = self.prompt_template | self.llm | self.parser
100 |             response = chain.invoke({"text": text})
101 |             concepts = response.get("concepts", [])
102 |             
103 |             # Validate concepts
104 |             valid_concepts = []
105 |             for concept in concepts:
106 |                 if isinstance(concept, str) and concept.strip():
107 |                     valid_concepts.append(concept.strip())
108 |                     
109 |             logger.debug(f"Concepts extracted: {valid_concepts}")
110 |             return valid_concepts
111 |         except Exception as e:
112 |             logger.error(f"Error during concept extraction: {e}", exc_info=True)
113 |             return []
114 | 
115 | 
116 | class VLLMEmbeddingModel(EmbeddingModel):
117 |     """Implementation of EmbeddingModel using VLLM."""
118 |     
119 |     def __init__(self, embedding_config: Dict[str, Any]):
120 |         """
121 |         Initialize the VLLMEmbeddingModel with configuration.
122 |         
123 |         Args:
124 |             embedding_config: Dictionary containing embedding model configuration
125 |         """
126 |         model_name = embedding_config['vllm_model_name']
127 |         
128 |         self.model = LLM(
129 |             model=model_name,
130 |             enforce_eager=True,
131 |         )
132 |         logger.info(f"VLLMEmbeddingModel initialized successfully: {model_name}")
133 | 
134 |     def initialize_embedding_dimension(self) -> int:
135 |         """
136 |         Determine the embedding dimension by encoding a test string.
137 |         
138 |         Returns:
139 |             Integer dimension of the embedding
140 |         """
141 |         try:
142 |             test_text = "dimension_check"
143 |             outputs = self.model.encode([test_text])
144 |             
145 |             embedding = self._extract_embedding_from_output(outputs)
146 |             
147 |             if embedding is not None:
148 |                 dimension = len(np.array(embedding))
149 |                 logger.info(f"Determined embedding dimension: {dimension}")
150 |                 return dimension
151 |             else:
152 |                 logger.error(f"Failed to determine embedding dimension: Unexpected output structure from VLLM model")
153 |                 raise RuntimeError("Failed to determine embedding dimension due to unexpected model output.")
154 |         except Exception as e:
155 |             logger.error(f"Failed to determine embedding dimension during initialization: {e}", exc_info=True)
156 |             # Fallback dimension
157 |             logger.warning("Falling back to default embedding dimension 768.")
158 |             return 768
159 |     
160 |     def _extract_embedding_from_output(self, outputs) -> Optional[np.ndarray]:
161 |         """
162 |         Extract embedding from different possible VLLM output structures.
163 |         
164 |         Args:
165 |             outputs: The output from VLLM's encode method
166 |             
167 |         Returns:
168 |             Extracted embedding as numpy array or None if extraction fails
169 |         """
170 |         if not outputs:
171 |             return None
172 |             
173 |         # Try different output structures that VLLM might return
174 |         # Structure 1: outputs[0].outputs.embedding
175 |         if hasattr(outputs[0], 'outputs') and hasattr(outputs[0].outputs, 'embedding'):
176 |             return outputs[0].outputs.embedding
177 |         # Structure 2: outputs[0].embedding
178 |         elif hasattr(outputs[0], 'embedding'):
179 |             return outputs[0].embedding
180 |         # Structure 3: outputs[0] is the embedding directly
181 |         elif isinstance(outputs[0], (list, np.ndarray)) and len(outputs[0]) > 0:
182 |             return outputs[0]
183 |         # Structure 4: outputs itself is the embedding
184 |         elif isinstance(outputs, (list, np.ndarray)) and len(outputs) > 0:
185 |             return outputs
186 |         
187 |         return None
188 | 
189 |     def get_embedding(self, text: str) -> np.ndarray:
190 |         """
191 |         Get embedding for the input text.
192 |         
193 |         Args:
194 |             text: Text to embed
195 |             
196 |         Returns:
197 |             Embedding as numpy array
198 |         """
199 |         # Keep try-except for external model inference
200 |         try:
201 |             if not text or not isinstance(text, str) or not text.strip():
202 |                 logger.warning("Empty text provided for embedding, returning zero vector")
203 |                 return np.zeros(self.dimension or 768)
204 |                 
205 |             outputs = self.model.encode([text])
206 |             embedding = self._extract_embedding_from_output(outputs)
207 |             
208 |             if embedding is not None:
209 |                 return np.array(embedding)
210 |             else:
211 |                 logger.error(f"Could not extract embedding from VLLM output")
212 |                 return np.zeros(self.dimension or 768) 
213 |         except Exception as e:
214 |             logger.error(f"Error getting VLLM embedding for text '{text[:50] if text else ''}...': {e}", exc_info=True)
215 |             # Return a zero vector as fallback
216 |             return np.zeros(self.dimension or 768)
217 | 
218 | 
219 | class AgentMemoryManager:
220 |     """Manages agent memory using the Memoripy library."""
221 |     
222 |     def __init__(self, participant_identity: str, config: Dict[str, Any]): 
223 |         """
224 |         Initialize the AgentMemoryManager.
225 |         
226 |         Args:
227 |             participant_identity: Identifier for the participant
228 |             config: Application configuration
229 |         """
230 |         self.participant_identity = participant_identity
231 |         self.config = config
232 |         self.memory_config = config['memory']
233 |         self.memory_manager = None
234 |         self._initialize_memory_manager()
235 | 
236 |     def _initialize_memory_manager(self) -> None:
237 |         """Initialize the Memoripy MemoryManager with model instances."""
238 |         if not self.memory_config['use']:
239 |             logger.info(f"Memory is disabled in config for {self.participant_identity}. Skipping initialization.")
240 |             return
241 |             
242 |         memory_dir_abs = self.memory_config['dir_abs']
243 |         
244 |         # Ensure the directory exists
245 |         os.makedirs(memory_dir_abs, exist_ok=True)
246 |         logger.info(f"Ensuring memory directory exists: {memory_dir_abs}")
247 |              
248 |         user_memory_file = os.path.join(memory_dir_abs, f"{self.participant_identity}.json")
249 |         
250 |         llm_cfg = self.config['llm']
251 |         embedding_cfg = self.config['embedding']
252 |             
253 |         try:
254 |             chat_model_instance = ChatCompletionsModel(llm_config=llm_cfg)
255 |             embedding_model_instance = VLLMEmbeddingModel(embedding_config=embedding_cfg)
256 |             
257 |             self.memory_manager = MemoryManager(
258 |                 chat_model=chat_model_instance,
259 |                 embedding_model=embedding_model_instance,
260 |                 storage=JSONStorage(user_memory_file)
261 |             )
262 |             logger.info(f"Initialized MemoryManager for user {self.participant_identity} with storage {user_memory_file}")
263 |         except Exception as e:
264 |             logger.error(f"Failed to initialize MemoryManager components for {self.participant_identity}: {e}", exc_info=True)
265 |             self.memory_manager = None
266 | 
267 |     async def load_memory(self, update_chat_ctx_func: Callable) -> None:
268 |         """
269 |         Load conversation history from storage and update the agent's chat context.
270 |         
271 |         Args:
272 |             update_chat_ctx_func: Function to update chat context with loaded memory
273 |         """
274 |         if not self.memory_config.get('use', False):
275 |             logger.info(f"Memory is disabled in config for {self.participant_identity}. Skipping load.")
276 |             return
277 |             
278 |         if not self.memory_manager:
279 |             logger.warning(f"MemoryManager not initialized for {self.participant_identity}. Cannot load history.")
280 |             return
281 | 
282 |         initial_messages_from_memory = []
283 |         
284 |         try:
285 |             short_term_history, _ = self.memory_manager.load_history()
286 |             # Use config value for number of interactions
287 |             num_interactions_to_load = self.memory_config.get('load_last_n', 5)
288 |             memory_interactions = short_term_history[-num_interactions_to_load:] if short_term_history else []
289 | 
290 |             for interaction in memory_interactions:
291 |                 if interaction.get('prompt'):
292 |                     initial_messages_from_memory.append(ChatMessage(role="user", content=[interaction['prompt']]))
293 |                 if interaction.get('output'):
294 |                     initial_messages_from_memory.append(ChatMessage(role="assistant", content=[interaction['output']]))
295 | 
296 |             if initial_messages_from_memory:
297 |                 await update_chat_ctx_func(ChatContext(initial_messages_from_memory))
298 |                 logger.info(f"Prepended {len(initial_messages_from_memory)} interactions to the initial context for {self.participant_identity}.")
299 |             else:
300 |                 logger.info(f"No interactions loaded from memory for {self.participant_identity}.")
301 | 
302 |         except FileNotFoundError:
303 |             logger.info(f"No previous history file found for {self.participant_identity}. Starting fresh.")
304 |         except Exception as e:
305 |             logger.error(f"Failed to load history via Memoripy for {self.participant_identity}: {e}", exc_info=True)
306 | 
307 |     def _extract_message_content(self, message: ChatMessage) -> str:
308 |         """
309 |         Extract text content from a ChatMessage.
310 |         
311 |         Args:
312 |             message: The ChatMessage to extract content from
313 |             
314 |         Returns:
315 |             Extracted text content as a string
316 |         """
317 |         if not message or not message.content:
318 |             return ""
319 |             
320 |         # Handle different content structures
321 |         if isinstance(message.content, list):
322 |             if not message.content:
323 |                 return ""
324 |             content_item = message.content[0]
325 |             if isinstance(content_item, str):
326 |                 return content_item
327 |             elif hasattr(content_item, 'text'):
328 |                 return content_item.text
329 |             else:
330 |                 return str(content_item)
331 |         else:
332 |             return str(message.content)
333 | 
334 |     async def save_memory(self, chat_ctx: ChatContext) -> None:
335 |         """
336 |         Save the current conversation history to storage.
337 |         
338 |         Args:
339 |             chat_ctx: ChatContext containing the conversation messages
340 |         """
341 |         if not self.memory_config.get('use', False):
342 |             logger.info(f"Memory is disabled in config for {self.participant_identity}. Skipping save.")
343 |             return
344 |             
345 |         if self.memory_manager is None:
346 |             logger.warning(f"Memory manager not available for {self.participant_identity}. Skipping history save.")
347 |             return
348 | 
349 |         if not chat_ctx or not chat_ctx.items:
350 |             logger.info(f"No conversation items to save for {self.participant_identity}.")
351 |             return
352 | 
353 |         logger.info(f"Saving conversation history via Memoripy for user: {self.participant_identity}")
354 |         logger.info(f"Conversation history messages count: {len(chat_ctx.items)}")
355 |         
356 |         i = 0
357 |         processed_count = 0
358 |         items = chat_ctx.items
359 |         
360 |         while i < len(items):
361 |             user_msg = None
362 |             assistant_msg = None
363 | 
364 |             # Find the next user message
365 |             if items[i].role == "user":
366 |                 user_msg = items[i]
367 |                 # Find the corresponding assistant message (if it exists)
368 |                 if i + 1 < len(items) and items[i+1].role == "assistant":
369 |                     assistant_msg = items[i+1]
370 |                     i += 2 # Move past both
371 |                 else:
372 |                     i += 1 # Move past only user msg
373 |             elif items[i].role == "assistant":
374 |                 # Skip assistant message without preceding user message
375 |                 logger.warning(f"Skipping assistant message without preceding user message at index {i}")
376 |                 i += 1
377 |                 continue
378 |             else: # Skip system messages etc.
379 |                 i += 1
380 |                 continue
381 | 
382 |             # Process the interaction pair
383 |             if user_msg:
384 |                 # Extract content using helper method
385 |                 user_prompt = self._extract_message_content(user_msg)
386 |                 assistant_response = self._extract_message_content(assistant_msg) if assistant_msg else ""
387 | 
388 |                 combined_text = f"{user_prompt} {assistant_response}".strip()
389 | 
390 |                 if not combined_text:
391 |                     logger.debug("Skipping empty interaction.")
392 |                     continue
393 | 
394 |                 try:
395 |                     concepts = self.memory_manager.extract_concepts(combined_text)
396 |                     embedding = self.memory_manager.get_embedding(combined_text)
397 |                     self.memory_manager.add_interaction(
398 |                         prompt=user_prompt,
399 |                         output=assistant_response,
400 |                         embedding=embedding,
401 |                         concepts=concepts
402 |                     )
403 |                     processed_count += 1
404 |                     logger.debug(f"Added interaction to Memoripy: User: '{user_prompt[:50]}...' Assistant: '{assistant_response[:50]}...'")
405 |                 except Exception as e:
406 |                     logger.error(f"Error processing interaction via Memoripy: {e} for interaction: User='{user_prompt[:50]}...', Assistant='{assistant_response[:50]}...'", exc_info=True)
407 |         
408 |         if processed_count > 0:
409 |             logger.info(f"Successfully added {processed_count} interactions into conversational memory for {self.participant_identity}")
410 |         else:
411 |             logger.warning(f"No interactions were added to memory for {self.participant_identity}") 


--------------------------------------------------------------------------------
/conversify/core/vision.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from typing import Dict, Any, Optional, Tuple
  4 | 
  5 | from livekit import rtc
  6 | from livekit.agents import JobContext
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | async def find_video_track(ctx: JobContext) -> Optional[rtc.RemoteVideoTrack]:
 12 |     """
 13 |     Find and subscribe to the first available video track in the room.
 14 |     
 15 |     Args:
 16 |         ctx: The job context containing room information
 17 |         
 18 |     Returns:
 19 |         The first available RemoteVideoTrack or None if no track is found
 20 |     """
 21 |     for participant in ctx.room.remote_participants.values():
 22 |         if not participant or not participant.track_publications:
 23 |             continue
 24 |             
 25 |         for track_pub in participant.track_publications.values():
 26 |             if not track_pub or track_pub.kind != rtc.TrackKind.KIND_VIDEO:
 27 |                 continue
 28 |                 
 29 |             # Attempt to subscribe if not already subscribed
 30 |             if not track_pub.subscribed:
 31 |                 logger.info(f"Subscribing to video track {track_pub.sid} from {participant.identity}...")
 32 |                 try:
 33 |                     track_pub.set_subscribed(True)
 34 |                     # Wait for subscription to complete
 35 |                     await asyncio.sleep(0.5)
 36 |                 except Exception as e:
 37 |                     logger.warning(f"Failed to subscribe to track {track_pub.sid}: {e}")
 38 |                     continue
 39 |             
 40 |             # Check if track is available after subscription
 41 |             if (track_pub.track and 
 42 |                 isinstance(track_pub.track, rtc.RemoteVideoTrack) and 
 43 |                 track_pub.subscribed):
 44 |                 logger.info(f"Found video track: {track_pub.track.sid} from {participant.identity}")
 45 |                 return track_pub.track
 46 |     
 47 |     return None
 48 | 
 49 | 
 50 | async def video_processing_loop(ctx: JobContext, shared_state: Dict[str, Any], video_frame_interval: float) -> None:
 51 |     """
 52 |     Process the first available video track and update shared_state['latest_image'].
 53 |     
 54 |     Args:
 55 |         ctx: The job context containing room information
 56 |         shared_state: Dictionary to store shared data between components
 57 |         video_frame_interval: Interval (seconds) between frame processing
 58 |     """
 59 |     if not ctx:
 60 |         logger.error("Invalid arguments: JobContext is None")
 61 |         return
 62 | 
 63 |     logger.info("Starting video processing loop, looking for video track...")
 64 |     video_track = None
 65 |     video_stream = None
 66 |     
 67 |     try:
 68 |         while True:
 69 |             try:
 70 |                 video_track = await find_video_track(ctx)
 71 |                 if video_track:
 72 |                     break
 73 |                     
 74 |                 logger.debug("No video track found yet, waiting...")
 75 |                 await asyncio.sleep(1)
 76 |             except asyncio.CancelledError:
 77 |                 logger.info("Video track search cancelled.")
 78 |                 return
 79 |             except Exception as e:
 80 |                 logger.error(f"Error searching for video track: {e}", exc_info=True)
 81 |                 await asyncio.sleep(1)  # Wait before retrying
 82 | 
 83 |         # Create video stream from the found track
 84 |         video_stream = rtc.VideoStream(video_track)
 85 |         logger.info(f"Starting video stream processing with interval {video_frame_interval}s.")
 86 |         
 87 |         # Process the video stream
 88 |         async for event in video_stream:
 89 |             if event and event.frame:
 90 |                 # Update the shared state with the latest frame
 91 |                 shared_state['latest_image'] = event.frame
 92 |                 
 93 |             # Sleep to control processing rate
 94 |             await asyncio.sleep(video_frame_interval)
 95 |             
 96 |     except asyncio.CancelledError:
 97 |         logger.info("Video processing task cancelled.")
 98 |     except Exception as e:
 99 |         logger.error(f"Error processing video stream: {e}", exc_info=True)
100 |     finally:
101 |         # Clean up resources
102 |         if video_stream:
103 |             try:
104 |                 await video_stream.aclose()
105 |                 logger.info("Video stream closed.")
106 |             except Exception as e:
107 |                 logger.error(f"Error closing video stream: {e}")
108 |         
109 |         # Clear the latest image reference when processing ends
110 |         shared_state.pop('latest_image', None)
111 |         logger.info("Video processing loop ended.") 


--------------------------------------------------------------------------------
/conversify/data/warmup_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taresh18/conversify-speech/153c7fed59f933eb0c3ea01be13c2c947c0bf33a/conversify/data/warmup_audio.wav


--------------------------------------------------------------------------------
/conversify/main.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import asyncio
  3 | from dotenv import load_dotenv
  4 | from typing import Dict, Any
  5 | from openai import AsyncClient
  6 | import functools 
  7 | 
  8 | from livekit.agents import (
  9 |     AgentSession,
 10 |     JobContext,
 11 |     JobProcess,
 12 |     RoomInputOptions,
 13 |     RoomOutputOptions,
 14 |     WorkerOptions,
 15 |     cli,
 16 |     BackgroundAudioPlayer,
 17 |     AudioConfig,
 18 |     BuiltinAudioClip,
 19 |     metrics
 20 | )
 21 | from livekit.plugins import silero
 22 | from livekit.agents.types import NOT_GIVEN
 23 | from livekit.plugins import noise_cancellation
 24 | from livekit.plugins.turn_detector.multilingual import MultilingualModel
 25 | 
 26 | from .models.tts import KokoroTTS
 27 | from .models.stt import WhisperSTT
 28 | from .models.llm import OpenaiLLM
 29 | from .core.vision import video_processing_loop
 30 | from .core.agent import ConversifyAgent
 31 | from .utils.logger import setup_logging
 32 | from .utils.config import ConfigManager
 33 | from .core.callbacks import metrics_callback, shutdown_callback
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | 
 38 | def prewarm(proc: JobProcess, config: Dict[str, Any]):
 39 |     """Prewarms resources needed by the agent, like the VAD model."""
 40 |     logger.info("Prewarming VAD...")    
 41 |     vad_config = config['vad']
 42 |     
 43 |     proc.userdata["vad"] = silero.VAD.load(
 44 |                                 min_speech_duration=vad_config['min_speech_duration'],
 45 |                                 min_silence_duration=vad_config['min_silence_duration'],
 46 |                                 prefix_padding_duration=vad_config['prefix_padding_duration'],
 47 |                                 max_buffered_speech=vad_config['max_buffered_speech'],
 48 |                                 activation_threshold=vad_config['activation_threshold'],
 49 |                                 force_cpu=vad_config['force_cpu'],
 50 |                                 sample_rate=vad_config['sample_rate']
 51 |                         )
 52 |     logger.info("VAD prewarmed successfully.")
 53 | 
 54 | 
 55 | async def entrypoint(ctx: JobContext, config: Dict[str, Any]):
 56 |     """The main entrypoint for the agent job."""
 57 |     # Setup initial logging context
 58 |     ctx.log_context_fields = {
 59 |         "room": ctx.room.name,
 60 |         "job_id": ctx.job.id,
 61 |     }
 62 |     logger.info(f"Agent entrypoint started. Context: {ctx.log_context_fields}")
 63 |     
 64 |     await ctx.connect()
 65 |     logger.info("Successfully connected to room.")
 66 | 
 67 |     # Create shared state dictionary for inter-task communication
 68 |     shared_state: Dict[str, Any] = {}
 69 | 
 70 |     # Initialize LLM Client here using config
 71 |     llm_config = config['llm']
 72 |     try:
 73 |         llm_client = AsyncClient(api_key=llm_config['api_key'], base_url=llm_config['base_url'])
 74 |         logger.info(f"Initialized LLM Client at {llm_config['base_url']}")
 75 |     except Exception as e:
 76 |         logger.error(f"Failed to initialize LLM Client: {e}")
 77 |         raise
 78 | 
 79 |     # Check if VAD was prewarmed successfully
 80 |     vad = ctx.proc.userdata.get("vad")
 81 |     if not vad:
 82 |         logger.error("VAD not found in process userdata. Exiting.")
 83 |         return
 84 | 
 85 |     # Setup the AgentSession with configured plugins
 86 |     session = AgentSession(
 87 |         vad=vad,
 88 |         llm=OpenaiLLM(client=llm_client, config=config), 
 89 |         stt=WhisperSTT(config=config),
 90 |         tts=KokoroTTS(config=config),
 91 |         turn_detection=MultilingualModel() if config['agent']['use_eou'] else NOT_GIVEN
 92 |     )
 93 |     logger.info("AgentSession created.")
 94 | 
 95 |     # Start the video processing loop if configured
 96 |     video_task: asyncio.Task | None = None
 97 |     vision_config = config['vision']
 98 |     
 99 |     if vision_config['use']:
100 |         logger.info("Starting video processing loop...")
101 |         video_task = asyncio.create_task(video_processing_loop(ctx, shared_state, vision_config['video_frame_interval']))
102 | 
103 |     # Setup metrics collection
104 |     metrics_callback(session)
105 | 
106 |     # Wait for a participant to join before starting the session
107 |     logger.info("Waiting for participant to join...")
108 |     participant = await ctx.wait_for_participant()
109 |     logger.info(f"Participant '{participant.identity if participant else 'unknown'}' joined.")
110 |     
111 |     # setup agent instance
112 |     agent = ConversifyAgent(
113 |         participant_identity=participant.identity,
114 |         shared_state=shared_state,
115 |         config=config
116 |     )
117 | 
118 |     # Register the shutdown callback 
119 |     ctx.add_shutdown_callback(lambda: shutdown_callback(agent, video_task))
120 |     logger.info("Shutdown callback registered.")
121 | 
122 |     # Start the agent session
123 |     logger.info("Starting agent session...")
124 |     await session.start(
125 |         agent=agent,
126 |         room=ctx.room,
127 |         room_input_options=RoomInputOptions(
128 |             noise_cancellation=noise_cancellation.BVC() if config['agent']['use_background_noise_removal'] else NOT_GIVEN,
129 |         ),
130 |         room_output_options=RoomOutputOptions(transcription_enabled=True),
131 |     )
132 |     
133 |     if config['agent']['use_background_audio']:
134 |         background_audio = BackgroundAudioPlayer(
135 |             # play office ambience sound looping in the background
136 |             ambient_sound=AudioConfig(BuiltinAudioClip.OFFICE_AMBIENCE, volume=0.8),
137 |             # play keyboard typing sound when the agent is thinking
138 |             thinking_sound=[
139 |                 AudioConfig(BuiltinAudioClip.KEYBOARD_TYPING, volume=0.8),
140 |                 AudioConfig(BuiltinAudioClip.KEYBOARD_TYPING2, volume=0.7),
141 |             ],
142 |         )
143 | 
144 |         await background_audio.start(room=ctx.room, agent_session=session)
145 | 
146 | 
147 | def main():
148 |     """Main function that initializes and runs the applicaton."""
149 |     # Configure basic logging BEFORE loading config
150 |     logging.basicConfig(level="INFO", 
151 |                         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
152 |                         handlers=[logging.StreamHandler()])
153 |     initial_logger = logging.getLogger(__name__)
154 |     initial_logger.info("Basic logging configured. Loading configuration...")
155 | 
156 |     # Load configuration 
157 |     app_config = ConfigManager().load_config()
158 |     initial_logger.info("Configuration loaded.")
159 |     
160 |     # Load env variables
161 |     load_dotenv(app_config['agent']['env_file'])
162 | 
163 |     # Setup centralized logging using the dedicated function
164 |     setup_logging(config=app_config, project_root=ConfigManager().project_root)
165 | 
166 |     # Now, get the properly configured logger for the main module
167 |     logger = logging.getLogger(__name__) # Re-get logger after setup
168 |     logger.info("Centralized logging configured. Starting LiveKit Agent application...")
169 |     
170 |     # Create a partial function that includes the config
171 |     entrypoint_with_config = functools.partial(entrypoint, config=app_config)
172 |     prewarm_with_config = functools.partial(prewarm, config=app_config)
173 | 
174 |     # Define worker options using loaded config
175 |     worker_config = app_config['worker']
176 |     worker_options = WorkerOptions(
177 |         entrypoint_fnc=entrypoint_with_config, 
178 |         prewarm_fnc=prewarm_with_config,
179 |         job_memory_warn_mb=worker_config['job_memory_warn_mb'],
180 |         load_threshold=worker_config['load_threshold'],
181 |         job_memory_limit_mb=worker_config['job_memory_limit_mb'],
182 |     )
183 | 
184 |     # Run the CLI application
185 |     cli.run_app(worker_options)
186 | 
187 | 
188 | if __name__ == "__main__":
189 |     main()
190 | 


--------------------------------------------------------------------------------
/conversify/models/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/conversify/models/llm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from dataclasses import dataclass
  5 | from typing import Any, Dict
  6 | 
  7 | import httpx
  8 | 
  9 | import openai
 10 | from livekit.agents import APIConnectionError, APIStatusError, APITimeoutError, llm
 11 | from livekit.agents.llm import ToolChoice, utils as llm_utils
 12 | from livekit.agents.llm.chat_context import ChatContext
 13 | from livekit.agents.llm.tool_context import FunctionTool
 14 | from livekit.agents.types import (
 15 |     DEFAULT_API_CONNECT_OPTIONS,
 16 |     NOT_GIVEN,
 17 |     APIConnectOptions,
 18 |     NotGivenOr,
 19 | )
 20 | from livekit.agents.utils import is_given
 21 | from openai.types.chat import (
 22 |     ChatCompletionChunk,
 23 |     ChatCompletionToolChoiceOptionParam,
 24 |     completion_create_params,
 25 | )
 26 | from openai.types.chat.chat_completion_chunk import Choice
 27 | 
 28 | from .utils import to_chat_ctx, to_fnc_ctx
 29 | 
 30 | 
 31 | @dataclass
 32 | class _LLMOptions:
 33 |     model: str
 34 |     temperature: NotGivenOr[float]
 35 |     parallel_tool_calls: NotGivenOr[bool]
 36 |     tool_choice: NotGivenOr[ToolChoice]
 37 | 
 38 | 
 39 | class OpenaiLLM(llm.LLM):
 40 |     def __init__(
 41 |         self,
 42 |         config: Dict[str, Any],
 43 |         client: openai.AsyncClient | None = None,
 44 |     ) -> None:
 45 |         """
 46 |         Create a new instance of OpenAI LLM using configuration.
 47 | 
 48 |         Args:
 49 |             client: Pre-configured OpenAI client
 50 |             config: Configuration dictionary (from config.yaml)
 51 |         """
 52 |         super().__init__()
 53 |         
 54 |         llm_config = config['llm']
 55 |         
 56 |         model = llm_config['model']
 57 |         api_key = llm_config['api_key']
 58 |         base_url = llm_config['base_url']
 59 |         temperature = llm_config['temperature']
 60 |         parallel_tool_calls = llm_config['parallel_tool_calls']
 61 |         tool_choice = llm_config['tool_choice']
 62 |         
 63 |         timeout = httpx.Timeout(
 64 |             connect=15.0,
 65 |             read=5.0,
 66 |             write=5.0,
 67 |             pool=5.0
 68 |         )
 69 |         
 70 |         self._opts = _LLMOptions(
 71 |             model=model,
 72 |             temperature=temperature,
 73 |             parallel_tool_calls=parallel_tool_calls,
 74 |             tool_choice=tool_choice,
 75 |         )
 76 |         
 77 |         # Use the provided client or create a new one with configured settings
 78 |         self._client = client or openai.AsyncClient(
 79 |             api_key=api_key,
 80 |             base_url=base_url,
 81 |             max_retries=0,
 82 |             http_client=httpx.AsyncClient(
 83 |                 timeout=timeout,
 84 |                 follow_redirects=True,
 85 |                 limits=httpx.Limits(
 86 |                     max_connections=50,
 87 |                     max_keepalive_connections=50,
 88 |                     keepalive_expiry=120,
 89 |                 ),
 90 |             ),
 91 |         )
 92 |   
 93 |     def chat(
 94 |         self,
 95 |         *,
 96 |         chat_ctx: ChatContext,
 97 |         tools: list[FunctionTool] | None = None,
 98 |         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
 99 |         parallel_tool_calls: NotGivenOr[bool] = NOT_GIVEN,
100 |         tool_choice: NotGivenOr[ToolChoice] = NOT_GIVEN,
101 |         response_format: NotGivenOr[
102 |             completion_create_params.ResponseFormat | type[llm_utils.ResponseFormatT]
103 |         ] = NOT_GIVEN,
104 |         extra_kwargs: NotGivenOr[dict[str, Any]] = NOT_GIVEN,
105 |     ) -> LLMStream:
106 |         extra = {}
107 |         if is_given(extra_kwargs):
108 |             extra.update(extra_kwargs)
109 | 
110 |         parallel_tool_calls = (
111 |             parallel_tool_calls if is_given(parallel_tool_calls) else self._opts.parallel_tool_calls
112 |         )
113 |         if is_given(parallel_tool_calls):
114 |             extra["parallel_tool_calls"] = parallel_tool_calls
115 | 
116 |         tool_choice = tool_choice if is_given(tool_choice) else self._opts.tool_choice
117 |         if is_given(tool_choice):
118 |             oai_tool_choice: ChatCompletionToolChoiceOptionParam
119 |             if isinstance(tool_choice, dict):
120 |                 oai_tool_choice = {
121 |                     "type": "function",
122 |                     "function": {"name": tool_choice["function"]["name"]},
123 |                 }
124 |                 extra["tool_choice"] = oai_tool_choice
125 |             elif tool_choice in ("auto", "required", "none"):
126 |                 oai_tool_choice = tool_choice
127 |                 extra["tool_choice"] = oai_tool_choice
128 | 
129 |         if is_given(response_format):
130 |             extra["response_format"] = llm_utils.to_openai_response_format(response_format)
131 | 
132 |         return LLMStream(
133 |             self,
134 |             model=self._opts.model,
135 |             client=self._client,
136 |             chat_ctx=chat_ctx,
137 |             tools=tools or [],
138 |             conn_options=conn_options,
139 |             extra_kwargs=extra,
140 |         )
141 | 
142 | 
143 | class LLMStream(llm.LLMStream):
144 |     def __init__(
145 |         self,
146 |         llm: LLM,
147 |         *,
148 |         model: str,
149 |         client: openai.AsyncClient,
150 |         chat_ctx: llm.ChatContext,
151 |         tools: list[FunctionTool],
152 |         conn_options: APIConnectOptions,
153 |         extra_kwargs: dict[str, Any],
154 |     ) -> None:
155 |         super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
156 |         self._model = model
157 |         self._client = client
158 |         self._llm = llm
159 |         self._extra_kwargs = extra_kwargs
160 | 
161 |     async def _run(self) -> None:
162 |         # current function call that we're waiting for full completion (args are streamed)
163 |         # (defined inside the _run method to make sure the state is reset for each run/attempt)
164 |         self._oai_stream: openai.AsyncStream[ChatCompletionChunk] | None = None
165 |         self._tool_call_id: str | None = None
166 |         self._fnc_name: str | None = None
167 |         self._fnc_raw_arguments: str | None = None
168 |         self._tool_index: int | None = None
169 |         retryable = True
170 | 
171 |         try:
172 |             self._oai_stream = stream = await self._client.chat.completions.create(
173 |                 messages=to_chat_ctx(self._chat_ctx, id(self._llm)),
174 |                 tools=to_fnc_ctx(self._tools) if self._tools else openai.NOT_GIVEN,
175 |                 model=self._model,
176 |                 stream_options={"include_usage": True},
177 |                 stream=True,
178 |                 **self._extra_kwargs,
179 |             )
180 | 
181 |             async with stream:
182 |                 async for chunk in stream:
183 |                     for choice in chunk.choices:
184 |                         chat_chunk = self._parse_choice(chunk.id, choice)
185 |                         if chat_chunk is not None:
186 |                             retryable = False
187 |                             self._event_ch.send_nowait(chat_chunk)
188 | 
189 |                     if chunk.usage is not None:
190 |                         retryable = False
191 |                         tokens_details = chunk.usage.prompt_tokens_details
192 |                         cached_tokens = tokens_details.cached_tokens if tokens_details else 0
193 |                         chunk = llm.ChatChunk(
194 |                             id=chunk.id,
195 |                             usage=llm.CompletionUsage(
196 |                                 completion_tokens=chunk.usage.completion_tokens,
197 |                                 prompt_tokens=chunk.usage.prompt_tokens,
198 |                                 prompt_cached_tokens=cached_tokens or 0,
199 |                                 total_tokens=chunk.usage.total_tokens,
200 |                             ),
201 |                         )
202 |                         self._event_ch.send_nowait(chunk)
203 | 
204 |         except openai.APITimeoutError:
205 |             raise APITimeoutError(retryable=retryable) from None
206 |         except openai.APIStatusError as e:
207 |             raise APIStatusError(
208 |                 e.message,
209 |                 status_code=e.status_code,
210 |                 request_id=e.request_id,
211 |                 body=e.body,
212 |                 retryable=retryable,
213 |             ) from None
214 |         except Exception as e:
215 |             raise APIConnectionError(retryable=retryable) from e
216 | 
217 |     def _parse_choice(self, id: str, choice: Choice) -> llm.ChatChunk | None:
218 |         delta = choice.delta
219 | 
220 |         # https://github.com/livekit/agents/issues/688
221 |         # the delta can be None when using Azure OpenAI (content filtering)
222 |         if delta is None:
223 |             return None
224 | 
225 |         if delta.tool_calls:
226 |             for tool in delta.tool_calls:
227 |                 if not tool.function:
228 |                     continue
229 | 
230 |                 call_chunk = None
231 |                 if self._tool_call_id and tool.id and tool.index != self._tool_index:
232 |                     call_chunk = llm.ChatChunk(
233 |                         id=id,
234 |                         delta=llm.ChoiceDelta(
235 |                             role="assistant",
236 |                             content=delta.content,
237 |                             tool_calls=[
238 |                                 llm.FunctionToolCall(
239 |                                     arguments=self._fnc_raw_arguments or "",
240 |                                     name=self._fnc_name or "",
241 |                                     call_id=self._tool_call_id or "",
242 |                                 )
243 |                             ],
244 |                         ),
245 |                     )
246 |                     self._tool_call_id = self._fnc_name = self._fnc_raw_arguments = None
247 |                     self._tool_index = None
248 |                     return call_chunk
249 | 
250 |                 if tool.id and not self._tool_call_id:
251 |                     self._tool_call_id = tool.id
252 |                     self._tool_index = tool.index
253 | 
254 |                 if tool.function.name and not self._fnc_name:
255 |                     self._fnc_name = tool.function.name
256 | 
257 |                 if tool.function.arguments:
258 |                     current = self._fnc_raw_arguments or ""
259 |                     self._fnc_raw_arguments = current + tool.function.arguments
260 | 
261 |                 return None
262 | 
263 |             return None
264 | 
265 |         if delta.content == "":
266 |             return None
267 | 
268 |         return llm.ChatChunk(
269 |             id=id,
270 |             delta=llm.ChoiceDelta(
271 |                 role="assistant",
272 |                 content=delta.content,
273 |             ),
274 |         )
275 | 


--------------------------------------------------------------------------------
/conversify/models/stt.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import logging
  3 | import os
  4 | from dataclasses import dataclass
  5 | from typing import Optional, Dict, Any
  6 | 
  7 | import numpy as np
  8 | import soundfile as sf
  9 | from faster_whisper import WhisperModel
 10 | 
 11 | from livekit import rtc
 12 | from livekit.agents import (
 13 |     APIConnectionError,
 14 |     APIConnectOptions,
 15 |     stt,
 16 | )
 17 | from livekit.agents.utils import AudioBuffer
 18 | 
 19 | from .utils import WhisperModels, find_time
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | @dataclass
 24 | class WhisperOptions:
 25 |     """Configuration options for WhisperSTT."""
 26 |     language: str
 27 |     model: WhisperModels | str
 28 |     device: str | None
 29 |     compute_type: str | None
 30 |     model_cache_directory: str | None
 31 |     warmup_audio: str | None
 32 | 
 33 | 
 34 | class WhisperSTT(stt.STT):
 35 |     """STT implementation using Whisper model."""
 36 |     
 37 |     def __init__(
 38 |         self,
 39 |         config: Dict[str, Any]
 40 |     ):
 41 |         """Initialize the WhisperSTT instance.
 42 |         
 43 |         Args:
 44 |             config: Configuration dictionary (from config.yaml)
 45 |         """
 46 |         super().__init__(
 47 |             capabilities=stt.STTCapabilities(streaming=False, interim_results=False)
 48 |         )
 49 |                
 50 |         stt_config = config['stt']['whisper']
 51 |         
 52 |         language = stt_config['language']
 53 |         model = stt_config['model']
 54 |         device = stt_config['device']
 55 |         compute_type = stt_config['compute_type']
 56 |         model_cache_directory = stt_config['model_cache_directory']
 57 |         warmup_audio = stt_config['warmup_audio']
 58 | 
 59 |         self._opts = WhisperOptions(
 60 |             language=language,
 61 |             model=model,
 62 |             device=device,
 63 |             compute_type=compute_type,
 64 |             model_cache_directory=model_cache_directory,
 65 |             warmup_audio=warmup_audio
 66 |         )
 67 |         
 68 |         self._model = None
 69 |         self._initialize_model()
 70 |         
 71 |         # Warmup the model with a sample audio if available
 72 |         if warmup_audio and os.path.exists(warmup_audio):
 73 |             self._warmup(warmup_audio)
 74 | 
 75 |     def _initialize_model(self):
 76 |         """Initialize the Whisper model."""
 77 |         device = self._opts.device
 78 |         compute_type = self._opts.compute_type
 79 |         
 80 |         logger.info(f"Using device: {device}, with compute: {compute_type}")
 81 |         
 82 |         # Ensure cache directories exist
 83 |         model_cache_dir = self._opts.model_cache_directory
 84 |         
 85 |         if model_cache_dir:
 86 |             os.makedirs(model_cache_dir, exist_ok=True)
 87 |             logger.info(f"Using model cache directory: {model_cache_dir}")
 88 |         
 89 |         self._model = WhisperModel(
 90 |             model_size_or_path=str(self._opts.model),
 91 |             device=device,
 92 |             compute_type=compute_type,
 93 |             download_root=model_cache_dir
 94 |         )
 95 |         logger.info("Whisper model loaded successfully")
 96 | 
 97 |     def _warmup(self, warmup_audio_path: str) -> None:
 98 |         """Performs a warmup transcription.
 99 |         
100 |         Args:
101 |             warmup_audio_path: Path to audio file for warmup
102 |         """
103 |         logger.info(f"Starting STT engine warmup using {warmup_audio_path}...")
104 |         try:
105 |             with find_time('STT_warmup'):
106 |                 warmup_audio_data, _ = sf.read(warmup_audio_path, dtype="float32")
107 |                 segments, info = self._model.transcribe(warmup_audio_data, 
108 |                                                         language=self._opts.language, 
109 |                                                         beam_size=1)
110 |                 model_warmup_transcription = " ".join(segment.text for segment in segments)
111 |             logger.info(f"STT engine warmed up. Text: {model_warmup_transcription}")
112 |         except Exception as e:
113 |             logger.error(f"Failed to warm up STT engine: {e}")
114 | 
115 |     def update_options(
116 |         self,
117 |         *,
118 |         model: Optional[WhisperModels | str] = None,
119 |         language: Optional[str] = None,
120 |         model_cache_directory: Optional[str] = None,
121 |     ) -> None:
122 |         """Update STT options.
123 |         
124 |         Args:
125 |             model: Whisper model to use
126 |             language: Language to detect
127 |             model_cache_directory: Directory to store downloaded models
128 |         """
129 |         reinitialize = False
130 |         
131 |         if model:
132 |             self._opts.model = model
133 |             reinitialize = True
134 |             
135 |         if model_cache_directory:
136 |             self._opts.model_cache_directory = model_cache_directory
137 |             reinitialize = True
138 |             
139 |         if language:
140 |             self._opts.language = language
141 |             
142 |         if reinitialize:
143 |             self._initialize_model()
144 | 
145 |     def _sanitize_options(self, *, language: Optional[str] = None) -> WhisperOptions:
146 |         """Create a copy of options with optional overrides.
147 |         
148 |         Args:
149 |             language: Language override
150 |             
151 |         Returns:
152 |             Copy of options with overrides applied
153 |         """
154 |         options = dataclasses.replace(self._opts)
155 |         if language:
156 |             options.language = language
157 |         return options
158 | 
159 |     async def _recognize_impl(
160 |         self,
161 |         buffer: AudioBuffer,
162 |         *,
163 |         language: Optional[str],
164 |         conn_options: APIConnectOptions,
165 |     ) -> stt.SpeechEvent:
166 |         """Implement speech recognition.
167 |         
168 |         Args:
169 |             buffer: Audio buffer
170 |             language: Language to detect
171 |             conn_options: Connection options
172 |             
173 |         Returns:
174 |             Speech recognition event
175 |         """
176 |         try:
177 |             logger.info(f"Received audio, transcribing to text")
178 |             options = self._sanitize_options(language=language)
179 |             audio_data = rtc.combine_audio_frames(buffer).to_wav_bytes()
180 |             
181 |             # Convert WAV to numpy array
182 |             audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
183 |             
184 |             with find_time('STT_inference'):
185 |                 segments, info = self._model.transcribe(
186 |                     audio_array,
187 |                     language=options.language,
188 |                     beam_size=1,
189 |                     best_of=1,
190 |                     condition_on_previous_text=True,
191 |                     vad_filter=False,
192 |                     vad_parameters=dict(min_silence_duration_ms=500),
193 |                 )
194 | 
195 |             segments_list = list(segments)
196 |             full_text = " ".join(segment.text.strip() for segment in segments_list)
197 | 
198 |             return stt.SpeechEvent(
199 |                 type=stt.SpeechEventType.FINAL_TRANSCRIPT,
200 |                 alternatives=[
201 |                     stt.SpeechData(
202 |                         text=full_text or "",
203 |                         language=options.language,
204 |                     )
205 |                 ],
206 |             )
207 | 
208 |         except Exception as e:
209 |             logger.error(f"Error in speech recognition: {e}", exc_info=True)
210 |             raise APIConnectionError() from e 


--------------------------------------------------------------------------------
/conversify/models/tts.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from dataclasses import dataclass
  3 | from typing import Any, Dict
  4 | 
  5 | import httpx
  6 | import openai
  7 | 
  8 | from livekit.agents import (
  9 |     APIConnectionError,
 10 |     APIConnectOptions,
 11 |     APIStatusError,
 12 |     APITimeoutError,
 13 |     tts,
 14 |     utils,
 15 | )
 16 | from livekit.agents.types import (
 17 |     DEFAULT_API_CONNECT_OPTIONS,
 18 |     NOT_GIVEN,
 19 |     NotGivenOr,
 20 | )
 21 | from livekit.agents.utils import is_given
 22 | 
 23 | from .utils import TTSModels, TTSVoices, find_time
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | TTS_SAMPLE_RATE = 24000
 28 | TTS_CHANNELS = 1
 29 | 
 30 | @dataclass
 31 | class KokoroTTSOptions:
 32 |     """Configuration options for KokoroTTS."""
 33 |     model: TTSModels | str
 34 |     voice: TTSVoices | str
 35 |     speed: float
 36 | 
 37 | 
 38 | class KokoroTTS(tts.TTS):
 39 |     """TTS implementation using Kokoro API."""
 40 |     
 41 |     def __init__(
 42 |         self,
 43 |         config: Dict[str, Any],
 44 |         client: openai.AsyncClient | None = None,
 45 |     ) -> None:
 46 |         """Initialize the KokoroTTS instance.
 47 |         
 48 |         Args:
 49 |             client: Optional pre-configured OpenAI AsyncClient
 50 |             config: Configuration dictionary (from config.yaml)
 51 |         """
 52 |         tts_config = config['tts']['kokoro']
 53 |         
 54 |         model = tts_config['model']
 55 |         voice = tts_config['voice']
 56 |         speed = tts_config['speed']
 57 |         api_key = tts_config['api_key']
 58 |         base_url = tts_config['base_url']
 59 |         
 60 |         logger.info(f"Using TTS API URL: {base_url}")
 61 | 
 62 |         super().__init__(
 63 |             capabilities=tts.TTSCapabilities(
 64 |                 streaming=False,
 65 |             ),
 66 |             sample_rate=TTS_SAMPLE_RATE,
 67 |             num_channels=TTS_CHANNELS,
 68 |         )
 69 | 
 70 |         self._opts = KokoroTTSOptions(
 71 |             model=model,
 72 |             voice=voice,
 73 |             speed=speed,
 74 |         )
 75 | 
 76 |         self._client = client or openai.AsyncClient(
 77 |             max_retries=0,
 78 |             api_key=api_key,
 79 |             base_url=base_url,
 80 |             http_client=httpx.AsyncClient(
 81 |                 timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
 82 |                 follow_redirects=True,
 83 |                 limits=httpx.Limits(
 84 |                     max_connections=50,
 85 |                     max_keepalive_connections=50,
 86 |                     keepalive_expiry=120,
 87 |                 ),
 88 |             ),
 89 |         )
 90 | 
 91 |     def update_options(
 92 |         self,
 93 |         *,
 94 |         model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
 95 |         voice: NotGivenOr[TTSVoices | str] = NOT_GIVEN,
 96 |         speed: NotGivenOr[float] = NOT_GIVEN,
 97 |     ) -> None:
 98 |         """Update TTS options.
 99 |         
100 |         Args:
101 |             model: TTS model to use
102 |             voice: Voice to use
103 |             speed: Speech speed multiplier
104 |         """
105 |         if is_given(model):
106 |             self._opts.model = model
107 |         if is_given(voice):
108 |             self._opts.voice = voice
109 |         if is_given(speed):
110 |             self._opts.speed = speed
111 | 
112 |     def synthesize(
113 |         self,
114 |         text: str,
115 |         *,
116 |         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
117 |     ) -> "KokoroTTSStream":
118 |         """Synthesize speech from text.
119 |         
120 |         Args:
121 |             text: Text to synthesize
122 |             conn_options: Connection options
123 |             
124 |         Returns:
125 |             Stream of audio chunks
126 |         """
127 |         return KokoroTTSStream(
128 |             tts=self,
129 |             input_text=text,
130 |             conn_options=conn_options,
131 |             opts=self._opts,
132 |             client=self._client,
133 |         )
134 | 
135 | 
136 | class KokoroTTSStream(tts.ChunkedStream):
137 |     """Stream implementation for KokoroTTS."""
138 |     
139 |     def __init__(
140 |         self,
141 |         *,
142 |         tts: KokoroTTS,
143 |         input_text: str,
144 |         conn_options: APIConnectOptions,
145 |         opts: KokoroTTSOptions,
146 |         client: openai.AsyncClient,
147 |     ) -> None:
148 |         """Initialize the stream.
149 |         
150 |         Args:
151 |             tts: TTS instance
152 |             input_text: Text to synthesize
153 |             conn_options: Connection options
154 |             opts: TTS options
155 |             client: OpenAI AsyncClient
156 |         """
157 |         super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
158 |         self._client = client
159 |         self._opts = opts
160 | 
161 |     async def _run(self):
162 |         """Run the TTS synthesis."""
163 |         oai_stream = self._client.audio.speech.with_streaming_response.create(
164 |             input=self.input_text,
165 |             model=self._opts.model,
166 |             voice=self._opts.voice,
167 |             response_format="pcm",  # raw pcm buffers
168 |             speed=self._opts.speed,
169 |             timeout=httpx.Timeout(30, connect=self._conn_options.timeout),
170 |         )
171 | 
172 |         request_id = utils.shortuuid()
173 | 
174 |         audio_bstream = utils.audio.AudioByteStream(
175 |             sample_rate=TTS_SAMPLE_RATE,
176 |             num_channels=TTS_CHANNELS,
177 |         )
178 | 
179 |         logger.info(f"Kokoro -> converting text to audio")
180 | 
181 |         try:
182 |             with find_time('TTS_inferencing'):
183 |                 async with oai_stream as stream:
184 |                     async for data in stream.iter_bytes():
185 |                         for frame in audio_bstream.write(data):
186 |                             self._event_ch.send_nowait(
187 |                                 tts.SynthesizedAudio(
188 |                                     frame=frame,
189 |                                     request_id=request_id,
190 |                                 )
191 |                             )
192 |                     # Flush any remaining data in the buffer
193 |                     for frame in audio_bstream.flush():
194 |                         self._event_ch.send_nowait(
195 |                             tts.SynthesizedAudio(
196 |                                 frame=frame,
197 |                                 request_id=request_id,
198 |                             )
199 |                         )
200 | 
201 |         except openai.APITimeoutError:
202 |             raise APITimeoutError()
203 |         except openai.APIStatusError as e:
204 |             raise APIStatusError(
205 |                 e.message,
206 |                 status_code=e.status_code,
207 |                 request_id=e.request_id,
208 |                 body=e.body,
209 |             )
210 |         except Exception as e:
211 |             raise APIConnectionError() from e


--------------------------------------------------------------------------------
/conversify/models/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from dataclasses import dataclass, field
  3 | from typing import Literal, Any
  4 | import time
  5 | import logging
  6 | import os
  7 | from collections import OrderedDict
  8 | 
  9 | import base64
 10 | 
 11 | from livekit.agents import llm
 12 | from livekit.agents.log import logger
 13 | from openai.types.chat import (
 14 |     ChatCompletionContentPartParam,
 15 |     ChatCompletionMessageParam,
 16 |     ChatCompletionToolParam,
 17 | )
 18 | 
 19 | # Speech-to-Text model types
 20 | WhisperModels = Literal[
 21 |     "deepdml/faster-whisper-large-v3-turbo-ct2",
 22 | ]
 23 | 
 24 | # Text-to-Speech model types
 25 | TTSModels = Literal[
 26 |     "tts-1", 
 27 | ]
 28 | 
 29 | # Text-to-Speech voice options
 30 | TTSVoices = Literal[
 31 |     "af_heart",
 32 |     "af_bella"
 33 | ]
 34 | 
 35 | # Chat model types
 36 | ChatModels = Literal[
 37 |     'gpt'
 38 | ]
 39 | 
 40 | 
 41 | class find_time:
 42 |     """A context manager for timing code execution and logging the elapsed time."""
 43 |     def __init__(self, label: str):
 44 |         """Initialize a timer with a descriptive label.
 45 |         
 46 |         Args:
 47 |             label: A descriptive name for what is being timed
 48 |         """
 49 |         self.label = label
 50 |         self.start_time = None
 51 | 
 52 |     def __enter__(self):
 53 |         self.start_time = time.perf_counter()
 54 |         return self
 55 | 
 56 |     def __exit__(self, exc_type, exc_value, traceback):
 57 |         end_time = time.perf_counter()
 58 |         elapsed_ms = (end_time - self.start_time) * 1000
 59 |         logging.debug(f"{self.label} took {elapsed_ms:.4f} ms")
 60 | 
 61 | 
 62 | def to_fnc_ctx(fnc_ctx: list[llm.FunctionTool]) -> list[ChatCompletionToolParam]:
 63 |     """Convert LiveKit function tools to OpenAI tool parameters.
 64 |     
 65 |     Args:
 66 |         fnc_ctx: List of LiveKit function tools
 67 |         
 68 |     Returns:
 69 |         List of OpenAI tool parameters
 70 |     """
 71 |     return [llm.utils.build_strict_openai_schema(fnc) for fnc in fnc_ctx]
 72 | 
 73 | 
 74 | @dataclass
 75 | class _ChatItemGroup:
 76 |     """Groups related chat items for conversion to OpenAI chat items."""
 77 |     message: llm.ChatMessage | None = None
 78 |     tool_calls: list[llm.FunctionCall] = field(default_factory=list)
 79 |     tool_outputs: list[llm.FunctionCallOutput] = field(default_factory=list)
 80 | 
 81 |     def add(self, item: llm.ChatItem) -> _ChatItemGroup:
 82 |         """Add a chat item to this group.
 83 |         
 84 |         Args:
 85 |             item: Chat item to add
 86 |             
 87 |         Returns:
 88 |             This item group for chaining
 89 |         """
 90 |         if item.type == "message":
 91 |             assert self.message is None, "only one message is allowed in a group"
 92 |             self.message = item
 93 |         elif item.type == "function_call":
 94 |             self.tool_calls.append(item)
 95 |         elif item.type == "function_call_output":
 96 |             self.tool_outputs.append(item)
 97 |         return self
 98 | 
 99 |     def to_chat_items(self, cache_key: Any) -> list[ChatCompletionMessageParam]:
100 |         """Convert this group to OpenAI chat items.
101 |         
102 |         Args:
103 |             cache_key: Cache key for image caching
104 |             
105 |         Returns:
106 |             List of OpenAI chat items
107 |         """
108 |         tool_calls = {tool_call.call_id: tool_call for tool_call in self.tool_calls}
109 |         tool_outputs = {tool_output.call_id: tool_output for tool_output in self.tool_outputs}
110 | 
111 |         valid_tools = set(tool_calls.keys()) & set(tool_outputs.keys())
112 |         # remove invalid tool calls and tool outputs
113 |         if len(tool_calls) != len(valid_tools) or len(tool_outputs) != len(valid_tools):
114 |             for tool_call in self.tool_calls:
115 |                 if tool_call.call_id not in valid_tools:
116 |                     logger.warning(
117 |                         "function call missing the corresponding function output, ignoring",
118 |                         extra={"call_id": tool_call.call_id, "tool_name": tool_call.name},
119 |                     )
120 |                     tool_calls.pop(tool_call.call_id)
121 | 
122 |             for tool_output in self.tool_outputs:
123 |                 if tool_output.call_id not in valid_tools:
124 |                     logger.warning(
125 |                         "function output missing the corresponding function call, ignoring",
126 |                         extra={"call_id": tool_output.call_id, "tool_name": tool_output.name},
127 |                     )
128 |                     tool_outputs.pop(tool_output.call_id)
129 | 
130 |         if not self.message and not tool_calls and not tool_outputs:
131 |             return []
132 | 
133 |         msg = (
134 |             _to_chat_item(self.message, cache_key)
135 |             if self.message
136 |             else {"role": "assistant", "tool_calls": []}
137 |         )
138 |         if tool_calls:
139 |             msg.setdefault("tool_calls", [])
140 |         for tool_call in tool_calls.values():
141 |             msg["tool_calls"].append(
142 |                 {
143 |                     "id": tool_call.call_id,
144 |                     "type": "function",
145 |                     "function": {"name": tool_call.name, "arguments": tool_call.arguments},
146 |                 }
147 |             )
148 |         items = [msg]
149 |         for tool_output in tool_outputs.values():
150 |             items.append(_to_chat_item(tool_output, cache_key))
151 |         return items
152 | 
153 | 
154 | def to_chat_ctx(chat_ctx: llm.ChatContext, cache_key: Any) -> list[ChatCompletionMessageParam]:
155 |     """Convert a LiveKit chat context to OpenAI chat messages.
156 |     
157 |     OpenAI requires the tool calls to be followed by the corresponding tool outputs.
158 |     We group them first and remove invalid tool calls and outputs before converting.
159 |     
160 |     Args:
161 |         chat_ctx: LiveKit chat context
162 |         cache_key: Cache key for image caching
163 |         
164 |     Returns:
165 |         List of OpenAI chat messages
166 |     """
167 |     item_groups: dict[str, _ChatItemGroup] = OrderedDict()  # item_id to group of items
168 |     tool_outputs: list[llm.FunctionCallOutput] = []
169 |     for item in chat_ctx.items:
170 |         if (item.type == "message" and item.role == "assistant") or item.type == "function_call":
171 |             # only assistant messages and function calls can be grouped
172 |             group_id = item.id.split("/")[0]
173 |             if group_id not in item_groups:
174 |                 item_groups[group_id] = _ChatItemGroup().add(item)
175 |             else:
176 |                 item_groups[group_id].add(item)
177 |         elif item.type == "function_call_output":
178 |             tool_outputs.append(item)
179 |         else:
180 |             item_groups[item.id] = _ChatItemGroup().add(item)
181 | 
182 |     # add tool outputs to their corresponding groups
183 |     call_id_to_group: dict[str, _ChatItemGroup] = {
184 |         tool_call.call_id: group for group in item_groups.values() for tool_call in group.tool_calls
185 |     }
186 |     for tool_output in tool_outputs:
187 |         if tool_output.call_id not in call_id_to_group:
188 |             logger.warning(
189 |                 "function output missing the corresponding function call, ignoring",
190 |                 extra={"call_id": tool_output.call_id, "tool_name": tool_output.name},
191 |             )
192 |             continue
193 | 
194 |         call_id_to_group[tool_output.call_id].add(tool_output)
195 | 
196 |     messages = []
197 |     for group in item_groups.values():
198 |         messages.extend(group.to_chat_items(cache_key))
199 |     return messages
200 | 
201 | 
202 | def _to_chat_item(msg: llm.ChatItem, cache_key: Any) -> ChatCompletionMessageParam:
203 |     """Convert a LiveKit chat item to an OpenAI chat message.
204 |     
205 |     Args:
206 |         msg: LiveKit chat item
207 |         cache_key: Cache key for image caching
208 |         
209 |     Returns:
210 |         OpenAI chat message
211 |     """
212 |     if msg.type == "message":
213 |         list_content: list[ChatCompletionContentPartParam] = []
214 |         text_content = ""
215 |         for content in msg.content:
216 |             if isinstance(content, str):
217 |                 if text_content:
218 |                     text_content += "\n"
219 |                 text_content += content
220 |             elif isinstance(content, llm.ImageContent):
221 |                 list_content.append(_to_image_content(content, cache_key))
222 | 
223 |         if not list_content:
224 |             # certain providers require text-only content in a string vs a list.
225 |             # for max-compatibility, we will combine all text content into a single string.
226 |             return {
227 |                 "role": msg.role,  # type: ignore
228 |                 "content": text_content,
229 |             }
230 | 
231 |         if text_content:
232 |             list_content.append({"type": "text", "text": text_content})
233 | 
234 |         return {
235 |             "role": msg.role,  # type: ignore
236 |             "content": list_content,
237 |         }
238 | 
239 |     elif msg.type == "function_call":
240 |         return {
241 |             "role": "assistant",
242 |             "tool_calls": [
243 |                 {
244 |                     "id": msg.call_id,
245 |                     "type": "function",
246 |                     "function": {
247 |                         "name": msg.name,
248 |                         "arguments": msg.arguments,
249 |                     },
250 |                 }
251 |             ],
252 |         }
253 | 
254 |     elif msg.type == "function_call_output":
255 |         return {
256 |             "role": "tool",
257 |             "tool_call_id": msg.call_id,
258 |             "content": msg.output,
259 |         }
260 | 
261 | 
262 | def _to_image_content(image: llm.ImageContent, cache_key: Any) -> ChatCompletionContentPartParam:
263 |     """Convert a LiveKit image to an OpenAI image content part.
264 |     
265 |     Args:
266 |         image: LiveKit image content
267 |         cache_key: Cache key for image caching
268 |         
269 |     Returns:
270 |         OpenAI image content part
271 |     """
272 |     img = llm.utils.serialize_image(image)
273 |     if img.external_url:
274 |         return {
275 |             "type": "image_url",
276 |             "image_url": {
277 |                 "url": img.external_url,
278 |                 "detail": img.inference_detail,
279 |             },
280 |         }
281 |     if cache_key not in image._cache:
282 |         image._cache[cache_key] = img.data_bytes
283 |     b64_data = base64.b64encode(image._cache[cache_key]).decode("utf-8")
284 |     return {
285 |         "type": "image_url",
286 |         "image_url": {
287 |             "url": f"data:{img.mime_type};base64,{b64_data}",
288 |             "detail": img.inference_detail,
289 |         },
290 |     }


--------------------------------------------------------------------------------
/conversify/prompts/llm.txt:
--------------------------------------------------------------------------------
 1 | You are Conversify, a helpful assistant integrated into a conversation app. You have a friendly, slightly quirky personality and enjoy assisting users efficiently. Your primary task is to fully understand the user's query and deliver short, clear, and satisfying responses.
 2 | 
 3 | **Interaction Style:**
 4 | *   Respond in a warm, approachable, and friendly manner, reflecting your slightly quirky personality.
 5 | *   **Brevity is key:** Keep your responses **extremely concise** and directly to the point. Focus *only* on answering the user's query without extra information.
 6 | *   Avoid lengthy explanations or paragraphs. Prefer short sentences.
 7 | *   Ensure your language is engaging and simple, suitable for conversion to speech.
 8 | *   Address the user's query accurately, avoiding unnecessary repetition or irrelevant details.
 9 | 
10 | **Capabilities & Behavior:**
11 | *   You have access to visual input; use it when relevant to understand the user's query or context.
12 | *   Consult the conversation history to understand the context and provide relevant, non-repetitive answers based on past interactions. If a user asks about something discussed previously, use that context.
13 | *   Do not provide information or answers about topics (such as current news headlines) that are outside your knowledge or toolset.
14 | *   Avoid repeating preset phrases.
15 | *   If further context is genuinely needed to clarify a query *after* checking the history, ask briefly without derailing the conversation.
16 | 
17 | **Input Handling:**
18 | *   The user's input is transcribed from speech using a speech-to-text system and may be imperfect. If something seems unusual or grammatically incorrect, assume it might be a mistranscription.
19 | *   Do your best to infer the words the user actually spoke based on the context of the conversation and potentially the visual input.
20 | *   If you are unable to reasonably infer what the user actually said, tell them you might have misheard and politely ask for clarification.
21 | 
22 | **Objective:**
23 | *   Provide helpful, correct, and satisfying answers that directly support the user's needs, leveraging your text understanding, visual input, and conversation history, leaving the user informed and content with the interaction. 


--------------------------------------------------------------------------------
/conversify/utils/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/conversify/utils/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | import logging
  4 | from typing import Dict, Any
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | class ConfigManager:
  9 |     """
 10 |     Manages configuration loading, parsing, path resolution,
 11 |     and prompt loading for the Conversify application.
 12 |     """
 13 |     
 14 |     def __init__(self, config_path: str = 'config.yaml'):
 15 |         """Initialize the ConfigManager with a path to the config file."""
 16 |         self.config_path = config_path
 17 |         self.config: Dict[str, Any] = {}
 18 |         self.project_root = self._get_project_root()
 19 |     
 20 |     def _get_project_root(self) -> str:
 21 |         """Get the absolute path to the project root directory."""
 22 |         # Assuming this file is in 'utils' subdirectory of the project root
 23 |         return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 24 |     
 25 |     def _resolve_path(self, relative_path: str) -> str:
 26 |         """Convert a relative path to an absolute path based on project root."""
 27 |         return os.path.abspath(os.path.join(self.project_root, relative_path))
 28 |     
 29 |     def _load_yaml_config(self) -> Dict[str, Any]:
 30 |         """Load the YAML configuration file."""
 31 |         abs_config_path = self._resolve_path(self.config_path)
 32 |         logger.info(f"Loading configuration from: {abs_config_path}")
 33 |         
 34 |         try:
 35 |             with open(abs_config_path, 'r', encoding='utf-8') as f:
 36 |                 config = yaml.safe_load(f)
 37 |                 if not isinstance(config, dict):
 38 |                     raise ValueError("Configuration file does not contain a valid YAML dictionary")
 39 |                 logger.info(f"Configuration loaded successfully from {abs_config_path}")
 40 |                 return config
 41 |         except Exception as e:
 42 |             logger.error(f"Error loading YAML configuration {abs_config_path}: {e}")
 43 |             raise
 44 |     
 45 |     def _load_prompt(self, prompt_path: str) -> str:
 46 |         """Load prompt content from a file."""
 47 |         abs_prompt_path = self._resolve_path(prompt_path)
 48 |         logger.info(f"Loading prompt from: {abs_prompt_path}")
 49 |         
 50 |         try:
 51 |             with open(abs_prompt_path, 'r', encoding='utf-8') as f:
 52 |                 content = f.read().strip()
 53 |                 logger.info(f"Prompt loaded successfully from {abs_prompt_path}")
 54 |                 return content
 55 |         except Exception as e:
 56 |             logger.error(f"Error loading prompt from {abs_prompt_path}: {e}")
 57 |             raise
 58 |     
 59 |     def _resolve_paths_in_config(self) -> None:
 60 |         """
 61 |         Resolve all relative paths in the configuration to absolute paths.
 62 |         Also loads any file content that needs to be loaded (e.g., prompts).
 63 |         """
 64 |         # Load agent instructions
 65 |         agent_cfg = self.config['agent']
 66 |         prompt_file = agent_cfg['instructions_file']
 67 |         agent_cfg['instructions'] = self._load_prompt(prompt_file)
 68 |         
 69 |         # Resolve memory directory
 70 |         memory_cfg = self.config['memory']
 71 |         if memory_cfg.get('use', False):
 72 |             memory_dir_rel = memory_cfg['dir']
 73 |             memory_dir_abs = self._resolve_path(memory_dir_rel)
 74 |             memory_cfg['dir_abs'] = memory_dir_abs
 75 |             logger.info(f"Memory enabled. Directory path: {memory_dir_abs}")
 76 |         else:
 77 |             logger.info("Memory usage is disabled in config.")
 78 |         
 79 |         # Handle STT paths - check if they should be absolute or need resolution
 80 |         stt_cfg = self.config.get('stt', {})
 81 |         whisper_cfg = stt_cfg['whisper']
 82 |         
 83 |         # Check if model_cache_directory is relative and needs resolution
 84 |         if 'model_cache_directory' in whisper_cfg and not os.path.isabs(whisper_cfg['model_cache_directory']):
 85 |             whisper_cfg['model_cache_directory'] = self._resolve_path(whisper_cfg['model_cache_directory'])
 86 |         
 87 |         # Check if warmup_audio is relative and needs resolution
 88 |         if 'warmup_audio' in whisper_cfg and not os.path.isabs(whisper_cfg['warmup_audio']):
 89 |             whisper_cfg['warmup_audio'] = self._resolve_path(whisper_cfg['warmup_audio'])
 90 |         
 91 |         # Add logging path resolution
 92 |         logging_cfg = self.config['logging']
 93 |         log_file_rel = logging_cfg['file']
 94 |         if not os.path.isabs(log_file_rel):
 95 |             logging_cfg['file_abs'] = self._resolve_path(log_file_rel)
 96 |     
 97 |     def load_config(self) -> Dict[str, Any]:
 98 |         """
 99 |         Load and process the configuration file.
100 |         Returns the processed configuration dictionary.
101 |         """
102 |         self.config = self._load_yaml_config()
103 |         self._resolve_paths_in_config()
104 |         logger.info("Configuration processed successfully.")
105 |         return self.config


--------------------------------------------------------------------------------
/conversify/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from typing import Dict, Any
 5 | 
 6 | # Sentinel to prevent multiple configurations
 7 | _logging_configured = False
 8 | 
 9 | DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10 | DEFAULT_LOG_LEVEL = "INFO"
11 | 
12 | def setup_logging(config: Dict[str, Any], project_root: str):
13 |     """Configures the root logger based on settings from the config dictionary.
14 | 
15 |     Args:
16 |         config: The loaded configuration dictionary.
17 |         project_root: The absolute path to the project's root directory.
18 |     """
19 |     global _logging_configured
20 |     if _logging_configured:
21 |         return
22 | 
23 |     # --- Extract Settings from Config --- 
24 |     log_cfg = config.get('logging', {})
25 |     log_level_str = log_cfg.get('level', DEFAULT_LOG_LEVEL)
26 |     log_file_rel = log_cfg.get('file') # Path relative to project root
27 |     
28 |     log_file_abs = None
29 |     if log_file_rel:
30 |         # Resolve relative path using the provided project_root
31 |         log_file_abs = os.path.abspath(os.path.join(project_root, log_file_rel))
32 | 
33 |     # --- Get Log Level --- 
34 |     level = logging.getLevelName(log_level_str.upper())
35 |     if not isinstance(level, int):
36 |         print(f"Warning: Invalid log level '{log_level_str}' in config. Defaulting to {DEFAULT_LOG_LEVEL}.", file=sys.stderr)
37 |         level = logging.INFO
38 | 
39 |     # --- Create Formatter --- 
40 |     formatter = logging.Formatter(DEFAULT_LOG_FORMAT)
41 | 
42 |     # --- Get Root Logger --- 
43 |     root_logger = logging.getLogger()
44 |     root_logger.setLevel(level)
45 | 
46 |     # --- Clear Existing Handlers --- 
47 |     for handler in root_logger.handlers[:]:
48 |         root_logger.removeHandler(handler)
49 |         handler.close()
50 | 
51 |     # --- Setup Console Handler --- 
52 |     console_handler = logging.StreamHandler(sys.stdout)
53 |     console_handler.setLevel(level) 
54 |     console_handler.setFormatter(formatter)
55 |     root_logger.addHandler(console_handler)
56 |     
57 |     # --- Setup File Handler (If specified) --- 
58 |     if log_file_abs:
59 |         try:
60 |             # Ensure log directory exists
61 |             log_dir = os.path.dirname(log_file_abs)
62 |             if log_dir and not os.path.exists(log_dir):
63 |                 os.makedirs(log_dir)
64 |                 print(f"Created log directory: {log_dir}", file=sys.stderr)
65 |             
66 |             file_handler = logging.FileHandler(log_file_abs, mode='a') 
67 |             file_handler.setLevel(level) 
68 |             file_handler.setFormatter(formatter)
69 |             root_logger.addHandler(file_handler)
70 |             print(f"Logging configured. Level: {log_level_str.upper()}, File: {log_file_abs}", file=sys.stderr)
71 |         except Exception as e:
72 |             print(f"Error setting up file logging to {log_file_abs}: {e}", file=sys.stderr)
73 |             # Continue with console logging only
74 |     else:
75 |          print(f"Logging configured. Level: {log_level_str.upper()}, Console only.", file=sys.stderr)
76 | 
77 |     _logging_configured = True
78 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | addict==2.4.0
  2 | aiofiles==24.1.0
  3 | aiohappyeyeballs==2.4.3
  4 | aiohttp==3.10.10
  5 | aiosignal==1.3.1
  6 | airportsdata==20250224
  7 | annotated-types==0.7.0
  8 | anyio==4.9.0
  9 | astor==0.8.1
 10 | attrs==24.2.0
 11 | av==14.2.0
 12 | babel==2.17.0
 13 | blake3==1.0.4
 14 | blis==1.2.1
 15 | cachetools==5.5.2
 16 | catalogue==2.0.10
 17 | certifi==2025.1.31
 18 | cffi==1.17.1
 19 | charset-normalizer==3.4.0
 20 | click==8.1.8
 21 | cloudpathlib==0.21.0
 22 | cloudpickle==3.1.1
 23 | cn2an==0.5.23
 24 | colorama==0.4.6
 25 | coloredlogs==15.0.1
 26 | compressed-tensors==0.9.2
 27 | confection==0.1.5
 28 | contourpy==1.3.1
 29 | csvw==3.5.1
 30 | ctranslate2==4.4.0
 31 | cupy-cuda12x==13.4.1
 32 | curated-tokenizers==0.0.9
 33 | curated-transformers==0.1.1
 34 | cycler==0.12.1
 35 | cymem==2.0.11
 36 | depyf==0.18.0
 37 | dill==0.3.9
 38 | diskcache==5.6.3
 39 | distro==1.9.0
 40 | dlinfo==2.0.0
 41 | dnspython==2.7.0
 42 | docopt==0.6.2
 43 | docstring_parser==0.16
 44 | einops==0.8.1
 45 | email_validator==2.2.0
 46 | en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
 47 | espeakng-loader==0.2.4
 48 | eval_type_backport==0.2.2
 49 | faiss-cpu==1.8.0.post1
 50 | fastapi==0.115.6
 51 | fastapi-cli==0.0.7
 52 | faster-whisper==1.1.1
 53 | fastrlock==0.8.3
 54 | filelock==3.18.0
 55 | flatbuffers==25.2.10
 56 | fonttools==4.57.0
 57 | frozenlist==1.5.0
 58 | fsspec==2025.3.2
 59 | fugashi==1.4.0
 60 | gguf==0.10.0
 61 | google-api-core==2.24.2
 62 | google-auth==2.38.0
 63 | google-cloud-speech==2.31.1
 64 | google-cloud-texttospeech==2.25.1
 65 | google-genai==1.3.0
 66 | googleapis-common-protos==1.69.2
 67 | greenlet==3.1.1
 68 | grpcio==1.71.0
 69 | grpcio-status==1.71.0
 70 | h11==0.14.0
 71 | hf-xet==1.0.3
 72 | httpcore==1.0.8
 73 | httptools==0.6.4
 74 | httpx==0.28.1
 75 | huggingface-hub==0.30.1
 76 | humanfriendly==10.0
 77 | idna==3.10
 78 | importlib_metadata==8.6.1
 79 | inflect==7.5.0
 80 | interegular==0.3.3
 81 | isodate==0.7.2
 82 | jaconv==0.4.0
 83 | jamo==0.4.1
 84 | jieba==0.42.1
 85 | Jinja2==3.1.6
 86 | jiter==0.7.0
 87 | joblib==1.4.2
 88 | jsonpatch==1.33
 89 | jsonpointer==3.0.0
 90 | jsonschema==4.23.0
 91 | jsonschema-specifications==2024.10.1
 92 | kiwisolver==1.4.8
 93 | kokoro==0.9.2
 94 | -e git+https://github.com/remsky/Kokoro-FastAPI.git@f1fa3404940e1422010b9be19f9b4996619e5e57#egg=kokoro_fastapi
 95 | langchain==0.3.7
 96 | langchain-core==0.3.15
 97 | langchain-ollama==0.2.0
 98 | langchain-openai==0.2.5
 99 | langchain-text-splitters==0.3.2
100 | langcodes==3.5.0
101 | langsmith==0.1.139
102 | language-tags==1.2.0
103 | language_data==1.3.0
104 | lark==1.2.2
105 | livekit==1.0.6
106 | livekit-agents==1.0.13
107 | livekit-api==1.0.2
108 | livekit-plugins-cartesia==0.4.11
109 | livekit-plugins-deepgram==1.0.13
110 | livekit-plugins-google==0.11.2
111 | livekit-plugins-noise-cancellation==0.2.1
112 | livekit-plugins-openai==1.0.13
113 | livekit-plugins-silero==1.0.13
114 | livekit-plugins-turn-detector==1.0.13
115 | livekit-protocol==1.0.2
116 | llguidance==0.7.14
117 | llvmlite==0.44.0
118 | lm-format-enforcer==0.10.11
119 | loguru==0.7.3
120 | marisa-trie==1.2.1
121 | markdown-it-py==3.0.0
122 | MarkupSafe==3.0.2
123 | matplotlib==3.10.1
124 | mdurl==0.1.2
125 | memoripy==0.1.2
126 | misaki==0.9.3
127 | mistral_common==1.5.4
128 | mojimoji==0.0.13
129 | more-itertools==10.6.0
130 | mpmath==1.3.0
131 | msgpack==1.1.0
132 | msgspec==0.19.0
133 | multidict==6.1.0
134 | munch==4.0.0
135 | murmurhash==1.0.12
136 | mutagen==1.47.0
137 | nanobind==2.6.1
138 | nest-asyncio==1.6.0
139 | networkx==3.4.2
140 | ninja==1.11.1.4
141 | nltk==3.9.1
142 | num2words==0.5.14
143 | numba==0.61.0
144 | numpy==1.26.4
145 | nvidia-cublas-cu12==12.4.5.8
146 | nvidia-cuda-cupti-cu12==12.4.127
147 | nvidia-cuda-nvrtc-cu12==12.4.127
148 | nvidia-cuda-runtime-cu12==12.4.127
149 | nvidia-cudnn-cu12==9.1.0.70
150 | nvidia-cufft-cu12==11.2.1.3
151 | nvidia-curand-cu12==10.3.5.147
152 | nvidia-cusolver-cu12==11.6.1.9
153 | nvidia-cusparse-cu12==12.3.1.170
154 | nvidia-cusparselt-cu12==0.6.2
155 | nvidia-nccl-cu12==2.21.5
156 | nvidia-nvjitlink-cu12==12.4.127
157 | nvidia-nvtx-cu12==12.4.127
158 | ollama==0.3.3
159 | onnxruntime==1.21.0
160 | openai==1.74.1
161 | opencv-python-headless==4.11.0.86
162 | ordered-set==4.1.0
163 | orjson==3.10.11
164 | outlines==0.1.11
165 | outlines_core==0.1.26
166 | packaging==24.1
167 | partial-json-parser==0.2.1.1.post5
168 | phonemizer-fork==3.3.2
169 | pillow==11.1.0
170 | preshed==3.0.9
171 | proces==0.1.7
172 | prometheus-fastapi-instrumentator==7.1.0
173 | prometheus_client==0.21.1
174 | propcache==0.2.0
175 | proto-plus==1.26.1
176 | protobuf==5.29.4
177 | psutil==7.0.0
178 | py-cpuinfo==9.0.0
179 | pyasn1==0.6.1
180 | pyasn1_modules==0.4.2
181 | pycountry==24.6.1
182 | pycparser==2.22
183 | pydantic==2.10.4
184 | pydantic-settings==2.7.0
185 | pydantic_core==2.27.2
186 | pydub==0.25.1
187 | Pygments==2.19.1
188 | PyJWT==2.10.1
189 | pyopenjtalk-plus==0.3.4.post11
190 | pyparsing==3.2.3
191 | pypinyin==0.54.0
192 | pypinyin-dict==0.9.0
193 | python-dateutil==2.9.0.post0
194 | python-dotenv==1.0.1
195 | python-json-logger==3.3.0
196 | python-multipart==0.0.20
197 | PyYAML==6.0.2
198 | pyzmq==26.4.0
199 | ray==2.43.0
200 | rdflib==7.1.4
201 | referencing==0.36.2
202 | regex==2024.11.6
203 | requests==2.32.3
204 | requests-toolbelt==1.0.0
205 | rfc3986==1.5.0
206 | rich==14.0.0
207 | rich-toolkit==0.14.1
208 | rpds-py==0.24.0
209 | rsa==4.9
210 | safetensors==0.5.3
211 | scikit-learn==1.5.2
212 | scipy==1.14.1
213 | segments==2.3.0
214 | sentencepiece==0.2.0
215 | shellingham==1.5.4
216 | six==1.17.0
217 | smart-open==7.1.0
218 | sniffio==1.3.1
219 | sounddevice==0.5.1
220 | soundfile==0.13.0
221 | spacy==3.8.5
222 | spacy-curated-transformers==0.3.0
223 | spacy-legacy==3.0.12
224 | spacy-loggers==1.0.5
225 | SQLAlchemy==2.0.27
226 | srsly==2.5.1
227 | starlette==0.41.3
228 | SudachiDict-core==20250129
229 | SudachiPy==0.6.10
230 | sympy==1.13.1
231 | tenacity==9.0.0
232 | text2num==2.5.1
233 | thinc==8.3.4
234 | threadpoolctl==3.5.0
235 | tiktoken==0.8.0
236 | tokenizers==0.21.1
237 | torch==2.6.0+cu124
238 | torchaudio==2.6.0
239 | torchvision==0.21.0
240 | tqdm==4.67.1
241 | transformers==4.51.2
242 | triton==3.2.0
243 | typeguard==4.4.2
244 | typer==0.15.2
245 | types-protobuf==4.25.0.20240417
246 | typing-inspection==0.4.0
247 | typing_extensions==4.13.2
248 | unidic-lite==1.0.8
249 | uritemplate==4.1.1
250 | urllib3==2.2.3
251 | uv==0.6.12
252 | uvicorn==0.34.0
253 | uvloop==0.21.0
254 | vllm==0.8.3
255 | wasabi==1.1.3
256 | watchfiles==1.0.4
257 | weasel==0.4.1
258 | websockets==14.2
259 | wrapt==1.17.2
260 | xformers==0.0.29.post2
261 | xgrammar==0.1.17
262 | yarl==1.17.1
263 | zipp==3.21.0
264 | 


--------------------------------------------------------------------------------
/scripts/run_app.sh:
--------------------------------------------------------------------------------
1 | python -m conversify.main start


--------------------------------------------------------------------------------
/scripts/run_kokoro.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get project root directory
 4 | cd ../Kokoro-FastAPI
 5 | PROJECT_ROOT=$(pwd)
 6 | 
 7 | # Set environment variables
 8 | export USE_GPU=true
 9 | export USE_ONNX=false
10 | export PYTHONPATH=$PROJECT_ROOT:$PROJECT_ROOT/api
11 | export MODEL_DIR=src/models
12 | export VOICES_DIR=src/voices/v1_0
13 | export WEB_PLAYER_PATH=$PROJECT_ROOT/web
14 | 
15 | # Run FastAPI with GPU extras using uv run
16 | # Note: espeak may still require manual installation,
17 | uv pip install -e ".[gpu]"
18 | uv run --no-sync python docker/scripts/download_model.py --output api/src/models/v1_0
19 | uv run --no-sync uvicorn api.src.main:app --host 0.0.0.0 --port 8880


--------------------------------------------------------------------------------
/scripts/run_llm.sh:
--------------------------------------------------------------------------------
1 | python3 -m sglang.launch_server \
2 | --model-path Qwen/Qwen2.5-VL-7B-Instruct-AWQ  \
3 | --chat-template=qwen2-vl \
4 | --mem-fraction-static=0.6 \
5 | --tool-call-parser qwen25 


--------------------------------------------------------------------------------