├── frames └── frame.jpg ├── requirements.txt ├── templates └── index.html ├── static ├── css │ └── styles.css └── js │ └── script.js ├── README.md └── app.py /frames/frame.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Doriandarko/DIY-Astra/HEAD/frames/frame.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask-socketio 3 | opencv-python 4 | pydub 5 | pillow 6 | requests 7 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Live Camera Feed 7 | 8 | 9 | 10 |
11 |
12 | 13 |
14 |
15 |
16 |
17 | 18 | 19 |
20 | 23 |
24 |
25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /static/css/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | display: flex; 3 | justify-content: center; 4 | align-items: center; 5 | height: 100vh; 6 | margin: 0; 7 | font-family: Arial, sans-serif; 8 | background-color: #f0f0f0; 9 | } 10 | 11 | .container { 12 | display: flex; 13 | flex-direction: column; 14 | align-items: center; 15 | background-color: white; 16 | border-radius: 10px; 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | padding: 20px; 19 | } 20 | 21 | .video-container { 22 | width: 640px; 23 | height: 480px; 24 | background: #000; 25 | display: flex; 26 | justify-content: center; 27 | align-items: center; 28 | border-radius: 10px; 29 | overflow: hidden; 30 | } 31 | 32 | #video { 33 | width: 100%; 34 | height: 100%; 35 | object-fit: cover; 36 | } 37 | 38 | .text-container { 39 | width: 640px; 40 | height: 200px; 41 | background: #f0f0f0; /* grey */ 42 | overflow-y: auto; 43 | margin-top: 20px; 44 | border: 1px solid #ddd; 45 | padding: 10px; 46 | border-radius: 10px; 47 | color: rgb(35, 35, 35); 48 | font-size: 16px; 49 | } 50 | 51 | .message { 52 | margin-bottom: 10px; /* Add vertical space between messages */ 53 | white-space: pre-line; /* Preserve line breaks and spaces */ 54 | } 55 | 56 | .control-container { 57 | display: flex; 58 | justify-content: space-between; 59 | align-items: center; /* Align items vertically centered */ 60 | width: 640px; 61 | margin-top: 20px; 62 | } 63 | 64 | .interval-container { 65 | display: flex; 66 | align-items: center; 67 | } 68 | 69 | #interval-input { 70 | padding-left: 12px; 71 | font-size: 16px; 72 | margin-right: 10px; 73 | border: 1px solid #ddd; 74 | border-radius: 20px; 75 | height: 42px; /* Same height as the buttons */ 76 | } 77 | 78 | button { 79 | padding: 10px 20px; 80 | font-size: 16px; 81 | background-color: black; 82 | color: white; 83 | border: none; 84 | border-radius: 20px; 85 | cursor: pointer; 86 | display: flex; 87 | align-items: center; 88 | justify-content: center; 89 | height: 42px; /* Ensure the buttons have the same height */ 90 | } 91 | 92 | button:hover { 93 | background-color: #333; 94 | } 95 | 96 | button span { 97 | margin-left: 10px; 98 | } 99 | -------------------------------------------------------------------------------- /static/js/script.js: -------------------------------------------------------------------------------- 1 | var socket = io(); 2 | var running = true; 3 | 4 | socket.on('stream', function(data) { 5 | var img = document.getElementById('video'); 6 | img.src = 'data:image/jpeg;base64,' + data.image; 7 | }); 8 | 9 | socket.on('text', function(data) { 10 | var textContainer = document.getElementById('text-container'); 11 | var newMessage = document.createElement('div'); 12 | newMessage.classList.add('message'); 13 | newMessage.textContent = data.message; 14 | textContainer.appendChild(newMessage); 15 | textContainer.scrollTop = textContainer.scrollHeight; 16 | }); 17 | 18 | function toggleApp() { 19 | var controlButton = document.getElementById('control-button'); 20 | if (running) { 21 | fetch('/stop') 22 | .then(response => response.json()) 23 | .then(data => { 24 | console.log('App stopped:', data); 25 | alert('The application has been stopped.'); 26 | controlButton.innerHTML = '▶️ Resume'; 27 | running = false; 28 | }) 29 | .catch((error) => { 30 | console.error('Error stopping the app:', error); 31 | }); 32 | } else { 33 | fetch('/resume') 34 | .then(response => response.json()) 35 | .then(data => { 36 | console.log('App resumed:', data); 37 | alert('The application has resumed.'); 38 | controlButton.innerHTML = '⏹ Stop'; 39 | running = true; 40 | }) 41 | .catch((error) => { 42 | console.error('Error resuming the app:', error); 43 | }); 44 | } 45 | } 46 | 47 | function setInterval() { 48 | var intervalInput = document.getElementById('interval-input').value; 49 | fetch('/set_interval', { 50 | method: 'POST', 51 | headers: { 52 | 'Content-Type': 'application/json' 53 | }, 54 | body: JSON.stringify({ interval: parseInt(intervalInput) }) 55 | }) 56 | .then(response => response.json()) 57 | .then(data => { 58 | if (data.status === 'interval updated') { 59 | alert('Capture interval updated to ' + data.interval + ' seconds.'); 60 | } else { 61 | alert('Failed to update interval: ' + data.message); 62 | } 63 | }) 64 | .catch(error => { 65 | console.error('Error setting interval:', error); 66 | }); 67 | } 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # DIY-Astra 3 | 4 | DIY-Astra is a Flask application that utilizes computer vision and natural language processing to create an interactive AI assistant. The application captures live video feed from a webcam, analyzes the captured images using OpenRouter API (with Claude Sonnet 4), and generates text responses based on the visual input. The generated text responses are then converted to audio using the ElevenLabs API and played back to the user. 5 | 6 | ## Features 7 | - Live video feed capture from the webcam 8 | - Image analysis using OpenRouter API with Claude Sonnet 4 (supports 400+ AI models) 9 | - Text generation based on visual input with context-aware conversation memory 10 | - Text-to-speech conversion using the ElevenLabs API 11 | - Real-time audio playback of generated responses 12 | - Web-based user interface for interaction and control 13 | - Flexible model selection (Claude Sonnet 4, Claude 3.5 Sonnet, Claude 3.7 Sonnet, and more) 14 | 15 | ## Requirements 16 | To run the DIY-Astra application, you need to have the following dependencies installed: 17 | - Python 3.x 18 | - Flask 19 | - Flask-SocketIO 20 | - OpenCV (cv2) 21 | - Pydub 22 | - Pillow (PIL) 23 | - Requests 24 | 25 | You also need to have valid API keys for the following services: 26 | - OpenRouter API (OPENROUTER_API_KEY) - Get your free key at https://openrouter.ai/keys 27 | - ElevenLabs API (ELEVENLABS_API_KEY) - For text-to-speech conversion 28 | 29 | ## Installation 30 | 1. Clone the repository: 31 | ```bash 32 | git clone https://github.com/your-username/diy-astra.git 33 | ``` 34 | 35 | 2. Navigate to the project directory: 36 | ```bash 37 | cd diy-astra 38 | ``` 39 | 40 | 3. Install the required dependencies: 41 | ```bash 42 | pip install -r requirements.txt 43 | ``` 44 | 45 | 4. Set up the API keys: 46 | - Replace `OPENROUTER_API_KEY` in `app.py` with your OpenRouter API key. 47 | - Replace `ELEVENLABS_API_KEY` in `app.py` with your ElevenLabs API key. 48 | - (Optional) Change `MODEL_NAME` to use different AI models like: 49 | - `anthropic/claude-sonnet-4` (default, most capable) 50 | - `anthropic/claude-3.5-sonnet` (faster, cheaper) 51 | - `anthropic/claude-3.7-sonnet` (balanced) 52 | - Or any other vision-capable model from OpenRouter's 400+ models 53 | 54 | 5. Run the application: 55 | ```bash 56 | python app.py 57 | ``` 58 | 59 | 6. Open your web browser and navigate to `http://localhost:5001` to access the DIY-Astra interface. 60 | 61 | ## Usage 62 | 1. Make sure your webcam is connected and accessible. 63 | 2. Launch the DIY-Astra application by running `python app.py`. 64 | 3. The application will open in your default web browser. 65 | 4. The live video feed from your webcam will be displayed in the interface. 66 | 5. DIY-Astra will continuously capture images, analyze them using OpenRouter API (Claude Sonnet 4), and generate text responses based on the visual input. 67 | 6. The generated text responses will be displayed in the text container below the video feed. 68 | 7. The text responses will also be converted to audio using the ElevenLabs API and played back in real-time. 69 | 8. You can stop the application by clicking the "Stop" button in the interface. To resume, click the "Resume" button. 70 | 9. Adjust the capture interval (in seconds) to control how frequently Astra analyzes new images. 71 | 72 | ## File Structure 73 | - `app.py`: The main Flask application file containing the server-side logic. 74 | - `templates/index.html`: The HTML template for the user interface. 75 | - `static/css/styles.css`: The CSS stylesheet for styling the user interface. 76 | - `static/js/script.js`: The JavaScript file for client-side interactions and socket communication. 77 | - `requirements.txt`: The list of required Python dependencies. 78 | 79 | ## Contributing 80 | Contributions are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request. 81 | 82 | ## License 83 | This project is licensed under the MIT License. 84 | 85 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import threading 4 | import base64 5 | import time 6 | import requests 7 | from flask import Flask, render_template, jsonify, request 8 | from flask_socketio import SocketIO, emit 9 | from queue import Queue 10 | from pydub import AudioSegment 11 | from pydub.playback import play 12 | from PIL import Image 13 | import numpy as np 14 | import errno 15 | 16 | # Initialize Flask app and SocketIO 17 | app = Flask(__name__) 18 | socketio = SocketIO(app, cors_allowed_origins='*') 19 | 20 | # Set the API keys for OpenRouter and ElevenLabs 21 | OPENROUTER_API_KEY = 'YOUR KEY HERE' 22 | ELEVENLABS_API_KEY = 'YOUR KEY HERE' 23 | # Voice ID for ElevenLabs API (I using a standard voice but make sure you have access to it) 24 | VOICE_ID = 'lNHyfbhlVgOTtlbts3eH' 25 | 26 | # Configure OpenRouter API 27 | OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions" 28 | # Model options: anthropic/claude-sonnet-4, anthropic/claude-3.5-sonnet, anthropic/claude-3.7-sonnet 29 | MODEL_NAME = "anthropic/claude-sonnet-4" 30 | 31 | # Folder to save frames 32 | folder = "frames" 33 | if not os.path.exists(folder): 34 | os.makedirs(folder) 35 | 36 | # Initialize the webcam 37 | cap = cv2.VideoCapture(0) 38 | 39 | # Check if the webcam is opened correctly 40 | if not cap.isOpened(): 41 | raise IOError("Cannot open webcam") 42 | 43 | # Queue to store text responses 44 | text_queue = Queue() 45 | 46 | # Flag to indicate when audio playback is in progress 47 | audio_playing = threading.Event() 48 | 49 | # Global variables 50 | running = True 51 | capture_interval = 2 # Default interval in seconds 52 | 53 | def encode_image(image_path): 54 | while True: 55 | try: 56 | with open(image_path, "rb") as image_file: 57 | encoded_image = base64.b64encode(image_file.read()).decode("utf-8") 58 | return encoded_image 59 | except IOError as e: 60 | if e.errno == errno.EACCES: 61 | print("Permission denied, retrying in 5 seconds...") 62 | time.sleep(5) 63 | else: 64 | print(f"Error {e.errno}: {e.strerror}") 65 | return None 66 | 67 | def generate_audio(text, filename): 68 | if len(text) > 2500: 69 | raise ValueError("Text exceeds the character limit of 2500 characters.") 70 | 71 | url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}" 72 | headers = { 73 | "Accept": "audio/mpeg", 74 | "Content-Type": "application/json", 75 | "xi-api-key": ELEVENLABS_API_KEY 76 | } 77 | data = { 78 | "text": text, 79 | "model_id": "eleven_turbo_v2", 80 | "voice_settings": { 81 | "stability": 0.5, 82 | "similarity_boost": 0.5 83 | } 84 | } 85 | 86 | response = requests.post(url, json=data, headers=headers) 87 | with open(filename, 'wb') as f: 88 | f.write(response.content) 89 | 90 | def play_audio(): 91 | global audio_playing 92 | current_audio = "voice_current.mp3" 93 | next_audio = "voice_next.mp3" 94 | while True: 95 | text = text_queue.get() 96 | if text is None: 97 | break 98 | audio_playing.set() 99 | try: 100 | generate_audio(text, next_audio) 101 | os.rename(next_audio, current_audio) 102 | 103 | audio = AudioSegment.from_file(current_audio, format="mp3") 104 | play(audio) 105 | except Exception as e: 106 | print(f"Error in play_audio: {e}") 107 | finally: 108 | audio_playing.clear() 109 | 110 | def generate_new_line(encoded_image): 111 | """Generate a new message with image for OpenRouter API""" 112 | return { 113 | "role": "user", 114 | "content": [ 115 | { 116 | "type": "text", 117 | "text": "Please describe what you see in max 30 words. You are a helpful and friendly assistant called Astra. If you see questions visually answer them - this is very important!" 118 | }, 119 | { 120 | "type": "image_url", 121 | "image_url": { 122 | "url": f"data:image/jpeg;base64,{encoded_image}" 123 | } 124 | } 125 | ] 126 | } 127 | 128 | def analyze_image(encoded_image, script): 129 | """Send image to OpenRouter API (Claude) for analysis""" 130 | try: 131 | # Build messages array - convert script + new message 132 | messages = script + [generate_new_line(encoded_image)] 133 | 134 | # Prepare headers 135 | headers = { 136 | "Authorization": f"Bearer {OPENROUTER_API_KEY}", 137 | "Content-Type": "application/json", 138 | "HTTP-Referer": "http://localhost:5001", # Optional, for rankings 139 | "X-Title": "DIY-Astra" # Optional, shows in rankings 140 | } 141 | 142 | # Prepare request payload 143 | payload = { 144 | "model": MODEL_NAME, 145 | "messages": messages 146 | } 147 | 148 | # Make API request 149 | response = requests.post( 150 | OPENROUTER_API_URL, 151 | headers=headers, 152 | json=payload, 153 | timeout=30 154 | ) 155 | response.raise_for_status() 156 | 157 | # Extract response text 158 | result = response.json() 159 | return result['choices'][0]['message']['content'] 160 | 161 | except Exception as e: 162 | print(f"Error in analyze_image: {e}") 163 | if hasattr(e, 'response') and e.response is not None: 164 | print(f"Response content: {e.response.text}") 165 | return "" 166 | 167 | def capture_images(): 168 | global capture_interval 169 | global script 170 | script = [] 171 | cap = cv2.VideoCapture(0) 172 | while running: 173 | try: 174 | ret, frame = cap.read() 175 | if ret: 176 | # Resize and compress the image 177 | pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 178 | max_size = 250 179 | ratio = max_size / max(pil_img.size) 180 | new_size = tuple([int(x * ratio) for x in pil_img.size]) 181 | resized_img = pil_img.resize(new_size, Image.LANCZOS) 182 | frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR) 183 | 184 | path = f"{folder}/frame.jpg" 185 | cv2.imwrite(path, frame) 186 | print("📸 Saving photo.") 187 | 188 | encoded_image = encode_image(path) 189 | print(f"Encoded image: {encoded_image[:30]}...") # Debug print 190 | 191 | if not encoded_image: 192 | print("Failed to encode image. Retrying in 5 seconds...") 193 | time.sleep(5) 194 | continue 195 | 196 | socketio.emit('stream', {'image': encoded_image}) 197 | 198 | response_text = analyze_image(encoded_image, script) 199 | print(f"Jarvis's response: {response_text}") 200 | 201 | with text_queue.mutex: 202 | text_queue.queue.clear() # Clear the queue 203 | 204 | text_queue.put(response_text) 205 | socketio.emit('text', {'message': response_text}) 206 | # Add assistant's response to conversation history (OpenAI format) 207 | script.append( 208 | { 209 | "role": "assistant", 210 | "content": response_text 211 | } 212 | ) 213 | else: 214 | print("Failed to capture image") 215 | 216 | time.sleep(capture_interval) 217 | except Exception as e: 218 | print(f"Error in capture_images: {e}") 219 | cap.release() 220 | 221 | @app.route('/') 222 | def index(): 223 | return render_template('index.html') 224 | 225 | @app.route('/stop') 226 | def stop(): 227 | global running 228 | running = False 229 | return jsonify({"status": "stopped"}) 230 | 231 | @app.route('/resume') 232 | def resume(): 233 | global running 234 | global capture_thread 235 | running = True 236 | if not capture_thread.is_alive(): 237 | capture_thread = threading.Thread(target=capture_images) 238 | capture_thread.start() 239 | return jsonify({"status": "resumed"}) 240 | 241 | @app.route('/set_interval', methods=['POST']) 242 | def set_interval(): 243 | global capture_interval 244 | interval = request.json.get('interval') 245 | if interval: 246 | capture_interval = interval 247 | return jsonify({"status": "interval updated", "interval": capture_interval}) 248 | return jsonify({"status": "failed", "message": "Invalid interval"}), 400 249 | 250 | import webbrowser 251 | 252 | if __name__ == '__main__': 253 | global capture_thread 254 | global audio_thread 255 | running = True 256 | capture_thread = threading.Thread(target=capture_images) 257 | capture_thread.start() 258 | audio_thread = threading.Thread(target=play_audio) 259 | audio_thread.start() 260 | 261 | # Open the default web browser to the server link 262 | webbrowser.open('http://localhost:5001') 263 | 264 | socketio.run(app, host='0.0.0.0', port=5001) 265 | capture_thread.join() 266 | text_queue.put(None) 267 | audio_thread.join() 268 | --------------------------------------------------------------------------------