├── frames
└── frame.jpg
├── requirements.txt
├── templates
└── index.html
├── static
├── css
│ └── styles.css
└── js
│ └── script.js
├── README.md
└── app.py
/frames/frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Doriandarko/DIY-Astra/HEAD/frames/frame.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flask-socketio
3 | opencv-python
4 | pydub
5 | pillow
6 | requests
7 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Live Camera Feed
7 |
8 |
9 |
10 |
11 |
12 |
![]()
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/static/css/styles.css:
--------------------------------------------------------------------------------
1 | body {
2 | display: flex;
3 | justify-content: center;
4 | align-items: center;
5 | height: 100vh;
6 | margin: 0;
7 | font-family: Arial, sans-serif;
8 | background-color: #f0f0f0;
9 | }
10 |
11 | .container {
12 | display: flex;
13 | flex-direction: column;
14 | align-items: center;
15 | background-color: white;
16 | border-radius: 10px;
17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
18 | padding: 20px;
19 | }
20 |
21 | .video-container {
22 | width: 640px;
23 | height: 480px;
24 | background: #000;
25 | display: flex;
26 | justify-content: center;
27 | align-items: center;
28 | border-radius: 10px;
29 | overflow: hidden;
30 | }
31 |
32 | #video {
33 | width: 100%;
34 | height: 100%;
35 | object-fit: cover;
36 | }
37 |
38 | .text-container {
39 | width: 640px;
40 | height: 200px;
41 | background: #f0f0f0; /* grey */
42 | overflow-y: auto;
43 | margin-top: 20px;
44 | border: 1px solid #ddd;
45 | padding: 10px;
46 | border-radius: 10px;
47 | color: rgb(35, 35, 35);
48 | font-size: 16px;
49 | }
50 |
51 | .message {
52 | margin-bottom: 10px; /* Add vertical space between messages */
53 | white-space: pre-line; /* Preserve line breaks and spaces */
54 | }
55 |
56 | .control-container {
57 | display: flex;
58 | justify-content: space-between;
59 | align-items: center; /* Align items vertically centered */
60 | width: 640px;
61 | margin-top: 20px;
62 | }
63 |
64 | .interval-container {
65 | display: flex;
66 | align-items: center;
67 | }
68 |
69 | #interval-input {
70 | padding-left: 12px;
71 | font-size: 16px;
72 | margin-right: 10px;
73 | border: 1px solid #ddd;
74 | border-radius: 20px;
75 | height: 42px; /* Same height as the buttons */
76 | }
77 |
78 | button {
79 | padding: 10px 20px;
80 | font-size: 16px;
81 | background-color: black;
82 | color: white;
83 | border: none;
84 | border-radius: 20px;
85 | cursor: pointer;
86 | display: flex;
87 | align-items: center;
88 | justify-content: center;
89 | height: 42px; /* Ensure the buttons have the same height */
90 | }
91 |
92 | button:hover {
93 | background-color: #333;
94 | }
95 |
96 | button span {
97 | margin-left: 10px;
98 | }
99 |
--------------------------------------------------------------------------------
/static/js/script.js:
--------------------------------------------------------------------------------
1 | var socket = io();
2 | var running = true;
3 |
4 | socket.on('stream', function(data) {
5 | var img = document.getElementById('video');
6 | img.src = 'data:image/jpeg;base64,' + data.image;
7 | });
8 |
9 | socket.on('text', function(data) {
10 | var textContainer = document.getElementById('text-container');
11 | var newMessage = document.createElement('div');
12 | newMessage.classList.add('message');
13 | newMessage.textContent = data.message;
14 | textContainer.appendChild(newMessage);
15 | textContainer.scrollTop = textContainer.scrollHeight;
16 | });
17 |
18 | function toggleApp() {
19 | var controlButton = document.getElementById('control-button');
20 | if (running) {
21 | fetch('/stop')
22 | .then(response => response.json())
23 | .then(data => {
24 | console.log('App stopped:', data);
25 | alert('The application has been stopped.');
26 | controlButton.innerHTML = '▶️ Resume';
27 | running = false;
28 | })
29 | .catch((error) => {
30 | console.error('Error stopping the app:', error);
31 | });
32 | } else {
33 | fetch('/resume')
34 | .then(response => response.json())
35 | .then(data => {
36 | console.log('App resumed:', data);
37 | alert('The application has resumed.');
38 | controlButton.innerHTML = '⏹ Stop';
39 | running = true;
40 | })
41 | .catch((error) => {
42 | console.error('Error resuming the app:', error);
43 | });
44 | }
45 | }
46 |
47 | function setInterval() {
48 | var intervalInput = document.getElementById('interval-input').value;
49 | fetch('/set_interval', {
50 | method: 'POST',
51 | headers: {
52 | 'Content-Type': 'application/json'
53 | },
54 | body: JSON.stringify({ interval: parseInt(intervalInput) })
55 | })
56 | .then(response => response.json())
57 | .then(data => {
58 | if (data.status === 'interval updated') {
59 | alert('Capture interval updated to ' + data.interval + ' seconds.');
60 | } else {
61 | alert('Failed to update interval: ' + data.message);
62 | }
63 | })
64 | .catch(error => {
65 | console.error('Error setting interval:', error);
66 | });
67 | }
68 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # DIY-Astra
3 |
4 | DIY-Astra is a Flask application that utilizes computer vision and natural language processing to create an interactive AI assistant. The application captures live video feed from a webcam, analyzes the captured images using OpenRouter API (with Claude Sonnet 4), and generates text responses based on the visual input. The generated text responses are then converted to audio using the ElevenLabs API and played back to the user.
5 |
6 | ## Features
7 | - Live video feed capture from the webcam
8 | - Image analysis using OpenRouter API with Claude Sonnet 4 (supports 400+ AI models)
9 | - Text generation based on visual input with context-aware conversation memory
10 | - Text-to-speech conversion using the ElevenLabs API
11 | - Real-time audio playback of generated responses
12 | - Web-based user interface for interaction and control
13 | - Flexible model selection (Claude Sonnet 4, Claude 3.5 Sonnet, Claude 3.7 Sonnet, and more)
14 |
15 | ## Requirements
16 | To run the DIY-Astra application, you need to have the following dependencies installed:
17 | - Python 3.x
18 | - Flask
19 | - Flask-SocketIO
20 | - OpenCV (cv2)
21 | - Pydub
22 | - Pillow (PIL)
23 | - Requests
24 |
25 | You also need to have valid API keys for the following services:
26 | - OpenRouter API (OPENROUTER_API_KEY) - Get your free key at https://openrouter.ai/keys
27 | - ElevenLabs API (ELEVENLABS_API_KEY) - For text-to-speech conversion
28 |
29 | ## Installation
30 | 1. Clone the repository:
31 | ```bash
32 | git clone https://github.com/your-username/diy-astra.git
33 | ```
34 |
35 | 2. Navigate to the project directory:
36 | ```bash
37 | cd diy-astra
38 | ```
39 |
40 | 3. Install the required dependencies:
41 | ```bash
42 | pip install -r requirements.txt
43 | ```
44 |
45 | 4. Set up the API keys:
46 | - Replace `OPENROUTER_API_KEY` in `app.py` with your OpenRouter API key.
47 | - Replace `ELEVENLABS_API_KEY` in `app.py` with your ElevenLabs API key.
48 | - (Optional) Change `MODEL_NAME` to use different AI models like:
49 | - `anthropic/claude-sonnet-4` (default, most capable)
50 | - `anthropic/claude-3.5-sonnet` (faster, cheaper)
51 | - `anthropic/claude-3.7-sonnet` (balanced)
52 | - Or any other vision-capable model from OpenRouter's 400+ models
53 |
54 | 5. Run the application:
55 | ```bash
56 | python app.py
57 | ```
58 |
59 | 6. Open your web browser and navigate to `http://localhost:5001` to access the DIY-Astra interface.
60 |
61 | ## Usage
62 | 1. Make sure your webcam is connected and accessible.
63 | 2. Launch the DIY-Astra application by running `python app.py`.
64 | 3. The application will open in your default web browser.
65 | 4. The live video feed from your webcam will be displayed in the interface.
66 | 5. DIY-Astra will continuously capture images, analyze them using OpenRouter API (Claude Sonnet 4), and generate text responses based on the visual input.
67 | 6. The generated text responses will be displayed in the text container below the video feed.
68 | 7. The text responses will also be converted to audio using the ElevenLabs API and played back in real-time.
69 | 8. You can stop the application by clicking the "Stop" button in the interface. To resume, click the "Resume" button.
70 | 9. Adjust the capture interval (in seconds) to control how frequently Astra analyzes new images.
71 |
72 | ## File Structure
73 | - `app.py`: The main Flask application file containing the server-side logic.
74 | - `templates/index.html`: The HTML template for the user interface.
75 | - `static/css/styles.css`: The CSS stylesheet for styling the user interface.
76 | - `static/js/script.js`: The JavaScript file for client-side interactions and socket communication.
77 | - `requirements.txt`: The list of required Python dependencies.
78 |
79 | ## Contributing
80 | Contributions are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request.
81 |
82 | ## License
83 | This project is licensed under the MIT License.
84 |
85 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import threading
4 | import base64
5 | import time
6 | import requests
7 | from flask import Flask, render_template, jsonify, request
8 | from flask_socketio import SocketIO, emit
9 | from queue import Queue
10 | from pydub import AudioSegment
11 | from pydub.playback import play
12 | from PIL import Image
13 | import numpy as np
14 | import errno
15 |
16 | # Initialize Flask app and SocketIO
17 | app = Flask(__name__)
18 | socketio = SocketIO(app, cors_allowed_origins='*')
19 |
20 | # Set the API keys for OpenRouter and ElevenLabs
21 | OPENROUTER_API_KEY = 'YOUR KEY HERE'
22 | ELEVENLABS_API_KEY = 'YOUR KEY HERE'
23 | # Voice ID for ElevenLabs API (I using a standard voice but make sure you have access to it)
24 | VOICE_ID = 'lNHyfbhlVgOTtlbts3eH'
25 |
26 | # Configure OpenRouter API
27 | OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
28 | # Model options: anthropic/claude-sonnet-4, anthropic/claude-3.5-sonnet, anthropic/claude-3.7-sonnet
29 | MODEL_NAME = "anthropic/claude-sonnet-4"
30 |
31 | # Folder to save frames
32 | folder = "frames"
33 | if not os.path.exists(folder):
34 | os.makedirs(folder)
35 |
36 | # Initialize the webcam
37 | cap = cv2.VideoCapture(0)
38 |
39 | # Check if the webcam is opened correctly
40 | if not cap.isOpened():
41 | raise IOError("Cannot open webcam")
42 |
43 | # Queue to store text responses
44 | text_queue = Queue()
45 |
46 | # Flag to indicate when audio playback is in progress
47 | audio_playing = threading.Event()
48 |
49 | # Global variables
50 | running = True
51 | capture_interval = 2 # Default interval in seconds
52 |
53 | def encode_image(image_path):
54 | while True:
55 | try:
56 | with open(image_path, "rb") as image_file:
57 | encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
58 | return encoded_image
59 | except IOError as e:
60 | if e.errno == errno.EACCES:
61 | print("Permission denied, retrying in 5 seconds...")
62 | time.sleep(5)
63 | else:
64 | print(f"Error {e.errno}: {e.strerror}")
65 | return None
66 |
67 | def generate_audio(text, filename):
68 | if len(text) > 2500:
69 | raise ValueError("Text exceeds the character limit of 2500 characters.")
70 |
71 | url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
72 | headers = {
73 | "Accept": "audio/mpeg",
74 | "Content-Type": "application/json",
75 | "xi-api-key": ELEVENLABS_API_KEY
76 | }
77 | data = {
78 | "text": text,
79 | "model_id": "eleven_turbo_v2",
80 | "voice_settings": {
81 | "stability": 0.5,
82 | "similarity_boost": 0.5
83 | }
84 | }
85 |
86 | response = requests.post(url, json=data, headers=headers)
87 | with open(filename, 'wb') as f:
88 | f.write(response.content)
89 |
90 | def play_audio():
91 | global audio_playing
92 | current_audio = "voice_current.mp3"
93 | next_audio = "voice_next.mp3"
94 | while True:
95 | text = text_queue.get()
96 | if text is None:
97 | break
98 | audio_playing.set()
99 | try:
100 | generate_audio(text, next_audio)
101 | os.rename(next_audio, current_audio)
102 |
103 | audio = AudioSegment.from_file(current_audio, format="mp3")
104 | play(audio)
105 | except Exception as e:
106 | print(f"Error in play_audio: {e}")
107 | finally:
108 | audio_playing.clear()
109 |
110 | def generate_new_line(encoded_image):
111 | """Generate a new message with image for OpenRouter API"""
112 | return {
113 | "role": "user",
114 | "content": [
115 | {
116 | "type": "text",
117 | "text": "Please describe what you see in max 30 words. You are a helpful and friendly assistant called Astra. If you see questions visually answer them - this is very important!"
118 | },
119 | {
120 | "type": "image_url",
121 | "image_url": {
122 | "url": f"data:image/jpeg;base64,{encoded_image}"
123 | }
124 | }
125 | ]
126 | }
127 |
128 | def analyze_image(encoded_image, script):
129 | """Send image to OpenRouter API (Claude) for analysis"""
130 | try:
131 | # Build messages array - convert script + new message
132 | messages = script + [generate_new_line(encoded_image)]
133 |
134 | # Prepare headers
135 | headers = {
136 | "Authorization": f"Bearer {OPENROUTER_API_KEY}",
137 | "Content-Type": "application/json",
138 | "HTTP-Referer": "http://localhost:5001", # Optional, for rankings
139 | "X-Title": "DIY-Astra" # Optional, shows in rankings
140 | }
141 |
142 | # Prepare request payload
143 | payload = {
144 | "model": MODEL_NAME,
145 | "messages": messages
146 | }
147 |
148 | # Make API request
149 | response = requests.post(
150 | OPENROUTER_API_URL,
151 | headers=headers,
152 | json=payload,
153 | timeout=30
154 | )
155 | response.raise_for_status()
156 |
157 | # Extract response text
158 | result = response.json()
159 | return result['choices'][0]['message']['content']
160 |
161 | except Exception as e:
162 | print(f"Error in analyze_image: {e}")
163 | if hasattr(e, 'response') and e.response is not None:
164 | print(f"Response content: {e.response.text}")
165 | return ""
166 |
167 | def capture_images():
168 | global capture_interval
169 | global script
170 | script = []
171 | cap = cv2.VideoCapture(0)
172 | while running:
173 | try:
174 | ret, frame = cap.read()
175 | if ret:
176 | # Resize and compress the image
177 | pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
178 | max_size = 250
179 | ratio = max_size / max(pil_img.size)
180 | new_size = tuple([int(x * ratio) for x in pil_img.size])
181 | resized_img = pil_img.resize(new_size, Image.LANCZOS)
182 | frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
183 |
184 | path = f"{folder}/frame.jpg"
185 | cv2.imwrite(path, frame)
186 | print("📸 Saving photo.")
187 |
188 | encoded_image = encode_image(path)
189 | print(f"Encoded image: {encoded_image[:30]}...") # Debug print
190 |
191 | if not encoded_image:
192 | print("Failed to encode image. Retrying in 5 seconds...")
193 | time.sleep(5)
194 | continue
195 |
196 | socketio.emit('stream', {'image': encoded_image})
197 |
198 | response_text = analyze_image(encoded_image, script)
199 | print(f"Jarvis's response: {response_text}")
200 |
201 | with text_queue.mutex:
202 | text_queue.queue.clear() # Clear the queue
203 |
204 | text_queue.put(response_text)
205 | socketio.emit('text', {'message': response_text})
206 | # Add assistant's response to conversation history (OpenAI format)
207 | script.append(
208 | {
209 | "role": "assistant",
210 | "content": response_text
211 | }
212 | )
213 | else:
214 | print("Failed to capture image")
215 |
216 | time.sleep(capture_interval)
217 | except Exception as e:
218 | print(f"Error in capture_images: {e}")
219 | cap.release()
220 |
221 | @app.route('/')
222 | def index():
223 | return render_template('index.html')
224 |
225 | @app.route('/stop')
226 | def stop():
227 | global running
228 | running = False
229 | return jsonify({"status": "stopped"})
230 |
231 | @app.route('/resume')
232 | def resume():
233 | global running
234 | global capture_thread
235 | running = True
236 | if not capture_thread.is_alive():
237 | capture_thread = threading.Thread(target=capture_images)
238 | capture_thread.start()
239 | return jsonify({"status": "resumed"})
240 |
241 | @app.route('/set_interval', methods=['POST'])
242 | def set_interval():
243 | global capture_interval
244 | interval = request.json.get('interval')
245 | if interval:
246 | capture_interval = interval
247 | return jsonify({"status": "interval updated", "interval": capture_interval})
248 | return jsonify({"status": "failed", "message": "Invalid interval"}), 400
249 |
250 | import webbrowser
251 |
252 | if __name__ == '__main__':
253 | global capture_thread
254 | global audio_thread
255 | running = True
256 | capture_thread = threading.Thread(target=capture_images)
257 | capture_thread.start()
258 | audio_thread = threading.Thread(target=play_audio)
259 | audio_thread.start()
260 |
261 | # Open the default web browser to the server link
262 | webbrowser.open('http://localhost:5001')
263 |
264 | socketio.run(app, host='0.0.0.0', port=5001)
265 | capture_thread.join()
266 | text_queue.put(None)
267 | audio_thread.join()
268 |
--------------------------------------------------------------------------------